In [1]:
import pandas as pd
import re

In [2]:
#Dataset

df = pd.read_csv('urdu_sarcastic_dataset.csv',header=0)

#the NaN are there because there are consecutive commas
df.head()

Unnamed: 0,urdu_text,is_sarcastic,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7
0,🤣😂😂 ہو لینے دے میری شادی فسادن ٹھیک ہے کوجی نہ...,1.0,,,,,,
1,چل مہمانوں میں کھانا سرو کر چڑیل چاچی نوں دسدی...,1.0,,,,,,
2,کامران خان آپکی دن بھریہ زمہ داری لگائی گئی اپ...,0.0,,,,,,
3,نہیں پائین 😎,0.0,,,,,,
4,`` مراد علی شاہ کے بھیس میں ڈی جی آئی ایس آئی...,1.0,,,,,,


In [3]:
df.tail()

Unnamed: 0,urdu_text,is_sarcastic,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7
20055,,,,,,,,
20056,,,,,,,,
20057,,,,,,,,
20058,,,,,,,,
20059,,,,,,,,


# Data Cleaning

In [4]:
#see if there are any rows which have all comlumns as NaN
empty_rows = df.isna().all(axis=1)

#Display the rows that are completely empty
empty_records = df[empty_rows]

empty_records

Unnamed: 0,urdu_text,is_sarcastic,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7
20004,,,,,,,,
20005,,,,,,,,
20006,,,,,,,,
20007,,,,,,,,
20008,,,,,,,,
20009,,,,,,,,
20010,,,,,,,,
20011,,,,,,,,
20012,,,,,,,,
20013,,,,,,,,


In [5]:
#dropping all these rows

df = df.dropna(how='all')

empty_records = df.isna().all(axis=1)

#we can see all those rows are dropped
df[empty_records]

Unnamed: 0,urdu_text,is_sarcastic,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7


In [6]:
df.head()

Unnamed: 0,urdu_text,is_sarcastic,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7
0,🤣😂😂 ہو لینے دے میری شادی فسادن ٹھیک ہے کوجی نہ...,1.0,,,,,,
1,چل مہمانوں میں کھانا سرو کر چڑیل چاچی نوں دسدی...,1.0,,,,,,
2,کامران خان آپکی دن بھریہ زمہ داری لگائی گئی اپ...,0.0,,,,,,
3,نہیں پائین 😎,0.0,,,,,,
4,`` مراد علی شاہ کے بھیس میں ڈی جی آئی ایس آئی...,1.0,,,,,,


In [7]:
df.describe()

Unnamed: 0,is_sarcastic,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 7
count,20004.0,0.0,0.0,0.0,0.0,38.0
mean,0.50005,,,,,0.526316
std,0.500012,,,,,0.506009
min,0.0,,,,,0.0
25%,0.0,,,,,0.0
50%,1.0,,,,,1.0
75%,1.0,,,,,1.0
max,1.0,,,,,1.0


In [8]:
#dropping all the cols containing all NaN only

df = df.dropna(axis=1,how='all')

df.describe()

Unnamed: 0,is_sarcastic,Unnamed: 7
count,20004.0,38.0
mean,0.50005,0.526316
std,0.500012,0.506009
min,0.0,0.0
25%,0.0,0.0
50%,1.0,1.0
75%,1.0,1.0
max,1.0,1.0


In [9]:
df.head()

Unnamed: 0,urdu_text,is_sarcastic,Unnamed: 6,Unnamed: 7
0,🤣😂😂 ہو لینے دے میری شادی فسادن ٹھیک ہے کوجی نہ...,1.0,,
1,چل مہمانوں میں کھانا سرو کر چڑیل چاچی نوں دسدی...,1.0,,
2,کامران خان آپکی دن بھریہ زمہ داری لگائی گئی اپ...,0.0,,
3,نہیں پائین 😎,0.0,,
4,`` مراد علی شاہ کے بھیس میں ڈی جی آئی ایس آئی...,1.0,,


In [10]:
# Drop rows where the first column contains NaN values
df = df.dropna(subset= [df.columns[0]])

df[df.isna().all(axis=1)]

Unnamed: 0,urdu_text,is_sarcastic,Unnamed: 6,Unnamed: 7


# Phase 1: Text Preprocessing

removing stopwords

In [11]:
df['urdu_text'].describe()

count                                                 19955
unique                                                15813
top       جزاک اللہ 🍃🌸🍃 ہمیشہ خوش رہے ۔اللہ تعالٰی ساری ...
freq                                                     42
Name: urdu_text, dtype: object

# Punctuation,emojis and URL Removal

In [12]:
import string
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [13]:
def remove_urls(text):
    pattern = re.compile(r'https?://\S+|www\.\S+')
    return pattern.sub(r'', text)

df['urdu_text'] = df['urdu_text'].apply(remove_urls)

df.head(50)


Unnamed: 0,urdu_text,is_sarcastic,Unnamed: 6,Unnamed: 7
0,🤣😂😂 ہو لینے دے میری شادی فسادن ٹھیک ہے کوجی نہ...,1.0,,
1,چل مہمانوں میں کھانا سرو کر چڑیل چاچی نوں دسدی...,1.0,,
2,کامران خان آپکی دن بھریہ زمہ داری لگائی گئی اپ...,0.0,,
3,نہیں پائین 😎,0.0,,
4,`` مراد علی شاہ کے بھیس میں ڈی جی آئی ایس آئی...,1.0,,
5,قابل اعتبار ہی اکثر قاتل اعتبار ہوتے ہیں 💔🔥,1.0,,
6,انساں کو تھکا دیتا ہے سوچوں کا سفر بھی ... 🍁🥀,0.0,,
7,حامد میر صاحب ویلڈن👏😊,0.0,,
8,یار وچارہ ویلا ہوندا ہے اس آرے لگا ہویا ہے😂😂 ت...,1.0,,
9,یہ سمجھتے ہیں سارا پاکستان بیوقوف ھے 😂😂😂,1.0,,


In [14]:
def remove_punc(text):
    punctuation_marks = "!\"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~۔۔۔؟…،"
    return text.translate(str.maketrans('', '', punctuation_marks))



df['urdu_text'] = df['urdu_text'].apply(remove_punc)

df.head(50)

Unnamed: 0,urdu_text,is_sarcastic,Unnamed: 6,Unnamed: 7
0,🤣😂😂 ہو لینے دے میری شادی فسادن ٹھیک ہے کوجی نہ...,1.0,,
1,چل مہمانوں میں کھانا سرو کر چڑیل چاچی نوں دسدی...,1.0,,
2,کامران خان آپکی دن بھریہ زمہ داری لگائی گئی اپ...,0.0,,
3,نہیں پائین 😎,0.0,,
4,مراد علی شاہ کے بھیس میں ڈی جی آئی ایس آئی ت...,1.0,,
5,قابل اعتبار ہی اکثر قاتل اعتبار ہوتے ہیں 💔🔥,1.0,,
6,انساں کو تھکا دیتا ہے سوچوں کا سفر بھی 🍁🥀,0.0,,
7,حامد میر صاحب ویلڈن👏😊,0.0,,
8,یار وچارہ ویلا ہوندا ہے اس آرے لگا ہویا ہے😂😂 ت...,1.0,,
9,یہ سمجھتے ہیں سارا پاکستان بیوقوف ھے 😂😂😂,1.0,,


In [15]:

def remove_emoji(text):
    emoji_pattern = re.compile(
        "["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002500-\U00002BEF"  # Chinese characters
        u"\U00002702-\U000027B0"  # dingbats
        u"\U00002702-\U000027B0"  # miscellaneous symbols
        u"\U0001F900-\U0001F9FF"  # supplemental symbols and pictographs
        u"\U0001FA70-\U0001FAFF"  # symbols and pictographs extended-A
        u"\U00002600-\U000026FF"  # miscellaneous symbols
        u"\U0001F018-\U0001F270"  # various emoji
        u"\U0001F1E6-\U0001F1FF"  # flags
        "]+", flags=re.UNICODE
    )
    return emoji_pattern.sub(r'', text)



df['urdu_text'] = df['urdu_text'].apply(remove_emoji)

df.head(50)


Unnamed: 0,urdu_text,is_sarcastic,Unnamed: 6,Unnamed: 7
0,ہو لینے دے میری شادی فسادن ٹھیک ہے کوجی نہیں ...,1.0,,
1,چل مہمانوں میں کھانا سرو کر چڑیل چاچی نوں دسدی...,1.0,,
2,کامران خان آپکی دن بھریہ زمہ داری لگائی گئی اپ...,0.0,,
3,نہیں پائین,0.0,,
4,مراد علی شاہ کے بھیس میں ڈی جی آئی ایس آئی ت...,1.0,,
5,قابل اعتبار ہی اکثر قاتل اعتبار ہوتے ہیں,1.0,,
6,انساں کو تھکا دیتا ہے سوچوں کا سفر بھی,0.0,,
7,حامد میر صاحب ویلڈن,0.0,,
8,یار وچارہ ویلا ہوندا ہے اس آرے لگا ہویا ہے تسی...,1.0,,
9,یہ سمجھتے ہیں سارا پاکستان بیوقوف ھے,1.0,,


# Remove Digits

In [16]:
urdu_digits = ['۶', '۴', '۵', '۸', '۲', '۰', '۷', '۹', '۳', '۱']

english_digits=['1','2','3','4','5','6','7','8','9','0']

def remove_numbers(text):
    for letter in text:
        if letter in urdu_digits or letter in english_digits :
            text = text.replace(letter, '')
    return text

df['urdu_text'] = df['urdu_text'].apply(remove_numbers)

df.head(50)

Unnamed: 0,urdu_text,is_sarcastic,Unnamed: 6,Unnamed: 7
0,ہو لینے دے میری شادی فسادن ٹھیک ہے کوجی نہیں ...,1.0,,
1,چل مہمانوں میں کھانا سرو کر چڑیل چاچی نوں دسدی...,1.0,,
2,کامران خان آپکی دن بھریہ زمہ داری لگائی گئی اپ...,0.0,,
3,نہیں پائین,0.0,,
4,مراد علی شاہ کے بھیس میں ڈی جی آئی ایس آئی ت...,1.0,,
5,قابل اعتبار ہی اکثر قاتل اعتبار ہوتے ہیں,1.0,,
6,انساں کو تھکا دیتا ہے سوچوں کا سفر بھی,0.0,,
7,حامد میر صاحب ویلڈن,0.0,,
8,یار وچارہ ویلا ہوندا ہے اس آرے لگا ہویا ہے تسی...,1.0,,
9,یہ سمجھتے ہیں سارا پاکستان بیوقوف ھے,1.0,,


In [17]:
import json

# Load stopwords from a JSON file
with open('urdu_stopwords.json', 'r', encoding='utf-8') as f:
    urdu_stopwords = json.load(f)


    
def remove_stopwords(text):
    temp=[]

    words=text.split()
    for letter in words:
        if letter not in urdu_stopwords:
            if letter !='':
                temp.append(letter)
            
    return (' '.join(temp) )
    

In [18]:
df['urdu_text']= df['urdu_text'].apply(remove_stopwords)

df.head(50)

Unnamed: 0,urdu_text,is_sarcastic,Unnamed: 6,Unnamed: 7
0,لینے میری شادی فسادن کوجی نہیں چاہیے,1.0,,
1,چل مہمانوں میں کھانا سرو کر چڑیل چاچی نوں دسدی...,1.0,,
2,کامران خان آپکی دن بھریہ زمہ داری لگائی اپوزیش...,0.0,,
3,نہیں پائین,0.0,,
4,مراد علی شاہ بھیس میں ڈی جی ایس حامد میر,1.0,,
5,قابل اعتبار قاتل اعتبار,1.0,,
6,انساں کو تھکا دیتا سوچوں کا سفر بھی,0.0,,
7,حامد میر صاحب ویلڈن,0.0,,
8,یار وچارہ ویلا ہوندا اس آرے لگا ہویا تسی تے پک...,1.0,,
9,سمجھتے سارا پاکستان بیوقوف ھے,1.0,,


# Filter short words

In [19]:
def short_words(text):
    temp = []
    for word in text.split():
        if len(word) > 3:
            if word !='':
                temp.append(word)
                
    return ' '.join(temp)
            
    

In [20]:
df['urdu_text'] = df['urdu_text'].apply(short_words)

df.head(50)

Unnamed: 0,urdu_text,is_sarcastic,Unnamed: 6,Unnamed: 7
0,لینے میری شادی فسادن کوجی نہیں چاہیے,1.0,,
1,مہمانوں کھانا چڑیل چاچی دسدی,1.0,,
2,کامران آپکی بھریہ داری لگائی اپوزیشن کردار اور...,0.0,,
3,نہیں پائین,0.0,,
4,مراد بھیس حامد,1.0,,
5,قابل اعتبار قاتل اعتبار,1.0,,
6,انساں تھکا دیتا سوچوں,0.0,,
7,حامد صاحب ویلڈن,0.0,,
8,وچارہ ویلا ہوندا ہویا نجومی منندے,1.0,,
9,سمجھتے سارا پاکستان بیوقوف,1.0,,


In [21]:
df.tail(50)

Unnamed: 0,urdu_text,is_sarcastic,Unnamed: 6,Unnamed: 7
19954,,0.0,,
19955,چاپلوسی نہیں اگلے الیکشن کھڑے بغیر کیسے محترمہ,1.0,,
19956,چھوڑ نہیں سکتی دیکھنا کیونکہ تجسسس لیکن بڈھے س...,0.0,,
19957,ابھرتے سورج دیکھ مجھے یقین آزادی کشمیر سورج ای...,0.0,,
19958,ھاھاھاھاھا بیچارے پرندے ساتھ,0.0,,
19959,بائیٹ اتنا الزام,0.0,,
19960,ملتان روٹی پکانے پابندی,1.0,,
19961,کیسے بدلتے رشتے پٹواری جیالا لگتاہے,0.0,,
19962,پنکھے,0.0,,
19963,گائے بھرنا نہیں چاہئیے,0.0,,


# Phase 2: Stemming and Lemmatization

In [22]:
pip install lughaatNLP




[notice] A new release of pip is available: 23.1.2 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip





In [23]:
from LughaatNLP import LughaatNLP
urdu_text_processing = LughaatNLP()






In [24]:
df['urdu_text'] = df['urdu_text'].apply(urdu_text_processing.urdu_stemmer)

In [25]:
df.head(50)

Unnamed: 0,urdu_text,is_sarcastic,Unnamed: 6,Unnamed: 7
0,لینہ میری شادی فسادن کوجی نہا چاہیہ,1.0,,
1,مہمانا کھانا چڑیل چاچی دسدی,1.0,,
2,کامران آپکی بھریہ داری لگائی اپوزیشن کردار اور...,0.0,,
3,نہا پائین,0.0,,
4,مراد بھیس حامد,1.0,,
5,قابل اعتبر قاتل اعتبر,1.0,,
6,انساں تھکا دیت سوچا,0.0,,
7,حامد صاحب ویلڈن,0.0,,
8,وچارہ ویلا ہوندا ہویا نجومی منندہ,1.0,,
9,سمجھتہ سارا پاکستن بیوقوف,1.0,,


In [26]:
df['urdu_text'] = df['urdu_text'].apply(urdu_text_processing.lemmatize_sentence)

In [27]:
df.head(50)

Unnamed: 0,urdu_text,is_sarcastic,Unnamed: 6,Unnamed: 7
0,لینہ میرا شادی فسادن کوجی نہانا چاہیہ,1.0,,
1,مہمانا کھا چڑیل چاچی دسدی,1.0,,
2,کامران میرا بھریہ داری لگنا اپوزیشن کردار اورا...,0.0,,
3,نہانا پائین,0.0,,
4,مراد بھیس حامد,1.0,,
5,قابل اعتبر قاتل اعتبر,1.0,,
6,انساں تھکنا دیت سوچنا,0.0,,
7,حامد صاحب ویلڈن,0.0,,
8,وچارہ ویلا ہوندا ہویا نجومی منندہ,1.0,,
9,سمجھتہ سارا پاکستن بیوقوف,1.0,,


# Phase 3: Feature Extraction

In [28]:
tokens = df['urdu_text'].apply(urdu_text_processing.urdu_tokenize)
tokens

0            [لینہ, میرا, شادی, فسادن, کوجی, نہانا, چاہیہ]
1                          [مہمانا, کھا, چڑیل, چاچی, دسدی]
2        [کامران, میرا, بھریہ, داری, لگنا, اپوزیشن, کرد...
3                                           [نہانا, پائین]
4                                       [مراد, بھیس, حامد]
                               ...                        
19999                   [راجہ, صاحب, چھڈیا, آواز, یوتھیئہ]
20000                                 [بےبی, پرائم, منسٹر]
20001            [اتنا, بونگا, وزیر, اعظم, ڈھونڈنہ, نہانا]
20002    [کاکا, عِدت, پوری, نہانا, ہونہ, عوام, کیسہ, تی...
20003    [جتنا, مرضی, بلیک, نہانا, گاجتنہ, مرضی, جلسہ, ...
Name: urdu_text, Length: 19955, dtype: object

In [29]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [30]:
vectorizer = TfidfVectorizer()

tfidf_vectors = vectorizer.fit_transform(df['urdu_text'][:10])

# Convert the TF-IDF vectors to a DataFrame
tfidf_df = pd.DataFrame(tfidf_vectors.toarray(), columns=vectorizer.get_feature_names_out())

# Print the TF-IDF DataFrame
tfidf_df.head(10)

Unnamed: 0,اعتبر,انساں,اوراس,اپوزیشن,بدبوآپ,بھریہ,بھیس,بیوقوف,تھکنا,حامد,...,چڑیل,کامران,کردار,کماناچاہتےہا,کوجی,کھا,کےسفرما,ہوندا,ہویا,ہےہما
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.399177,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.447214,0.0,0.0,0.0,0.0,0.447214,0.0,0.0,0.0,0.0
2,0.0,0.0,0.198906,0.198906,0.198906,0.198906,0.0,0.0,0.0,0.0,...,0.0,0.198906,0.198906,0.198906,0.0,0.0,0.198906,0.0,0.0,0.198906
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.606043,0.0,0.0,0.515192,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.816497,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.515192,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.408248,0.408248,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [31]:
from gensim.models import Word2Vec

model_cbow = Word2Vec(sentences=tokens, min_count=1, window=4,sg=0,vector_size=5)

cbow_vectors = model_cbow.wv

#creating a data frame for cbow word2vec

df_cbow = pd.DataFrame(cbow_vectors.vectors, index = cbow_vectors.index_to_key)

df_cbow.head(10)

Unnamed: 0,0,1,2,3,4
نہانا,3.416219,1.321064,4.90689,-2.999023,-0.671047
میرا,2.474938,2.697515,3.75554,-3.753396,-0.181537
اللہ,2.709821,2.484616,4.540586,-5.886301,-7.439035
صاحب,1.340916,-0.335377,5.094649,-4.360559,-0.053183
سندھ,0.605115,4.714535,6.849089,-2.732483,3.490943
پاکستن,0.914264,0.669867,5.644705,-4.324653,0.695528
والا,1.502854,4.203449,3.92495,-4.174159,-0.110684
کرنہ,2.612968,3.226404,4.423012,-4.177242,0.786045
️,-2.850546,6.124665,2.687757,-10.056393,-1.100048
نواز,2.222471,-2.427465,8.010687,-4.135088,3.169136


In [32]:
similar_words = model_cbow.wv.most_similar("اچھا", topn=5)

In [33]:
similar_words

[('بولئہ', 0.999354898929596),
 ('بیوقوف', 0.9988667964935303),
 ('فائدہ', 0.9985321760177612),
 ('مذہبی', 0.9981155395507812),
 ('گالی', 0.9980341792106628)]

In [34]:
from gensim.models import Word2Vec

model_skipgram = Word2Vec(sentences=tokens, min_count=1, window=4,sg=1, vector_size=5)

sk_vectors = model_skipgram.wv

#creating a data frame for skipgram word2vec

df_sk = pd.DataFrame(sk_vectors.vectors, index = sk_vectors.index_to_key)

df_sk.head(10)

Unnamed: 0,0,1,2,3,4
نہانا,0.771271,1.03896,1.886552,-0.754217,0.057614
میرا,1.362549,1.236548,1.505917,-0.799155,0.227722
اللہ,1.841931,0.187674,0.741421,-1.937345,-1.884491
صاحب,0.493784,-0.45875,1.8848,-1.195364,0.155398
سندھ,-0.81001,1.496649,2.796462,-1.550992,1.488128
پاکستن,-0.029442,-0.183883,1.895952,-1.39632,0.537332
والا,0.376829,1.985472,1.707943,-0.84906,-0.277413
کرنہ,0.088573,1.516109,1.630515,-1.125776,0.465819
️,-0.693844,2.075157,0.332344,-3.428743,-0.929718
نواز,1.970098,-1.227998,2.781157,-1.329027,0.837791


In [35]:
similar_words_sk = model_skipgram.wv.most_similar("اچھا", topn=5)

In [36]:
similar_words_sk

[('وَاٰلِہٖ', 0.9973012208938599),
 ('مــن', 0.9958191514015198),
 ('یوٹیوبر', 0.9955233335494995),
 ('نانسینس', 0.995385468006134),
 ('چاہیےجسما', 0.9942165613174438)]

# Phase 5 : N-gram Analysis

In [37]:
from sklearn.feature_extraction.text import CountVectorizer

In [38]:
# Initialize the CountVectorizer with desired unigram-gram range
unigram_vectorizer = CountVectorizer()

# Fit and transform the text data
unigram_vectors = unigram_vectorizer.fit_transform(df['urdu_text'][:10])

# Convert the unigram vectors to a DataFrame
unigram_df = pd.DataFrame(unigram_vectors.toarray(), columns=unigram_vectorizer.get_feature_names_out())

# Print the uni-gram DataFrame
unigram_df.head(10)

Unnamed: 0,اعتبر,انساں,اوراس,اپوزیشن,بدبوآپ,بھریہ,بھیس,بیوقوف,تھکنا,حامد,...,چڑیل,کامران,کردار,کماناچاہتےہا,کوجی,کھا,کےسفرما,ہوندا,ہویا,ہےہما
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,1,0,0,0,0
2,0,0,1,1,1,1,0,0,0,0,...,0,1,1,1,0,0,1,0,0,1
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,1,0,0,1,...,0,0,0,0,0,0,0,0,0,0
5,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,1,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,1,0
9,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [39]:
# Sum the trigram counts across all rows
unigram_counts = unigram_df.sum(axis=0)

# Sort the trigrams by frequency in descending order and get the top 10
top_10_unigrams = unigram_counts.sort_values(ascending=False).head(10)

top_10_unigrams

نہانا         3
اعتبر         2
میرا          2
حامد          2
پاکستن        1
مراد          1
منندہ         1
موٹرسائیکل    1
مہمانا        1
نجومی         1
dtype: int64

In [40]:
# Initialize the CountVectorizer with desired unigram-gram range
bigram_vectorizer = CountVectorizer(ngram_range=(2,2))

# Fit and transform the text data
bigram_vectors = bigram_vectorizer.fit_transform(df['urdu_text'][:10])

# Convert the unigram vectors to a DataFrame
bigram_df = pd.DataFrame(bigram_vectors.toarray(), columns=bigram_vectorizer.get_feature_names_out())

# Print the uni-gram DataFrame
bigram_df.head(10)

Unnamed: 0,اعتبر قاتل,انساں تھکنا,اوراس پربھونکناہےآپ,اپوزیشن کردار,بدبوآپ نہانا,بھریہ داری,بھیس حامد,تھکنا دیت,حامد صاحب,خوشامدگری وچاپلوسی,...,چڑیل چاچی,کامران میرا,کردار اوراس,کماناچاہتےہا موٹرسائیکل,کوجی نہانا,کھا چڑیل,کےسفرما ضمیرکی,ہوندا ہویا,ہویا نجومی,ہےہما فالوکرا
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,1,0,0,0,0
2,0,0,1,1,1,1,0,0,0,1,...,0,1,1,1,0,0,1,0,0,1
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,1,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,1,0
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [41]:
# Sum the trigram counts across all rows
bigram_counts = bigram_df.sum(axis=0)

# Sort the trigrams by frequency in descending order and get the top 10
top_10_bigrams = bigram_counts.sort_values(ascending=False).head(10)

top_10_bigrams

اعتبر قاتل             1
پاکستن بیوقوف          1
موٹرسائیکل سےپیجارو    1
مہمانا کھا             1
میرا بھریہ             1
میرا شادی              1
نجومی منندہ            1
نہانا سےالتجاگزارش     1
نہانا پائین            1
نہانا چاہیہ            1
dtype: int64

In [42]:
# Initialize the CountVectorizer with desired unigram-gram range
trigram_vectorizer = CountVectorizer(ngram_range=(3,3))

# Fit and transform the text data
trigram_vectors = trigram_vectorizer.fit_transform(df['urdu_text'][:10])

# Convert the unigram vectors to a DataFrame
trigram_df = pd.DataFrame(trigram_vectors.toarray(), columns=trigram_vectorizer.get_feature_names_out())

# Print the uni-gram DataFrame
trigram_df.head(10)

Unnamed: 0,اعتبر قاتل اعتبر,انساں تھکنا دیت,اوراس پربھونکناہےآپ خوشامدگری,اپوزیشن کردار اوراس,بدبوآپ نہانا سےالتجاگزارش,بھریہ داری لگنا,تھکنا دیت سوچنا,حامد صاحب ویلڈن,خوشامدگری وچاپلوسی سےاورکتنی,داری لگنا اپوزیشن,...,چڑیل چاچی دسدی,کامران میرا بھریہ,کردار اوراس پربھونکناہےآپ,کماناچاہتےہا موٹرسائیکل سےپیجارو,کوجی نہانا چاہیہ,کھا چڑیل چاچی,کےسفرما ضمیرکی سےاٹھتی,ہوندا ہویا نجومی,ہویا نجومی منندہ,ہےہما فالوکرا شکریہ
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,1,0,0,0,0
2,0,0,1,1,1,1,0,0,1,1,...,0,1,1,1,0,0,1,0,0,1
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,1,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,1,0
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [43]:
# Sum the trigram counts across all rows
trigram_counts = trigram_df.sum(axis=0)

# Sort the trigrams by frequency in descending order and get the top 10
top_10_trigrams = trigram_counts.sort_values(ascending=False).head(10)

top_10_trigrams

اعتبر قاتل اعتبر             1
انساں تھکنا دیت              1
موٹرسائیکل سےپیجارو پراڈو    1
مہمانا کھا چڑیل              1
میرا بھریہ داری              1
میرا شادی فسادن              1
نہانا سےالتجاگزارش ہےہما     1
وچارہ ویلا ہوندا             1
وچاپلوسی سےاورکتنی دولت      1
ویلا ہوندا ہویا              1
dtype: int64

# Phase 5 and 6: Sentiment and Classification Model and Evaluation

In [44]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
# Model Evaluation Libraries
from sklearn.metrics import classification_report, confusion_matrix

In [45]:
#splitting data into test and train

X_train, X_test, Y_train, Y_test = train_test_split(df['urdu_text'], df['is_sarcastic'], test_size = 0.20, random_state = 7)

In [46]:
print('Shape of X_train', X_train.shape)
print('Shape of X_test', X_test.shape)
print('Shape of Y_train', Y_train.shape)
print('Shape of Y_test', Y_test.shape)

Shape of X_train (15964,)
Shape of X_test (3991,)
Shape of Y_train (15964,)
Shape of Y_test (3991,)


In [47]:
max_feature_num = 50000
vectorizer = TfidfVectorizer(max_features=max_feature_num)
train_vecs = vectorizer.fit_transform(X_train)
test_vecs = TfidfVectorizer(max_features=max_feature_num, vocabulary=vectorizer.vocabulary_).fit_transform(X_test)



In [48]:
def LR_classifier(train_vecs, Y_train, test_vecs, Y_test):
    # Training
    LR = LogisticRegression()
    LR.fit(train_vecs, Y_train)

    # testing
    test_predictionLR = LR.predict(test_vecs)
    return classification_report(test_predictionLR, Y_test) , confusion_matrix(test_predictionLR, Y_test)

In [49]:
class_report , conf_matrix = LR_classifier(train_vecs, Y_train, test_vecs, Y_test)
print('Results of Logistic Regression Classifier on Tf idf Vectorizer')
print(class_report)
print(conf_matrix)

Results of Logistic Regression Classifier on Tf idf Vectorizer
              precision    recall  f1-score   support

         0.0       0.79      0.80      0.79      1969
         1.0       0.80      0.79      0.79      2022

    accuracy                           0.79      3991
   macro avg       0.79      0.79      0.79      3991
weighted avg       0.79      0.79      0.79      3991

[[1566  403]
 [ 425 1597]]


In [50]:
def MNB_classifier(train_vecs, Y_train, test_vecs, Y_test):
    # Training
    MNB = MultinomialNB()
    MNB.fit(train_vecs, Y_train)

    # testing
    test_predictionMNB = MNB.predict(test_vecs)
    return classification_report(test_predictionMNB, Y_test) , confusion_matrix(test_predictionMNB, Y_test)

In [51]:
class_report , conf_matrix = MNB_classifier(train_vecs, Y_train, test_vecs, Y_test)
print('Results of Multi nomial Naive Bayes Classifier on Tf idf Vectorizer')
print(class_report)
print(conf_matrix)

Results of Multi nomial Naive Bayes Classifier on Tf idf Vectorizer
              precision    recall  f1-score   support

         0.0       0.67      0.83      0.74      1609
         1.0       0.86      0.72      0.79      2382

    accuracy                           0.76      3991
   macro avg       0.76      0.78      0.76      3991
weighted avg       0.78      0.76      0.77      3991

[[1331  278]
 [ 660 1722]]


In [52]:
#evaluation
"""
Using the above two reports, we can see that there is not a huge difference in the accuracy, macro and micro
average of both the models. However, Logistic Regression performs slightly better than the Bayesian Classifier.

Precision and Recall are used as the evaluation metrics. 
Precision is the % of items the system detected (i.e., items the
system labeled as positive) that are in fact positive
(according to the human gold labels)

--> Logistic Regression has a 79% precision as compared to NB's 67% for the 0's
--> Logistic Regression has an 80% precision as compared to NB's 86% for the 1's.

Logistic Regression has better precision for class 0.0 (negative) compared to Multinomial Naive Bayes, 
which means Logistic Regression is making fewer false positives when predicting class 0.0.
For class 1.0 (positive), Naive Bayes has a higher precision, indicating that 
Naive Bayes is slightly better at minimizing false positives when predicting class 1.0.

Recall is the % of items actually present in the input that were
correctly identified by the system.

--> Logistic Regression has a recall of 80% as compared to NB's 83% for the 0's
--> Logistic Regression has a recall of 79% as compared to NB's 80% for the 1's

Naive Bayes has a better recall for class 0.0, meaning it correctly identifies 
more of the negative class compared to Logistic Regression.
Logistic Regression, however, has better recall for class 1.0, indicating 
it identifies more of the positive class compared to Naive Bayes.


f-score is a single number that combines Precision and Recall

--> Logistic Regression has 79% f1-score as compared to NB's 74% for the 0's
--> Logistic Regression has 79% f1-score as compared to NB's 79% for the 0's

Logistic Regression has a higher F1-score for class 0.0, indicating a better
balance between precision and recall for the negative class.
For class 1.0, both models perform equally well in terms of the F1-score (0.79).
"""

"\nUsing the above two reports, we can see that there is not a huge difference in the accuracy, macro and micro\naverage of both the models. However, Logistic Regression performs slightly better than the Bayesian Classifier.\n\nPrecision and Recall are used as the evaluation metrics. \nPrecision is the % of items the system detected (i.e., items the\nsystem labeled as positive) that are in fact positive\n(according to the human gold labels)\n\n--> Logistic Regression has a 79% precision as compared to NB's 67% for the 0's\n--> Logistic Regression has an 80% precision as compared to NB's 86% for the 1's.\n\nLogistic Regression has better precision for class 0.0 (negative) compared to Multinomial Naive Bayes, \nwhich means Logistic Regression is making fewer false positives when predicting class 0.0.\nFor class 1.0 (positive), Naive Bayes has a higher precision, indicating that \nNaive Bayes is slightly better at minimizing false positives when predicting class 1.0.\n\nRecall is the % of

In [53]:
"""Multinomial Naive Bayes Report:
Class 0.0 (Negative class):

Precision: 0.67 (Out of all instances predicted as 0.0, 67% were correct)
Recall: 0.83 (Out of all actual 0.0 instances, the model identified 83%)
F1-Score: 0.74 (This is the harmonic mean of precision and recall; it balances both metrics)
Support: 1609 (Number of actual instances of class 0.0)
Class 1.0 (Positive class):

Precision: 0.86 (Out of all instances predicted as 1.0, 86% were correct)
Recall: 0.72 (Out of all actual 1.0 instances, the model identified 72%)
F1-Score: 0.79
Support: 2382
Overall Metrics:

Accuracy: 0.76 (76% of the total predictions were correct)
Macro Average:
Precision: 0.76
Recall: 0.78
F1-Score: 0.76
Weighted Average (takes into account class imbalance):
Precision: 0.78
Recall: 0.76
F1-Score: 0.77


[[1331  278]  ->  Class 0.0: 1331 correctly classified as 0.0, 278 misclassified as 1.0
 [ 660 1722]] ->  Class 1.0: 1722 correctly classified as 1.0, 660 misclassified as 0.0

"""

'Multinomial Naive Bayes Report:\nClass 0.0 (Negative class):\n\nPrecision: 0.67 (Out of all instances predicted as 0.0, 67% were correct)\nRecall: 0.83 (Out of all actual 0.0 instances, the model identified 83%)\nF1-Score: 0.74 (This is the harmonic mean of precision and recall; it balances both metrics)\nSupport: 1609 (Number of actual instances of class 0.0)\nClass 1.0 (Positive class):\n\nPrecision: 0.86 (Out of all instances predicted as 1.0, 86% were correct)\nRecall: 0.72 (Out of all actual 1.0 instances, the model identified 72%)\nF1-Score: 0.79\nSupport: 2382\nOverall Metrics:\n\nAccuracy: 0.76 (76% of the total predictions were correct)\nMacro Average:\nPrecision: 0.76\nRecall: 0.78\nF1-Score: 0.76\nWeighted Average (takes into account class imbalance):\nPrecision: 0.78\nRecall: 0.76\nF1-Score: 0.77\n\n\n[[1331  278]  ->  Class 0.0: 1331 correctly classified as 0.0, 278 misclassified as 1.0\n [ 660 1722]] ->  Class 1.0: 1722 correctly classified as 1.0, 660 misclassified as 0

In [54]:
"""Class 0.0 (Negative class):

Precision: 0.79 (Out of all instances predicted as 0.0, 79% were correct)
Recall: 0.80 (Out of all actual 0.0 instances, the model identified 80%)
F1-Score: 0.79
Support: 1969
Class 1.0 (Positive class):

Precision: 0.80 (Out of all instances predicted as 1.0, 80% were correct)
Recall: 0.79 (Out of all actual 1.0 instances, the model identified 79%)
F1-Score: 0.79
Support: 2022
Overall Metrics:

Accuracy: 0.79 (79% of the total predictions were correct)
Macro Average:
Precision: 0.79
Recall: 0.79
F1-Score: 0.79
Weighted Average:
Precision: 0.79
Recall: 0.79
F1-Score: 0.79

[[1566  403]  ->  Class 0.0: 1566 correctly classified as 0.0, 403 misclassified as 1.0
 [ 425 1597]] ->  Class 1.0: 1597 correctly classified as 1.0, 425 misclassified as 0.0


"""

'Class 0.0 (Negative class):\n\nPrecision: 0.79 (Out of all instances predicted as 0.0, 79% were correct)\nRecall: 0.80 (Out of all actual 0.0 instances, the model identified 80%)\nF1-Score: 0.79\nSupport: 1969\nClass 1.0 (Positive class):\n\nPrecision: 0.80 (Out of all instances predicted as 1.0, 80% were correct)\nRecall: 0.79 (Out of all actual 1.0 instances, the model identified 79%)\nF1-Score: 0.79\nSupport: 2022\nOverall Metrics:\n\nAccuracy: 0.79 (79% of the total predictions were correct)\nMacro Average:\nPrecision: 0.79\nRecall: 0.79\nF1-Score: 0.79\nWeighted Average:\nPrecision: 0.79\nRecall: 0.79\nF1-Score: 0.79\n\n[[1566  403]  ->  Class 0.0: 1566 correctly classified as 0.0, 403 misclassified as 1.0\n [ 425 1597]] ->  Class 1.0: 1597 correctly classified as 1.0, 425 misclassified as 0.0\n\n\n'

In [55]:
#optimization
"""
- I have used TfIdf vectorizer but MNB works better with Count Vectorizer because it assumes independence b/w features
  so using Tfidf may improve MNB
- MNB works better with simpler features, 
  but incorporating context through n-grams helps it learn relationships between adjacent words.
- We can tune the alpha parameter for Laplace smooting
- working closely with stopwords removal and normalization of text can also optimize both the models
- LR has a c paramter by default, we can tune it for optimization

"""

'\n- I have used TfIdf vectorizer but MNB works better with Count Vectorizer because it assumes independence b/w features\n  so using Tfidf may improve MNB\n- MNB works better with simpler features, \n  but incorporating context through n-grams helps it learn relationships between adjacent words.\n- We can tune the alpha parameter for Laplace smooting\n- working closely with stopwords removal and normalization of text can also optimize both the models\n- LR has a c paramter by default, we can tune it for optimization\n\n'

In [56]:
#challenges
"""
- Difficult to validate stemming and lemmatization
- punctuation marks are also different, difficulty in data cleaning
- difficulty in ensuring data cleanliness
- Challenges that may occur when working with social media urdu data
   includes that the language used is not even pure Urdu, 
   it is a mix of other languages too. Many posts include Punjabi and English along with Urdu
   and working with multiple languages in a single data set is time-consuming and difficult
   It is also difficult to differentiate punjabi and urdu. Also, there are instances of 
   Roman Urdu and English as well.
"""

'\n- Difficult to validate stemming and lemmatization\n- punctuation marks are also different, difficulty in data cleaning\n- difficulty in ensuring data cleanliness\n- Challenges that may occur when working with social media urdu data\n   includes that the language used is not even pure Urdu, \n   it is a mix of other languages too. Many posts include Punjabi and English along with Urdu\n   and working with multiple languages in a single data set is time-consuming and difficult\n   It is also difficult to differentiate punjabi and urdu. Also, there are instances of \n   Roman Urdu and English as well.\n'