In [261]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import nltk
from nltk.corpus import stopwords
import string
import re
from sklearn.feature_extraction.text import TfidfTransformer,TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import BernoulliNB,CategoricalNB
from sklearn.metrics import classification_report
from xgboost import XGBClassifier
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Mohammed\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [201]:
df=pd.read_csv("twitter_training.csv")

In [202]:
df.head()

Unnamed: 0,2401,Borderlands,Positive,"im getting on borderlands and i will murder you all ,"
0,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
1,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
2,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
3,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...
4,2401,Borderlands,Positive,im getting into borderlands and i can murder y...


# Preprocessing

In [203]:
# removing the extra columns
df.drop(columns=['2401','Borderlands'],inplace=True)
# renaming the columns
df.rename(columns={df.columns[0]:'label',df.columns[1]:'text'},inplace=True)

In [204]:
df.head()

Unnamed: 0,label,text
0,Positive,I am coming to the borders and I will kill you...
1,Positive,im getting on borderlands and i will kill you ...
2,Positive,im coming on borderlands and i will murder you...
3,Positive,im getting on borderlands 2 and i will murder ...
4,Positive,im getting into borderlands and i can murder y...


In [205]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 74681 entries, 0 to 74680
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   label   74681 non-null  object
 1   text    73995 non-null  object
dtypes: object(2)
memory usage: 1.1+ MB


In [206]:
# Dropping null values
df.dropna(inplace=True)

In [207]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 73995 entries, 0 to 74680
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   label   73995 non-null  object
 1   text    73995 non-null  object
dtypes: object(2)
memory usage: 1.7+ MB


In [208]:
df['label'].value_counts()

label
Negative      22358
Positive      20654
Neutral       18108
Irrelevant    12875
Name: count, dtype: int64

In [209]:
pd.get_dummies(df['label']).astype('int')

Unnamed: 0,Irrelevant,Negative,Neutral,Positive
0,0,0,0,1
1,0,0,0,1
2,0,0,0,1
3,0,0,0,1
4,0,0,0,1
...,...,...,...,...
74676,0,0,0,1
74677,0,0,0,1
74678,0,0,0,1
74679,0,0,0,1


In [210]:
encoder=LabelEncoder()
df['label']=encoder.fit_transform(df['label'])

In [211]:
df.head()

Unnamed: 0,label,text
0,3,I am coming to the borders and I will kill you...
1,3,im getting on borderlands and i will kill you ...
2,3,im coming on borderlands and i will murder you...
3,3,im getting on borderlands 2 and i will murder ...
4,3,im getting into borderlands and i can murder y...


In [212]:
st=stopwords.words("english")

# Cleaning
<ol>
    <li>
        Turning text into lower case
    </li>
    <li>
        removing stopwords
    </li>
    <li>
        removing non-English characters
    </li>
</ol>
 

In [213]:
def cleaning(text):
    text=text.lower()
    text=re.sub(r'\b(im)\s',"",text)
    text=re.sub(r'\b(all)',"",text)
    text_split=text.split(" ")
    clean_text=[]
    for word in text_split:
        if word not in st:
            clean_text.append(word)
    text=" ".join(clean_text)
    chars=string.ascii_letters+" "
    for char in text:
        if char not in chars:
            text=text.replace(char,"")
    return text  

In [214]:
df['clean']=df['text'].apply(cleaning)

In [215]:
df.drop(columns=['text'],inplace=True)

In [222]:
df.head()

Unnamed: 0,label,clean
0,3,coming borders kill
1,3,getting borderlands kill
2,3,coming borderlands murder
3,3,getting borderlands murder
4,3,getting borderlands murder


In [229]:
transformer=TfidfVectorizer()

In [230]:
X=transformer.fit_transform(df['clean'])

In [234]:
y=df['label'].values

In [233]:
df.shape

(73995, 2)

In [237]:
X_train,X_test,y_train,y_test=train_test_split(X,y,random_state=42)

In [239]:
clf=DecisionTreeClassifier()

In [240]:
clf.fit(X_train,y_train)

In [242]:
preds=clf.predict(X_test)

In [243]:
print(classification_report(y_test,preds))

              precision    recall  f1-score   support

           0       0.77      0.70      0.74      3266
           1       0.81      0.80      0.81      5579
           2       0.79      0.75      0.77      4492
           3       0.74      0.82      0.78      5162

    accuracy                           0.78     18499
   macro avg       0.78      0.77      0.77     18499
weighted avg       0.78      0.78      0.78     18499



In [246]:
nb=BernoulliNB()

In [247]:
nb.fit(X_train,y_train)

In [248]:
preds=nb.predict(X_test)

In [249]:
print(classification_report(y_test,preds))

              precision    recall  f1-score   support

           0       0.90      0.51      0.65      3266
           1       0.74      0.81      0.77      5579
           2       0.86      0.56      0.68      4492
           3       0.61      0.89      0.72      5162

    accuracy                           0.72     18499
   macro avg       0.78      0.69      0.71     18499
weighted avg       0.76      0.72      0.71     18499



In [253]:
t='today is a good day'

In [254]:
t=cleaning(t)

In [257]:
t=transformer.transform([t])

In [259]:
clf.predict(t)

array([3])

In [260]:
encoder.classes_

array(['Irrelevant', 'Negative', 'Neutral', 'Positive'], dtype=object)

In [262]:
xgb=XGBClassifier()

In [263]:
xgb.fit(X_train,y_train)
preds=xgb.predict(X_test)
print(classification_report(y_test,preds))

              precision    recall  f1-score   support

           0       0.74      0.37      0.49      3266
           1       0.58      0.83      0.68      5579
           2       0.67      0.55      0.61      4492
           3       0.67      0.68      0.68      5162

    accuracy                           0.64     18499
   macro avg       0.67      0.61      0.61     18499
weighted avg       0.66      0.64      0.63     18499

