In [14]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer
import re

In [15]:
df_tweet = pd.read_csv("train.csv")
df_tweet.shape

(29992, 2)

In [16]:
df_tweet.columns

Index(['tweets', 'labels'], dtype='object')

In [17]:
from nltk.corpus import stopwords

def process_text(text):
    text = text.lower()
    url = "https?://([A-z0-9_\.%\-\?&=]+/)*[A-z0-9_\.%\-\?&=]+"
    hashtags = "#[A-z0-9_\.\-%]{1,}"
    mentions = "@[A-z0-9_\.\-%]+"
    
    
    sw = stopwords.words("english")
    #### Now lets remove the patterns
    text = re.sub(url,"",text)
    text = re.sub(hashtags,"",text)
    text = re.sub(mentions,"",text)
    
    #### tokenize sentence
    words = word_tokenize(text)
    st = SnowballStemmer("english")
    stemmed = [st.stem(w) for w in words]
    final = [w for w in stemmed if w not in sw]
    text = " ".join(final)
    return text



df_tweet["tweets"] = df_tweet["tweets"].apply(process_text)

In [25]:
mapper = {
    "Anxious": 0,
    "Normal": 1,
    "Stressed": 2,
    "Lonely": 3
}

df_tweet["labels"] = df_tweet["labels"].map(mapper)

In [28]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [30]:
vc = CountVectorizer()
tc = TfidfVectorizer()

### train_test split
df_train, df_test = train_test_split(df_tweet,train_size=0.75,random_state=3)

X_train_cv = vc.fit_transform(df_train["tweets"])
X_train_tf = tc.fit_transform(df_train["tweets"])
y_train = df_train["labels"]
y_test = df_test["labels"]

### Fitting the model

In [41]:
nb_cv = MultinomialNB()
nb_tf = MultinomialNB()

nb_cv = nb_cv.fit(X_train_cv,y_train)
nb_tf = nb_tf.fit(X_train_tf,y_train)

### Predicting the model on train data count vectorizer

In [45]:
from sklearn.metrics import confusion_matrix,classification_report
y_train_pred_cv  = nb_cv.predict(X_train_cv)
cf = confusion_matrix(y_train,y_train_pred_cv)
print(cf)
print(classification_report(y_train,y_train_pred_cv))

[[4365   94   84 1733]
 [1056 4353  404  206]
 [ 216   94 4821   26]
 [2395   87   63 2497]]
              precision    recall  f1-score   support

           0       0.54      0.70      0.61      6276
           1       0.94      0.72      0.82      6019
           2       0.90      0.93      0.92      5157
           3       0.56      0.50      0.53      5042

    accuracy                           0.71     22494
   macro avg       0.74      0.71      0.72     22494
weighted avg       0.73      0.71      0.72     22494



### test prediction count vectorizer

In [48]:
X_test_cv = vc.transform(df_test["tweets"])
y_test_pred_cv  = nb_cv.predict(X_test_cv)
cf = confusion_matrix(y_test,y_test_pred_cv)
print(cf)
print(classification_report(y_test,y_test_pred_cv))

[[1122   61   34  895]
 [ 555 1050  226  126]
 [ 136   63 1466   18]
 [1167   38   26  515]]
              precision    recall  f1-score   support

           0       0.38      0.53      0.44      2112
           1       0.87      0.54      0.66      1957
           2       0.84      0.87      0.85      1683
           3       0.33      0.29      0.31      1746

    accuracy                           0.55      7498
   macro avg       0.60      0.56      0.57      7498
weighted avg       0.60      0.55      0.56      7498



### Predicting the model on train data TfiDf vectorizer

In [50]:
y_train_pred_cv  = nb_tf.predict(X_train_cv)
cf = confusion_matrix(y_train,y_train_pred_cv)
print(cf)
print(classification_report(y_train,y_train_pred_cv))

[[5152   47   86  991]
 [1701 3757  436  125]
 [ 393   72 4678   14]
 [3364   48   61 1569]]
              precision    recall  f1-score   support

           0       0.49      0.82      0.61      6276
           1       0.96      0.62      0.76      6019
           2       0.89      0.91      0.90      5157
           3       0.58      0.31      0.41      5042

    accuracy                           0.67     22494
   macro avg       0.73      0.67      0.67     22494
weighted avg       0.73      0.67      0.67     22494



In [52]:
X_test_tf = tc.transform(df_test["tweets"])
y_test_pred_tf  = nb_tf.predict(X_test_tf)
cf = confusion_matrix(y_test,y_test_pred_tf)
print(cf)
print(classification_report(y_test,y_test_pred_tf))

[[1442   83   44  543]
 [ 745  991  181   40]
 [ 249   82 1344    8]
 [1420   55   29  242]]
              precision    recall  f1-score   support

           0       0.37      0.68      0.48      2112
           1       0.82      0.51      0.63      1957
           2       0.84      0.80      0.82      1683
           3       0.29      0.14      0.19      1746

    accuracy                           0.54      7498
   macro avg       0.58      0.53      0.53      7498
weighted avg       0.58      0.54      0.53      7498

