### Importing Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from nltk.tokenize import word_tokenize
import re
from nltk.stem import SnowballStemmer,PorterStemmer
from nltk.corpus import stopwords

### Reading the data

In [2]:
df_tweet = pd.read_csv("train.csv")
df_tweet.shape

(29992, 2)

In [3]:
df_tweet.columns

Index(['tweets', 'labels'], dtype='object')

In [4]:
df_tweet['labels'].value_counts()

Anxious     8388
Normal      7976
Stressed    6840
Lonely      6788
Name: labels, dtype: int64

### Converting class Labels into numbers

In [5]:
mapper = {
    "Anxious": 0,
    "Normal": 1,
    "Stressed": 2,
    "Lonely": 3
}

df_tweet["labels"] = df_tweet["labels"].map(mapper)
df_tweet.head()

Unnamed: 0,tweets,labels
0,sending solidarity whoever doctor manage incre...,2
1,need see hair amp beard gat book appointment b...,0
2,next time meet someone new dont ask ask love,1
3,surprise someone love give la senza gift box r...,3
4,raise hand junhoes ocean lotion life rent free...,1


### Preprocessing the text
1. convert text to lowercase
2. Tokenize the text
3. Stem the text
4. Regex based removal of (URL, Mentions and Hashtag)
5. Remove the stopwords


In [6]:
def process_text(text):
    text = text.lower()
    
    ### Regex based cleaning
    url = "https?://([A-z0-9_\.\-%]+/)*[A-z0-9_\.\-%&=\?]+"
    hashtags = "#[A-z0-9_\.\-]+"
    mentions = "@[A-z0-9_\.\-]+"
    text = re.sub(url,"",text)
    text = re.sub(hashtags,"",text)
    text = re.sub(mentions,"",text)
    ### Tokenize the text 
    words = word_tokenize(text)
        
    ### Stem the words
    stemmer = SnowballStemmer("english")
    words = [stemmer.stem(w) for w in words]
    
    ### Stwopwords removal
    sw = stopwords.words("english")
    words = [w for w in words if w not in sw]
    
    text = " ".join(words)
    return text

In [7]:
df_tweet_model = df_tweet.copy()
df_tweet_model['tweets'] = df_tweet_model['tweets'].apply(process_text) 

### Generate features from text using countvectorizer

In [8]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier

In [9]:
### Splitting the data into train and test 
X_train,X_test,y_train,y_test = train_test_split(df_tweet_model[['tweets']],df_tweet_model['labels'],
                                                 train_size=0.75,random_state=2)

### Transform data
vc = CountVectorizer()
X_train = vc.fit_transform(X_train['tweets'])
X_train_bk = X_train.toarray()
X_test = vc.transform(X_test["tweets"])
X_test_bk = X_test.toarray()

In [10]:
X_test_bk.shape

(7498, 16529)

In [11]:
X_train.shape

(22494, 16529)

### Logistic Regression model

In [12]:
from sklearn.metrics import confusion_matrix, classification_report

lm = LogisticRegression()

params = {
    "C": [0.001, 0.01, 0.1, 1, 10, 100]
}

lm_grid = GridSearchCV(estimator=lm,param_grid=params,cv=5,scoring='balanced_accuracy', n_jobs=-1, verbose=10).fit(X_train,y_train)

Fitting 5 folds for each of 6 candidates, totalling 30 fits


In [13]:
print(lm_grid.best_score_)
lm =lm_grid.best_estimator_


### Evaluate Model
def evaluate_model(model,X,y,X_test,y_test):
    y_train_pred = model.predict(X)
    train_cf = confusion_matrix(y,y_train_pred)
    print("Train confusion matrix:")
    print(train_cf)
    print("Train report:")
    print(classification_report(y,y_train_pred))
    
    y_test_pred = model.predict(X_test)
    test_cf = confusion_matrix(y_test,y_test_pred)
    print("Test confusion matrix:")
    print(test_cf)
    print("Test report:")
    print(classification_report(y_test,y_test_pred))
    
    

    
evaluate_model(lm,X_train,y_train,X_test,y_test)
    


0.6594062349486429
Train confusion matrix:
[[4115  394   75 1683]
 [ 163 5689   56  110]
 [  46  245 4825   31]
 [2238  274   44 2506]]
Train report:
              precision    recall  f1-score   support

           0       0.63      0.66      0.64      6267
           1       0.86      0.95      0.90      6018
           2       0.96      0.94      0.95      5147
           3       0.58      0.50      0.53      5062

    accuracy                           0.76     22494
   macro avg       0.76      0.76      0.76     22494
weighted avg       0.76      0.76      0.76     22494

Test confusion matrix:
[[1052  148   33  888]
 [  51 1831   27   49]
 [  16  126 1538   13]
 [1128   98   24  476]]
Test report:
              precision    recall  f1-score   support

           0       0.47      0.50      0.48      2121
           1       0.83      0.94      0.88      1958
           2       0.95      0.91      0.93      1693
           3       0.33      0.28      0.30      1726

    accuracy  

### The test score is very low hence we will try Decision Tree

In [14]:
params = {
    "min_samples_split": [20,50,100,500],
    "min_samples_leaf": [20,100,500],
    "max_depth": [10,20,30],
}

dt = DecisionTreeClassifier(random_state=1)

### grid search
dt_grid = GridSearchCV(estimator=dt,param_grid=params,cv=5,n_jobs=-1,verbose=10,scoring="balanced_accuracy").fit(X_train,y_train)


Fitting 5 folds for each of 36 candidates, totalling 180 fits


In [15]:
dt = dt_grid.best_estimator_

dt = dt.fit(X_train,y_train)
y_train_pred = dt.predict(X_train)
y_test_pred = dt.predict(X_test)

evaluate_model(dt,X_train,y_train,X_test,y_test)

Train confusion matrix:
[[4528  597   91 1051]
 [ 194 5547   88  189]
 [  83  439 4575   50]
 [3292  421   63 1286]]
Train report:
              precision    recall  f1-score   support

           0       0.56      0.72      0.63      6267
           1       0.79      0.92      0.85      6018
           2       0.95      0.89      0.92      5147
           3       0.50      0.25      0.34      5062

    accuracy                           0.71     22494
   macro avg       0.70      0.70      0.68     22494
weighted avg       0.70      0.71      0.69     22494

Test confusion matrix:
[[1449  195   31  446]
 [  58 1801   37   62]
 [  33  162 1485   13]
 [1229  136   22  339]]
Test report:
              precision    recall  f1-score   support

           0       0.52      0.68      0.59      2121
           1       0.79      0.92      0.85      1958
           2       0.94      0.88      0.91      1693
           3       0.39      0.20      0.26      1726

    accuracy                     

### The model fit is good but The test performance for lonely is very low. 

In [16]:
rf = RandomForestClassifier(random_state=1)
params = {
    "min_samples_split": [20,50,100],
    "min_samples_leaf": [20,50,100],
    "max_features": [10,100,1000],
    "max_depth": [10,20,30],
}

rf_grid = GridSearchCV(estimator=rf, param_grid=params, cv=5, n_jobs=-1, 
                  scoring="balanced_accuracy", verbose=10).fit(X_train,y_train)

Fitting 5 folds for each of 81 candidates, totalling 405 fits


In [17]:
rf_grid.best_score_

0.6687449753528389

In [18]:
rf = rf_grid.best_estimator_

evaluate_model(rf,X_train,y_train,X_test,y_test)

Train confusion matrix:
[[4462  589   88 1128]
 [ 295 5514   79  130]
 [ 100  506 4530   11]
 [3167  461   65 1369]]
Train report:
              precision    recall  f1-score   support

           0       0.56      0.71      0.62      6267
           1       0.78      0.92      0.84      6018
           2       0.95      0.88      0.91      5147
           3       0.52      0.27      0.36      5062

    accuracy                           0.71     22494
   macro avg       0.70      0.69      0.68     22494
weighted avg       0.70      0.71      0.69     22494

Test confusion matrix:
[[1411  190   37  483]
 [  77 1801   29   51]
 [  36  186 1470    1]
 [1198  144   24  360]]
Test report:
              precision    recall  f1-score   support

           0       0.52      0.67      0.58      2121
           1       0.78      0.92      0.84      1958
           2       0.94      0.87      0.90      1693
           3       0.40      0.21      0.27      1726

    accuracy                     

In [19]:
ad = AdaBoostClassifier(random_state=1)
params = {
    "learning_rate": [0.001, 0.01, 0.1, 0.2, 0.3]
}

ad_grid = GridSearchCV(estimator=ad, param_grid=params, cv=5, n_jobs=-1, 
                  scoring="balanced_accuracy", verbose=10).fit(X_train,y_train)

Fitting 5 folds for each of 5 candidates, totalling 25 fits


In [20]:
ad_grid.best_score_

0.6712538139358134

In [21]:
ad = ad_grid.best_estimator_
evaluate_model(ad,X_train,y_train,X_test,y_test)

Train confusion matrix:
[[3941  953   66 1307]
 [ 145 5612   74  187]
 [  88  487 4541   31]
 [3027  660   39 1336]]
Train report:
              precision    recall  f1-score   support

           0       0.55      0.63      0.59      6267
           1       0.73      0.93      0.82      6018
           2       0.96      0.88      0.92      5147
           3       0.47      0.26      0.34      5062

    accuracy                           0.69     22494
   macro avg       0.68      0.68      0.67     22494
weighted avg       0.67      0.69      0.67     22494

Test confusion matrix:
[[1342  290   30  459]
 [  32 1831   29   66]
 [  27  170 1490    6]
 [1076  205   18  427]]
Test report:
              precision    recall  f1-score   support

           0       0.54      0.63      0.58      2121
           1       0.73      0.94      0.82      1958
           2       0.95      0.88      0.91      1693
           3       0.45      0.25      0.32      1726

    accuracy                     

In [22]:
gb = GradientBoostingClassifier(random_state=1)
params = {
    "learning_rate": [0.001, 0.01, 0.1, 0.2, 0.3],
    "max_depth": [3,5],
    "n_estimators": [50,100]
}

gb_grid = GridSearchCV(estimator=gb, param_grid=params, cv=5, n_jobs=-1, 
                  scoring="balanced_accuracy", verbose=10).fit(X_train,y_train)

Fitting 5 folds for each of 20 candidates, totalling 100 fits


In [23]:
gb = gb_grid.best_estimator_

In [24]:
gb_grid.best_score_

0.6878818029441043

In [25]:
y_test_pred = gb.predict(X_test)
cf = confusion_matrix(y_test,y_test_pred)
cf

array([[ 761,  196,   38, 1126],
       [  15, 1823,   37,   83],
       [   3,  114, 1565,   11],
       [ 584,  146,   27,  969]], dtype=int64)

In [26]:
print(classification_report(y_test,y_test_pred))

              precision    recall  f1-score   support

           0       0.56      0.36      0.44      2121
           1       0.80      0.93      0.86      1958
           2       0.94      0.92      0.93      1693
           3       0.44      0.56      0.50      1726

    accuracy                           0.68      7498
   macro avg       0.68      0.69      0.68      7498
weighted avg       0.68      0.68      0.67      7498



### XGBoost model

In [27]:
from xgboost import XGBClassifier

xg = XGBClassifier()

xg.fit(X_train,y_train)
#y_train_pred = xg.predict(X_train)
#print(y_train_pred)
#y_test_pred = xg.predict(X_test)
#print(confusion_matrix(y_train,y_train_pred))
evaluate_model(xg,X_train,y_train,X_test,y_test)






Train confusion matrix:
[[4409  416   54 1388]
 [ 153 5635   68  162]
 [  24  245 4862   16]
 [2728  286   33 2015]]
Train report:
              precision    recall  f1-score   support

           0       0.60      0.70      0.65      6267
           1       0.86      0.94      0.89      6018
           2       0.97      0.94      0.96      5147
           3       0.56      0.40      0.47      5062

    accuracy                           0.75     22494
   macro avg       0.75      0.75      0.74     22494
weighted avg       0.75      0.75      0.74     22494

Test confusion matrix:
[[1247  142   28  704]
 [  50 1819   30   59]
 [  16  105 1562   10]
 [1201  105   20  400]]
Test report:
              precision    recall  f1-score   support

           0       0.50      0.59      0.54      2121
           1       0.84      0.93      0.88      1958
           2       0.95      0.92      0.94      1693
           3       0.34      0.23      0.28      1726

    accuracy                    

### Still confusion between class 1 and class 3

### Printing all the best model on cross validation

In [28]:
print("Logistic Regression", lm_grid.best_estimator_)
print("Decision Tree", dt_grid.best_estimator_)
print("Random Forest", rf_grid.best_estimator_)
print("Ada Boost", ad_grid.best_estimator_)
print("Gradient Boostin", gb_grid.best_estimator_)

Logistic Regression LogisticRegression(C=0.1)
Decision Tree DecisionTreeClassifier(max_depth=20, min_samples_leaf=20, min_samples_split=20,
                       random_state=1)
Random Forest RandomForestClassifier(max_depth=30, max_features=1000, min_samples_leaf=20,
                       min_samples_split=20, random_state=1)
Ada Boost AdaBoostClassifier(learning_rate=0.3, random_state=1)
Gradient Boostin GradientBoostingClassifier(learning_rate=0.3, n_estimators=50, random_state=1)


### Let us see the TFIDF vectorizer

In [29]:
from sklearn.feature_extraction.text import TfidfVectorizer
vc = TfidfVectorizer()

df_tweet_model.shape

(29992, 2)

### df_tweet_model is preprocessed text hance we can use that

In [30]:
df_train,df_test = train_test_split(df_tweet_model,train_size=0.75,random_state=3)

In [31]:
X_train_tf = vc.fit_transform(df_train["tweets"])
X_test_tf = vc.transform(df_test["tweets"])
y_train = df_train["labels"]
y_test = df_test["labels"]

### Fit Models

In [32]:
### Splitting the data into train and test 
X_train,X_test,y_train,y_test = train_test_split(df_tweet_model[['tweets']],df_tweet_model['labels'],
                                                 train_size=0.75,random_state=2)

### Transform data
vc = TfidfVectorizer()
X_train = vc.fit_transform(X_train['tweets'])
X_train_bk = X_train.toarray()
X_test = vc.transform(X_test["tweets"])
X_test_bk = X_test.toarray()

X_test_bk.shape

X_train.shape

### Logistic Regression model

from sklearn.metrics import confusion_matrix, classification_report

lm = LogisticRegression()

params = {
    "C": [0.001, 0.01, 0.1, 1, 10, 100]
}

lm_grid = GridSearchCV(estimator=lm,param_grid=params,cv=5,scoring='balanced_accuracy', n_jobs=-1, verbose=10).fit(X_train,y_train)

print(lm_grid.best_score_)
lm =lm_grid.best_estimator_


### Evaluate Model
def evaluate_model(model,X,y,X_test,y_test):
    y_train_pred = model.predict(X)
    train_cf = confusion_matrix(y,y_train_pred)
    print("Train confusion matrix:")
    print(train_cf)
    print("Train report:")
    print(classification_report(y,y_train_pred))
    
    y_test_pred = model.predict(X_test)
    test_cf = confusion_matrix(y_test,y_test_pred)
    print("Test confusion matrix:")
    print(test_cf)
    print("Test report:")
    print(classification_report(y_test,y_test_pred))
    
    

print("logistic Regression + tfidf")    
evaluate_model(lm,X_train,y_train,X_test,y_test)
    


### The test score is very low hence we will try Decision Tree

params = {
    "min_samples_split": [20,50,100,500],
    "min_samples_leaf": [20,100,500],
    "max_depth": [10,20,30],
}

dt = DecisionTreeClassifier(random_state=1)

### grid search
dt_grid = GridSearchCV(estimator=dt,param_grid=params,cv=5,n_jobs=-1,verbose=10,scoring="balanced_accuracy").fit(X_train,y_train)


dt = dt_grid.best_estimator_

dt = dt.fit(X_train,y_train)
y_train_pred = dt.predict(X_train)
y_test_pred = dt.predict(X_test)

print("Decision Tree + tfidf") 
evaluate_model(dt,X_train,y_train,X_test,y_test)

### The model fit is good but The test performance for lonely is very low. 

rf = RandomForestClassifier(random_state=1)
params = {
    "min_samples_split": [20,50,100],
    "min_samples_leaf": [20,50,100],
    "max_features": [10,100,1000],
    "max_depth": [10,20,30],
}

rf_grid = GridSearchCV(estimator=rf, param_grid=params, cv=5, n_jobs=-1, 
                  scoring="balanced_accuracy", verbose=10).fit(X_train,y_train)

rf_grid.best_score_

rf = rf_grid.best_estimator_

print("Random Forest + tfidf") 
evaluate_model(rf,X_train,y_train,X_test,y_test)



ad = AdaBoostClassifier(random_state=1)
params = {
    "learning_rate": [0.001, 0.01, 0.1, 0.2, 0.3]
}

ad_grid = GridSearchCV(estimator=ad, param_grid=params, cv=5, n_jobs=-1, 
                  scoring="balanced_accuracy", verbose=10).fit(X_train,y_train)

ad_grid.best_score_

ad = ad_grid.best_estimator_

print("Adaboost + tfidf") 
evaluate_model(ad,X_train,y_train,X_test,y_test)

gb = GradientBoostingClassifier(random_state=1)
params = {
    "learning_rate": [0.001, 0.01, 0.1, 0.2, 0.3],
    "max_depth": [3,5],
    "n_estimators": [50,100]
}

gb_grid = GridSearchCV(estimator=gb, param_grid=params, cv=5, n_jobs=-1, 
                  scoring="balanced_accuracy", verbose=10).fit(X_train,y_train)

gb = gb_grid.best_estimator_

gb_grid.best_score_



y_test_pred = gb.predict(X_test)
cf = confusion_matrix(y_test,y_test_pred)
cf

print("GBM + tfidf") 
evaluate_model(gb,X_train,y_train,X_test,y_test)

### Naive Bayes

Fitting 5 folds for each of 6 candidates, totalling 30 fits


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.6452736654202411
logistic Regression + tfidf
Train confusion matrix:
[[4397  333   63 1474]
 [ 169 5721   51   77]
 [  47  254 4833   13]
 [2365  258   41 2398]]
Train report:
              precision    recall  f1-score   support

           0       0.63      0.70      0.66      6267
           1       0.87      0.95      0.91      6018
           2       0.97      0.94      0.95      5147
           3       0.61      0.47      0.53      5062

    accuracy                           0.77     22494
   macro avg       0.77      0.77      0.76     22494
weighted avg       0.77      0.77      0.77     22494

Test confusion matrix:
[[1081  158   30  852]
 [  66 1813   31   48]
 [  18  122 1542   11]
 [1198  101   20  407]]
Test report:
              precision    recall  f1-score   support

           0       0.46      0.51      0.48      2121
           1       0.83      0.93      0.87      1958
           2       0.95      0.91      0.93      1693
           3       0.31      0.24      0.

### XGBoost Classifier

In [33]:
xg.fit(X_train,y_train)
#y_train_pred = xg.predict(X_train)
#print(y_train_pred)
#y_test_pred = xg.predict(X_test)
#print(confusion_matrix(y_train,y_train_pred))
evaluate_model(xg,X_train,y_train,X_test,y_test)



Train confusion matrix:
[[3723  333   40 2171]
 [ 145 5703   40  130]
 [  23  224 4887   13]
 [1828  231   26 2977]]
Train report:
              precision    recall  f1-score   support

           0       0.65      0.59      0.62      6267
           1       0.88      0.95      0.91      6018
           2       0.98      0.95      0.96      5147
           3       0.56      0.59      0.58      5062

    accuracy                           0.77     22494
   macro avg       0.77      0.77      0.77     22494
weighted avg       0.77      0.77      0.77     22494

Test confusion matrix:
[[ 916  132   27 1046]
 [  68 1826   22   42]
 [  16  104 1563   10]
 [ 934   90   18  684]]
Test report:
              precision    recall  f1-score   support

           0       0.47      0.43      0.45      2121
           1       0.85      0.93      0.89      1958
           2       0.96      0.92      0.94      1693
           3       0.38      0.40      0.39      1726

    accuracy                     

In [34]:
print("Logistic Regression", lm_grid.best_estimator_)
print("Decision Tree", dt_grid.best_estimator_)
print("Random Forest", rf_grid.best_estimator_)
print("Ada Boost", ad_grid.best_estimator_)
print("Gradient Boostin", gb_grid.best_estimator_)

Logistic Regression LogisticRegression(C=1)
Decision Tree DecisionTreeClassifier(max_depth=20, min_samples_leaf=20, min_samples_split=500,
                       random_state=1)
Random Forest RandomForestClassifier(max_depth=30, max_features=1000, min_samples_leaf=20,
                       min_samples_split=100, random_state=1)
Ada Boost AdaBoostClassifier(learning_rate=0.3, random_state=1)
Gradient Boostin GradientBoostingClassifier(learning_rate=0.3, n_estimators=50, random_state=1)
