In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from models.classical_models import SentimixModel

In [3]:
import pandas as pd
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import SVC
import re
import numpy as np

In [4]:
data = pd.read_csv("../data/processed_train.csv")
data.head()

Unnamed: 0,id,sentiment,text,language_labels,clean_text,labels
0,23081,neutral,RT @ RD _ BANA Kahan Ho ???? Zinda Samadhi Kab...,"['Eng', 'O', 'Hin', 'O', 'Hin', 'Hin', 'Hin', ...",rt mention rd bana kahan ho zinda samadhi kab ...,1
1,29854,negative,In pro-indian hazraat ka Bughazzay Pak fauj da...,"['Eng', 'Eng', 'Hin', 'Hin', 'Eng', 'Hin', 'En...",in proindian hazraat ka bughazzay pak fauj dai...,0
2,35319,neutral,RT @ Sm4bjp @ sardesairajdeep Some media walas...,"['Eng', 'O', 'Eng', 'O', 'Hin', 'Hin', 'Eng', ...",rt mention sm4bjp mention sardesairajdeep some...,1
3,9572,positive,@ aapkadharam Hello sir ji 🙏🙏🙏🙏🙏 Sir ji mere d...,"['O', 'Hin', 'Hin', 'Hin', 'Hin', 'O', 'Hin', ...",mention aapkadharam hello sir ji sir ji mere d...,2
4,24598,neutral,@ OmarAyubKhan sir aaj subah sehri se light ka...,"['O', 'Hin', 'Hin', 'Hin', 'Hin', 'Hin', 'Hin'...",mention omarayubkhan sir aaj subah sehri se li...,1


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14374 entries, 0 to 14373
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   id               14374 non-null  int64 
 1   sentiment        14374 non-null  object
 2   text             14374 non-null  object
 3   language_labels  14374 non-null  object
 4   clean_text       14374 non-null  object
 5   labels           14374 non-null  int64 
dtypes: int64(2), object(4)
memory usage: 673.9+ KB


In [6]:

X_train, X_test, y_train, y_test = train_test_split(data['clean_text'].values, data["labels"].values.reshape(-1, 1), test_size=0.2, random_state=0)

In [7]:
%%time

lr_param_grid = {
                'vectorizer__max_features': [50000, 100000],
                'classifier__C': [0.1,1,5,10,100],
                'classifier__penalty': ['l1', 'l2'],
                    
            }

LR = LogisticRegression(C=4, max_iter=1000)
tfidf = TfidfVectorizer(strip_accents="unicode", max_features=100000, token_pattern='\w+', ngram_range=(1, 2))

lr = SentimixModel(vectorizer=tfidf, classifier=LR)


CPU times: user 58 µs, sys: 0 ns, total: 58 µs
Wall time: 62.2 µs


In [8]:
%%time 
lr.train(X_train, y_train, random_search=True, param_grid=lr_param_grid)
print(lr.evaluate(X_test, y_test))

              precision    recall  f1-score   support

           0       0.63      0.66      0.64       863
           1       0.51      0.53      0.52      1002
           2       0.72      0.66      0.69      1010

    accuracy                           0.61      2875
   macro avg       0.62      0.62      0.62      2875
weighted avg       0.62      0.61      0.62      2875

CPU times: user 8.83 s, sys: 1.09 s, total: 9.93 s
Wall time: 3min 2s


In [26]:
print("Best Params for Logistic Regression", lr._model.best_params_)


Best Params for Logistic Regression {'vectorizer__max_features': 50000, 'classifier__penalty': 'l2', 'classifier__C': 1}


In [27]:
pd.DataFrame(lr._model.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_vectorizer__max_features,param_classifier__penalty,param_classifier__C,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,4.574465,0.363149,0.0,0.0,50000,l1,5,"{'vectorizer__max_features': 50000, 'classifie...",,,,,,,,6
1,3.563742,0.302562,0.0,0.0,100000,l1,5,"{'vectorizer__max_features': 100000, 'classifi...",,,,,,,,7
2,12.071874,1.38822,0.880677,0.427337,50000,l2,1,"{'vectorizer__max_features': 50000, 'classifie...",0.603913,0.598261,0.615217,0.600435,0.61418,0.606401,0.007018,1
3,3.454413,0.259546,0.0,0.0,50000,l1,1,"{'vectorizer__max_features': 50000, 'classifie...",,,,,,,,8
4,21.939345,0.718899,0.634604,0.078964,50000,l2,10,"{'vectorizer__max_features': 50000, 'classifie...",0.60087,0.587391,0.6,0.58,0.601131,0.593878,0.008645,5
5,3.611558,0.089805,0.0,0.0,100000,l1,100,"{'vectorizer__max_features': 100000, 'classifi...",,,,,,,,9
6,53.279123,1.412098,0.628104,0.09714,100000,l2,100,"{'vectorizer__max_features': 100000, 'classifi...",0.601304,0.586522,0.598261,0.581304,0.608525,0.595183,0.009926,4
7,21.704108,1.671295,0.632774,0.072101,100000,l2,1,"{'vectorizer__max_features': 100000, 'classifi...",0.599565,0.595217,0.619565,0.59913,0.613745,0.605445,0.009459,2
8,33.14922,2.695113,0.766105,0.331796,100000,l2,10,"{'vectorizer__max_features': 100000, 'classifi...",0.603913,0.596522,0.606522,0.587391,0.609395,0.600749,0.00793,3
9,4.218071,0.578347,0.0,0.0,50000,l1,100,"{'vectorizer__max_features': 50000, 'classifie...",,,,,,,,10


In [28]:
lr.save("../weights/sentimix_logistic_regression.joblib")

In [35]:
%%time 
lr2 = SentimixModel(model_path='../weights/sentimix_logistic_regression.joblib')

CPU times: user 1.26 s, sys: 342 ms, total: 1.6 s
Wall time: 2.63 s


In [45]:
print(lr2.evaluate(X_test, y_test))


              precision    recall  f1-score   support

           0       0.63      0.66      0.64       863
           1       0.51      0.53      0.52      1002
           2       0.72      0.66      0.69      1010

    accuracy                           0.61      2875
   macro avg       0.62      0.62      0.62      2875
weighted avg       0.62      0.61      0.62      2875



In [42]:
%%time
lr.predict(np.array(["kaise hain yaar"]))

CPU times: user 3.16 ms, sys: 1.97 ms, total: 5.13 ms
Wall time: 6.52 ms


array([1])