In [1]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import SVC
import re

In [2]:
data = pd.read_csv("../../data/processed_train.csv")
data.head()

Unnamed: 0,id,sentiment,text,language_labels,clean_text
0,23081,neutral,RT @ RD _ BANA Kahan Ho ???? Zinda Samadhi Kab...,"['Eng', 'O', 'Hin', 'O', 'Hin', 'Hin', 'Hin', ...",rt mention rd bana kahan ho zinda samadhi kab ...
1,29854,negative,In pro-indian hazraat ka Bughazzay Pak fauj da...,"['Eng', 'Eng', 'Hin', 'Hin', 'Eng', 'Hin', 'En...",in proindian hazraat ka bughazzay pak fauj dai...
2,35319,neutral,RT @ Sm4bjp @ sardesairajdeep Some media walas...,"['Eng', 'O', 'Eng', 'O', 'Hin', 'Hin', 'Eng', ...",rt mention sm4bjp mention sardesairajdeep some...
3,9572,positive,@ aapkadharam Hello sir ji 🙏🙏🙏🙏🙏 Sir ji mere d...,"['O', 'Hin', 'Hin', 'Hin', 'Hin', 'O', 'Hin', ...",mention aapkadharam hello sir ji sir ji mere d...
4,24598,neutral,@ OmarAyubKhan sir aaj subah sehri se light ka...,"['O', 'Hin', 'Hin', 'Hin', 'Hin', 'Hin', 'Hin'...",mention omarayubkhan sir aaj subah sehri se li...


In [3]:
data.sentiment.unique()

array(['neutral', 'negative', 'positive'], dtype=object)

In [42]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15131 entries, 0 to 15130
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   id               15131 non-null  int64 
 1   sentiment        15131 non-null  object
 2   text             15131 non-null  object
 3   language_labels  15131 non-null  object
 4   text_cleaned     15131 non-null  object
 5   target           15131 non-null  int64 
dtypes: int64(2), object(4)
memory usage: 709.4+ KB


In [24]:
def train_and_evaluate(df, classifier, vectorizer, random_search=False, param_grid=None, cv=5, n_iter_search=10):
    
    
#     text_transformer = Pipeline(steps=[('vect', vectorizer)])
#     preprocessor = ColumnTransformer(n_jobs=4, transformers=[('text', text_transformer, ["text_cleaned"])])

    model = Pipeline(steps=[('vectorizer', vectorizer),
                          ('classifier', classifier)])


    
    le = LabelEncoder()
    df["target"] = le.fit_transform(df['sentiment'])
    
    X_train, X_test, y_train, y_test = train_test_split(df['clean_text'].values, df["target"].values.reshape(-1, 1), test_size=0.2, random_state=0)
#     print(type(X_train), type(y_train), X_train.shape)
   
    if random_search and param_grid:
        model = RandomizedSearchCV(model, param_grid, cv=cv, n_iter=n_iter_search, n_jobs=-1, refit=True)
        
        
    model.fit(X_train, y_train)

    print("model score: %.3f" % model.score(X_test, y_test))
    
    y_pred = model.predict(X_test)
#     print(y_pred[0:10], y_pred.shape)
    
    scores = metrics.classification_report(y_test, y_pred, output_dict=True)
    
    return model, le, scores



In [None]:
# %%time
# ridge_param_grid = {
#                'preprocessor__text__tfidf__max_features': [75000, 100000, 50000],
#                'regressor__alpha': stats.uniform()
#               }
# ridge, y_scaler1 = train_and_test_sklearn(df, Ridge(max_iter=200, tol=0.01), random_search=True, param_grid=ridge_param_grid, n_iter_search=10, cv=2)
# testset_output1 = run_testset(testset, ridge, y_scaler1)

In [25]:
%%time

lr_param_grid = {
                'vectorizer__max_features': [50000, 100000],
                'classifier__C': [0.1,1,5,10,100],
                'classifier__penalty': ['l1', 'l2'],
                    
            }

LR = LogisticRegression(C=4, max_iter=1000)
tfidf = TfidfVectorizer(strip_accents="unicode", max_features=100000, token_pattern='\w+', ngram_range=(1, 2))

# lr_pipeline, le, lr_scores = train_and_evaluate(data, LR, tfidf)
lr_pipeline, le, lr_scores = train_and_evaluate(data, LR, tfidf, random_search=True, param_grid=lr_param_grid)
lr_scores

TypeError: Parameter value is not iterable or distribution (key='cv', value=3)

In [19]:
lr_pipeline.best_params_

{'vectorizer__max_features': 50000,
 'classifier__penalty': 'l2',
 'classifier__C': 1}

In [20]:
pd.DataFrame(lr_pipeline.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_vectorizer__max_features,param_classifier__penalty,param_classifier__C,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,1.86793,0.084408,0.0,0.0,100000,l1,5.0,"{'vectorizer__max_features': 100000, 'classifi...",,,,,,,,6
1,35.894753,3.6305,0.267992,0.024346,100000,l2,100.0,"{'vectorizer__max_features': 100000, 'classifi...",0.58087,0.596087,0.594783,0.611304,0.616355,0.59988,0.012678,3
2,1.952212,0.063167,0.0,0.0,50000,l1,1.0,"{'vectorizer__max_features': 50000, 'classifie...",,,,,,,,7
3,2.081063,0.048666,0.0,0.0,50000,l1,100.0,"{'vectorizer__max_features': 50000, 'classifie...",,,,,,,,8
4,6.890387,0.803464,0.230721,0.018165,50000,l2,1.0,"{'vectorizer__max_features': 50000, 'classifie...",0.585217,0.588261,0.61087,0.622174,0.624619,0.606228,0.016603,1
5,1.912451,0.049333,0.0,0.0,50000,l1,5.0,"{'vectorizer__max_features': 50000, 'classifie...",,,,,,,,9
6,23.369458,0.657975,0.243769,0.035683,50000,l2,100.0,"{'vectorizer__max_features': 50000, 'classifie...",0.572609,0.590435,0.582174,0.606957,0.600261,0.590487,0.012292,4
7,3.373211,0.055133,0.226372,0.025564,50000,l2,0.1,"{'vectorizer__max_features': 50000, 'classifie...",0.568261,0.566957,0.586522,0.605652,0.598521,0.585182,0.015603,5
8,22.003486,5.489806,0.213391,0.078421,100000,l2,10.0,"{'vectorizer__max_features': 100000, 'classifi...",0.580435,0.596087,0.603478,0.618696,0.616355,0.60301,0.014014,2
9,1.924534,0.093944,0.0,0.0,100000,l1,10.0,"{'vectorizer__max_features': 100000, 'classifi...",,,,,,,,10


In [21]:
%%time
svc_param_grid = {
                'vectorizer__max_features': [50000, 100000],
                'classifier__gamma': ['scale', 'auto'], 
                'classifier__kernel': ['rbf', 'poly', 'sigmoid'],
                    
            }


svc = SVC()
tfidf = TfidfVectorizer(strip_accents="unicode", max_features=100000, token_pattern='\w+', ngram_range=(1, 2))
# svc_pipeline, le, svc_scores = train_and_evaluate(data, svc, tfidf)
svc_pipeline, le, svc_scores = train_and_evaluate(data, svc, tfidf, random_search=True, param_grid=svc_param_grid)

svc_scores

  return f(*args, **kwargs)


model score: 0.611


  return f(*args, **kwargs)


model score: 0.613
CPU times: user 1min 48s, sys: 2.16 s, total: 1min 50s
Wall time: 15min 25s


{'0': {'precision': 0.5935483870967742,
  'recall': 0.6789667896678967,
  'f1-score': 0.6333907056798623,
  'support': 813},
 '1': {'precision': 0.5548841893252769,
  'recall': 0.507366482504604,
  'f1-score': 0.53006253006253,
  'support': 1086},
 '2': {'precision': 0.6922268907563025,
  'recall': 0.6752049180327869,
  'f1-score': 0.6836099585062242,
  'support': 976},
 'accuracy': 0.6128695652173913,
 'macro avg': {'precision': 0.6135531557261179,
  'recall': 0.6205127300684293,
  'f1-score': 0.6156877314162056,
  'support': 2875},
 'weighted avg': {'precision': 0.6124426134591581,
  'recall': 0.6128695652173913,
  'f1-score': 0.611407955084421,
  'support': 2875}}

In [22]:
svc_pipeline.best_params_

{'vectorizer__max_features': 100000,
 'classifier__kernel': 'sigmoid',
 'classifier__gamma': 'scale'}

In [23]:
svc_pipeline

RandomizedSearchCV(cv=5,
                   estimator=Pipeline(steps=[('vectorizer',
                                              TfidfVectorizer(max_features=100000,
                                                              ngram_range=(1,
                                                                           2),
                                                              strip_accents='unicode',
                                                              token_pattern='\\w+')),
                                             ('classifier', SVC())]),
                   n_jobs=-1,
                   param_distributions={'classifier__gamma': [1, 0.1, 'scale',
                                                              'auto'],
                                        'classifier__kernel': ['rbf', 'poly',
                                                               'sigmoid'],
                                        'vectorizer__max_features': [50000,
                       

In [None]:
from sklearn.externals import joblib
joblib.dump(svc_pipeline.best_estimator_, '../models/svc.pkl')