In [334]:
import pandas as pd
import numpy as np

In [335]:
data = pd.read_csv("../data/stg/sentiment/senteval2016.csv")

In [336]:
data.sentiment.value_counts()

 0    49943
-1     5702
 1     2334
Name: sentiment, dtype: int64

In [337]:
data.loc[data.sentiment == 1, "review"].iloc[12]

'rt рейтинг стоимости брендов apple  лидер две российские компании  сбербанк и мтс вошли в топ'

In [338]:
from sklearn.model_selection import train_test_split

In [339]:
X = data['review']
y = data.sentiment

In [340]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=.3, shuffle=True, random_state=17
)

In [353]:
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

from catboost import CatBoostClassifier
from sklearn.model_selection import KFold, GridSearchCV

In [354]:
model = make_pipeline(
    TfidfVectorizer(max_df=.3, min_df=5),
    TruncatedSVD(n_components=200,),
    CatBoostClassifier(n_estimators=200, random_state=17, verbose=0)
)

params = {
    "truncatedsvd__n_components": [100, 200,],
    "catboostclassifier__n_estimators": [200, 400, 600]
}
kf = KFold(n_splits=4, shuffle=True, random_state=7)
gcv = GridSearchCV(model, params, scoring='f1_macro', n_jobs=-1, cv=kf, verbose=2)
gcv

In [356]:
gcv.fit(X_train, y_train)

Fitting 4 folds for each of 6 candidates, totalling 24 fits
[CV] END catboostclassifier__n_estimators=200, truncatedsvd__n_components=100; total time= 1.1min
[CV] END catboostclassifier__n_estimators=200, truncatedsvd__n_components=100; total time= 1.1min
[CV] END catboostclassifier__n_estimators=200, truncatedsvd__n_components=100; total time= 1.1min
[CV] END catboostclassifier__n_estimators=200, truncatedsvd__n_components=100; total time= 1.1min
[CV] END catboostclassifier__n_estimators=200, truncatedsvd__n_components=200; total time= 1.7min
[CV] END catboostclassifier__n_estimators=200, truncatedsvd__n_components=200; total time= 1.7min
[CV] END catboostclassifier__n_estimators=200, truncatedsvd__n_components=200; total time= 1.7min
[CV] END catboostclassifier__n_estimators=200, truncatedsvd__n_components=200; total time= 1.8min
[CV] END catboostclassifier__n_estimators=400, truncatedsvd__n_components=100; total time= 1.8min
[CV] END catboostclassifier__n_estimators=400, truncatedsv

In [358]:
gcv.best_score_, gcv.best_params_

(0.4515276402722095,
 {'catboostclassifier__n_estimators': 400, 'truncatedsvd__n_components': 200})

In [359]:
def get_cv_results(gcv):

    df1 = pd.DataFrame({
        "mean_score": gcv.cv_results_['mean_test_score'],
        "std_score": gcv.cv_results_['std_test_score'],
        "fit_time": gcv.cv_results_["mean_fit_time"]
    })

    df2 = pd.DataFrame(gcv.cv_results_["params"])

    return df2.join(df1)

In [360]:
get_cv_results(gcv)

Unnamed: 0,catboostclassifier__n_estimators,truncatedsvd__n_components,mean_score,std_score,fit_time
0,200,100,0.439828,0.007697,64.292179
1,200,200,0.445869,0.002311,101.43079
2,400,100,0.434334,0.007763,108.954692
3,400,200,0.451528,0.002099,207.322769
4,600,100,0.442495,0.003611,204.610718
5,600,200,0.449297,0.004869,244.548423


In [361]:
model_cb = make_pipeline(
    TfidfVectorizer(max_df=.3, min_df=5),
    TruncatedSVD(n_components=200,),
    CatBoostClassifier(n_estimators=200, random_state=17, verbose=0)
)

model_cb.fit(X_train, y_train)

In [365]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

In [369]:
model = make_pipeline(
    TfidfVectorizer(max_df=.3, min_df=5),
    TruncatedSVD(n_components=200,),
    StandardScaler(),
    LogisticRegression(random_state=17, n_jobs=-1, C=1)
)

params = {
    "truncatedsvd__n_components": [100, 200, 300],
    "logisticregression__C": [.1, 1, 10]
}
kf = KFold(n_splits=4, shuffle=True, random_state=7)
gcv = GridSearchCV(model, params, scoring='f1_macro', n_jobs=1, cv=kf, verbose=2)
gcv

In [370]:
gcv.fit(X_train, y_train)

Fitting 4 folds for each of 9 candidates, totalling 36 fits
[CV] END logisticregression__C=0.1, truncatedsvd__n_components=100; total time=   3.8s
[CV] END logisticregression__C=0.1, truncatedsvd__n_components=100; total time=   4.0s
[CV] END logisticregression__C=0.1, truncatedsvd__n_components=100; total time=   4.2s
[CV] END logisticregression__C=0.1, truncatedsvd__n_components=100; total time=   4.3s
[CV] END logisticregression__C=0.1, truncatedsvd__n_components=200; total time=   6.8s
[CV] END logisticregression__C=0.1, truncatedsvd__n_components=200; total time=   6.8s
[CV] END logisticregression__C=0.1, truncatedsvd__n_components=200; total time=   6.7s
[CV] END logisticregression__C=0.1, truncatedsvd__n_components=200; total time=   6.4s
[CV] END logisticregression__C=0.1, truncatedsvd__n_components=300; total time=  10.5s
[CV] END logisticregression__C=0.1, truncatedsvd__n_components=300; total time=  11.0s
[CV] END logisticregression__C=0.1, truncatedsvd__n_components=300; to

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV] END logisticregression__C=1, truncatedsvd__n_components=100; total time=   4.3s
[CV] END logisticregression__C=1, truncatedsvd__n_components=100; total time=   4.1s
[CV] END logisticregression__C=1, truncatedsvd__n_components=100; total time=   4.2s
[CV] END logisticregression__C=1, truncatedsvd__n_components=200; total time=   6.5s
[CV] END logisticregression__C=1, truncatedsvd__n_components=200; total time=   7.2s
[CV] END logisticregression__C=1, truncatedsvd__n_components=200; total time=   6.8s
[CV] END logisticregression__C=1, truncatedsvd__n_components=200; total time=   7.4s
[CV] END logisticregression__C=1, truncatedsvd__n_components=300; total time=  10.3s
[CV] END logisticregression__C=1, truncatedsvd__n_components=300; total time=  10.9s
[CV] END logisticregression__C=1, truncatedsvd__n_components=300; total time=  10.5s
[CV] END logisticregression__C=1, truncatedsvd__n_components=300; total time=  10.9s
[CV] END logisticregression__C=10, truncatedsvd__n_components=100

In [371]:
get_cv_results(gcv)

Unnamed: 0,logisticregression__C,truncatedsvd__n_components,mean_score,std_score,fit_time
0,0.1,100,0.413214,0.002011,3.976497
1,0.1,200,0.436147,0.003392,6.547688
2,0.1,300,0.455198,0.004394,10.449225
3,1.0,100,0.417063,0.002633,4.103623
4,1.0,200,0.43734,0.003623,6.833779
5,1.0,300,0.457727,0.004755,10.462302
6,10.0,100,0.414232,0.002654,4.199411
7,10.0,200,0.437527,0.006738,7.224829
8,10.0,300,0.455064,0.006986,10.593552


In [373]:
gcv.best_score_, gcv.best_params_

(0.45772745839093326,
 {'logisticregression__C': 1, 'truncatedsvd__n_components': 300})

In [374]:
model_lr = make_pipeline(
    TfidfVectorizer(max_df=.3, min_df=5),
    TruncatedSVD(n_components=300,),
    StandardScaler(),
    LogisticRegression(random_state=17, n_jobs=-1, C=1)
)

model_lr.fit(X_train, y_train)

In [385]:
predictions_dict = {
    "catboost": model_cb.predict(X_test).flatten(),
    "logreg": model_lr.predict(X_test).flatten(),}

In [386]:
from sklearn.metrics import accuracy_score, f1_score, classification_report

In [388]:
accuracies = {
    key: f1_score(y_test, value, average="macro") 
    for key, value in predictions_dict.items()
}

In [389]:
accuracies

{'catboost': 0.45058479033172977, 'logreg': 0.4511511324739843}

In [390]:
print(classification_report(y_test, predictions_dict['catboost']))

              precision    recall  f1-score   support

          -1       0.52      0.25      0.33      1703
           0       0.88      0.97      0.93     14969
           1       0.50      0.05      0.09       722

    accuracy                           0.86     17394
   macro avg       0.63      0.42      0.45     17394
weighted avg       0.83      0.86      0.83     17394



In [391]:
print(classification_report(y_test, predictions_dict['logreg']))

              precision    recall  f1-score   support

          -1       0.58      0.26      0.36      1703
           0       0.88      0.98      0.93     14969
           1       0.43      0.04      0.07       722

    accuracy                           0.87     17394
   macro avg       0.63      0.42      0.45     17394
weighted avg       0.84      0.87      0.84     17394

