In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import RandomizedSearchCV
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import SVC

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import accuracy_score, f1_score

In [2]:
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

In [3]:
X_train = df_train.text
y_train_r = df_train.rating
y_train_p = df_train.positive
X_test = df_test.text
y_test_r = df_test.rating
y_test_p = df_test.positive


In [4]:
y_train_p = y_train_p.to_numpy()
y_test_p = y_test_p.to_numpy()
y_train_r = y_train_r.to_numpy()
y_test_r = y_test_r.to_numpy()

In [5]:
tfv = TfidfVectorizer()

In [6]:
tfv.fit(X_train)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=None,
                min_df=1, ngram_range=(1, 1), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words=None, strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)

In [7]:
X_train_tfv = tfv.transform(X_train)
X_test_tfv = tfv.transform(X_test)

### TfidfVectorizer + LogReg
accuracy_score
train - 0.97376
test - 0.88272

In [62]:
clf = LogisticRegression(C = 4.5)
clf.fit(X_train_tfv, y_train_p)
predictions = clf.predict(X_test_tfv)
accuracy_score(y_test_p, predictions)

0.88272

In [63]:
clf = LogisticRegression(C = 4.5)
clf.fit(X_train_tfv, y_train_p)
predictions = clf.predict(X_train_tfv)
accuracy_score(y_train_p, predictions)

0.97376

-----

In [53]:
clf = LogisticRegression(C=4.5, penalty='l2', solver='saga', n_jobs=-1)
clf.fit(X_train_tfv, y_train_p)
predictions = clf.predict(X_test_tfv)
accuracy_score(y_test_p, predictions)

0.88268

In [55]:
clf = LogisticRegression(C=4.5, penalty='l2', solver='saga', n_jobs=-1)
clf.fit(X_train_tfv, y_train_p)
predictions = clf.predict(X_train_tfv)
accuracy_score(y_train_p, predictions)

0.97368

In [14]:
logistic = LogisticRegression(random_state=0, n_jobs=-1)
distributions = dict(C=range(0, 10), penalty=['l2', 'l1'], solver=['saga', 'lbfgs'])
clf = RandomizedSearchCV(logistic, distributions, random_state=0, scoring='accuracy')
search = clf.fit(X_train_tfv, y_train_p)

In [15]:
search.best_score_

0.86136

In [16]:
search.best_params_

{'solver': 'saga', 'penalty': 'l1', 'C': 2}

In [90]:
search.cv_results_

{'mean_fit_time': array([4.76672099e+01, 2.88965297e+00, 4.23567777e+00, 1.74793963e+00,
        2.46784707e+01, 3.09325218e-02, 3.64074416e+00, 2.91362286e-02,
        3.93447373e+01, 4.27881789e+00]),
 'std_fit_time': array([2.03639987e+00, 6.10184259e-02, 1.60443193e-01, 1.33750626e-01,
        4.30518051e-01, 2.67468068e-03, 1.68977035e-01, 7.51063215e-04,
        1.31709803e+00, 1.86357782e-01]),
 'mean_score_time': array([0.00439873, 0.00698109, 0.00497928, 0.00560608, 0.0057806 ,
        0.        , 0.00558486, 0.        , 0.00619125, 0.0073925 ]),
 'std_score_time': array([0.00050291, 0.00126   , 0.00125973, 0.00134722, 0.00182887,
        0.        , 0.00135442, 0.        , 0.00132193, 0.00330832]),
 'param_solver': masked_array(data=['saga', 'saga', 'lbfgs', 'saga', 'saga', 'lbfgs',
                    'saga', 'lbfgs', 'saga', 'lbfgs'],
              mask=[False, False, False, False, False, False, False, False,
                    False, False],
        fill_value='?',
      

-----

In [82]:
clf = MultinomialNB(alpha=2.6)
clf.fit(X_train_tfv, y_train_p)
predictions = clf.predict(X_test_tfv)
accuracy_score(y_test_p, predictions)

0.8318

In [83]:
clf = MultinomialNB(alpha=2.6)
clf.fit(X_train_tfv, y_train_p)
predictions = clf.predict(X_train_tfv)
accuracy_score(y_train_p, predictions)

0.898

-----

In [109]:
rfc = RandomForestClassifier(n_estimators=200, min_samples_leaf=5, min_samples_split=5, max_depth=80, bootstrap=False, criterion='entropy', n_jobs=-1, random_state=42)
rfc.fit(X_train_tfv, y_train_p)
predictions = rfc.predict(X_test_tfv)
accuracy_score(y_test_p, predictions)

0.8476

In [95]:
rfc = RandomForestClassifier(n_estimators=100, min_samples_leaf=4, min_samples_split=5, max_depth=80, bootstrap=False, criterion='entropy', n_jobs=-1)
rfc.fit(X_train_tfv, y_train_p)
predictions = rfc.predict(X_train_tfv)
accuracy_score(y_train_p, predictions)

0.98536

In [91]:
base_clf = RandomForestClassifier(n_estimators=20, criterion='entropy', n_jobs=-1, random_state=42)
params = dict(max_depth=range(10, 100, 10), min_samples_split = [2, 5, 10], min_samples_leaf = [1, 2, 4], max_features = ['auto', 'sqrt'], bootstrap = [True, False])
clf = RandomizedSearchCV(base_clf, params, random_state=0, scoring='accuracy')
search = clf.fit(X_train_tfv, y_train_p)

In [92]:
search.best_score_

0.8074

In [93]:
search.best_params_

{'min_samples_split': 5,
 'min_samples_leaf': 4,
 'max_features': 'auto',
 'max_depth': 80,
 'bootstrap': False}

-----

In [8]:
rfc = RandomForestClassifier(n_estimators=100, min_samples_leaf=4, min_samples_split=5, max_depth=80, bootstrap=False, criterion='entropy', n_jobs=-1)
rfc.fit(X_train_tfv, y_train_r)
predictions = rfc.predict(X_test_tfv)
accuracy_score(y_test_r, predictions)

0.3624

array([10,  1,  1, ..., 10, 10, 10], dtype=int64)

In [8]:
rfc = RandomForestClassifier(n_estimators=150, min_samples_leaf=4, min_samples_split=5, max_depth=100, bootstrap=False, criterion='gini', n_jobs=-1)
rfc.fit(X_train_tfv, y_train_r)
predictions = rfc.predict(X_test_tfv)
accuracy_score(y_test_r, predictions)

0.37184

In [9]:
f1_score(y_test_r, predictions, average='micro')

0.37184

In [15]:
logreg = LogisticRegression()
ovr = OneVsRestClassifier(logreg)
ovr.fit(X_train_tfv, y_train_r)
predictions = ovr.predict(X_test_tfv)
accuracy_score(y_test_r, predictions)

0.42168

In [16]:
f1_score(y_test_r, predictions, average='macro')

0.2805836556149856