In [1]:
import pandas as pd
import numpy as np

from sklearn import preprocessing, decomposition, model_selection, metrics, pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB

from sklearn.svm import LinearSVC
from sklearn.ensemble import AdaBoostClassifier
#import xgboost as xgb
from tqdm import tqdm
from sklearn.svm import SVC

#import pickle5

In [2]:
def multiclass_logloss(actual, predicted, eps=1e-15):
    """Multi class version of Logarithmic Loss metric.
    :param actual: Array containing the actual target classes
    :param predicted: Matrix with class predictions, one probability per class
    """
    # Convert 'actual' to a binary array if it's not already:
    if len(actual.shape) == 1:
        actual2 = np.zeros((actual.shape[0], predicted.shape[1]))
        for i, val in enumerate(actual):
            actual2[i, val] = 1
        actual = actual2

    clip = np.clip(predicted, eps, 1 - eps)
    rows = actual.shape[0]
    vsota = np.sum(actual * np.log(clip))
    return -1.0 / rows * vsota

In [7]:
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

In [8]:
df_train

Unnamed: 0,text,rating,positive
0,story man unnatural feelings pig starts openin...,2,0
1,airport starts brand new luxury plane loaded v...,3,0
2,film lacked something couldnt put finger first...,3,0
3,sorry everyone know supposed art film wow hand...,0,0
4,little parents took along theater see interior...,0,0
...,...,...,...
24995,seeing vote average pretty low fact clerk vide...,6,1
24996,plot wretched unbelievable twists however chem...,5,1
24997,amazed movieand others average stars lower cra...,7,1
24998,christmas together actually came time ive rais...,5,1


In [9]:
X_train = df_train.text
X_test = df_test.text
y_train = df_train.rating
y_test = df_test.rating

In [10]:
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, random_state=42, test_size = 0.1, shuffle = True)

In [11]:
tfv = TfidfVectorizer(min_df=10, max_df=0.95, strip_accents='unicode', analyzer='word',token_pattern=r'\w{1,}', ngram_range=(1, 2), sublinear_tf=True)

tfv.fit(list(X_train) + list(X_val) + list(X_test))
X_train_tfv =  tfv.transform(X_train) 
X_val_tfv = tfv.transform(X_val)
X_test_tfv = tfv.transform(X_test)

In [12]:
clf = LogisticRegression(C=1.2)
clf.fit(X_train_tfv, y_train)

predictions = clf.predict(X_val_tfv)
metrics.accuracy_score(y_val, predictions)

0.4576

In [25]:
predictions = clf.predict(X_test_tfv)
metrics.accuracy_score(y_test, predictions)

0.4312

In [26]:
metrics.f1_score(y_test, predictions, average='macro')

0.2967648164440115

In [28]:
predictions = clf.predict(X_train_tfv)
metrics.accuracy_score(y_train, predictions)

0.8517777777777777

In [29]:
metrics.f1_score(y_train, predictions, average='macro')

0.8479154505548965

In [33]:
clf = LinearSVC(C=5)
clf.fit(X_train_tfv, y_train)
predictions = clf.predict(X_val_tfv)

metrics.accuracy_score(y_val, predictions)

0.3908

In [34]:
predictions = clf.predict(X_test_tfv)
metrics.accuracy_score(y_test, predictions)

0.3622

In [42]:
predictions = clf.predict(X_train_tfv)
metrics.accuracy_score(y_train, predictions)

0.9175555555555556

In [35]:
ctv = CountVectorizer(min_df = 3, max_df = 0.95, analyzer='word',token_pattern=r'\w{1,}', ngram_range=(1, 2))

ctv.fit(list(X_train) + list(X_val) + list(X_test))
X_train_ctv =  ctv.transform(X_train) 
X_val_ctv = ctv.transform(X_val)
X_test_ctv = ctv.transform(X_test)

In [38]:
clf = LogisticRegression(C=1.2)
clf.fit(X_train_ctv, y_train)

predictions = clf.predict(X_val_ctv)
metrics.accuracy_score(y_val, predictions)

0.3972

In [39]:
predictions = clf.predict(X_test_ctv)
metrics.accuracy_score(y_test, predictions)

0.3798

In [88]:
predictions = clf.predict(X_train_ctv)
metrics.accuracy_score(y_train, predictions)

0.9998666666666667

In [41]:
clf = MultinomialNB()
clf.fit(X_train_tfv, y_train)
predictions = clf.predict(X_val_tfv)
metrics.accuracy_score(y_val, predictions)

0.3844

In [25]:
clf = LogisticRegression(C=1.0)
clf.fit(X_train_tfv, y_train)
predictions = clf.predict_proba(X_train_tfv)

multiclass_logloss(y_train, predictions)

1.074762409160698

In [26]:
predictions = clf.predict_proba(X_test_tfv)
multiclass_logloss(y_test, predictions)

1.604938378272961

In [27]:
svd = decomposition.TruncatedSVD(n_components=120)
svd.fit(X_train_tfv)
X_train_svd = svd.transform(X_train_tfv)
X_val_svd = svd.transform(X_val_tfv)
X_test_svd = svd.transform(X_test_tfv)

scl = preprocessing.StandardScaler()
scl.fit(X_train_svd)
X_train_svd_scl = scl.transform(X_train_svd)
X_val_svd_scl = scl.transform(X_val_svd)

In [31]:
clf = SVC(C=1.0, probability=True) # since we need probabilities
clf.fit(X_train_svd_scl, y_train)
predictions = clf.predict_proba(X_val_svd_scl)

In [32]:
multiclass_logloss(y_val, predictions)

1.463663135076308

In [33]:
clf = xgb.XGBClassifier(max_depth=7, n_estimators=200, colsample_bytree=0.8, subsample=0.8, nthread=10, learning_rate=0.1)
clf.fit(X_train_tfv.tocsc(), y_train)
predictions = clf.predict_proba(X_val_tfv.tocsc())

multiclass_logloss(y_val, predictions)

1.6038875416576863

In [34]:
predictions = clf.predict(X_test_tfv.tocsc())

metrics.accuracy_score(y_test, predictions)

0.39344

In [37]:
metrics.f1_score(y_test, predictions, average = 'macro')

0.2766583240627062

In [14]:
with open("TFIDF_Vectorizer.pkl", 'wb') as f:
    pickle5.dump(tfv, f)

In [15]:
with open("Classifier.pkl", 'wb') as f:
    pickle5.dump(clf, f)

array([2, 7, 3, ..., 4, 7, 0], dtype=int64)