In [17]:
# Reference: https://www.kaggle.com/abhishek/approaching-almost-any-nlp-problem-on-kaggle

In [2]:
import pandas as pd
import numpy as np

from sklearn.svm import SVC
from sklearn import preprocessing, decomposition, model_selection, metrics, pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB

# Loss Function

In [3]:
def multiclass_logloss(actual, predicted, eps=1e-15):
    """Multi class version of Logarithmic Loss metric.
    :param actual: Array containing the actual target classes
    :param predicted: Matrix with class predictions, one probability per class
    """
    # Convert 'actual' to a binary array if it's not already:
    if len(actual.shape) == 1:
        actual2 = np.zeros((actual.shape[0], predicted.shape[1]))
        for i, val in enumerate(actual):
            actual2[i, val] = 1
        actual = actual2

    clip = np.clip(predicted, eps, 1 - eps)
    rows = actual.shape[0]
    vsota = np.sum(actual * np.log(clip))
    return -1.0 / rows * vsota

# Data Featurization

In [4]:
train = pd.read_csv('../data/spooky-author-identification/train.csv')
test = pd.read_csv('../data/spooky-author-identification/test.csv')
sample = pd.read_csv('../data/spooky-author-identification/sample_submission.csv')

In [5]:
lbl_enc = preprocessing.LabelEncoder()
y = lbl_enc.fit_transform(train.author.values)

In [6]:
xtrain, xvalid, ytrain, yvalid = train_test_split(train.text.values,
                                                  y,
                                                  stratify=y,
                                                  random_state=42,
                                                  test_size=0.1,
                                                  shuffle=True)

In [7]:
tfv = TfidfVectorizer(min_df=3,
                      max_features=None,
                      strip_accents='unicode',
                      analyzer='word',
                      token_pattern=r'\w{1,}',
                      ngram_range=(1, 3),
                      use_idf=1,
                      smooth_idf=1,
                      sublinear_tf=1,
                      stop_words='english')

# Fitting TF-IDF to both training and test sets (semi-supervised learning)
tfv.fit(list(xtrain) + list(xvalid))
xtrain_tfv = tfv.transform(xtrain) 
xvalid_tfv = tfv.transform(xvalid)

In [8]:
ctv = CountVectorizer(analyzer='word',
                      token_pattern=r'\w{1,}',
                      ngram_range=(1, 3),
                      stop_words = 'english')

# Fitting Count Vectorizer to both training and test sets (semi-supervised learning)
ctv.fit(list(xtrain) + list(xvalid))
xtrain_ctv = ctv.transform(xtrain) 
xvalid_ctv = ctv.transform(xvalid)

   # Logistic regression on tf-idf

In [9]:
%%time
clf = LogisticRegression(C=1.0, solver='newton-cg')
clf.fit(xtrain_tfv, ytrain)

predictions = clf.predict_proba(xvalid_tfv)
print(multiclass_logloss(yvalid, predictions))

0.5715620486412905
Wall time: 394 ms


# Logistic regression on word counts

In [10]:
%%time
clf = LogisticRegression(C=1.0, solver='newton-cg')
clf.fit(xtrain_ctv, ytrain)
predictions = clf.predict_proba(xvalid_ctv)

print(multiclass_logloss(yvalid, predictions))

0.526557604889774
Wall time: 5.71 s


# Naive Bayes on tf-idf

In [11]:
%%time
clf = MultinomialNB()
clf.fit(xtrain_tfv, ytrain)
predictions = clf.predict_proba(xvalid_tfv)

print(multiclass_logloss(yvalid, predictions))

0.5778049688756708
Wall time: 4.99 ms


# Naive Bayes on word counts

In [12]:
%%time
clf = MultinomialNB()
clf.fit(xtrain_ctv, ytrain)
predictions = clf.predict_proba(xvalid_ctv)

print(multiclass_logloss(yvalid, predictions))

0.4854149231348956
Wall time: 29.9 ms


# SVM on standardized tf-idf

In [13]:
# Apply SVD, I chose 120 components. 120-200 components are good enough for SVM model.
svd = decomposition.TruncatedSVD(n_components=120)
svd.fit(xtrain_tfv)
xtrain_svd = svd.transform(xtrain_tfv)
xvalid_svd = svd.transform(xvalid_tfv)

# Scale the data obtained from SVD. Renaming variable to reuse without scaling.
scl = preprocessing.StandardScaler()
scl.fit(xtrain_svd)
xtrain_svd_scl = scl.transform(xtrain_svd)
xvalid_svd_scl = scl.transform(xvalid_svd)

In [14]:
%%time
# Fitting a simple SVM
clf = SVC(C=1.0, probability=True) # since we need probabilities
clf.fit(xtrain_svd_scl, ytrain)
predictions = clf.predict_proba(xvalid_svd_scl)

print(multiclass_logloss(yvalid, predictions))

0.7308069556462726
Wall time: 2min 22s


# SVM on standardized word counts

In [15]:
# Apply SVD, I chose 120 components. 120-200 components are good enough for SVM model.
svd = decomposition.TruncatedSVD(n_components=120)
svd.fit(xtrain_ctv)
xtrain_svd = svd.transform(xtrain_ctv)
xvalid_svd = svd.transform(xvalid_ctv)

# Scale the data obtained from SVD. Renaming variable to reuse without scaling.
scl = preprocessing.StandardScaler()
scl.fit(xtrain_svd)
xtrain_svd_scl = scl.transform(xtrain_svd)
xvalid_svd_scl = scl.transform(xvalid_svd)

In [16]:
%%time
# Fitting a simple SVM
clf = SVC(C=1.0, probability=True) # since we need probabilities
clf.fit(xtrain_svd_scl, ytrain)
predictions = clf.predict_proba(xvalid_svd_scl)

print(multiclass_logloss(yvalid, predictions))

0.7825439040183609
Wall time: 2min 33s
