In [32]:
import pandas as pd
import numpy as np
import nltk
import sklearn
from sklearn import preprocessing, model_selection
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn import decomposition
from sklearn.linear_model import perceptron
from sklearn.neural_network import MLPClassifier
import gensim
import torch
import torch.autograd as ag
# from gensim.models.tfidfmodel import TfidfModel
eps = 1e-15

# Loading Data

In [2]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
sample = pd.read_csv("sample_submission.csv")

In [3]:
train.head()

Unnamed: 0,id,text,author
0,id26305,"This process, however, afforded me no means of...",EAP
1,id17569,It never once occurred to me that the fumbling...,HPL
2,id11008,"In his left hand was a gold snuff box, from wh...",EAP
3,id27763,How lovely is spring As we looked from Windsor...,MWS
4,id12958,"Finding nothing else, not even gold, the Super...",HPL


In [4]:
sample.head()

Unnamed: 0,id,EAP,HPL,MWS
0,id02310,0.403494,0.287808,0.308698
1,id24541,0.403494,0.287808,0.308698
2,id00134,0.403494,0.287808,0.308698
3,id27757,0.403494,0.287808,0.308698
4,id04081,0.403494,0.287808,0.308698


# Log Loss Function


In [5]:
def multiclass_logloss(actual, predicted):
    '''Calculated multi-class log loss for Kaggle competition
    :actual: actual vector of the target classes 
    :predicted: matrix containing the predicted probability for each class
    '''
    if len(actual.shape)==1:
        actual2 = np.zeros((predicted.shape[0],predicted.shape[1]))
        for i, val in enumerate(actual):
            actual2[i,val] = 1
        actual = actual2
        
    predicted_clip = np.clip(predicted, eps, 1 - eps)
    
    return (-1.0 * (np.sum(actual*np.log(predicted_clip))) / actual.shape[0])

In [6]:
### using sklearn to transfer authors to 0,1,2
label_encoder = preprocessing.LabelEncoder()
y = label_encoder.fit_transform(train.author.values)

# Train-Validation Split

In [7]:
### train/test split
# shuffle before splitting, set random state for the same result regardless of iteration, 
# stratify: data is split in a stratified fashion, using this as the class labels.
xtrain, xvalid, ytrain, yvalid = model_selection.train_test_split(train.text.values, y, 
                                                                  test_size=0.1, stratify=y, 
                                                                  random_state=42, shuffle=True)

In [8]:
print(xtrain.shape, xvalid.shape)

(17621,) (1958,)


# Feature: Tf-idf vectorizer 
# Model: LogisticRegression (sklearn)

### We are trying to learn mapping from a matrix (tfidf) to a vector of 3 classes (probabilities) using logistic regression

In [9]:
### Parameters based on experience
#smooth_idf, use_idf, max_features are set to default
# sublinear_tf: add 1 to the result of log, 
# ngram_range: consider phrases between 1 to 3 words, token_pattern: words
tfidf_vectorizer = TfidfVectorizer(min_df=3, max_features=None, strip_accents='unicode', analyzer='word', 
                                   token_pattern=r'\w{1,}', ngram_range=(1, 3), use_idf=1,
                                   smooth_idf=1,sublinear_tf=1,stop_words = 'english')

In [10]:
tfidf_vectorizer.fit(list(xtrain)+list(xvalid))
xtrain_tfidf = tfidf_vectorizer.transform(xtrain)
xvalid_tfidf = tfidf_vectorizer.transform(xvalid)

In [11]:
clf = LogisticRegression(C=1.0)
clf.fit(xtrain_tfidf, ytrain)
predictions = clf.predict_proba(xvalid_tfidf)
print(predictions[1])
print(yvalid[1])
print("loglos is : %0.3f " % multiclass_logloss(yvalid, predictions) )



[0.7279392  0.11053272 0.16152808]
0
loglos is : 0.626 


In [12]:
results = pd.DataFrame(predictions)
results.columns = ["EAP", "HPL", "MWS"]
results.head()

Unnamed: 0,EAP,HPL,MWS
0,0.645703,0.071989,0.282308
1,0.727939,0.110533,0.161528
2,0.594639,0.160637,0.244724
3,0.690826,0.147396,0.161778
4,0.616144,0.129763,0.254093


In [13]:
xtest_tfidf = tfidf_vectorizer.transform(test.text.values)
out = pd.DataFrame(clf.predict_proba(xtest_tfidf))
out.columns = ["EAP", "HPL", "MWS"]
idvalues = pd.DataFrame(test.id)
out.head()
final_results = idvalues.join(out)
final_results.head()
final_results.set_index("id", inplace = True)
final_results.to_csv("final_results")

# Feature: CountVectorizer 

In [14]:
count_vectorizer = CountVectorizer(analyzer="word", token_pattern=r"\w{1,}", 
                                  ngram_range=(1,3), stop_words="english")
count_vectorizer.fit(list(xtrain)+list(xvalid))
xtrain_count_vectorizer = count_vectorizer.transform(xtrain)
xvalid_count_vectorizer = count_vectorizer.transform(xvalid)

In [15]:
clf = LogisticRegression(C=1.0)
clf.fit(xtrain_count_vectorizer, ytrain)
predictions = clf.predict_proba(xvalid_count_vectorizer)
print(predictions[1])
print(yvalid[1])
print("loglos is : %0.3f " % multiclass_logloss(yvalid, predictions) )

[0.85869362 0.04464736 0.09665902]
0
loglos is : 0.528 


In [16]:
xtest_count_vectorizer = count_vectorizer.transform(test.text.values)
out = pd.DataFrame(clf.predict_proba(xtest_count_vectorizer))
out.columns = ["EAP", "HPL", "MWS"]
idvalues = pd.DataFrame(test.id)
out.head()
final_results = idvalues.join(out)
final_results.head()
final_results.set_index("id", inplace = True)
final_results.to_csv("results_count_vectorizer")

# Model: Naive Bayes

In [17]:
clf = MultinomialNB()
clf.fit(xtrain_tfidf, ytrain)
predictions = clf.predict_proba(xvalid_tfidf)
print(predictions[1])
print(yvalid[1])
print("loglos is : %0.3f " % multiclass_logloss(yvalid, predictions) )

[0.63027919 0.13074833 0.23897248]
0
loglos is : 0.578 


In [18]:
clf = MultinomialNB()
clf.fit(xtrain_count_vectorizer, ytrain)
predictions = clf.predict_proba(xvalid_count_vectorizer)
print(predictions[1])
print(yvalid[1])
print("loglos is : %0.3f " % multiclass_logloss(yvalid, predictions) )

[0.86376799 0.01769372 0.11853828]
0
loglos is : 0.485 


In [19]:
xtest_count_vectorizer = count_vectorizer.transform(test.text.values)
out = pd.DataFrame(clf.predict_proba(xtest_count_vectorizer))
out.columns = ["EAP", "HPL", "MWS"]
idvalues = pd.DataFrame(test.id)
out.head()
final_results = idvalues.join(out)
final_results.head()
final_results.set_index("id", inplace = True)
final_results.to_csv("results_count_vectorizer_bayes")

# Feature: SVD
# Model: SVM

In [20]:
# Sigular value decomposition
# 120-200 components are good enough for SVM model
svd = decomposition.TruncatedSVD(n_components=120)
svd.fit(xtrain_tfidf)
xtrain_svd = svd.transform(xtrain_tfidf)
xvalid_svd = svd.transform(xvalid_tfidf)

In [21]:
# Standardize data before svm
scalar = preprocessing.StandardScaler()
scalar.fit(xtrain_svd)
xtrain_svd_normal = scalar.transform(xtrain_svd)
xvalid_svd_normal = scalar.transform(xvalid_svd)

In [22]:
clf = SVC(C=1.0, probability=True)
clf.fit(xtrain_svd_normal, ytrain)
predictions = clf.predict_proba(xvalid_svd_normal)
print(predictions[1])
print(yvalid[1])
print("loglos is : %0.3f " % multiclass_logloss(yvalid, predictions) )

[0.76075407 0.10952052 0.12972541]
0
loglos is : 0.738 


# Model: Neural Network

In [27]:
# scalar = preprocessing.StandardScaler()
# scalar.fit(xtrain_count_vectorizer)
# xtrain_normal = scalar.transform(xtrain_count_vectorizer)
# xvalid_normal = scalar.transform(xvalid_count_vectorizer)

# torch.tensor(targets_df['targets'].values)

# xtrain_normal
# xtrain_torch = ag.Variable(torch.from_numpy(xtrain_count_vectorizer))
xtrain_count_vectorizer
len(xvalid_svd_normal[0])

120

In [37]:
clf = MLPClassifier(hidden_layer_sizes=(120,120,3,), activation='logistic')
clf.fit(xtrain_svd_normal, ytrain)
predictions = clf.predict_proba(xvalid_svd_normal)
print(predictions[1])
print(yvalid[1])
print("loglos is : %0.3f " % multiclass_logloss(yvalid, predictions) )

[9.93283348e-01 9.17819674e-04 5.79883183e-03]
0
loglos is : 1.475 


