In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
%cd drive/MyDrive/Colab Notebooks/text

/content/drive/MyDrive/Colab Notebooks/text


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from nltk import word_tokenize
from nltk.corpus import stopwords

from tqdm import tqdm

import nltk
nltk.download('stopwords')
stop_words = stopwords.words('english')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
df = pd.read_csv('train.csv')

In [None]:
df

Unnamed: 0,id,text,author
0,id26305,"This process, however, afforded me no means of...",EAP
1,id17569,It never once occurred to me that the fumbling...,HPL
2,id11008,"In his left hand was a gold snuff box, from wh...",EAP
3,id27763,How lovely is spring As we looked from Windsor...,MWS
4,id12958,"Finding nothing else, not even gold, the Super...",HPL
...,...,...,...
19574,id17718,"I could have fancied, while I looked at it, th...",EAP
19575,id08973,The lids clenched themselves together as if in...,EAP
19576,id05267,"Mais il faut agir that is to say, a Frenchman ...",EAP
19577,id17513,"For an item of news like this, it strikes us i...",EAP


In [None]:
# remove id column
df.drop(['id'], axis=1, inplace=True)

In [None]:
from sklearn.preprocessing import LabelEncoder

# convert author from categorical ['EAP', 'MWS', 'HPL'] to numerial 0,1,2
lb = LabelEncoder()
df['author'] = lb.fit_transform(df['author'])

In [None]:
from sklearn.model_selection import train_test_split, cross_val_score

X = df['text']
y = df['author']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=101)

In [None]:
X_train.shape, X_test.shape

((15663,), (3916,))

In [None]:
embeddings_index = {}
f = open('glove.840B.300d.txt')
for line in tqdm(f):
    values = line.split(" ")
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

2196017it [03:21, 10876.56it/s]


Found 2196016 word vectors.


In [None]:
# Natural Language Processing
# Preprocess the text by first 
# 1) convert all string to lowercase
# 2) tokenize each word
# 3) remove stopwords 
# 4) remove punctuations
# then, perform vectorization on the sentences, convert all text to vector of numbers
# we set the number of features to 300

def sent2vec(s):
    words = str(s).lower() # convert all string to lowercase
    words = word_tokenize(words) # tokenize each word 
    words = [w for w in words if not w in stop_words] # remove stopwords
    words = [w for w in words if w.isalpha()] # remove punctuations(,!?)
    M = []
    for w in words:
        try:
            M.append(embeddings_index[w])
        except:
            continue
    M = np.array(M)
    v = M.sum(axis=0)
    if type(v) != np.ndarray:
        return np.zeros(300)
    return v / np.sqrt((v ** 2).sum())

In [None]:
# Word vectorization using GloVe 
xtrain_glove = [sent2vec(x) for x in X_train]
xtest_glove = [sent2vec(x) for x in X_test]

In [None]:
xtrain_glove = np.array(xtrain_glove)
xtest_glove = np.array(xtest_glove)

In [None]:
# we have 15663 rows of vectors, each vector = one sentence, and each vector have 300 features
xtrain_glove.shape

(15663, 300)

In [None]:
# 1st method: Decision tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, classification_report 

# use cross-validation to get the accuracy
dt = DecisionTreeClassifier()
dt.fit(xtrain_glove, y_train)
y_pred = dt.predict(xtest_glove)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.50      0.51      0.50      1556
           1       0.42      0.42      0.42      1108
           2       0.47      0.44      0.46      1252

    accuracy                           0.47      3916
   macro avg       0.46      0.46      0.46      3916
weighted avg       0.46      0.47      0.46      3916



In [None]:
# hyperparameter tuning
from sklearn.model_selection import GridSearchCV

parameters = {'criterion':['gini','entropy'],
              'max_depth': [30, 40]}

dt = DecisionTreeClassifier()
gridcv = GridSearchCV(dt, parameters)
gridcv.fit(xtrain_glove, y_train)

print(gridcv.best_params_)

y_pred = gridcv.predict(xtest_glove)
print(classification_report(y_test, y_pred))

{'criterion': 'entropy', 'max_depth': 30}
              precision    recall  f1-score   support

           0       0.48      0.50      0.49      1556
           1       0.42      0.42      0.42      1108
           2       0.45      0.44      0.44      1252

    accuracy                           0.46      3916
   macro avg       0.45      0.45      0.45      3916
weighted avg       0.46      0.46      0.46      3916



Using decision tree, we get a low accuracy, 46%, even with cross validation, the mean accuracy still remain the same, 46%. This make senses since decision tree is not suitable on dataset with large features

In [None]:
# 2nd method: SVM
from sklearn.svm import SVC

# original dataset
svc = SVC()
svc.fit(xtrain_glove, y_train)

y_pred = svc.predict(xtest_glove)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.70      0.76      0.73      1556
           1       0.74      0.67      0.71      1108
           2       0.73      0.73      0.73      1252

    accuracy                           0.72      3916
   macro avg       0.73      0.72      0.72      3916
weighted avg       0.72      0.72      0.72      3916



In [None]:
# cross-validation
svc = SVC()
cv = cross_val_score(svc, xtrain_glove, y_train, cv=5)
cv.mean(), cv.std()

(0.7232968250066548, 0.009784380880692311)

Using SVM, we have a better performance compared to DecisionTree, with a 72% accuracy.

After performing cross-validation, the mean accuracy remains the same, with 72%.

Since SVM is very slow, we are unable to perform hyperparameter tuning using grid search