<a href="https://colab.research.google.com/github/mrScissors/spookyAuthors/blob/master/spookyAuthors.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Getting data from google drive

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [None]:
zip = '/content/gdrive/My Drive/spooky-author-identification.zip'

import zipfile

with zipfile.ZipFile(zip,"r") as z:
  z.extractall(".")

In [None]:
with zipfile.ZipFile('train.zip',"r") as z:
  z.extractall(".")

with zipfile.ZipFile('test.zip',"r") as z:
  z.extractall(".")

In [None]:
! ls

gdrive	     sample_submission.zip  test.zip   train.zip
sample_data  test.csv		    train.csv


# Importing stuff and downloading libs data, reading data

In [None]:
import pandas as pd
import numpy as np

import re
import string

from sklearn.feature_extraction.text import TfidfVectorizer

import nltk
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize
nltk.download('popular')
stop_words = set(stopwords.words('english'))
from nltk.stem import PorterStemmer
ps =PorterStemmer()

from spacy.lang.en import English
nlp = English()
from spacy.lang.en.stop_words import STOP_WORDS
! python -m spacy download en

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report


[nltk_data] Downloading collection 'popular'
[nltk_data]    | 
[nltk_data]    | Downloading package cmudict to /root/nltk_data...
[nltk_data]    |   Package cmudict is already up-to-date!
[nltk_data]    | Downloading package gazetteers to /root/nltk_data...
[nltk_data]    |   Package gazetteers is already up-to-date!
[nltk_data]    | Downloading package genesis to /root/nltk_data...
[nltk_data]    |   Package genesis is already up-to-date!
[nltk_data]    | Downloading package gutenberg to /root/nltk_data...
[nltk_data]    |   Package gutenberg is already up-to-date!
[nltk_data]    | Downloading package inaugural to /root/nltk_data...
[nltk_data]    |   Package inaugural is already up-to-date!
[nltk_data]    | Downloading package movie_reviews to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Package movie_reviews is already up-to-date!
[nltk_data]    | Downloading package names to /root/nltk_data...
[nltk_data]    |   Package names is already up-to-date!
[nltk_data]    | Do

In [None]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [None]:
train.head()

Unnamed: 0,id,text,author
0,id26305,"This process, however, afforded me no means of...",EAP
1,id17569,It never once occurred to me that the fumbling...,HPL
2,id11008,"In his left hand was a gold snuff box, from wh...",EAP
3,id27763,How lovely is spring As we looked from Windsor...,MWS
4,id12958,"Finding nothing else, not even gold, the Super...",HPL


In [None]:
train.shape

(19579, 3)

# Preprocessing function

In [None]:
# choose 'nltk or 'spacy' for stopword method, lemma = True for lemmatization, stemming is True by default
def preprocessText(text, stop, lemma = False, stemm = True):

  text = text.lower()

  # removing punctuation
  text = re.sub(r'[^\w\s]','',text)
  
  # removing numeric characters
  text = re.sub('\d', '', text)

  words = text.split()

  # Lemmatizing
  if (lemma):
    words = []
    doc = nlp(text)
    for token in doc:
      words.append(token.lemma_)

  # remove stop words using NLTK
  if (stop == 'nltk'):
    words = [i for i in words if not i in stop_words]

  # remove stop words using SpaCy
  if (stop == 'spacy'):
    doc = nlp(' '.join([word for word in words]))
    words = []
    for token in doc:
      lexeme = nlp.vocab[token.text]
      if lexeme.is_stop == False:
        words.append(token.text)

  # Stemming
  if (stemm):
    words = [ps.stem(word) for word in words]


  if (len(words) == 0):
    return text
  else:
    return ' '.join([word for word in words])

In [None]:
train['cleanedText'] = train['text'].apply(lambda x: preprocessText(x, stop = 'spacy', lemma = True))
train.head()

Unnamed: 0,id,text,author,cleanedText
0,id26305,"This process, however, afforded me no means of...",EAP,process afford mean ascertain dimens dungeon c...
1,id17569,It never once occurred to me that the fumbling...,HPL,occur fumbl mere mistak
2,id11008,"In his left hand was a gold snuff box, from wh...",EAP,left hand gold snuff box caper hill cut manner...
3,id27763,How lovely is spring As we looked from Windsor...,MWS,love spring look windsor terrac sixteen fertil...
4,id12958,"Finding nothing else, not even gold, the Super...",HPL,find gold superintend abandon attempt perplex ...


In [None]:
test['cleanedText'] = test['text'].apply(lambda x: preprocessText(x, stop = 'spacy', lemma = True))
test.head()

Unnamed: 0,id,text,cleanedText
0,id02310,"Still, as I urged our leaving Ireland with suc...",urg leav ireland inquietud impati father thoug...
1,id24541,"If a fire wanted fanning, it could readily be ...",fire want fan readili fan newspap govern grew ...
2,id00134,And when they had broken down the frail door t...,broken frail door found cleanli pick human ske...
3,id27757,While I was thinking how I should possibly man...,think possibl manag actual tumbl head roll ste...
4,id04081,I am not sure to what limit his knowledge may ...,sure limit knowledg extend


In [None]:
X_train, X_test, y_train, y_test = train_test_split(train['cleanedText'], train['author'])

# Using tfidf for classification

In [None]:
tfIdfVectorizer=TfidfVectorizer(use_idf=True)
tfTrain = tfIdfVectorizer.fit_transform(X_train.tolist())

In [None]:
tfTrain.shape

(14684, 14028)

In [None]:
tfTest = tfIdfVectorizer.transform(X_test.tolist())

In [None]:
tfTest.shape

(4895, 14028)

In [None]:
model = RandomForestClassifier(n_estimators=600)
model.fit(tfTrain, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=600,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [None]:
yPred = model.predict(tfTest)
print(accuracy_score(y_test, yPred))
print(classification_report(y_test, yPred))

0.7160367722165475
              precision    recall  f1-score   support

         EAP       0.70      0.76      0.73      1959
         HPL       0.69      0.69      0.69      1365
         MWS       0.76      0.68      0.72      1571

    accuracy                           0.72      4895
   macro avg       0.72      0.71      0.71      4895
weighted avg       0.72      0.72      0.72      4895



In [None]:
tfTestSubmit = tfIdfVectorizer.transform(test['text'].tolist())
tfTestSubmit.shape

(8392, 14028)

In [None]:
with zipfile.ZipFile('sample_submission.zip',"r") as z:
  z.extractall(".")

In [None]:
submit = pd.read_csv('sample_submission.csv')
submit.head()

Unnamed: 0,id,EAP,HPL,MWS
0,id02310,0.403494,0.287808,0.308698
1,id24541,0.403494,0.287808,0.308698
2,id00134,0.403494,0.287808,0.308698
3,id27757,0.403494,0.287808,0.308698
4,id04081,0.403494,0.287808,0.308698


In [None]:
model.classes_

array(['EAP', 'HPL', 'MWS'], dtype=object)

In [None]:
submitDf = pd.DataFrame(model.predict_proba(tfTestSubmit))
submitDf['id'] = test['id']
submitDf.head()

Unnamed: 0,0,1,2,id
0,0.181667,0.073333,0.745,id02310
1,0.801667,0.091667,0.106667,id24541
2,0.493333,0.273333,0.233333,id00134
3,0.543333,0.295,0.161667,id27757
4,0.571667,0.221667,0.206667,id04081


In [None]:
submitDf.rename({0: 'EAP', 1: 'HPL', 2: 'MWS'}, axis=1, inplace=True)
submitDf.head()

Unnamed: 0,EAP,HPL,MWS,id
0,0.181667,0.073333,0.745,id02310
1,0.801667,0.091667,0.106667,id24541
2,0.493333,0.273333,0.233333,id00134
3,0.543333,0.295,0.161667,id27757
4,0.571667,0.221667,0.206667,id04081


In [None]:
submitDf = submitDf[['id', 'EAP', 'HPL', 'MWS']]
submitDf.head()

Unnamed: 0,id,EAP,HPL,MWS
0,id02310,0.181667,0.073333,0.745
1,id24541,0.801667,0.091667,0.106667
2,id00134,0.493333,0.273333,0.233333
3,id27757,0.543333,0.295,0.161667
4,id04081,0.571667,0.221667,0.206667


In [None]:
submitDf.to_csv('submission.csv',index=False)

# Using doc2vec for classification