In [1]:
import numpy as np
import pandas as pd
import os
import nltk
import pickle 

from collections import OrderedDict

from joblib import Parallel, delayed

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer 
from nltk.tokenize import word_tokenize 

from autocorrect import Speller

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import balanced_accuracy_score, make_scorer
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split, GridSearchCV 

ps = PorterStemmer() 
spell = Speller()

stop = stopwords.words('english')

# Read data

In [2]:
path_train = os.path.join(os.getcwd(), "ch02-train")
filenames_train = os.listdir(path_train)

data = OrderedDict()
labels = []
for filename in filenames_train:
    with open(os.path.join(os.getcwd(), "ch02-train", filename), mode = 'r', encoding = 'latin-1') as f:
        try:
            data[filename] = f.read()
            labels.append(filename[-1])
        except UnicodeDecodeError as error:
            print(filename, "   ", error)

In [3]:
path_test = os.path.join(os.getcwd(), "ch02-test")
filenames_test = os.listdir(path_test)

test = OrderedDict()
for filename in filenames_test:
    with open(os.path.join(os.getcwd(), "ch02-test", filename), mode = 'r', encoding = 'latin-1') as f:
        try:
            test[filename] = f.read()
        except UnicodeDecodeError as error:
            print(filename, "   ", error)

# Preprocess data

In [4]:
def token_ise(text):
    text = nltk.re.sub('[^A-Za-z]', ' ', text).lower()
    tokens = nltk.re.sub(' +', ' ', text).strip().split(" ")
#     tokens = [ps.stem(spell(word)) for word in tokens]  # with spellcheck the function takes forever
    tokens = [ps.stem(word) for word in tokens]
    
    clear_tokens = [token for token in tokens if token not in stop]
    
    return " ".join(clear_tokens[1:])

In [5]:
%%time
tokenized_data = Parallel(n_jobs = -1)(delayed(token_ise)(example) for example in data.values())
tokenized_test = Parallel(n_jobs = -1)(delayed(token_ise)(example) for example in test.values())

Wall time: 1min 27s


# Optimize model parameters

In [6]:
vectorizer_opt = CountVectorizer(stop_words = stop)
X_train_transformed = vectorizer_opt.fit_transform(tokenized_data)


param_grid = {'C': [0.0001, 0.0005, 0.001, 0.005, 0.01]}  
grid = GridSearchCV(LinearSVC(), param_grid, refit = True, verbose = 3, n_jobs = -1, scoring= make_scorer(balanced_accuracy_score)) 
  
grid.fit(X_train_transformed, labels) 

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.


Fitting 3 folds for each of 5 candidates, totalling 15 fits


[Parallel(n_jobs=-1)]: Done   4 out of  15 | elapsed:   17.1s remaining:   47.3s
[Parallel(n_jobs=-1)]: Done  10 out of  15 | elapsed:   23.4s remaining:   11.7s
[Parallel(n_jobs=-1)]: Done  15 out of  15 | elapsed:   26.5s finished


GridSearchCV(cv='warn', error_score='raise-deprecating',
             estimator=LinearSVC(C=1.0, class_weight=None, dual=True,
                                 fit_intercept=True, intercept_scaling=1,
                                 loss='squared_hinge', max_iter=1000,
                                 multi_class='ovr', penalty='l2',
                                 random_state=None, tol=0.0001, verbose=0),
             iid='warn', n_jobs=-1,
             param_grid={'C': [0.0001, 0.0005, 0.001, 0.005, 0.01]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=make_scorer(balanced_accuracy_score), verbose=3)

In [8]:
grid.best_estimator_

LinearSVC(C=0.0005, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=1000,
          multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
          verbose=0)

In [9]:
grid.best_score_

0.9998279878191938

# Testing

In [10]:
vectorizer_test = CountVectorizer(stop_words = stop)

X_train = vectorizer_test.fit_transform(tokenized_data)
X_test = vectorizer_test.transform(tokenized_test)

In [11]:
svc_test = LinearSVC(C = 0.005)  # value from the grid search best C

svc_test.fit(X_train, labels)
predictions_test = svc_test.predict(X_test)

In [12]:
pd.Series(predictions_test, index = test.keys()).to_csv("pred_of_test_data.csv")

  """Entry point for launching an IPython kernel.
