In [1]:
import numpy as np
import pandas as pd
import os
import nltk
import pickle 

from collections import OrderedDict

from joblib import Parallel, delayed

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer 
from nltk.tokenize import word_tokenize 

from autocorrect import Speller

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import balanced_accuracy_score, make_scorer
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split, GridSearchCV 
from sklearn.decomposition import TruncatedSVD
from sklearn.neighbors import KNeighborsClassifier

ps = PorterStemmer() 
spell = Speller()

stop = stopwords.words('english')

# Read data

In [None]:
path_train = os.path.join(os.getcwd(), "ch02-train")
filenames_train = os.listdir(path_train)

data = OrderedDict()
labels = []
for filename in filenames_train:
    with open(os.path.join(os.getcwd(), "ch02-train", filename), mode = 'r', encoding = 'latin-1') as f:
        try:
            data[filename] = f.read()
            labels.append(filename[-1])
        except UnicodeDecodeError as error:
            print(filename, "   ", error)

In [None]:
path_test = os.path.join(os.getcwd(), "ch02-test")
filenames_test = os.listdir(path_test)

test = OrderedDict()
for filename in filenames_test:
    with open(os.path.join(os.getcwd(), "ch02-test", filename), mode = 'r', encoding = 'latin-1') as f:
        try:
            test[filename] = f.read()
        except UnicodeDecodeError as error:
            print(filename, "   ", error)

# Preprocess data


In [None]:
def token_ise(text):
    text = nltk.re.sub('[^A-Za-z]', ' ', text).lower()
    tokens = nltk.re.sub(' +', ' ', text).strip().split(" ")
#     tokens = [ps.stem(spell(word)) for word in tokens]  # with spellcheck the function takes forever
    tokens = [ps.stem(word) for word in tokens]
    
    clear_tokens = [token for token in tokens if token not in stop]
    
    return " ".join(clear_tokens[1:])

In [None]:
%%time
tokenized_data = Parallel(n_jobs = -1)(delayed(token_ise)(example) for example in data.values())
tokenized_test = Parallel(n_jobs = -1)(delayed(token_ise)(example) for example in test.values())

# Optimize model parameters

In [None]:
X_train_1, X_test_1, y_train_1, y_test_1 = train_test_split(tokenized_data, labels, test_size = 0.1, random_state = 3)

In [None]:
vectorizer_opt = CountVectorizer(stop_words = stop)

X_train_transformed = vectorizer_opt.fit_transform(X_train_1)
X_test_transformed = vectorizer_opt.transform(X_test_1)

In [None]:
knn = KNeighborsClassifier(n_neighbors=21)
knn.fit(X_train_transformed, y_train_1)

preds = knn.predict(X_test_transformed)
balanced_accuracy_score(preds, y_test_1)

# Model

In [None]:
vectorizer_test = CountVectorizer(stop_words = stop)

X_train = vectorizer_test.fit_transform(tokenized_data)
X_test = vectorizer_test.transform(tokenized_test)

In [None]:
knn = KNeighborsClassifier(n_neighbors=21)
knn.fit(X_train, labels)

predictions_test = knn.predict(X_test)

In [None]:
pd.Series(predictions_test, index = test.keys()).to_csv("pred_of_test_data_KNN.csv")