<center> <h2> KNN classification : </h2> </center>

In [1]:
import os
import re
import random
import string
import time
import operator
import numpy as np
import multiprocessing
from functools import partial
from multiprocessing import Pool
from collections import Counter
from gensim.models.word2vec import Word2Vec
from sklearn.metrics import accuracy_score
from sklearn.metrics.pairwise import cosine_similarity as cosine
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
random.seed(111417)

In [3]:
# remove dashes and apostrophes from punctuation marks 
punct = string.punctuation.replace('-', '').replace("'",'')
# regex to match intra-word dashes and intra-word apostrophes
my_regex = re.compile(r"(\b[-']\b)|[\W_]")


In [4]:
path_root = 'for_moodle/'

path_to_data = path_root + 'data/'
path_to_documents = path_root + 'data/documents/'
path_to_google_news = path_root

In [5]:
def atoi(text):
    return int(text) if text.isdigit() else text

def natural_keys(text):
    return [atoi(c) for c in re.split('(\d+)', text)]

# returns the vector of a word
def my_vector_getter(word, wv):
    try:
        # we use reshape because cosine similarity in sklearn now works only for multidimensional arrays
        word_array = wv.wv[word].reshape(1,-1)
        return (word_array)
    except KeyError:
        print('word: <', word, '> not in vocabulary!')
    
# performs basic pre-processing
def clean_string(string, punct=punct, my_regex=my_regex, to_lower=False):
    if to_lower:
        string = string.lower()
    # remove formatting
    str = re.sub('\s+', ' ', string)
     # remove punctuation
    str = ''.join(l for l in str if l not in punct)
    # remove dashes that are not intra-word
    str = my_regex.sub(lambda x: (x.group(1) if x.group(1) else ' '), str)
    # strip extra white space
    str = re.sub(' +',' ',str)
    # strip leading and trailing white space
    str = str.strip()
    return str

def to_parallelize(doc,collection,w2v):
    to_return = []
    for doc_train in collection:
        ### fill gap ### append the Word Mover's Distance between doc and doc_train
        to_return.append(w2v.wv.wmdistance(doc, doc_train))
    return to_return

In [6]:
t = time.time()

with open(path_to_data + 'smart_stopwords.txt', 'r') as my_file: 
    stpwds = my_file.read().splitlines()

doc_names = os.listdir(path_to_documents)
doc_names.sort(key=natural_keys)
docs = []
for idx,name in enumerate(doc_names):
    with open(path_to_documents + name,'r') as my_file:
        docs.append(my_file.read())
    if idx % round(len(doc_names)/10) == 0:
        print(idx)

with open(path_to_data + 'labels.txt', 'r') as my_file: 
    labels = my_file.read().splitlines()

labels = np.array([int(item) for item in labels])

print('documents, labels and stopwords loaded in', round(time.time() - t,2), 'second(s)')

shuffled_idxs = random.sample(range(len(docs)), len(docs)) # sample w/o replct
docs = [docs[idx] for idx in shuffled_idxs]
labels = [labels[idx] for idx in shuffled_idxs]

print('documents and labels shuffled')


0
1131
2262
3393
4524
5655
6786
7917
9048
10179
11310
documents, labels and stopwords loaded in 0.37 second(s)
documents and labels shuffled


In [7]:
t = time.time()

cleaned_docs = []
for idx, doc in enumerate(docs):
    # clean
    doc = clean_string(doc, punct, my_regex, to_lower=True)
    # tokenize (split based on whitespace)
    tokens = doc.split(' ')
    # remove stopwords
    tokens = [token for token in tokens if token not in stpwds]
    # remove digits
    tokens = [''.join([elt for elt in token if not elt.isdigit()]) for token in tokens]
    # remove tokens shorter than 3 characters in size
    tokens = [token for token in tokens if len(token)>2]
    # remove tokens exceeding 25 characters in size
    tokens = [token for token in tokens if len(token)<=25]
    cleaned_docs.append(tokens)
    if idx % round(len(docs)/10) == 0:
        print(idx)

print('documents cleaned in', round(time.time() - t,2), 'second(s)')

0
1131
2262
3393
4524
5655
6786
7917
9048
10179
11310
documents cleaned in 25.29 second(s)


In [8]:
# create empty word vectors for the words in vocabulary 
# we set size=300 to match dim of GNews word vectors
my_q = 300
mcount = 5
w2v = Word2Vec(size=my_q, min_count=mcount)

w2v.build_vocab(cleaned_docs)

# w2v.wv.vocab returns a dictionary
vocab = list(w2v.wv.vocab.keys())
all_tokens = [token for sublist in cleaned_docs for token in sublist]
t_counts = dict(Counter(all_tokens))
assert len(vocab) == len([token for token,count in t_counts.items() if count>=mcount])

In [9]:
t = time.time()

w2v.intersect_word2vec_format(path_to_google_news + 'GoogleNews-vectors-negative300.bin.gz', binary=True)

print('word vectors loaded in', round(time.time() - t,2), 'second(s)')

word vectors loaded in 89.85 second(s)


In [10]:
# NOTE: in-vocab words without an entry in the Google News file are not removed from the vocabulary
# instead, their vectors are silently initialized to random values
# we can detect those vectors via their norms which approach zero
norms = [np.linalg.norm(w2v[word]) for word in vocab]
idxs_zero_norms = [idx for idx,norm in enumerate(norms) if norm<=0.05]
# get the words with close to zero norms
no_entry_words = [vocab[idx] for idx in idxs_zero_norms]

# remove no-entry words and infrequent words
no_entry_words = set(no_entry_words)
for idx,doc in enumerate(cleaned_docs):
    cleaned_docs[idx] = [token for token in doc if token not in no_entry_words and t_counts[token]>=mcount]
    if idx % round(len(docs)/10) == 0:
        print(idx)

# retain only 'max_size' first words of each doc to speed-up computation of WMD
max_size = 100

cleaned_docs = [elt[:max_size] for elt in cleaned_docs]

print('documents truncated to', max_size, 'word(s)')

  after removing the cwd from sys.path.


0
1131
2262
3393
4524
5655
6786
7917
9048
10179
11310
documents truncated to 100 word(s)


In [11]:
# compute centroids of documents
t = time.time()

centroids = np.empty(shape=(len(cleaned_docs),my_q))

for idx,doc in enumerate(cleaned_docs):
    # computing the centroid by using mean and concatenate
    centroid = np.mean( np.concatenate( [my_vector_getter(token,w2v) for token in doc]) , axis=0)
    centroids[idx,:] = centroid
    if idx % round(len(docs)/10) == 0:
        print(idx)

print('centroids computed in', round(time.time() - t,2), 'second(s)')

0
1131
2262
3393
4524
5655
6786
7917
9048
10179
11310
centroids computed in 2.09 second(s)


In [12]:
# use the first n_train docs as training set and last n_test docs as test set
# compute distance between each element in the test set and each element in the training set

n_train = 100
n_test = 50

print('using', n_train, 'documents as examples')
print('using', n_test, 'documents for testing')

tfidf_vect = TfidfVectorizer(min_df=1, 
                             stop_words=None, 
                             lowercase=False, 
                             preprocessor=None)

# tfidf_vectorizer takes raw documents as input
doc_term_mtx = tfidf_vect.fit_transform([' '.join(elt) for elt in cleaned_docs[:n_train]])

using 100 documents as examples
using 50 documents for testing


In [13]:
t = time.time()

my_similarities = []
for idx,doc_test in enumerate(cleaned_docs[-n_test:]):
    # notice that we just transform
    doc_test_vect = tfidf_vect.transform([' '.join(doc_test)])
    sims = cosine(doc_term_mtx, Y=doc_test_vect, dense_output=True)
    my_similarities.append(sims[:,0])
    if idx % round(n_test/10) == 0:
        print(idx)

print('TFIDF cosine similarities computed in', round(time.time() - t,2), 'second(s)')


0
5
10
15
20
25
30
35
40
45
TFIDF cosine similarities computed in 0.06 second(s)


In [14]:
t = time.time()

my_centroid_similarities = []
for idx in range(n_test):
    sims = cosine(centroids[:n_train,:], 
                  Y=centroids[centroids.shape[0]-(idx+1),:].reshape(1, -1), 
                  dense_output=True)
    my_centroid_similarities.append(sims[:,0])
    if idx % round(n_test/10) == 0:
        print(idx)

print('centroid-based cosine similarities computed in', round(time.time() - t,2), 'second(s)')


0
5
10
15
20
25
30
35
40
45
centroid-based cosine similarities computed in 0.06 second(s)


In [15]:
t = time.time()

to_parallelize_partial = partial(to_parallelize,
                                 collection=cleaned_docs[:n_train],
                                 w2v=w2v)
 
n_jobs = multiprocessing.cpu_count()

print('using', n_jobs, 'core(s)')
pool = Pool(processes=n_jobs)
my_distances = pool.map(to_parallelize_partial, cleaned_docs[-n_test:])
pool.close()
pool.join()

using 8 core(s)


In [16]:
print('========== performance of centroids ==========')

for nn in [1,3,5,7,11,13,15,17,21,23]:
    
    preds_centroids = []
    for idx,sims in enumerate(my_centroid_similarities):
        idxs_sorted = np.argsort(sims).tolist()
        ### fill gap ### get labels of 'nn' nearest neighbors. Be cautious about the difference between distance and similarity!
        labels_nn = [labels[:n_train][elt] for elt in idxs_sorted[-nn:]]
        # select most frequent label as prediction
        counts = dict(Counter(labels_nn))
        max_counts = max(list(counts.values()))
        pred = [k for k,v in counts.items() if v==max_counts][0]
        preds_centroids.append(pred)
    
    # compare predictions to true labels
    
    print('accuracy for',nn,'nearest neighbors:',accuracy_score(labels[-n_test:],preds_centroids))


accuracy for 1 nearest neighbors: 0.08
accuracy for 3 nearest neighbors: 0.12
accuracy for 5 nearest neighbors: 0.08
accuracy for 7 nearest neighbors: 0.1
accuracy for 11 nearest neighbors: 0.12
accuracy for 13 nearest neighbors: 0.14
accuracy for 15 nearest neighbors: 0.14
accuracy for 17 nearest neighbors: 0.14
accuracy for 21 nearest neighbors: 0.1
accuracy for 23 nearest neighbors: 0.08


In [17]:
print('========== performance of TFIDF ==========')

for nn in [1,3,5,7,11,13,15,17,21,23]:
    
    preds_tfidf = []
    for idx,sims in enumerate(my_similarities):
        # sort by decreasing order
        idxs_sorted = np.argsort(sims).tolist()
        ### fill gap ### get labels of 'nn' nearest neighbors. Be cautious about the difference between distance and similarity!
        labels_nn = [labels[:n_train][elt] for elt in idxs_sorted[-nn:]]
        # select most frequent label as prediction
        counts = dict(Counter(labels_nn))
        max_counts = max(list(counts.values()))
        pred = [k for k,v in counts.items() if v==max_counts][0]
        preds_tfidf.append(pred)
    
    # compare predictions to true labels
    
    print('accuracy for',nn,'nearest neighbors:',accuracy_score(labels[-n_test:],preds_tfidf))

accuracy for 1 nearest neighbors: 0.2
accuracy for 3 nearest neighbors: 0.26
accuracy for 5 nearest neighbors: 0.24
accuracy for 7 nearest neighbors: 0.2
accuracy for 11 nearest neighbors: 0.2
accuracy for 13 nearest neighbors: 0.22
accuracy for 15 nearest neighbors: 0.14
accuracy for 17 nearest neighbors: 0.2
accuracy for 21 nearest neighbors: 0.16
accuracy for 23 nearest neighbors: 0.16
