In [11]:
import wikipedia as wp
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from collections import Counter
from operator import concat
from functools import reduce #python 3
from sklearn import svm
from gensim.models import KeyedVectors

# Pilot data (10 classes 100 articles per class)

## Data Re-formatting and Splitting

In [110]:
data = np.load('./data.npz')

In [119]:
X = data['X'].tolist()
y = data['y']

In [128]:
X = data['X']

In [130]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

## Preprocessing

In [13]:
def preprocess_documents_matrix(docs_mat):
    # paritally adopted from gensim documentation
    
    from string import punctuation
    punctuation = set(punctuation)

    docs_list = docs_mat.tolist()
    
    # remove common words and tokenize
    stoplist = set('for a of the and to in'.split())

    texts = [[''.join(ch for ch in word if ch not in punctuation)
              for word in (document.lower().split())
               if word not in stoplist]
              for document in docs_list]

    # remove words that appear only once
    from collections import defaultdict
    frequency = defaultdict(int)
    for text in texts:
        for token in text:
            frequency[token] += 1

    texts = [[token for token in text if frequency[token] > 1]
             for text in texts]
    return texts

In [155]:
X_train_cleaned = preprocess_documents_matrix(X_train)

## Creating the Vocabulary

In [167]:
words = Counter()
words.update(reduce(concat, X_train_cleaned))
freq = words.most_common(2000)


In [174]:
vocab = [item[0] for item in freq]

## Creating the Docs-Vocabulary Matrix

In [36]:
def docs_vocab_freq_matrix(docs, vocab):
    from collections import Counter

    docs_freq_distribution = []
    for doc in docs:
        word_freq = Counter()
        word_freq.update(doc)
        doc_freq_dist = [word_freq[word] for word in vocab]
        docs_freq_distribution.append(doc_freq_dist)
    return docs_freq_distribution

In [199]:
docs_freq_dist = docs_vocab_freq_matrix(X_train_cleaned, vocab)

In [204]:
docs_freq_dist = np.array(docs_freq_dist)

## Logistic Regression

In [205]:
clf = LogisticRegression()
clf.fit(docs_freq_dist, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [206]:
X_test_cleaned = preprocess_documents_matrix(X_test)
test_docs_freq_dist = docs_vocab_freq_matrix(X_test_cleaned, vocab)
test_docs_freq_dist = np.array(test_docs_freq_dist)

In [215]:
clf.score(test_docs_freq_dist, y_test)

0.76000000000000001

## Saving test/train splits for comparison

In [220]:
np.savez_compressed('./pilot_data_test_train_split', 
                   X_train = docs_freq_dist, 
                   X_test = test_docs_freq_dist,
                   y_train = y_train,
                   y_test = y_test)

## Loading

In [4]:
data = np.load('./pilot_data_test_train_split.npz')
X_train = data['X_train']
X_test = data['X_test']
y_train = data['y_train']
y_test = data['y_test']

## PCA

In [19]:
def compress_reconstruct_with_PCA(X, n_components=150):
    from sklearn.decomposition import PCA
    pca = PCA(n_components=n_components)
    pca.fit(X)
    X_mean = np.mean(X, axis=0)
    reconstructed_X = np.dot(pca.transform(X)[:,:n_components], 
                             pca.components_[:n_components,:]) + X
    return reconstructed_X

In [24]:
reconstructed_X_train = compress_reconstruct_with_PCA(X_train)

In [26]:
clf_reconstructed = LogisticRegression()
clf_reconstructed.fit(reconstructed_X_train, y_train)

reconstructed_X_test = compress_reconstruct_with_PCA(X_test)
PCA_reconstruction_score = clf_reconstructed.score(reconstructed_X_test, y_test)
print("=== PCA Reconstructed Score: {}".format(PCA_reconstruction_score))

=== PCA Reconstructed Score: 0.6966666666666667


# Full Data

## Preprocessing to make the splits

In [43]:
# Loading raw scrapped data
data = np.load('./data_10class_1000perClass_2.npz')
X = data['X']
y = data['y']

# Train/Test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

# Preprocessing and cleaning
X_train_cleaned = preprocess_documents_matrix(X_train)

# Building the vocabulary
words = Counter()
words.update(reduce(concat, X_train_cleaned))
freq = words.most_common(2000)
vocab = [item[0] for item in freq]

# Creating the Docs-Vocabulary Matrix
docs_freq_dist = docs_vocab_freq_matrix(X_train_cleaned, vocab)
docs_freq_dist = np.array(docs_freq_dist)
X_test_cleaned = preprocess_documents_matrix(X_test)
test_docs_freq_dist = docs_vocab_freq_matrix(X_test_cleaned, vocab)
test_docs_freq_dist = np.array(test_docs_freq_dist)


# Saving splits
np.savez_compressed('./full_data_test_train_split', 
                   X_train = docs_freq_dist, 
                   X_test = test_docs_freq_dist,
                   y_train = y_train,
                   y_test = y_test)

## Loading

In [48]:
data = np.load('./full_data_test_train_split.npz')
X_train = data['X_train']
X_test = data['X_test']
y_train = data['y_train']
y_test = data['y_test']

## PCA on full data

In [49]:
reconstructed_X_train = compress_reconstruct_with_PCA(X_train)

In [50]:
clf_reconstructed = LogisticRegression()
clf_reconstructed.fit(reconstructed_X_train, y_train)

reconstructed_X_test = compress_reconstruct_with_PCA(X_test)
PCA_reconstruction_score = clf_reconstructed.score(reconstructed_X_test, y_test)
print("=== PCA Reconstructed Score: {}".format(PCA_reconstruction_score))

=== PCA Reconstructed Score: 0.6773333333333333


In [46]:
len(X_train)

7000

## SVM

In [56]:
clf_reconstructed_svm = svm.LinearSVC()
clf_reconstructed_svm.fit(reconstructed_X_train, y_train)

reconstructed_X_test = compress_reconstruct_with_PCA(X_test)
PCA_reconstruction_score = clf_reconstructed_svm.score(reconstructed_X_test, y_test)
print("=== PCA Reconstructed Score (SVM): {}".format(PCA_reconstruction_score))

=== PCA Reconstructed Score (SVM): 0.624


# Using word embeddings

Training with most common word distributions did not turn out so well. Even on the full dataset, I got around 16% accuracy which is barely higher than the random classifier (10%) on this task.

Upon inspections of the model, researching the community and reviewing the literature I found two main reasons for this:

1. Weight initializations
2. Text-data pre-processing

My initial input pre-processing was based on the experiments section of [this article](https://arxiv.org/abs/1705.02033), which uses the 2000 most frequent words as vocabulary (of course after removal of stop words and punctuation). However, given the poor performance of the end-to-end system using those features, I decided to use word-embeddings which boosted the classifier performance to 54%.

Weight initialization has always been an issue since the inception of autoencoders. So much so that in a famous paper in [Science](https://www.cs.toronto.edu/~hinton/science.pdf), Hinton et al. discuss using a Restricted Boltzmann machine precisely to find good initialization for the weights.

Given the recent progress in deep neural networks training, specifically that of [Batch Norm](https://arxiv.org/abs/1502.03167), I decided to add batch norm layers after each linear layer, because they are known to reduce the effect of bad initialization in the training of the network.

## Loading the word embeddings

In [6]:
en_model = KeyedVectors.load_word2vec_format('./wiki.en/wiki.en.vec')

In [65]:
def docs_embeddings_mat(X, y):
    docs_embeddings_list = []
    docs_labels_list = []
    for doc_ind, doc in enumerate(X):
        word_embeddings_list = []
        for word in doc:
            try:
                word_embedding = en_model.get_vector(word)
                word_embeddings_list.append(word_embedding)
            except KeyError:
                continue
        try: 
            doc_embedding = np.mean(word_embeddings_list, axis=0)
            docs_embeddings_list.append(doc_embedding)
            docs_labels_list.append(y[doc_ind])
        except FloatingPointError:
            continue
    docs_embeddings_mat = np.vstack(docs_embeddings_list)
    docs_labels = np.hstack(docs_labels_list)
    return docs_embeddings_mat, docs_labels

In [67]:
np.seterr(all='raise')

# Loading raw scrapped data
data = np.load('./data_10class_1000perClass_2.npz')
X = data['X']
y = data['y']

# Train/Test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

# Preprocessing and cleaning
X_train_cleaned = preprocess_documents_matrix(X_train)
X_test_cleaned = preprocess_documents_matrix(X_test)

# Creating the Docs-Embeddings Matrix
train_docs_embeddings_mat, train_labels = docs_embeddings_mat(X_train_cleaned, y_train)
test_docs_embeddings_mat, test_labels = docs_embeddings_mat(X_test_cleaned, y_test)

# Saving splits
np.savez_compressed('./full_data_test_train_split_with_embeddings', 
                   X_train = train_docs_embeddings_mat, 
                   X_test = test_docs_embeddings_mat,
                   y_train = train_labels,
                   y_test = test_labels)


  out=out, **kwargs)


In [69]:
train_labels.shape

(6988,)

## Training with reconstructed features

In [8]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
import numpy as np

In [23]:
data = np.load('./reconstructed_train.npz')
train_reconstructed_features = data['train_reconstructed_features']
train_labels = data['train_labels']

In [24]:
clf_reconstructed = LogisticRegression(verbose=True, solver='sag', n_jobs=4, tol=0.1)

In [25]:
clf_reconstructed.fit(train_reconstructed_features, train_labels)

convergence after 5 epochs took 378 seconds
convergence after 6 epochs took 452 seconds
convergence after 6 epochs took 503 seconds
convergence after 7 epochs took 524 seconds
convergence after 6 epochs took 463 seconds
convergence after 5 epochs took 380 seconds
convergence after 5 epochs took 413 seconds
convergence after 7 epochs took 483 seconds
convergence after 5 epochs took 114 seconds
convergence after 5 epochs took 56 seconds


[Parallel(n_jobs=4)]: Done  10 out of  10 | elapsed: 16.0min finished


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=4,
          penalty='l2', random_state=None, solver='sag', tol=0.1,
          verbose=True, warm_start=False)

In [26]:
data_test = np.load('./reconstructed_test.npz')
reconstructed_test_features = data_test['test_reconstructed_features']
test_labels = data_test['test_labels']

In [27]:
reconstructed_score = clf_reconstructed.score(reconstructed_test_features, test_labels)
print("=== Reconstructed Score: {}".format(reconstructed_score))

=== Reconstructed Score: 0.6645548516172057


## Baseline (with embeddings): Original Signal


In [14]:
# Loading the splits
data = np.load('./full_data_test_train_split_with_embeddings.npz')
X_train = data['X_train']
X_test = data['X_test']
y_train = data['y_train']
y_test = data['y_test']

In [15]:
clf_original = LogisticRegression(verbose=True, solver='sag', n_jobs=4, tol=0.1)

In [16]:
clf_original.fit(X_train, y_train)

convergence after 4 epochs took 1 seconds
convergence after 4 epochs took 1 seconds
convergence after 4 epochs took 1 seconds
convergence after 5 epochs took 2 seconds
convergence after 5 epochs took 2 seconds
convergence after 5 epochs took 2 seconds
convergence after 5 epochs took 2 seconds
convergence after 4 epochs took 1 seconds
convergence after 4 epochs took 0 seconds
convergence after 5 epochs took 0 seconds


[Parallel(n_jobs=4)]: Done  10 out of  10 | elapsed:    3.0s finished


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=4,
          penalty='l2', random_state=None, solver='sag', tol=0.1,
          verbose=True, warm_start=False)

In [17]:
original_score = clf_original.score(X_test, y_test)
print("=== Original Score (with embeddingS): {}".format(original_score))

=== Original Score (with embeddingS): 0.7342447482494164


## Baseline (with embeddings): 50-PCA

In [20]:
X_train_pca_reconstructed = compress_reconstruct_with_PCA(X_train, n_components=50)
X_test_pca_reconstructed = compress_reconstruct_with_PCA(X_test, n_components=50)

In [21]:
clf_pca_reconstructed = LogisticRegression(verbose=True, solver='sag', n_jobs=4, tol=0.1)
clf_pca_reconstructed.fit(X_train_pca_reconstructed, y_train)

convergence after 4 epochs took 1 seconds
convergence after 4 epochs took 1 seconds
convergence after 5 epochs took 1 seconds
convergence after 5 epochs took 1 seconds
convergence after 4 epochs took 1 seconds
convergence after 5 epochs took 2 seconds
convergence after 4 epochs took 2 seconds
convergence after 5 epochs took 2 seconds
convergence after 5 epochs took 1 seconds
convergence after 5 epochs took 0 seconds


[Parallel(n_jobs=4)]: Done  10 out of  10 | elapsed:    3.0s finished


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=4,
          penalty='l2', random_state=None, solver='sag', tol=0.1,
          verbose=True, warm_start=False)

In [22]:
pca_reconstruction_score = clf_pca_reconstructed.score(X_test_pca_reconstructed, y_test)
print("=== PCA Reconstructed Score (with embeddings): {}".format(pca_reconstruction_score))

=== PCA Reconstructed Score (with embeddings): 0.7262420806935646
