In [1]:
import pandas as pd
import numpy as np
from sklearn import metrics
from sklearn.cluster import KMeans, AgglomerativeClustering

import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.mixture import GaussianMixture
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier

import advanced_processor_chain_factory
from data import preprocess_data, load_dataset
from evaluation import analysis, cluster_analysis

In [2]:
def last_layer_activations(model, X):
    acs = [X] + (model.n_layers_ - 1) * [None]
    return model._forward_pass(acs)[model.n_layers_ - 2]

In [3]:
dataset = load_dataset()
DEBUG = False

In [4]:
X_train, X_test, Y_train, Y_test = train_test_split(
    *preprocess_data(dataset, processor_chain=advanced_processor_chain_factory.create('lem'), debug=DEBUG))

Pandas Apply:   0%|          | 0/45000 [00:00<?, ?it/s]

In [5]:
vectorizer = CountVectorizer(max_features=2000)

In [6]:
vectorizer.fit(pd.concat([X_train, X_test]))
X_train_vec = vectorizer.transform(X_train)
X_test_vec = vectorizer.transform(X_test)

In [None]:
mlp = MLPClassifier(hidden_layer_sizes=(1000, 500, 250, 5), activation='tanh', warm_start=True)
mlp.fit(X_train_vec, Y_train)

In [None]:
analysis(Y_test, mlp.predict(X_test_vec))

In [None]:
X_train_vec = last_layer_activations(mlp, X_train_vec)
X_test_vec = last_layer_activations(mlp, X_test_vec)

In [None]:
def plot_clustering(Y, X_train_pca):
    u_labels = np.unique(Y)
    for i in u_labels:
        points = X_train_pca[Y == i]
        plt.scatter(points[:, 0], points[:, 1], label=i)
    plt.show()

In [None]:
pca = PCA(n_components=2)
X_train_pca = pca.fit_transform(X_train_vec)
X_test_pca = pca.transform(X_test_vec)

# Show Clusters

Let's see something weird. Features generated by the multi-layer perceptron for train data, are perfect for
even one-dimensional clustering, but it seems they are not good at all for separating our test data.

In [None]:
plot_clustering(Y_train, X_train_pca)

In [None]:
plot_clustering(Y_test, X_test_pca)

## (Bonus) Train a Logistic Regression with the New Representation
As the two-dimensional representation for transformed test data was a total disaster, we wondered to know if this data
is even predictable at all with a model trained with our transformed train data. So we decided to measure
test data performance on a Logistic Regression model trained on our transformed train data.
We measure train and test performance with conventional accuracy metric and with two clustering measures which will be
also used in the next part.

In [None]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.fit(X_train_vec, Y_train)
for name, y, x in [('Training', Y_train, X_train_vec), ('Testing', Y_test, X_test_vec)]:
    print('%s Accuracy: %f' % (name, metrics.accuracy_score(y, lr.predict(x))))
    print('%s Rand Score: %f' % (name, metrics.rand_score(y, lr.predict(x))))
    print('%s Fowlkers Mallows Score: %f' % (name, metrics.fowlkes_mallows_score(y, lr.predict(x))))

# Agglomerative Clustering

In [None]:
agg2 = AgglomerativeClustering(n_clusters=2)
agg5 = AgglomerativeClustering (n_clusters=5)
Y_agg2 = agg2.fit_predict(X_train_vec, Y_train)
Y_agg5 = agg5.fit_predict(X_train_vec, Y_train)

## Number of Clusters = 2

In [None]:
plot_clustering(Y_agg2, X_train_pca)

## Number of Clusters = 5

In [None]:
plot_clustering(Y_agg5, X_train_pca)

Training Loss: (This approuch can't be used for unseen data)

In [None]:
cluster_analysis(Y_train, Y_agg2)

# KMeans Clustering

In [None]:
kmeans2 = KMeans(n_clusters=2)
kmeans5 = KMeans (n_clusters=5)
Y_kmeans2 = kmeans2.fit_predict(X_train_vec, Y_train)
Y_kmeans5 = kmeans5.fit_predict(X_train_vec, Y_train)

## Number of Clusters = 2

In [None]:
plot_clustering(Y_kmeans2, X_train_pca)

## Number of Clusters = 5

In [None]:
plot_clustering(Y_kmeans5, X_train_pca)

## Performance on Training Data

In [None]:
cluster_analysis(Y_train, Y_kmeans2)

## Performance on Test Data

In [None]:
cluster_analysis(Y_test, kmeans2.predict(X_test_vec))

# GMM Clustering

## Number of Clusters = 2

In [None]:
gmm2 = GaussianMixture(n_components=2)
Y_gmm2 = gmm2.fit_predict(X_train_vec, Y_train)
plot_clustering(Y_gmm2, X_train_pca)

## Number of Clusters = 5

In [None]:
gmm5 = GaussianMixture (n_components=5)
Y_gmm5 = gmm5.fit_predict(X_train_vec, Y_train)
plot_clustering(Y_gmm5, X_train_pca)

## Performance on Training Data

In [None]:
cluster_analysis(Y_train, Y_gmm2)

## Performance on Test Data

In [None]:
cluster_analysis(Y_test, gmm2.predict(X_test_vec))

# Cluster Member's Similarity in Action

In [None]:
kmeans = KMeans(n_clusters=3)
kmeans.fit(X_train_vec)
Y_train_pred = kmeans.predict(X_train_vec)

In [None]:
for i in range(3):
    print(f'Cluster {i}:')
    points = X_train_pca[Y_train_pred == i]
    plt.scatter(points[:, 0], points[:, 1], label=i)
    for i, item in enumerate(X_train[Y_train_pred == i].head(5)):
        print(f'{i}\t{item}')
plt.legend()
plt.show()

# Fine-Tuning

## Without pre-training

In [None]:
from google_drive_downloader import GoogleDriveDownloader as gdd

gdd.download_file_from_google_drive(file_id='1uykBJxWH5v5BsSuuwM0r9WLiKWQrDiDJ', dest_path=('./dataset2.csv'))
dataset2 = pd.read_csv('./dataset2.csv')

In [None]:
X_train2, X_test2, Y_train2, Y_test2 = train_test_split(
    *preprocess_data(dataset2, processor_chain=advanced_processor_chain_factory.create('lem'), debug=False))

In [None]:
vectorizer2 = CountVectorizer(max_features=2000)
vectorizer2.fit(pd.concat([X_train2, X_test2]))
X_train_vec2 = vectorizer2.transform(X_train2)
X_test_vec2 = vectorizer2.transform(X_test2)

In [None]:
model = MLPClassifier(hidden_layer_sizes=(1000, 500, 250, 5), activation='tanh')
model.fit(X_train_vec2, Y_train2)
analysis(Y_test2, model.predict(X_test_vec2))

## With pre-training

In [None]:
X_train_vec2 = vectorizer.transform(X_train2)
X_test_vec2 = vectorizer.transform(X_test2)

In [None]:
mlp.partial_fit(X_train_vec2, Y_train2)
analysis(Y_test2, mlp.predict(X_test_vec2))