In [15]:
import pickle
import pandas as pd
import numpy as np
from sklearn import metrics
from sklearn.cluster import MiniBatchKMeans, KMeans

import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.mixture import GaussianMixture
from sklearn.model_selection import train_test_split, GridSearchCV
from gensim.models import KeyedVectors

import advanced_processor_chain_factory
from data import preprocess_data, vectorize_data, load_dataset
from w2v_adapter import Word2VecAdapter

def last_layer_activations(model, X):
    acs = [X] + (model.n_layers_ - 1) * [None]
    return model._forward_pass(acs)[model.n_layers_ - 2]

In [2]:
with open('best.pkl', 'rb') as f:
    mlp = pickle.load(f)

In [3]:
dataset = load_dataset()
DEBUG = False

In [4]:
X_train, X_test, Y_train, Y_test = train_test_split(
    *preprocess_data(dataset, processor_chain=advanced_processor_chain_factory.create('lem'), debug=DEBUG))

Dask Apply:   0%|          | 0/8 [00:00<?, ?it/s]

FileNotFoundError: [Errno 2] No such file or directory: 'vectors.kv'

In [5]:
vectorizer = Word2VecAdapter()

In [None]:
vectorizer = Word2VecAdapter(pre_trained_model=KeyedVectors.load('w2v.kv'))

In [6]:
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

Dask Apply:   0%|          | 0/8 [00:00<?, ?it/s]

Dask Apply:   0%|          | 0/8 [00:00<?, ?it/s]

# Show Clusters

In [13]:
tsne = PCA(n_components = 2)
X_tsne = tsne.fit_transform(X_train_vec[:1000])

In [None]:
u_labels = np.unique(Y_train[:1000])
for i in u_labels:
    points = X_tsne[Y_train[:1000] == i]
    plt.scatter(points[:,0] , points[:,1] , label = i)
plt.show()

In [31]:
import copy

def evaluate_cluster(model_factory , possible_configs, X_train_vec, X_test_vec, Y_train, Y_test):
    kmeans_summary = pd.DataFrame()
    for config in possible_configs:
        model = model_factory(**config)
        Y_train_pred = model.fit_predict(X_train_vec, Y_train)
        # Y_test_pred = model.predict(X_test_vec)
        train_ant_score = metrics.mutual_info_score(Y_train, Y_train_pred)
        train_rand_score = metrics.rand_score(Y_train, Y_train_pred)
        #test_ant_score = metrics.mutual_info_score(Y_test, Y_test_pred)
        #test_rand_score = metrics.rand_score(Y_test, Y_test_pred)
        ccon = copy.copy(config)
        config['train_mutual_info_score'] = train_ant_score
        config['train_rand_score'] = train_rand_score
        #config['test_mutual_info_score'] = test_ant_score
        #config['test_rand_score'] = test_rand_score
        kmeans_summary = kmeans_summary.append(config, ignore_index=True)
        print(f'Evaluation config {ccon} has been completed.')
    return kmeans_summary

In [32]:
evaluate_cluster(MiniBatchKMeans, [{'n_clusters':2}, {'n_clusters':10}, {'n_clusters':100}, {'n_clusters':1000}], X_train_vec[:4000], X_test_vec[:4000], Y_train[:4000], Y_test[:4000])

Evaluation config {'n_clusters': 2} has been completed.
Evaluation config {'n_clusters': 10} has been completed.
Evaluation config {'n_clusters': 100} has been completed.
Evaluation config {'n_clusters': 1000} has been completed.


Unnamed: 0,n_clusters,train_mutual_info_score,train_rand_score
0,2.0,0.000269,0.500151
1,10.0,0.001929,0.500174
2,100.0,0.013355,0.499982
3,1000.0,0.158764,0.500033


In [None]:
evaluate_cluster(GaussianMixture, [{'n_components':2}, {'n_components':10}, {'n_components':100}, {'n_components':4500}])

In [28]:

from sklearn.cluster import DBSCAN
evaluate_cluster(DBSCAN, [{'eps':0.07, 'min_samples':3, 'metric':'cosine'}], X_train_vec, X_test_vec, Y_train, Y_test)

Evaluation config {'eps': 0.07, 'min_samples': 3, 'metric': 'cosine'} has been completed.


Unnamed: 0,eps,metric,min_samples,train_mutual_info_score,train_rand_score
0,0.07,cosine,3.0,8.5e-05,1e-05
