In [None]:
%matplotlib inline

import glob
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.cm as cm
from tqdm import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples, silhouette_score
from sklearn.cluster import DBSCAN

In [None]:
def get_papers():
    papers = sorted(glob.glob('data/*.txt'))
    df = pd.DataFrame(columns=['paper', 'len'], index=range(len(papers)))

    i = 0
    for paper in papers:
        with open(paper, 'r') as f:
            text = f.readlines()
        df.iloc[i, :] = [paper, len(text[0])]
        i = i + 1
    df = df[~(df['len'] < 5000) & ~(df['len'] > 80000)]
    
    return df

In [None]:
def build_bag_of_words(df, max_df = 0.7, min_df = 0.3):
    count_vect = TfidfVectorizer(input='filename', max_df = max_df, min_df = min_df)
    X_freq = count_vect.fit_transform(df.paper.values.tolist())
    
    return X_freq

In [None]:
def apply_lsa(X_freq, n_components = 15):
    X_lsa = TruncatedSVD(n_components = n_components, random_state=0).fit_transform(X_freq)
    
    return X_lsa

In [None]:
def apply_dimensionality_reduction(X_lsa):
    X_embedded = TSNE(n_components=2).fit_transform(X_lsa)
    
    return X_embedded

In [None]:
def kmeans_clustering(X_embedded, k = 20):
    kmeans = KMeans(n_clusters=k, random_state=0).fit(X_embedded)
    shil = silhouette_score(X_embedded, kmeans.labels_)
    
    return kmeans, shil

In [None]:
def dba_clustering(X_embedded, eps = 1.5, min_samples = 5):
    dbscan = DBSCAN(eps = eps, min_samples = min_samples).fit(X_embedded)
    if len(np.unique(dbscan.labels_).tolist()) > 1:
        shil = silhouette_score(X_embedded, dbscan.labels_)
    else:
        shil = -1
        
    return dbscan, shil

In [None]:
def plot_kmeans(X_embedded, kmeans):
    fig, ax = plt.subplots()
    ax.scatter(X_embedded[:,0], X_embedded[:,1], c = kmeans.labels_, cmap="gist_rainbow")
    i = 0
    for c in kmeans.cluster_centers_:
        plt.annotate(
            str(i),
            xy=(c[0], c[1]), xytext=(0,0),
            textcoords='offset points', ha='center', va='center',
            bbox=dict(boxstyle='round,pad=0.5', fc='white', alpha=0.5))
        i = i + 1
    fig.set_figwidth(16)
    fig.set_figheight(16)

In [None]:
def plot_dbscan(X_embedded, dbscan):
    fig, ax = plt.subplots()
    ax.scatter(X_embedded[:,0], X_embedded[:,1], c = dbscan.labels_, cmap="gist_rainbow")

    labels = np.unique(dbscan.labels_).tolist()

    for i in range(len(labels)):
        if labels[i] != -1:
            center = [np.mean(X_embedded[dbscan.labels_ == labels[i], 0]), 
                      np.mean(X_embedded[dbscan.labels_ == labels[i], 1])]
            plt.annotate(
                str(labels[i]),
                xy=(center[0], center[1]), xytext=(0,0),
                textcoords='offset points', ha='center', va='center',
                bbox=dict(boxstyle='round,pad=0.5', fc='white', alpha=0.5))
    fig.set_figwidth(16)
    fig.set_figheight(16)

In [None]:
# Testing the pipeline
df = get_papers()
X_freq = build_bag_of_words(df, max_df = 0.7, min_df = 0.3)
X_lsa = apply_lsa(X_freq, n_components = 15)
X_embedded = apply_dimensionality_reduction(X_lsa)
kmeans, shil_kmeans = kmeans_clustering(X_embedded, k = 20)
dbscan, shil_dbscan = dba_clustering(X_embedded, eps = 1.5, min_samples=5)
print('Shilouette score kmeans: ' + str(shil_kmeans))
print('Shilouette score dbscan: ' + str(shil_dbscan))
plot_kmeans(X_embedded, kmeans)
plot_dbscan(X_embedded, dbscan)

In [None]:
# Model selection's grid search discretisation and ranges
# Starting from a coarse grid
max_df_values = np.arange(0.7, 1, 0.1)
min_df_values = np.arange(0, 0.3, 0.1)
n_components_values = range(5, 50)

k_values = range(2, 50)

eps_values = np.arange(0.01, 3, 0.05)
min_samples_values = range(3,50)

In [None]:
# K-means model selection
grid_search = [(max_df, min_df, n_components, k) 
                 for max_df in max_df_values
                 for min_df in min_df_values
                 for n_components in n_components_values
                 for k in k_values]
print(len(grid_search))

In [None]:
df = get_papers()

best_shil = -1
best_model = None

for i in tqdm(range(len(grid_search))):
    X_freq = build_bag_of_words(df, max_df = grid_search[i][0], min_df = grid_search[i][1])
    X_lsa = apply_lsa(X_freq, n_components = grid_search[i][2])
    X_embedded = apply_dimensionality_reduction(X_lsa)
    kmeans, shil_kmeans = kmeans_clustering(X_embedded, k = grid_search[i][3])
    
    if shil_kmeans > best_shil:
        best_shil = shil_kmeans
        best_model = kmeans