In [None]:
%matplotlib inline

import glob
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.cm as cm
from tqdm import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples, silhouette_score
from sklearn.cluster import DBSCAN

In this notebook I practice with the different elements of what I will put together as a pipeline to provide me with a categorisation of CVPR 2018 papers. The process will be based on representing the papers by means of a bag of words approach.

## Bag of words model

In [None]:
# Selecting the papers to process (see notebook 2)
papers = sorted(glob.glob('data/*.txt'))
df = pd.DataFrame(columns=['paper', 'len'], index=range(len(papers)))

i = 0
for paper in papers:
    with open(paper, 'r') as f:
        text = f.readlines()
    df.iloc[i, :] = [paper, len(text[0])]
    i = i + 1
df = df[~(df['len'] < 5000) & ~(df['len'] > 80000)]

In [None]:
papers = df.paper.values.tolist()

In [None]:
count_vect = TfidfVectorizer(input='filename', max_df = 0.7, min_df = 0.3)
X_freq = count_vect.fit_transform(papers)
X_freq.shape

In [None]:
count_vect.vocabulary_

In [None]:
X_freq[0,1]

## LSA

In [None]:
X_lsa = TruncatedSVD(n_components=15, random_state=0).fit_transform(X_freq)

## Paper similarity

In [None]:
X_embedded = TSNE(n_components=2).fit_transform(X_lsa)

In [None]:
fig, ax = plt.subplots()
ax.scatter(X_embedded[:,0], X_embedded[:,1])
fig.set_figwidth(16)
fig.set_figheight(16)

## Clustering

In [None]:
# Model selection
RANGE_K = range(2, 100)
best_kmeans = None
max_shil = -1
shils = []
for k in tqdm(RANGE_K):
    kmeans = KMeans(n_clusters=k, random_state=0).fit(X_embedded)
    shil = silhouette_score(X_embedded, kmeans.labels_)
    shils.append(shil)
    if shil > max_shil:
        max_shil = shil
        best_kmeans = kmeans
kmeans = best_kmeans

In [None]:
fig, ax = plt.subplots()
ax.plot(RANGE_K, shils)

In [None]:
fig, ax = plt.subplots()
ax.scatter(X_embedded[:,0], X_embedded[:,1], c = kmeans.labels_, cmap="gist_rainbow")
i = 0
for c in kmeans.cluster_centers_:
    plt.annotate(
        str(i),
        xy=(c[0], c[1]), xytext=(0,0),
        textcoords='offset points', ha='center', va='center',
        bbox=dict(boxstyle='round,pad=0.5', fc='white', alpha=0.5))
    i = i + 1
fig.set_figwidth(16)
fig.set_figheight(16)

In [None]:
for i in np.unique(kmeans.labels_.tolist()):
    indexes = np.where(kmeans.labels_ == i)
    papers_cluster = np.array(papers)[indexes].tolist()
    print('CLUSTER ' + str(i))
    for p in papers_cluster:
        print('   ' + p.replace('data/', ''))
    print('=====================')

In [None]:
# Overall clustering score
silhouette_score(X_embedded, kmeans.labels_)

In [None]:
# Silhouette score per sample
# User to calculate the mean silouette score for each cluster
scores = silhouette_samples(X_embedded, kmeans.labels_)
df_scores = df.copy()
df_scores['label'] = kmeans.labels_
df_scores['shil'] = scores
df_scores.groupby('label').mean().sort_values(by='shil', ascending=False)

## DBSCAN

In [None]:
RANGE_EPS = np.arange(0.01, 5, 0.01)
RANGE_MIN_SAMPLES = range(3,50)
parameters = []
for eps in RANGE_EPS:
    for min_samples in RANGE_MIN_SAMPLES:
        parameters.append([eps, min_samples])

best_dbscan = None
max_shil = -1
shils = []
for i in tqdm(range(len(parameters))):
    dbscan = DBSCAN(eps=parameters[i][0], min_samples=parameters[i][1]).fit(X_embedded)
    if len(np.unique(dbscan.labels_).tolist()) > 1:
        shil = silhouette_score(X_embedded, dbscan.labels_)
        shils.append(shil)
        if shil > max_shil:
            max_shil = shil
            best_dbscan = dbscan
    else:
        shils.append(-1)
dbscan = best_dbscan

In [None]:
fig, ax = plt.subplots()
ax.scatter(X_embedded[:,0], X_embedded[:,1], c = dbscan.labels_, cmap="gist_rainbow")

labels = np.unique(dbscan.labels_).tolist()

for i in range(len(labels)):
    if labels[i] != -1:
        center = [np.mean(X_embedded[dbscan.labels_ == labels[i], 0]), 
                  np.mean(X_embedded[dbscan.labels_ == labels[i], 1])]
        plt.annotate(
            str(labels[i]),
            xy=(center[0], center[1]), xytext=(0,0),
            textcoords='offset points', ha='center', va='center',
            bbox=dict(boxstyle='round,pad=0.5', fc='white', alpha=0.5))
fig.set_figwidth(16)
fig.set_figheight(16)

In [None]:
# Overall clustering score
silhouette_score(X_embedded, dbscan.labels_)

In [None]:
# Silhouette score per sample
# User to calculate the mean silouette score for each cluster
scores = silhouette_samples(X_embedded, dbscan.labels_)
df_scores = df.copy()
df_scores['label'] = dbscan.labels_
df_scores['shil'] = scores
df_scores.groupby('label').mean().sort_values(by='shil', ascending=False)

##Â Conclusion

I am ready to put all of this together into a pipeline in order to apply model selection. These are the parameters I have to deal with:

- min_df and max_df in TfidfVectorizer
- n_components in TruncatedSVD
- n_clusters in KMeans, or eps and min_samples in DBSCAN