In [None]:
%matplotlib inline

import glob
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.cm as cm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples, silhouette_score
from sklearn.cluster import DBSCAN

In this notebook we will run a simple bag of word analysis of the papers:

- Extract a bag of words representation of the papers
- Cluster papers
- Visualisation of the clustered labels
- Manually annotating each cluster to identify research topics in CVPR

## Bag of words model

In [None]:
# Selecting the papers to process (see notebook 2)
papers = sorted(glob.glob('data/*.txt'))
df = pd.DataFrame(columns=['paper', 'len'], index=range(len(papers)))

i = 0
for paper in papers:
    with open(paper, 'r') as f:
        text = f.readlines()
    df.iloc[i, :] = [paper, len(text[0])]
    i = i + 1
df = df[~(df['len'] < 5000) & ~(df['len'] > 80000)]

In [None]:
papers = df.paper.values.tolist()

In [None]:
count_vect = TfidfVectorizer(input='filename')
X_freq = count_vect.fit_transform(papers)
X_freq.shape

In [None]:
count_vect.vocabulary_

In [None]:
X_freq[0,1]

## LDA

In [None]:
#X_lda = LatentDirichletAllocation(n_components = 10).fit_transform(X_freq)

## Paper similarity

In [None]:
X_embedded = TSNE(n_components=2).fit_transform(X_freq.toarray())

In [None]:
fig, ax = plt.subplots()
ax.scatter(X_embedded[:,0], X_embedded[:,1])
fig.set_figwidth(16)
fig.set_figheight(16)

## Clustering

In [None]:
 kmeans = KMeans(n_clusters=30, random_state=0).fit(X_embedded)

In [None]:
fig, ax = plt.subplots()
ax.scatter(X_embedded[:,0], X_embedded[:,1], c = kmeans.labels_, cmap="gist_rainbow")
i = 0
for c in kmeans.cluster_centers_:
    plt.annotate(
        str(i),
        xy=(c[0], c[1]), xytext=(0,0),
        textcoords='offset points', ha='center', va='center',
        bbox=dict(boxstyle='round,pad=0.5', fc='white', alpha=0.5))
    i = i + 1
fig.set_figwidth(16)
fig.set_figheight(16)

In [None]:
# Overall clustering score
silhouette_score(X_embedded, kmeans.labels_)

In [None]:
# Silhouette score per sample
# User to calculate the mean silouette score for each cluster
scores = silhouette_samples(X_embedded, kmeans.labels_)
df_scores = df.copy()
df_scores['label'] = kmeans.labels_
df_scores['shil'] = scores
df_scores.groupby('label').mean().sort_values(by='shil', ascending=False)

## DBSCAN

In [None]:
dbscan = DBSCAN(eps=1, min_samples=5).fit(X_embedded)

In [None]:
fig, ax = plt.subplots()
ax.scatter(X_embedded[:,0], X_embedded[:,1], c = dbscan.labels_, cmap="gist_rainbow")

labels = np.unique(dbscan.labels_).tolist()

for i in range(len(labels)):
    center = [np.mean(X_embedded[dbscan.labels_ == labels[i], 0]), 
              np.mean(X_embedded[dbscan.labels_ == labels[i], 1])]
    plt.annotate(
        str(labels[i]),
        xy=(center[0], center[1]), xytext=(0,0),
        textcoords='offset points', ha='center', va='center',
        bbox=dict(boxstyle='round,pad=0.5', fc='white', alpha=0.5))
fig.set_figwidth(16)
fig.set_figheight(16)

In [None]:
# Overall clustering score
silhouette_score(X_embedded, dbscan.labels_)

In [None]:
# Silhouette score per sample
# User to calculate the mean silouette score for each cluster
scores = silhouette_samples(X_embedded, dbscan.labels_)
df_scores = df.copy()
df_scores['label'] = dbscan.labels_
df_scores['shil'] = scores
df_scores.groupby('label').mean().sort_values(by='shil', ascending=False)