In [1]:
import numpy as np
import pandas as pd
import nltk
import re
import os
import codecs
from sklearn import feature_extraction

In [2]:
df = pd.read_csv("compiled_data_toclassify.csv",index_col=None, header=0, nrows=5000)
df=df.drop(df.columns[0],1)

In [3]:
slicit=(df['response_type']=='asker') 
cols = [col for col in df.columns if 'diag_' in col]
col_name = np.array([col.split('_')[1] for col in df.columns if 'diag_' in col])
diag = (df.loc[slicit,cols].as_matrix()>0)

In [4]:
synopses = df.loc[slicit,'response'].tolist()
titles = df.loc[slicit,'label'].tolist()

In [5]:
print titles[:10] #first 10 titles

['eczema', nan, 'hemangiomas', 'warts, genital warts', nan, nan, 'antihistamines, eczema', nan, 'contact dermatitis', 'intertrigo']


In [6]:
# load nltk's SnowballStemmer as variabled 'stemmer'
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english")

In [7]:
def tokenize_and_stem(text):
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    stems = [stemmer.stem(t) for t in filtered_tokens]
    return stems



In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer

#define vectorizer parameters
tfidf_vectorizer = TfidfVectorizer(max_df=0.8, max_features=200000,
                                 min_df=0.1, stop_words='english',
                                 use_idf=True, tokenizer=tokenize_and_stem, ngram_range=(1,3))

%time tfidf_matrix = tfidf_vectorizer.fit_transform(synopses) #fit the vectorizer to synopses

print(tfidf_matrix.shape)

CPU times: user 58.2 s, sys: 276 ms, total: 58.4 s
Wall time: 58.4 s
(5000, 58)


In [9]:
terms = tfidf_vectorizer.get_feature_names()

In [10]:
from sklearn.metrics.pairwise import cosine_similarity
dist = 1 - cosine_similarity(tfidf_matrix)

In [11]:
from sklearn.cluster import KMeans

num_clusters = 8

km = KMeans(n_clusters=num_clusters)

%time km.fit(tfidf_matrix)

clusters = km.labels_.tolist()

CPU times: user 8.66 s, sys: 4 ms, total: 8.66 s
Wall time: 8.9 s


In [12]:
#from sklearn.externals import joblib

#uncomment the below to save your model 
#since I've already run my model I am loading from the pickle

#joblib.dump(km,  'doc_cluster.pkl')

#km = joblib.load('doc_cluster.pkl')
clusters = km.labels_.tolist()

In [13]:
films = { 'title': titles, 'synopsis': synopses, 'cluster': clusters }

frame = pd.DataFrame(films, index = [clusters] , columns = ['title', 'synopsis', 'cluster'])

In [14]:
frame.cluster.value_counts()

6    1295
2     975
0     739
3     456
1     432
4     388
7     370
5     345
Name: cluster, dtype: int64

In [15]:
from collections import Counter


In [16]:
print("Top terms per cluster:")
print
#sort cluster centers by proximity to centroid
order_centroids = km.cluster_centers_.argsort()[:, ::-1] 
cluster_names={}
l=[item for item in frame['title'].str.split(', ').values.tolist() if isinstance(item,list) ]
to_count= [item for sublist in l for item in sublist]
counter = Counter(to_count)
names = counter.keys()
counts = np.array(counter.values())
freq = counts / np.linalg.norm(counts)
all_freq = dict(zip(names, freq))
for i in range(num_clusters):
    print("Cluster %d words:" % i)
    
    words=''
    for ind in order_centroids[i, :6]: #replace 6 with n words per cluster
        new_word=terms[ind]
        words=words+new_word+', '
        print(' %s,' % new_word)
    cluster_names[i]=words
    print #add whitespace
    print #add whitespace
    print("Cluster %d titles:" % i)
    l=[item for item in frame.ix[i]['title'].str.split(', ').values.tolist() if isinstance(item,list) ]
    to_count= [item for sublist in l for item in sublist]
    counter = Counter(to_count)
    counted=counter.most_common(10)
    names = counter.keys()
    freq = counts / np.linalg.norm(counts)
    clust_freq=dict(zip(names, freq))
    clust_norm={ k:clust_freq[k] / all_freq[k] for k in clust_freq }
    import operator
    sorted_x = sorted(clust_norm.items(), key=operator.itemgetter(1), reverse=True)
    for c in sorted_x[:25]:
        print(' %s= %s' % c)
    print #add whitespace
    print #add whitespace

Top terms per cluster:

Cluster 0 words:
 'm,
 n't,
 's,
 ve,
 day,
 like,


Cluster 0 titles:
 fungus infections= 26.5
 seborrheic dermatitis= 23.7
 chronic fatigue syndrome= 12.0
 outbreaks= 8.21428571429
 ringworm= 6.44444444444
 stasis dermatitis= 6.0
 vitiligo= 6.0
 kaposis sarcoma= 6.0
 shingles= 5.0
 kawasaki disease= 5.0
 asthma= 4.0
 doxycycline= 4.0
 leukoplakia= 4.0
 angular cheilitis= 3.5
 colorectal= 3.5
 mrsa= 3.33333333333
 tuberculosis= 3.14285714286
 epidermal cyst= 3.0
 body lice= 3.0
 atopic dermatitis= 3.0
 yeast infection= 2.484375
 staph= 2.4375
 rosacea= 2.0
 fungal diseases= 2.0
 chickenpox= 2.0


Cluster 1 words:
 test,
 negat,
 day,
 month,
 week,
 blood,


Cluster 1 titles:
 sinus infection= 79.0
 vaccine safety= 19.0
 chicken pox= 13.25
 outbreaks= 8.21428571429
 herpes zoster= 5.33333333333
 cmv= 3.33333333333
 toxoplasmosis= 2.0
 hair loss= 1.66666666667
 ear infection= 1.5
 arthritis= 1.44444444444
 tuberculosis= 1.28571428571
 flu= 1.21739130435
 trichom

In [17]:
sorted_x

[('shingles', 59.250000000000007),
 ('cfs', 22.0),
 ('blood clot', 16.0),
 ('chickenpox', 13.25),
 ('tretinoin', 6.3333333333333339),
 ('infertility', 3.3823529411764706),
 ('md', 2.8235294117647056),
 ('dry skin', 2.5),
 ('ear infection', 2.5),
 ('sore throat', 2.1538461538461537),
 ('high blood pressure', 2.0),
 ('pityriasis rosea', 2.0),
 ('hair loss', 1.3333333333333333),
 ('fordyce spots', 1.25),
 ('scabies', 1.0),
 ('infectious mononucleosis', 1.0),
 ('varicella', 1.0),
 ('sinus infection', 1.0),
 ('rosacea', 1.0),
 ('arthritis', 1.0),
 ('carpal tunnel syndrome', 1.0),
 ('fungus infections', 1.0),
 ('chronic fatigue syndrome', 0.75),
 ('vitamins', 0.66666666666666663),
 ('zoster', 0.66666666666666663),
 ('tinea', 0.64999999999999991),
 ('molluscum contagiosum', 0.5),
 ('keratosis pilaris', 0.5),
 ('thrombosis', 0.39999999999999997),
 ('common cold', 0.3529411764705882),
 ('biopsy', 0.339622641509434),
 ('giardiasis', 0.33333333333333331),
 ('flu', 0.2608695652173913),
 ('food poi

In [18]:
import os  # for os.path.basename

import matplotlib.pyplot as plt
import matplotlib as mpl

from sklearn.manifold import MDS

MDS()

# convert two components as we're plotting points in a two-dimensional plane
# "precomputed" because we provide a distance matrix
# we will also specify `random_state` so the plot is reproducible.
mds = MDS(n_components=2, dissimilarity="precomputed", random_state=1)

pos = mds.fit_transform(dist)  # shape (n_components, n_samples)

xs, ys = pos[:, 0], pos[:, 1]

In [None]:


#set up colors per clusters using a dict
cluster_colors = {0: '#1b9e77', 1: '#d95f02', 2: '#7570b3', 3: '#e7298a', 4: '#66a61e'}

    
#set up cluster names using a dict
cluster_names



In [None]:
#some ipython magic to show the matplotlib plots inline
%matplotlib inline 

#create data frame that has the result of the MDS plus the cluster numbers and titles
df = pd.DataFrame(dict(x=xs, y=ys, label=clusters, title=titles)) 

#group by cluster
groups = df.groupby('label')


# set up plot
fig, ax = plt.subplots(figsize=(17, 9)) # set size
ax.margins(0.05) # Optional, just adds 5% padding to the autoscaling

#iterate through groups to layer the plot
#note that I use the cluster_name and cluster_color dicts with the 'name' lookup to return the appropriate color/label
for name, group in groups:
    ax.plot(group.x, group.y, marker='o', linestyle='', ms=12, 
            label=cluster_names[name], color=cluster_colors[name], 
            mec='none')
    ax.set_aspect('auto')
    ax.tick_params(\
        axis= 'x',          # changes apply to the x-axis
        which='both',      # both major and minor ticks are affected
        bottom='off',      # ticks along the bottom edge are off
        top='off',         # ticks along the top edge are off
        labelbottom='off')
    ax.tick_params(\
        axis= 'y',         # changes apply to the y-axis
        which='both',      # both major and minor ticks are affected
        left='off',      # ticks along the bottom edge are off
        top='off',         # ticks along the top edge are off
        labelleft='off')
    
ax.legend(numpoints=1)  #show legend with only 1 point

#add label in x,y position with the label as the film title
for i in range(len(df)):
    ax.text(df.ix[i]['x'], df.ix[i]['y'], df.ix[i]['title'], size=8)  

    
    
plt.show() #show the plot

#uncomment the below to save the plot if need be
#plt.savefig('clusters_small_noaxes.png', dpi=200)