# KMeans clustering ANY documents

## Read in your files if they're all separate

In [54]:
import pandas as pd
import glob

filenames = glob.glob("fanfiction-harry-potter/*.txt")

# Maybe use open(filename, encoding='latin-1').read()
# if you're running into trouble
contents = [open(filename, encoding='latin-1').read() for filename in filenames]

df = pd.DataFrame({
    'text': contents,
    'filenames': filenames
})
df.head()

Unnamed: 0,filenames,text
0,fanfiction-harry-potter/10001898.txt,Prologue: The MissionDisclaimer: All character...
1,fanfiction-harry-potter/10004131.txt,BlackDisclaimer: I do not own Harry PotterAuth...
2,fanfiction-harry-potter/10004927.txt,"Chapter 1""I'm pregnant.""""""""Mum please say some..."
3,fanfiction-harry-potter/10007980.txt,"Author's Note: Hey, just so you know, this is ..."
4,fanfiction-harry-potter/10010343.txt,Disclaimer: I do not own Harry Potter and frie...


## Or read in your CSV with the text column if not

In [55]:
# import pandas as pd
# df = pd.read_csv("WHAT IS THE FILE???")

## Vectorize your documents

What are the options when creating a `TfidfVectorizer`?

In [56]:
from sklearn.feature_extraction.text import TfidfVectorizer

TfidfVectorizer?

Let's think about:

* **max_features**: Can it make things faster? `1` and up
* **max_df**: Should we ignore words that show up too often? `0.0`-`1.0` for percent, OR an integer for absolute document counts
* **min_df**: Should we ignore words that show up too little? `0.0`-`1.0` for percent, OR an integer for absolute document counts
* **vocabulary**: Only care about certain words

Also... how many documents do we have?

In [57]:
df.shape

(1874, 2)

In [58]:
%%time
from sklearn.feature_extraction.text import TfidfVectorizer
from textblob import TextBlob

# cutsom stopwords
from sklearn.feature_extraction import stop_words
custom_stopwords = ['did', 'don', 'didn', 'said', 'thee', 'ye'] + list(stop_words.ENGLISH_STOP_WORDS)

# Vectorize and save into a new dataframe
vec = TfidfVectorizer(stop_words=custom_stopwords,
                      max_df=0.95, # If you are > 90%, ignore
                      min_df=0.65, # If four are only in 15%, ignore
                      max_features=1000,
                      use_idf=True)

# Fit from the 'text' column of our dataframe
matrix = vec.fit_transform(df['text'])

# Then turn it into a new dataframe
results = pd.DataFrame(matrix.toarray(), columns=vec.get_feature_names())

CPU times: user 3.77 s, sys: 52.7 ms, total: 3.82 s
Wall time: 3.86 s


In [59]:
results.head()

Unnamed: 0,away,come,eyes,face,going,hand,harry,head,just,know,...,long,look,looked,make,right,think,thought,time,want,way
0,0.0,0.097431,0.16553,0.093909,0.460939,0.097375,0.380324,0.090659,0.309462,0.082139,...,0.196765,0.095081,0.464304,0.0,0.097153,0.0,0.184479,0.246149,0.098214,0.091064
1,0.266795,0.08276,0.468684,0.079769,0.052204,0.27571,0.026921,0.154017,0.240959,0.162798,...,0.083568,0.323057,0.420684,0.082997,0.082525,0.078746,0.130584,0.069695,0.139042,0.103136
2,0.0,0.0,0.083538,0.189572,0.558291,0.0,0.0,0.0,0.312351,0.248717,...,0.297902,0.287906,0.281183,0.295865,0.09806,0.187141,0.0,0.0,0.297391,0.091914
3,0.016474,0.051104,0.260467,0.049256,0.048354,0.119174,0.71482,0.190208,0.135264,0.15797,...,0.068804,0.099742,0.47083,0.034166,0.118902,0.129666,0.0,0.057381,0.068686,0.127371
4,0.194431,0.120625,0.409872,0.193775,0.114134,0.040185,0.196193,0.037414,0.063855,0.169488,...,0.121803,0.117716,0.344902,0.040323,0.080188,0.0,0.076132,0.40633,0.445845,0.300646


> ...Try it without the TextBlob tokenizer

## Cluster your documents

In [60]:
%%time
from sklearn.cluster import KMeans

# How many clusters?
number_of_clusters=3
km = KMeans(n_clusters=number_of_clusters)

print("Fitting", number_of_clusters, "clusters using a ", matrix.shape, "matrix")

# Let's fit it!
km.fit(matrix)
km.fit

Fitting 3 clusters using a  (1874, 22) matrix
CPU times: user 3.19 s, sys: 27.9 ms, total: 3.22 s
Wall time: 3.27 s


## See what they look like

In [61]:
print("Top terms per cluster:")
order_centroids = km.cluster_centers_.argsort()[:, ::-1]
terms = vec.get_feature_names()
for i in range(number_of_clusters):
    top_ten_words = [terms[ind] for ind in order_centroids[i, :3]]
    print("Cluster {}: {}".format(i, ' '.join(top_ten_words)))

Top terms per cluster:
Cluster 0: just like know
Cluster 1: eyes looked like
Cluster 2: harry just like


## Push the category back to the original dataframe

In [62]:
df['category'] = km.labels_
df

Unnamed: 0,filenames,text,category
0,fanfiction-harry-potter/10001898.txt,Prologue: The MissionDisclaimer: All character...,1
1,fanfiction-harry-potter/10004131.txt,BlackDisclaimer: I do not own Harry PotterAuth...,1
2,fanfiction-harry-potter/10004927.txt,"Chapter 1""I'm pregnant.""""""""Mum please say some...",0
3,fanfiction-harry-potter/10007980.txt,"Author's Note: Hey, just so you know, this is ...",2
4,fanfiction-harry-potter/10010343.txt,Disclaimer: I do not own Harry Potter and frie...,1
5,fanfiction-harry-potter/10017757.txt,Disclaimer: I don't own any character in the H...,1
6,fanfiction-harry-potter/10018490.txt,DISCLAIMER: I don't own Harry Potter and its c...,0
7,fanfiction-harry-potter/10018889.txt,Katherine Rose-TylerChapter One: the Introduct...,0
8,fanfiction-harry-potter/10019142.txt,I am no longer that shy little boy anymore.I w...,0
9,fanfiction-harry-potter/10019987.txt,Happy New year! *throws confetti*I've really b...,1


## Be pleased

In [63]:
df[df.category == 4]['text']

Series([], Name: text, dtype: object)