# KMeans clustering ANY documents

## Read in your files if they're all separate

In [1]:
import pandas as pd
import glob

filenames = glob.glob("fanfiction-harry-potter/*.txt")

# Maybe use open(filename, encoding='latin-1').read()
# if you're running into trouble
contents = [open(filename, encoding='latin-1').read() for filename in filenames]

df = pd.DataFrame({
    'text': contents,
    'filenames': filenames
})
df.head()

Unnamed: 0,filenames,text
0,fanfiction-harry-potter/10001898.txt,Prologue: The MissionDisclaimer: All character...
1,fanfiction-harry-potter/10004131.txt,BlackDisclaimer: I do not own Harry PotterAuth...
2,fanfiction-harry-potter/10004927.txt,"Chapter 1""I'm pregnant.""""""""Mum please say some..."
3,fanfiction-harry-potter/10007980.txt,"Author's Note: Hey, just so you know, this is ..."
4,fanfiction-harry-potter/10010343.txt,Disclaimer: I do not own Harry Potter and frie...


## Or read in your CSV with the text column if not

In [None]:
# import pandas as pd
# df = pd.read_csv("WHAT IS THE FILE???")

## Vectorize your documents

What are the options when creating a `TfidfVectorizer`?

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer

TfidfVectorizer?

Let's think about:

* **max_features**: Can it make things faster? `1` and up
* **max_df**: Should we ignore words that show up too often? `0.0`-`1.0` for percent, OR an integer for absolute document counts
* **min_df**: Should we ignore words that show up too little? `0.0`-`1.0` for percent, OR an integer for absolute document counts
* **vocabulary**: Only care about certain words

Also... how many documents do we have?

In [18]:
df.shape

(1874, 2)

In [2]:
%%time
from sklearn.feature_extraction.text import TfidfVectorizer
from textblob import TextBlob


# Vectorize and save into a new dataframe
vec = TfidfVectorizer(stop_words='english',
                      max_df=0.9, # If you are > 90%, ignore
                      min_df=0.15, # If four are only in 15%, ignore
                      max_features=1000,
                      use_idf=True)

# Fit from the 'text' column of our dataframe
matrix = vec.fit_transform(df['text'])

# Then turn it into a new dataframe
results = pd.DataFrame(matrix.toarray(), columns=vec.get_feature_names())

CPU times: user 4.7 s, sys: 153 ms, total: 4.85 s
Wall time: 5.02 s


In [3]:
results.head()

Unnamed: 0,able,actually,added,afraid,age,ago,agreed,air,albus,alive,...,write,writing,written,wrong,yeah,year,years,yes,young,younger
0,0.0,0.0,0.032304,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.020527,0.151146,0.0
1,0.007593,0.015065,0.0,0.0,0.018084,0.0,0.0,0.008366,0.020044,0.0,...,0.0,0.0,0.009807,0.01618,0.0,0.02715,0.035011,0.031348,0.084636,0.0
2,0.0,0.026802,0.0,0.0,0.032172,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.038641,0.0,0.022308,0.08213,0.0
3,0.008892,0.0,0.0,0.0,0.010589,0.0,0.012317,0.009797,0.011737,0.0,...,0.0,0.0,0.0,0.018949,0.04007,0.095386,0.061502,0.036712,0.054064,0.036292
4,0.019595,0.0,0.0,0.0,0.0,0.020292,0.0,0.043178,0.0,0.026512,...,0.024283,0.0,0.025308,0.0,0.0,0.04204,0.045177,0.03236,0.218421,0.0


> ...Try it without the TextBlob tokenizer

## Cluster your documents

In [22]:
%%time
from sklearn.cluster import KMeans

# How many clusters?
number_of_clusters=2
km = KMeans(n_clusters=number_of_clusters)

print("Fitting", number_of_clusters, "clusters usinga ", matrix.shape, "matrix")

# Let's fit it!
km.fit(matrix)
km.fit

Fitting 2 clusters usinga  (1874, 1000) matrix
CPU times: user 32.4 s, sys: 86.4 ms, total: 32.5 s
Wall time: 32.6 s


## See what they look like

In [23]:
print("Top terms per cluster:")
order_centroids = km.cluster_centers_.argsort()[:, ::-1]
terms = vec.get_feature_names()
for i in range(number_of_clusters):
    top_ten_words = [terms[ind] for ind in order_centroids[i, :5]]
    print("Cluster {}: {}".format(i, ' '.join(top_ten_words)))

Top terms per cluster:
Cluster 0: harry hermione draco said just
Cluster 1: lily james sirius remus said


## Push the category back to the original dataframe

In [24]:
df['category'] = km.labels_
df

Unnamed: 0,filenames,text,category
0,fanfiction-harry-potter/10001898.txt,Prologue: The MissionDisclaimer: All character...,0
1,fanfiction-harry-potter/10004131.txt,BlackDisclaimer: I do not own Harry PotterAuth...,1
2,fanfiction-harry-potter/10004927.txt,"Chapter 1""I'm pregnant.""""""""Mum please say some...",0
3,fanfiction-harry-potter/10007980.txt,"Author's Note: Hey, just so you know, this is ...",0
4,fanfiction-harry-potter/10010343.txt,Disclaimer: I do not own Harry Potter and frie...,0
5,fanfiction-harry-potter/10017757.txt,Disclaimer: I don't own any character in the H...,0
6,fanfiction-harry-potter/10018490.txt,DISCLAIMER: I don't own Harry Potter and its c...,0
7,fanfiction-harry-potter/10018889.txt,Katherine Rose-TylerChapter One: the Introduct...,0
8,fanfiction-harry-potter/10019142.txt,I am no longer that shy little boy anymore.I w...,0
9,fanfiction-harry-potter/10019987.txt,Happy New year! *throws confetti*I've really b...,1


## Be pleased

In [25]:
df[df.category == 1]

Unnamed: 0,filenames,text,category
1,fanfiction-harry-potter/10004131.txt,BlackDisclaimer: I do not own Harry PotterAuth...,1
9,fanfiction-harry-potter/10019987.txt,Happy New year! *throws confetti*I've really b...,1
16,fanfiction-harry-potter/10037071.txt,"For my friend, constant cheerleader and talent...",1
18,fanfiction-harry-potter/10038493.txt,Lily Potter was quite happy. Her favourite bro...,1
21,fanfiction-harry-potter/10043782.txt,Hey everybody this is my first fic and will en...,1
28,fanfiction-harry-potter/10055985.txt,A/N: Everyone needs their own version of a Sly...,1
41,fanfiction-harry-potter/10086672.txt,IntroductionSet in 1995 Christmas everyone is ...,1
43,fanfiction-harry-potter/10090801.txt,A/N: Sequel to Of Wolves and Men. Things will ...,1
46,fanfiction-harry-potter/10093402.txt,"""With the Animagus thing, how will I know when...",1
48,fanfiction-harry-potter/10093883.txt,My first 2nd Generation fic. Hope you like it!...,1
