# KMeans clustering ANY documents

## Read in your files if they're all separate

In [1]:
import pandas as pd
import glob

filenames = glob.glob("bible-text/*.txt")

# Maybe use open(filename, encoding='latin-1').read()
# if you're running into trouble
contents = [open(filename).read() for filename in filenames]

df = pd.DataFrame({
    'text': contents,
    'filenames': filenames
})
df.head()

UnicodeDecodeError: 'utf-8' codec can't decode byte 0xb6 in position 165: invalid start byte

## Or read in your CSV with the text column if not

In [None]:
# import pandas as pd
# df = pd.read_csv("WHAT IS THE FILE???")

## Vectorize your documents

What are the options when creating a `TfidfVectorizer`?

In [None]:
TfidfVectorizer?

Let's think about:

* **max_features**: Can it make things faster? `1` and up
* **max_df**: Should we ignore words that show up too often? `0.0`-`1.0` for percent, OR an integer for absolute document counts
* **min_df**: Should we ignore words that show up too little? `0.0`-`1.0` for percent, OR an integer for absolute document counts
* **vocabulary**: Only care about certain words

Also... how many documents do we have?

In [None]:
df.shape

In [None]:
%%time
from sklearn.feature_extraction.text import TfidfVectorizer
from textblob import TextBlob

def textblob_tokenizer(str_input):
    blob = TextBlob(str_input.lower())
    tokens = blob.words
    words = [token.stem() for token in tokens]
    return words

# Vectorize and save into a new dataframe
vec = TfidfVectorizer(tokenizer=textblob_tokenizer,
                      stop_words='english',
                      use_idf=True)

# Fit from the 'text' column of our dataframe
matrix = vec.fit_transform(df['text'])

# Then turn it into a new dataframe
results = pd.DataFrame(matrix.toarray(), columns=vec.get_feature_names())

In [None]:
results.head()

> ...Try it without the TextBlob tokenizer

## Cluster your documents

In [None]:
%%time
from sklearn.cluster import KMeans

# How many clusters?
number_of_clusters=2
km = KMeans(n_clusters=number_of_clusters)

print("Fitting", number_of_clusters, "clusters usinga ", matrix.shape, "matrix")

# Let's fit it!
km.fit(matrix)
km.fit

## See what they look like

In [None]:
print("Top terms per cluster:")
order_centroids = km.cluster_centers_.argsort()[:, ::-1]
terms = vec.get_feature_names()
for i in range(number_of_clusters):
    top_ten_words = [terms[ind] for ind in order_centroids[i, :5]]
    print("Cluster {}: {}".format(i, ' '.join(top_ten_words)))

## Push the category back to the original dataframe

In [None]:
df['category'] = km.labels_
df

## Be pleased