# *Unsupervised learning: k-means clustering*

[http://scikit-learn.org/stable/auto_examples/text/document_clustering.html](http://scikit-learn.org/stable/auto_examples/text/document_clustering.html)

In [None]:
## Importing basic packages

import os
import random
import numpy as np

In [None]:
## Downloading several hundred New York Times articles as text files

os.chdir('/sharedfolder/')

!wget https://github.com/pcda17/pcda17.github.io/raw/master/week/10/nyt_articles_11-9-2017.zip

!unzip nyt_articles_11-9-2017.zip

os.chdir('/sharedfolder/nyt_articles_11-9-2017/')

In [None]:
## Loading all text files in the current directory as a list of strings

document_list = []

for filename in [item for item in os.listdir('./') if '.txt' in item]:
    text_data = open(filename).read()
    document_list.append(text_data)

print(len(document_list)) # Printing number of documents in list

random.choice(document_list) # Viewing a single document chosen at random

In [None]:
## Creating stop word list

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string

stop_words = stopwords.words('english') + ["'s", "'re", '”', '“', '’', '—'] + list(string.punctuation)

stop_words[:10]     ## Viewing first 10 stop words

In [None]:
## Tokenizing, stemming, and removing stop words from our list of documents

from nltk.stem.porter import PorterStemmer

stemmer = PorterStemmer()

documents_filtered = []

for document in document_list:
    token_list = word_tokenize(document.lower())                                 ## Tokenizing
    tokens_filtered = [item for item in token_list if (item not in stop_words)]  ## Removing stop words
    tokens_filtered = [stemmer.stem(item) for item in tokens_filtered]           ## Stemming
    documents_filtered.append(' '.join(tokens_filtered))

In [None]:
## Viewing a preprocessed document

random.choice(documents_filtered)

In [None]:
## Vectorizing preprocessed documents

from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()

X = vectorizer.fit_transform(documents_filtered) 

In [None]:
## Creating a vocabulary list corresponding to the vectors we created above

vocabulary = vectorizer.get_feature_names()

vocabulary[1140:1160]

In [None]:
from sklearn.cluster import KMeans

kmeans_classifier = KMeans(n_clusters=8, init='k-means++', n_init=10, max_iter=500, tol=0.0001, verbose=0, copy_x=True, n_jobs=1, algorithm='auto')
                                   ## ^ docuements to be grouped in 8 clusters 

cluster_classes = kmeans_classifier.fit_predict(X) 

cluster_classes[:20]               ## Viewing first 20 cluster assignments

In [None]:
## Our cluster assignments and document lists are the same size, in the same order.

print(len(cluster_classes))

print(len(documents_filtered))

print(len(document_list))

In [None]:
## We can view a document and its assigned cluster by accessing the same index in both lists

index_num = 130

print('Cluster assignment:')
print(cluster_classes[index_num])
print()
print('Document:')
print(document_list[index_num])

In [None]:
## Write each document to a new text file, with each cluster of documents in its own directory

try: os.mkdir('/sharedfolder/nyt_clusters_11-9-2017/')
except: pass

for i in range(len(documents_filtered)):
    
    out_dir = '/sharedfolder/nyt_clusters_11-9-2017/Cluster_' + str(cluster_classes[i])  ## Creating a directory pathname that
                                                                                         ## includes the assigned cluster number.
    try: os.mkdir(out_dir)  ## Creating the out_dir directory if it does not yet exist
    except: pass
    
    os.chdir(out_dir)
    
    out_filename = 'Document_' + str(i) + '.txt'            ## Creating a filename for the text file
    
    with open(out_filename, 'w') as file_out:
        file_out.write(document_list[i])                    ## Writing text from original (non-filtered) document list


In [None]:
## Classifying a new text into an existing cluster

input_vector = vectorizer.transform(['Even the budget office is revising its estimates and has predicted the new numbers would be smaller.'])

kmeans_classifier.predict(input_vector)

In [None]:
## Projecting all vectors to 2 dimensions using linear discriminant analysis (LDA) and viewing a scatter plot

from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

matplotlib.rcParams['figure.figsize'] = (10, 7)  # Setting default plot size

lda = LDA(n_components=2) #2-dimensional LDA

y = cluster_classes

lda_transformed = pd.DataFrame(lda.fit_transform(X.toarray(), y))

lda_transformed['y'] = y

plt.scatter(lda_transformed[lda_transformed['y']==0][0], lda_transformed[lda_transformed['y']==0][1], label='0', c='blue', alpha=0.4)
plt.scatter(lda_transformed[lda_transformed['y']==1][0], lda_transformed[lda_transformed['y']==1][1], label='1', c='green', alpha=0.4)
plt.scatter(lda_transformed[lda_transformed['y']==2][0], lda_transformed[lda_transformed['y']==2][1], label='2', c='red', alpha=0.4)
plt.scatter(lda_transformed[lda_transformed['y']==3][0], lda_transformed[lda_transformed['y']==3][1], label='3', c='violet', alpha=0.4)
plt.scatter(lda_transformed[lda_transformed['y']==4][0], lda_transformed[lda_transformed['y']==4][1], label='4', c='yellow', alpha=0.4)
plt.scatter(lda_transformed[lda_transformed['y']==5][0], lda_transformed[lda_transformed['y']==5][1], label='5', c='magenta', alpha=0.4)
plt.scatter(lda_transformed[lda_transformed['y']==6][0], lda_transformed[lda_transformed['y']==6][1], label='6', c='black', alpha=0.4)
plt.scatter(lda_transformed[lda_transformed['y']==7][0], lda_transformed[lda_transformed['y']==7][1], label='7', c='orange', alpha=0.4)
plt.scatter(lda_transformed[lda_transformed['y']==8][0], lda_transformed[lda_transformed['y']==8][1], label='8', c='indigo', alpha=0.4)

plt.legend(loc=2)

plt.show()

In [None]:
## A simple scatter plot example

import random

x_vals_1 = [1.92, 1.79, 1.96, 1.4, 1.61, 1.23, 1.43, 1.85, 2.07, 2.24, 2.11, 1.78, 2.21, 1.79, 1.33]
y_vals_1 = [2.5, 2.11, 2.19, 1.6, 2.83, 2.55, 2.33, 2.09, 2.32, 2.72, 2.05, 2.4, 2.55, 2.83, 1.58]

x_vals_2 = [3.63, 3.12, 3.21, 3.15, 3.56, 3.17, 3.05, 3.14, 2.87, 3.65, 2.82, 3.34, 3.7, 2.95, 3.15]
y_vals_2 = [3.1, 4.27, 4.03, 3.37, 3.22, 3.89, 3.27, 2.64, 3.09, 4.1, 3.61, 3.74, 3.71, 3.51, 2.9]


plt.scatter(x_vals_1, y_vals_1, label='Class 1', c='indigo', alpha=0.5)

plt.scatter(x_vals_2, y_vals_2, label='Class 2', c='blue', alpha=0.5)

plt.ylim(ymin=0, ymax=5)
plt.xlim(xmin=0, xmax=5)


plt.legend(loc=4)

plt.show()