# Basic K-Means Clustering

##### Note: must run [pip install whoosh]

In [2]:
from whoosh import index, writing
from whoosh.fields import Schema, TEXT, KEYWORD, ID, STORED
from whoosh.analysis import *
from whoosh.qparser import QueryParser
import os.path
from pathlib import Path
import tempfile
import subprocess
import csv
import codecs
import pandas as pd
import re
import os.path
from nltk.tokenize import RegexpTokenizer

In [3]:
DIRECTORY = ""
OUTPUT = os.path.join(DIRECTORY, "output.csv")

In [4]:
OUTPUT

'output.csv'

## Step 1:
### Lower case, remove punctuation, stemming, remove stop words

In [11]:
def createIndex(schema):
    # Generate a temporary directory for the index
    indexDir = tempfile.mkdtemp()

    # create and return the index
    return index.create_in(indexDir, schema)


def addFilesToIndex(indexObj, csvPath):
    # open writer
    writer = indexObj.writer()
    #writer.add_field(index, format)
    #writer.add_field(cell_content, format)

    # open csv
    with codecs.open(csvPath, "r", "ISO-8859-1") as csvfile:
        # create csv reader object
        csvreader = csv.DictReader(csvfile)
        # create dictionary to hold document values
        doc = {}
        # instantiate index count
        i = 0
        
        # read each row in file
        for row in csvreader:
            value = row["Negative Feedback"]
            if value != "" and isinstance(value, str):
                doc[i] = value
                writer.update_document(index = str(i), cell_content = value)
            i += 1
        writer.commit()

In [12]:
# create text analyzer
cleaningFilter = RegexTokenizer() | LowercaseFilter() | IntraWordFilter() | StopFilter() | StemFilter()

# define a Schema with the new analyzer
schema = Schema(index = ID(stored=True),
                   cell_content = TEXT(analyzer = cleaningFilter, stored = True))

# create the index based on the new schema
index2 = createIndex(schema)

# index cell contents in refined.csv to index2
# TODO: put index2 into dataframe
addFilesToIndex(index2, OUTPUT)

#### Inspect Index

In [13]:
# Is it empty?
print("Index is empty?", index2.is_empty())

# How many cells indexed?
print("Number of indexed files:", index2.doc_count())


# define a reader object on the index
myReader = index2.reader()

#[(docnum, doc_dict) for (docnum, doc_dict) in myReader.iter_docs()][0:25]

# list indexed terms for field "cell_content"
word_list = [term for term in myReader.field_terms("cell_content")]
print(word_list)

Index is empty? False
Number of indexed files: 1110
['00', '000', '0000000', '0000565', '00923002555369', '01', '0100', '021', '024', '030', '04', '0420752', '05', '06', '0684', '07', '0785274576', '08', '09190333683', '09384396727', '0968751911', '0asf', '0borpt', '0c', '0cdovl', '0ddyo', '0egkhpzi', '0fbdkwnza', '0fgwwftyei', '0fk', '0g', '0gcsqgsib', '0j', '0kteimcagcsqgsib', '0ma', '0mamnh', '0obbyefiedbyoibqnrqkca', '0pbaqdagh', '0pnmjrtnplaposxvrwvteosx', '0qhpzy', '0rbciwiiipki', '0syym', '0tglwqkurrpihqx', '0tnc', '0umesj', '0vslmsmywt', '0w', '0wdqyjkozihvcnaqelbqawgakxczajbgnv', '0wl', '0wwyaj', '0xntaxmdyymjm', '0yu', '0zcb', '10', '100', '1000', '10000', '1000000', '1003', '101', '1024', '1028', '103', '1080', '108000', '11', '110000', '110030', '111', '112168', '11629', '117847853825', '12', '121', '122', '1220', '123', '125', '128', '13', '1316297', '132', '1379017597407', '13873061', '14', '144', '1482133970', '15', '150', '16', '162', '16896686465', '17', '17400', '1740

In [17]:
# Create a binary encoding of dataset based on the selected features (X)
# go through each document --> tokenize that single document --> compare with total word list
tokenizer = RegexpTokenizer(r'\w+')
df_rows = []

with codecs.open(OUTPUT, "r", "ISO-8859-1") as csvfile:
    csvreader = csv.DictReader(csvfile)
    for i, row in enumerate(csvreader):
        value = row["Negative Feedback"]
        if value != "" and isinstance(value, str):
            file_words = tokenizer.tokenize(value)
            df_rows.append([1 if word in file_words else 0 for word in word_list]) 
    X = pd.DataFrame(df_rows, columns = word_list)

## Step 2

### K-means Clustering (Scikit-learn)
##### Note: must run pip install scikit-learn, pip install scipy

In [18]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_rand_score
from scipy.cluster import hierarchy

# vectorizer = TfidfVectorizer(stop_words='english')
# X = vectorizer.fit_transform(myReader.field_terms("cell_content"))

# # Number of clusters
# kmeans = KMeans(n_clusters=10)
# # Fitting the input data
# kmeans = kmeans.fit(df_rows)
# # Getting the cluster labels
# labels = kmeans.predict(df_rows)
# # Centroid values
# centroids = kmeans.cluster_centers_

# print(centroids)

In [19]:
from sklearn.cluster import SpectralClustering
from sklearn.metrics.pairwise import pairwise_distances

In [21]:
=similarity_matrix = 1 - pairwise_distances(df_rows, metric='cosine')
cosineScores = pd.DataFrame(similarity_matrix)

In [18]:
clusters = SpectralClustering(n_clusters = 5, affinity = 'precomputed').fit(cosineScores)



In [19]:
print(clusters.labels_)

[2 1 2 1 4 4 1 2 2 2 2 2 2 0 1 0 1 0 2 0 0 2 1 1 0 2 1 2 2 2 1 0 1 2 1 0 4
 0 2 1 2 2 1 2 2 4 1 2 2 1 2 4 4 0 2 2 1 1 4 1 1 1 1 0 1 4 2 0 1 1 2 1 0 0
 1 2 0 0 0 0 0 0 0 1 0 1 2 4 1 2 1 0 1 2 0 1 0 2 1 2 2 1 0 2 0 2 1 2 2 2 1
 0 4 1 2 4 0 1 1 1 0 1 1 2 1 1 1 0 2 2 3 1 2 2 2 1 1 2 1 1 2 1 1 4 1 0 2 4
 1 2 2 0 2 0 1 0 0 1 1 0 1 2 0 4 2 1 2 4 2 4 1 0 0 0 2 2 0 2 0 0 2 2 1 2 0
 2 2 2 2 2 2 1 1 1 0 2 2 2 2 0 1 2 2 3 1 2 1 3 4 2 2 1 0 1 4 4 0 1 1 2 2 2
 2 2 4 0 2 1 2 2 1 2 1 0 1 2 2 0 0 1 1 1 1 2 3 0 1 0 1 1 3 3 1 2 2 2 1 3 1
 1 2 1 2 1 3 1 2 3 0 0 1 1 2 2 1 2 1 4 2 1 0 2 4 4 0 1 2 2 1 2 0 1 2 0 1 1
 2 1 2 1 2 2 1 1 2 1 0 0 1 1 0 0 1 1 4 0 1 2 1 0 4 2 2 1 2 2 2 0 1 2 2 0 0
 2 2 0 1 1 1 2 2 2 2 0 2 0 2 2 0 2 2 1 4 2 0 1 2 0 2 1 2 1 2 1 1 1 1 0 1 3
 4 0 2 1 2 0 0 0 0 0 0 2 0 0 0 0 1 1 1 0 1 2 1 2 0 1 0 0 1 4 2 1 4 1 1 1 3
 1 4 2 1 2 3 2 2 2 1 0 1 1 0 4 2 2]


## Step 3 Label Clusters with Key Words

### TF-IDF Frequency Normalization

In [169]:
# pull out documents of each cluster --> tf idf for key words

import numpy as np

num_clusters = 5

# indices for cluster 0
indices = [index for index, clusterNum in enumerate(clusters.labels_) if clusterNum == 0]

# documents in cluster 0
clusterCorpus = [doc_dict['cell_content'] for (docnum, doc_dict) in myReader.iter_docs() if docnum in indices
print(clusterCorpus)

SyntaxError: invalid syntax (<ipython-input-169-a787f8044349>, line 12)

In [164]:
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(clusterCorpus)

response = vectorizer.transform(clusterCorpus)
feature_names = vectorizer.get_feature_names()

In [163]:
feature_array = np.array(vectorizer.get_feature_names())
tfidf_sorting = np.argsort(response.toarray()).flatten()[::-1]

n = 10
top_n = feature_array[tfidf_sorting][:n]

print(top_n)

['shut' 'settings' 'reset' 'ff' 'time' 'exploits' 'escape' 'esoteric'
 'eventually' 'everytime']
