In [1]:
#Import Statements
import os, itertools
import warnings
from matplotlib.pyplot import text
from numpy import vectorize
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import silhouette_score, silhouette_samples
from sklearn.decomposition import PCA, IncrementalPCA, KernelPCA
import re
import numpy as np
import warnings
import statistics
warnings.simplefilter(action='ignore', category=FutureWarning)
import math
from sklearn.metrics import davies_bouldin_score
from sklearn.metrics import calinski_harabasz_score
#from JaccardIndexUpdated import JaccardIndex, ClustersData, summary
from sklearn.decomposition import TruncatedSVD
import umap

In [46]:
#Helper Functions
def drop_duplicates(labeled_df):
    data_copy_df = labeled_df.copy()
    data_no_duplicates = data_copy_df.drop_duplicates(subset = ['text'])
    return data_no_duplicates

def crop_documents(labeled_df, document_cutoff_length):
    '''Returns a df with 'cropped_documents' columns'''
    labeled_df_copy = labeled_df.copy()
    labeled_df_copy['tokens_cropped'] = labeled_df_copy['text'].apply(lambda x: x.split()[:document_cutoff_length])
    labeled_df_copy['cropped_documents'] = labeled_df_copy['tokens_cropped'].apply(lambda x: ' '.join(x))
    labeled_df_copy = labeled_df_copy.drop(columns = ['tokens_cropped'])
    return labeled_df_copy

def get_corpus(labeled_df, on_entire_doc, document_cut_off):
    if on_entire_doc == False:
        crop_documents_df = crop_documents(labeled_df, document_cut_off)
        corpus_to_train_on = crop_documents_df['cropped_documents'].to_list()
    else:
        corpus_to_train_on = labeled_df['text'].to_list()
    return corpus_to_train_on

def numUniqueLabels(cluster_l):
    from collections import Counter
    items = Counter(cluster_l).keys()
    return len(items)

def allScores(X, clusters_l, labeled_df, metric = 'cosine', withJack = False):
    sample_silhouette_values = silhouette_samples(X, clusters_l, metric= metric)
    labeled_df_copy = labeled_df.copy()
    #print(labeled_df_copy)
    labeled_df_copy['sil score'] = sample_silhouette_values
    grouped = labeled_df_copy.groupby(['labels']).mean()
    grouped_med = labeled_df_copy.groupby(['labels']).median()
    grouped['median_sil'] = grouped_med['sil score']
    grouped.loc['MEAN SIL SCORE'] = grouped.mean()
    grouped.loc['MEDIAN SIL SCORE'] = grouped.median()
    grouped.loc['David Bouldin Score'] = davies_bouldin_score(X, clusters_l)
    grouped.loc['Calinski Harabasz'] = calinski_harabasz_score(X, clusters_l)
    if(withJack):
        jack = JaccardIndex(X, clusters_l)
        grouped.loc['Jaccard Index'] = avgJack(jack)
    return grouped

def checkItems(scores, bestItems, bestParameters, dimensionReduction, numComponents, numLabels, maxFreq, minFreq, ngramRange, withJack):
    
    if(bestItems[0] < scores.iloc[numLabels]["sil score"]):
        bestParameters[0]["maxDfRange"] = maxFreq
        bestParameters[0]["minDfRange"] = minFreq
        bestParameters[0]["ngramRange"] = ngramRange
        bestItems[0] = scores.iloc[numLabels]["sil score"]
        bestParameters[0]["reduction"][0] = dimensionReduction
        bestParameters[0]["reduction"][1] = numComponents
    if(bestItems[1] < scores.iloc[numLabels + 1]["sil score"]):
        bestParameters[1]["maxDfRange"] = maxFreq
        bestParameters[1]["minDfRange"] = minFreq
        bestParameters[1]["ngramRange"] = ngramRange
        bestItems[1] = scores.iloc[numLabels + 1]["sil score"]
        bestParameters[1]["reduction"][0] = dimensionReduction
        bestParameters[1]["reduction"][1] = numComponents
    if(bestItems[2] > scores.iloc[numLabels + 2]["sil score"]):
        bestParameters[2]["maxDfRange"] = maxFreq
        bestParameters[2]["minDfRange"] = minFreq
        bestParameters[2]["ngramRange"] = ngramRange
        bestItems[2] = scores.iloc[numLabels + 2]["sil score"]
        bestParameters[2]["reduction"][0] = dimensionReduction
        bestParameters[2]["reduction"][1] = numComponents
    if(bestItems[3] < scores.iloc[numLabels + 3]["sil score"]):
        bestParameters[3]["maxDfRange"] = maxFreq
        bestParameters[3]["minDfRange"] = minFreq
        bestParameters[3]["ngramRange"] = ngramRange
        bestItems[3] = scores.iloc[numLabels + 3]["sil score"]
        bestParameters[3]["reduction"][0] = dimensionReduction
        bestParameters[3]["reduction"][1] = numComponents
    if(withJack and bestItems[4] < scores.iloc[numLabels + 4]["sil score"]):
        bestParameters[4]["maxDfRange"] = maxFreq
        bestParameters[4]["minDfRange"] = minFreq
        bestParameters[4]["ngramRange"] = ngramRange
        bestItems[4] = scores.iloc[numLabels + 4]["sil score"]
        bestParameters[4]["reduction"][0] = dimensionReduction
        bestParameters[4]["reduction"][1] = numComponents

def avgJack(jack):
    totalJackAvg = 0
    totalJackVar = 0
    jackSum = summary(jack.score_df)
    for i in range(len(jackSum)):
        totalJackAvg = totalJackAvg + jackSum.iloc[i]['Avg Score']
        totalJackVar = totalJackVar + jackSum.iloc[i]['Variance']
    return (totalJackAvg/ len(jackSum), totalJackVar/len(jackSum))


def bestScoresTFIDF(X, clusters_l, labeled_df, metric = 'cosine', withJack = False, printProgress = False):
    #ngramRangeList = [(1,1), (1,2), (1,3), (1,4), (2,2), (2,3), (2,4), (3,3), (3,3), (3,4)]
    ngramRangeList = [(1,2)]
    #maxDfRange = np.arange(0.7, 0.9, 0.05)
    maxDfRange = [0.8]
    #minDfRange = [0.005, 0.01, 0.02, 0.05, 0.1]
    mindDfRange = [0.005]
    n_comp = [10]

    numLabels = numUniqueLabels(clusters_l)
    bestSilMean = 0
    silMeanPar = {"maxDfRange": 0, "minDfRange": 0, "ngramRange": (0,0), "reduction": ["none", 0]}
    bestSilMed = 0
    silMedPar = {"maxDfRange": 0, "minDfRange": 0, "ngramRange": (0,0), "reduction": ["none", 0]}
    bestDav = 100
    davPar = {"maxDfRange": 0, "minDfRange": 0, "ngramRange": (0,0), "reduction": ["none", 0]}
    bestCal = 0
    calPar = {"maxDfRange": 0, "minDfRange": 0, "ngramRange": (0,0), "reduction": ["none", 0]}
    bestJack  = 0
    jackPar = {"maxDfRange": 0, "minDfRange": 0, "ngramRange": (0,0), "reduction": ["none", 0]}
    bestItems = [bestSilMean, bestSilMed, bestDav, bestCal, bestJack]
    bestParameters = [silMeanPar, silMedPar, davPar, calPar, jackPar]
    for i in maxDfRange:
        if(printProgress):
            print("MaxDfRange:", i)
        maxFreq = i
        for j in minDfRange:
            if(printProgress):
                print("MinDfRange:",j)
            minFreq = j
            for k in range(len(ngramRangeList)):
                if(printProgress):
                    print("Ngram Range", k)
                ngramRange = ngramRangeList[k]
                
                tfidfvectorizer = TfidfVectorizer(stop_words= 'english', lowercase = True, max_df = maxFreq, min_df = minFreq, ngram_range=ngramRange)
                tfidf_wm = tfidfvectorizer.fit_transform(X)
                tfidf_tokens = tfidfvectorizer.get_feature_names_out()
                df_tfidfvect = pd.DataFrame(data = tfidf_wm.toarray(),index = clusters_l,columns = tfidf_tokens)
                scores = allScores(df_tfidfvect, clusters_l, labeled_df, metric = metric)
                checkItems(scores, bestItems, bestParameters, "none", 0, numLabels, maxFreq, minFreq, ngramRange, withJack)
                #print(bestItems)
                for y in range(len(n_comp)):
                    if(printProgress):
                        print("SVD")
                    svd = TruncatedSVD(n_components = n_comp[y])
                    svdVect = svd.fit_transform(df_tfidfvect)
                    scores = allScores(svdVect, clusters_l, labeled_df)
                    checkItems(scores, bestItems, bestParameters, "SVD", n_comp[y], numLabels, maxFreq, minFreq, ngramRange, withJack)
                    
                    if(printProgress):
                        print("PCA")
                    pca = PCA(n_components = n_comp[y])
                    pcaVect = pca.fit_transform(df_tfidfvect)
                    scores = allScores(pcaVect, clusters_l, labeled_df)
                    checkItems(scores, bestItems, bestParameters, "PCA", n_comp[y], numLabels, maxFreq, minFreq, ngramRange, withJack)
                    
                    if(printProgress):
                        print("PCALower")
                    pcaIncLower = IncrementalPCA(n_components = n_comp[y])
                    pcaIncVect = pcaIncLower.fit_transform(df_tfidfvect)
                    scores = allScores(pcaIncVect, clusters_l, labeled_df)
                    checkItems(scores, bestItems, bestParameters, "PCAinc", n_comp[y], numLabels, maxFreq, minFreq, ngramRange, withJack)
                    if(printProgress):
                        print("PCA Kernel")
                    pcaKernelReduce = KernelPCA(n_components = n_comp[y])
                    pcaKernel = pcaKernelReduce.fit_transform(df_tfidfvect)
                    scores = allScores(pcaKernel, clusters_l, labeled_df)
                    checkItems(scores, bestItems, bestParameters, "PCAKernel", n_comp[y], numLabels, maxFreq, minFreq, ngramRange, withJack)
    
    if(withJack):
        results = [bestParameters, bestItems]
    else:
        results = [bestParameters[:-1], bestItems[:-1]]

    return results

def gridSearchReduction(text_l, labels, testFileSize = [], testFullLength = False, printProgress = False):
    results = []
    d = {"labels": labels, "text": text_l}
    textInput = pd.DataFrame(data = d)
    for word in testFileSize:
        textInputCut = get_corpus(textInput, False, word)
        d2 = {"labels": labels, "text": textInputCut}
        cleanTextDFsil = pd.DataFrame(data = d2)
        results.append(bestScoresTFIDF(textInputCut, cleanTextDFsil['labels'], cleanTextDFsil, printProgress = printProgress))
    d = {"labels": labels, "text": text_l}
    textInput = pd.DataFrame(data = d)
    results.append(bestScoresTFIDF(text_l,textInput["labels"], textInput))
    return results

def printResults(optimalParameters, docLengthTest):
    parameterTypes = ["Silhouette Mean", "Silhouette Median", "Davies-Bouldin", "Calinski-Harabasz"]
    print()
    for i in range(len(docLengthTest)):
        print("Words: ", docLengthTest[i])
        for x in range(len(optimalParameters[i][0])):
            print(parameterTypes[x])
            for key, value in optimalParameters[i][0][x].items():
                print(key, ": ", value)

            print("Score:", optimalParameters[i][1][x])
            print()
        


In [47]:
import sys
sys.path.insert(0, r'C:\Users\Matthew Arrieta\Desktop\Project3Testing\TextInput')
sys.path.insert(0, r'C:\Users\Matthew Arrieta\Desktop\Project3Testing\TestFiles')
sys.path.insert(0, r'C:\Users\Matthew Arrieta\Desktop\Project3Testing\Vectorizations')
sys.path.insert(0, r'C:\Users\Matthew Arrieta\Desktop\Project3Testing\Clustering')
sys.path.insert(0, r'C:\Users\Matthew Arrieta\Desktop\Project3Testing\KeywordExtraction')
from CleanText import getCleanText, getText

text_l, labels = getCleanText(r"C:\Users\Matthew Arrieta\Desktop\Project3Testing\TestFiles\KeywordFilesTight", RemoveNums = True, lem=True, extendStopWords= False)
d = {"labels": labels, "text": text_l}
textInput = pd.DataFrame(data = d)

In [48]:
ngramRangeList = [(1,2),(1,4)]
maxDfRange = np.arange(0.7, 0.9, 0.05)
minDfRange = [0.005, 0.01, 0.02, 0.05, 0.1]
numComponentRange = [10, 20]
docLengthTests = [50, 75]

optimalParameters = gridSearchReduction(text_l, labels, docLengthTests, False, printProgress= False)

printResults(optimalParameters, docLengthTests)


Words:  50
Silhouette Mean
maxDfRange :  0.8
minDfRange :  0.005
ngramRange :  (1, 2)
reduction :  ['PCAinc', 10]
Score: 0.6456227720715029

Silhouette Median
maxDfRange :  0.8
minDfRange :  0.02
ngramRange :  (1, 2)
reduction :  ['PCA', 10]
Score: 0.7672778976873513

Davies-Bouldin
maxDfRange :  0.8
minDfRange :  0.01
ngramRange :  (1, 2)
reduction :  ['PCAinc', 10]
Score: 0.7410108376236718

Calinski-Harabasz
maxDfRange :  0.8
minDfRange :  0.02
ngramRange :  (1, 2)
reduction :  ['PCAinc', 10]
Score: 95.8222731419529

Words:  75
Silhouette Mean
maxDfRange :  0.8
minDfRange :  0.005
ngramRange :  (1, 2)
reduction :  ['PCAinc', 10]
Score: 0.5748289364488502

Silhouette Median
maxDfRange :  0.8
minDfRange :  0.05
ngramRange :  (1, 2)
reduction :  ['PCAinc', 10]
Score: 0.7174195159573156

Davies-Bouldin
maxDfRange :  0.8
minDfRange :  0.02
ngramRange :  (1, 2)
reduction :  ['PCAinc', 10]
Score: 0.915907154269799

Calinski-Harabasz
maxDfRange :  0.8
minDfRange :  0.01
ngramRange :  (1, 2