In [1]:
#Import Statements
import os, itertools
import warnings
from matplotlib.pyplot import text
from numpy import vectorize
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, silhouette_samples
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA, IncrementalPCA, KernelPCA
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sentence_transformers import SentenceTransformer, util
import gensim.downloader as api
from gensim.models import Word2Vec
from gensim.models.keyedvectors import KeyedVectors
import re
import numpy as np
import warnings
import statistics
from nltk.corpus import words
warnings.simplefilter(action='ignore', category=FutureWarning)
from nltk.stem import *
from nltk.stem.porter import *
import math
from sklearn.metrics import davies_bouldin_score
from sklearn.metrics import calinski_harabasz_score
from JaccardIndexUpdated import JaccardIndex, ClustersData, summary

In [3]:
#Helper Functions
def drop_duplicates(labeled_df):
    data_copy_df = labeled_df.copy()
    data_no_duplicates = data_copy_df.drop_duplicates(subset = ['text'])
    return data_no_duplicates

def crop_documents(labeled_df, document_cutoff_length):
    '''Returns a df with 'cropped_documents' columns'''
    labeled_df_copy = labeled_df.copy()
    labeled_df_copy['tokens_cropped'] = labeled_df_copy['text'].apply(lambda x: x.split()[:document_cutoff_length])
    labeled_df_copy['cropped_documents'] = labeled_df_copy['tokens_cropped'].apply(lambda x: ' '.join(x))
    labeled_df_copy = labeled_df_copy.drop(columns = ['tokens_cropped'])
    return labeled_df_copy

def get_corpus(labeled_df, on_entire_doc, document_cut_off):
    if on_entire_doc == False:
        crop_documents_df = crop_documents(labeled_df, document_cut_off)
        corpus_to_train_on = crop_documents_df['cropped_documents'].to_list()
    else:
        corpus_to_train_on = labeled_df['text'].to_list()
    return corpus_to_train_on

def vectorize_count_vectorizer(labeled_df, train_on_entire_doc, vectorize_entire_doc, document_cut_off, max_df, min_df, ngram_range, binary= False):
    '''train_on_entire_doc = True --> determine vocabulary and stop words using entire document, else use document_cut_off
    vectorize_on_entire_doc = True --> vectorize entire doc, otherwise only vectorize cropped document
    document_cut_off = N --> used to crop documents to N words
    max_df: any words appearing with frequence > max_df will be excluded
    min_df: any words appearing with frequency < min_df will be excluded
    ngram_range: us ngrams to create vocabulary and vectorize
    binary = True, vecorizes documents using 1 or 0, vs word count
    '''
    no_dups_df = drop_duplicates(labeled_df)
    training_corpus = get_corpus(no_dups_df, train_on_entire_doc, document_cut_off)
    vectorizing_corpus = get_corpus(labeled_df, vectorize_entire_doc, document_cut_off)
    vectorizer = CountVectorizer(ngram_range = ngram_range, min_df= min_df, max_df = max_df, binary = binary)
    vectorizer.fit(training_corpus)
    X = vectorizer.transform(vectorizing_corpus)
    #X = vectorizer.fit_transform(vectorizing_corpus)
    df = pd.DataFrame(X.todense(), index = labeled_df.index, columns = vectorizer.get_feature_names())
    return df

def vectorize_with_tfidf(labeled_df, train_on_entire_doc, vectorize_entire_doc, document_cut_off, max_df, min_df, ngram_range, binary= False, use_idf = True):
    '''train_on_entire_doc = True --> determine vocabulary and stop words using entire document, else use document_cut_off
    vectorize_on_entire_doc = True --> vectorize entire doc, otherwise only vectorize cropped document
    document_cut_off = N --> used to crop documents to N words
    max_df: any words appearing with frequence > max_df will be excluded
    min_df: any words appearing with frequency < min_df will be excluded
    ngram_range: us ngrams to create vocabulary and vectorize
    binary = True, vecorizes documents using 1 or 0, vs word count
    use_idf = False sets idf(t) = 1 for all tokens t.
    '''
    no_dups_df = drop_duplicates(labeled_df)
    training_corpus = get_corpus(no_dups_df, train_on_entire_doc, document_cut_off)
    vectorizing_corpus = get_corpus(labeled_df, vectorize_entire_doc, document_cut_off)
    vectorizer = TfidfVectorizer(ngram_range = ngram_range, min_df= min_df, max_df = max_df, binary = binary, use_idf= use_idf)
    vectorizer.fit(training_corpus)
    X= vectorizer.transform(vectorizing_corpus)
    df = pd.DataFrame(X.todense(), index = labeled_df, columns = vectorizer.get_feature_names())
    return df

def allScores(X, clusters_l, labeled_df, metric = 'cosine', withJack = False):
    sample_silhouette_values = silhouette_samples(X, clusters_l, metric= metric)
    labeled_df_copy = labeled_df.copy()
    #print(labeled_df_copy)
    labeled_df_copy['sil score'] = sample_silhouette_values
    grouped = labeled_df_copy.groupby(['labels']).mean()
    grouped_med = labeled_df_copy.groupby(['labels']).median()
    grouped['median_sil'] = grouped_med['sil score']
    grouped.loc['MEAN SIL SCORE'] = grouped.mean()
    grouped.loc['MEDIAN SIL SCORE'] = grouped.median()
    grouped.loc['David Bouldin Score'] = davies_bouldin_score(X, clusters_l)
    grouped.loc['Calinski Harabasz'] = calinski_harabasz_score(X, clusters_l)
    if(withJack):
        jack = JaccardIndex(X, clusters_l)
        grouped.loc['Jaccard Index'] = avgJack(jack)
    return grouped

def bestScoresTFIDF(X, clusters_l, labeled_df, metric = 'cosine', withJack = False):
    #maxDfRange = np.arange(0.6, 1.0, 0.1)
    #minDfRange = np.arange(0, .20, 0.1)
    #ngramRangeList = [(1,1), (1,2), (1,3), (1,4), (2,2), (2,3), (2,4), (3,3), (3,3), (3,4)]
    maxDfRange = np.arange(0.7, 0.9, 0.05)
    minDfRange = np.arange(0.025,0.125, 0.025)
    ngramRangeList = [(1,1), (1,2), (1,3), (1,4)]
    n_comp = [10,20,30,40,50,60,70,80,90,100]

    bestSilMean = 0
    silMeanPar = {"maxDfRange": 0, "minDfRange": 0, "ngramRange": (0,0), "reduction": ["none", 0]}
    bestSilMed = 0
    silMedPar = {"maxDfRange": 0, "minDfRange": 0, "ngramRange": (0,0), "reduction": ["none", 0]}
    bestDav = 0
    davPar = {"maxDfRange": 0, "minDfRange": 0, "ngramRange": (0,0), "reduction": ["none", 0]}
    bestCal = 0
    calPar = {"maxDfRange": 0, "minDfRange": 0, "ngramRange": (0,0), "reduction": ["none", 0]}
    bestJack  = 0
    jackPar = {"maxDfRange": 0, "minDfRange": 0, "ngramRange": (0,0), "reduction": ["none", 0]}
    for i in maxDfRange:
        print("MaxDfRange:", i)
        maxFreq = i
        for j in minDfRange:
            print("MinDfRange:",j)
            minFreq = j
            for k in range(len(ngramRangeList)):
                print("Ngram Range", k)
                ngramRange = ngramRangeList[k]
                
                tfidfvectorizer = TfidfVectorizer(stop_words= 'english', lowercase = True, max_df = maxFreq, min_df = minFreq, ngram_range=ngramRange)
                tfidf_wm = tfidfvectorizer.fit_transform(X)
                tfidf_tokens = tfidfvectorizer.get_feature_names_out()
                df_tfidfvect = pd.DataFrame(data = tfidf_wm.toarray(),index = clusters_l,columns = tfidf_tokens)
                scores = allScores(df_tfidfvect, clusters_l, textInput)
                if(bestSilMean < scores.iloc[41]["sil score"]):
                    silMeanPar["maxDfRange"] = maxFreq
                    silMeanPar["minDfRange"] = minFreq
                    silMeanPar["ngramRange"] = ngramRange
                    bestSilMean = scores.iloc[41]["sil score"]
                    silMeanPar["reduction"][0] = "none"
                if(bestSilMed < scores.iloc[42]["sil score"]):
                    silMedPar["maxDfRange"] = maxFreq
                    silMedPar["minDfRange"] = minFreq
                    silMedPar["ngramRange"] = ngramRange
                    bestSilMed = scores.iloc[42]["sil score"]
                    silMedPar["reduction"][0] = "none"
                if(bestDav < scores.iloc[43]["sil score"]):
                    davPar["maxDfRange"] = maxFreq
                    davPar["minDfRange"] = minFreq
                    davPar["ngramRange"] = ngramRange
                    bestDav = scores.iloc[43]["sil score"]
                    davPar["reduction"][0] = "none"
                if(bestCal < scores.iloc[44]["sil score"]):
                    calPar["maxDfRange"] = maxFreq
                    calPar["minDfRange"] = minFreq
                    calPar["ngramRange"] = ngramRange
                    bestCal = scores.iloc[44]["sil score"]
                    calPar["reduction"][0] = "none"
                if(withJack and bestJack < scores.iloc[45]["sil score"]):
                    jackPar["maxDfRange"] = maxFreq
                    jackPar["minDfRange"] = minFreq
                    jackPar["ngramRange"] = ngramRange
                    bestJack = scores.iloc[45]["sil score"]
                    jackPar["reduction"][0] = "none"
                for y in range(len(n_comp)):
                    svd = TruncatedSVD(n_components = n_comp[y])
                    svdVect = svd.fit_transform(df_tfidfvect)
                    scores = allScores(svdVect, clusters_l, textInput)
                    if(bestSilMean < scores.iloc[41]["sil score"]):
                        silMeanPar["maxDfRange"] = maxFreq
                        silMeanPar["minDfRange"] = minFreq
                        silMeanPar["ngramRange"] = ngramRange
                        bestSilMean = scores.iloc[41]["sil score"]
                        silMeanPar["reduction"][0] = "SVD"
                        silMeanPar["reduction"][1] = n_comp[y]
                    if(bestSilMed < scores.iloc[42]["sil score"]):
                        silMedPar["maxDfRange"] = maxFreq
                        silMedPar["minDfRange"] = minFreq
                        silMedPar["ngramRange"] = ngramRange
                        bestSilMed = scores.iloc[42]["sil score"]
                        silMedPar["reduction"][0] = "SVD"
                        silMedPar["reduction"][1] = n_comp[y]
                    if(bestDav < scores.iloc[43]["sil score"]):
                        davPar["maxDfRange"] = maxFreq
                        davPar["minDfRange"] = minFreq
                        davPar["ngramRange"] = ngramRange
                        bestDav = scores.iloc[43]["sil score"]
                        davPar["reduction"][0] = "SVD"
                        davPar["reduction"][1] = n_comp[y]
                    if(bestCal < scores.iloc[44]["sil score"]):
                        calPar["maxDfRange"] = maxFreq
                        calPar["minDfRange"] = minFreq
                        calPar["ngramRange"] = ngramRange
                        bestCal = scores.iloc[44]["sil score"]
                        calPar["reduction"][0] = "SVD"
                        calPar["reduction"][1] = n_comp[y]
                    if(withJack and bestJack < scores.iloc[45]["sil score"]):
                        jackPar["maxDfRange"] = maxFreq
                        jackPar["minDfRange"] = minFreq
                        jackPar["ngramRange"] = ngramRange
                        bestJack = scores.iloc[45]["sil score"]
                        jackPar["reduction"][0] = "SVD"
                        jackPar["reduction"][1] = n_comp[y]
                    pca = PCA(n_components = n_comp[y])
                    pcaVect = pca.fit_transform(df_tfidfvect)
                    scores = allScores(pcaVect, clusters_l, textInput)
                    if(bestSilMean < scores.iloc[41]["sil score"]):
                        silMeanPar["maxDfRange"] = maxFreq
                        silMeanPar["minDfRange"] = minFreq
                        silMeanPar["ngramRange"] = ngramRange
                        bestSilMean = scores.iloc[41]["sil score"]
                        silMeanPar["reduction"][0] = "PCA"
                        silMeanPar["reduction"][1] = n_comp[y]
                    if(bestSilMed < scores.iloc[42]["sil score"]):
                        silMedPar["maxDfRange"] = maxFreq
                        silMedPar["minDfRange"] = minFreq
                        silMedPar["ngramRange"] = ngramRange
                        bestSilMed = scores.iloc[42]["sil score"]
                        silMedPar["reduction"][0] = "PCA"
                        silMedPar["reduction"][1] = n_comp[y]
                    if(bestDav < scores.iloc[43]["sil score"]):
                        davPar["maxDfRange"] = maxFreq
                        davPar["minDfRange"] = minFreq
                        davPar["ngramRange"] = ngramRange
                        bestDav = scores.iloc[43]["sil score"]
                        davPar["reduction"][0] = "PCA"
                        davPar["reduction"][1] = n_comp[y]
                    if(bestCal < scores.iloc[44]["sil score"]):
                        calPar["maxDfRange"] = maxFreq
                        calPar["minDfRange"] = minFreq
                        calPar["ngramRange"] = ngramRange
                        bestCal = scores.iloc[44]["sil score"]
                        calPar["reduction"][0] = "PCA"
                        calPar["reduction"][1] = n_comp[y]
                    if(withJack and bestJack < scores.iloc[45]["sil score"]):
                        jackPar["maxDfRange"] = maxFreq
                        jackPar["minDfRange"] = minFreq
                        jackPar["ngramRange"] = ngramRange
                        bestJack = scores.iloc[45]["sil score"]
                        jackPar["reduction"][0] = "PCA"
                        jackPar["reduction"][1] = n_comp[y]
                    pcaIncLower = IncrementalPCA(n_components = n_comp[y])
                    pcaIncVect = pcaIncLower.fit_transform(df_tfidfvect)
                    scores = allScores(pcaIncVect, clusters_l, textInput)
                    if(bestSilMean < scores.iloc[41]["sil score"]):
                        silMeanPar["maxDfRange"] = maxFreq
                        silMeanPar["minDfRange"] = minFreq
                        silMeanPar["ngramRange"] = ngramRange
                        bestSilMean = scores.iloc[41]["sil score"]
                        silMeanPar["reduction"][0] = "PCAinc"
                        silMeanPar["reduction"][1] = n_comp[y]
                    if(bestSilMed < scores.iloc[42]["sil score"]):
                        silMedPar["maxDfRange"] = maxFreq
                        silMedPar["minDfRange"] = minFreq
                        silMedPar["ngramRange"] = ngramRange
                        bestSilMed = scores.iloc[42]["sil score"]
                        silMedPar["reduction"][0] = "PCAinc"
                        silMedPar["reduction"][1] = n_comp[y]
                    if(bestDav < scores.iloc[43]["sil score"]):
                        davPar["maxDfRange"] = maxFreq
                        davPar["minDfRange"] = minFreq
                        davPar["ngramRange"] = ngramRange
                        bestDav = scores.iloc[43]["sil score"]
                        davPar["reduction"][0] = "PCAinc"
                        davPar["reduction"][1] = n_comp[y]
                    if(bestCal < scores.iloc[44]["sil score"]):
                        calPar["maxDfRange"] = maxFreq
                        calPar["minDfRange"] = minFreq
                        calPar["ngramRange"] = ngramRange
                        bestCal = scores.iloc[44]["sil score"]
                        calPar["reduction"][0] = "PCAinc"
                        calPar["reduction"][1] = n_comp[y]
                    if(withJack and bestJack < scores.iloc[45]["sil score"]):
                        jackPar["maxDfRange"] = maxFreq
                        jackPar["minDfRange"] = minFreq
                        jackPar["ngramRange"] = ngramRange
                        bestJack = scores.iloc[45]["sil score"]
                        jackPar["reduction"][0] = "PCAinc"
                        jackPar["reduction"][1] = n_comp[y]
                    pcaKernelReduce = KernelPCA(n_components = n_comp[y])
                    pcaKernel = pcaKernelReduce.fit_transform(df_tfidfvect)
                    scores = allScores(pcaKernel, clusters_l, textInput)
                    if(bestSilMean < scores.iloc[41]["sil score"]):
                        silMeanPar["maxDfRange"] = maxFreq
                        silMeanPar["minDfRange"] = minFreq
                        silMeanPar["ngramRange"] = ngramRange
                        bestSilMean = scores.iloc[41]["sil score"]
                        silMeanPar["reduction"][0] = "PCAKernel"
                        silMeanPar["reduction"][1] = n_comp[y]
                    if(bestSilMed < scores.iloc[42]["sil score"]):
                        silMedPar["maxDfRange"] = maxFreq
                        silMedPar["minDfRange"] = minFreq
                        silMedPar["ngramRange"] = ngramRange
                        bestSilMed = scores.iloc[42]["sil score"]
                        silMedPar["reduction"][0] = "PCAKernel"
                        silMedPar["reduction"][1] = n_comp[y]
                    if(bestDav > scores.iloc[43]["sil score"]):
                        davPar["maxDfRange"] = maxFreq
                        davPar["minDfRange"] = minFreq
                        davPar["ngramRange"] = ngramRange
                        bestDav = scores.iloc[43]["sil score"]
                        davPar["reduction"][0] = "PCAKernel"
                        davPar["reduction"][1] = n_comp[y]
                    if(bestCal < scores.iloc[44]["sil score"]):
                        calPar["maxDfRange"] = maxFreq
                        calPar["minDfRange"] = minFreq
                        calPar["ngramRange"] = ngramRange
                        bestCal = scores.iloc[44]["sil score"]
                        calPar["reduction"][0] = "PCAKernel"
                        calPar["reduction"][1] = n_comp[y]
                    if(withJack and bestJack < scores.iloc[45]["sil score"]):
                        jackPar["maxDfRange"] = maxFreq
                        jackPar["minDfRange"] = minFreq
                        jackPar["ngramRange"] = ngramRange
                        bestJack = scores.iloc[45]["sil score"]
                        jackPar["reduction"][0] = "PCAKernel"
                        jackPar["reduction"][1] = n_comp[y]
    if(withJack):
        r1 = [silMeanPar, silMedPar, davPar, calPar, jackPar]
        r2 = [bestSilMean, bestSilMed, bestDav, bestCal, bestJack]
        results = [r1,r2]
    else:
        r1 = [silMeanPar, silMedPar, davPar, calPar]
        r2 = [bestSilMean, bestSilMed, bestDav, bestCal]
        results = [r1,r2]
    return results

def avgJack(jack):
    totalJackAvg = 0
    totalJackVar = 0
    jackSum = summary(jack.score_df)
    for i in range(len(jackSum)):
        totalJackAvg = totalJackAvg + jackSum.iloc[i]['Avg Score']
        totalJackVar = totalJackVar + jackSum.iloc[i]['Variance']
    return (totalJackAvg/ len(jackSum), totalJackVar/len(jackSum))

def remove_page_nos(text):
    new_text = re.subn(r'Page \d*', '', text)
    return new_text[0]
    
def remove_metadata(text):
    parts = text.split('Page 1\n')
    if len(parts)>1:
        return parts[1]
    else:
        return text

def truncate(number, digits) -> float:
    # Improve accuracy with floating point operations, to avoid truncate(16.4, 2) = 16.39 or truncate(-1.13, 2) = -1.12
    nbDecimals = len(str(number).split('.')[1]) 
    if nbDecimals <= digits:
        return number
    stepper = 10.0 ** digits
    return math.trunc(stepper * number) / stepper

def printSamples(df, labels):
    samples = silhouette_samples(df, labels, metric = "cosine")
    types = list(set(labels))
    dict = {}
    totalSum = []
    meanSum = []
    medianSum = []
    for label in types:
        dict[label] = []
    for i in range(len(labels)):
        dict[labels[i]].append(samples[i])
    for key, value in dict.items():
        totalSum = totalSum + list(value)
        meanSum.append(truncate(statistics.mean(value), 4))
        medianSum.append(truncate(statistics.median(value),4))
        print("{:<30} {:<15} {:<15}".format(key,truncate(statistics.mean(value), 4), truncate(statistics.median(value),4)))
    #print(key, ": ", statistics.mean(value))
    print("{:<40} {:<15} {:<15}".format("Score Types:", "Silhouettes Mean", "Sihouettes Medina"))
    print("{:<40} {:<15} {:<15}".format("Mean & Median", truncate(statistics.mean(totalSum), 4), truncate(statistics.median(totalSum), 4)))
    print("{:<40} {:<15} {:<15}".format("Mean of mean & Median of Median", truncate(statistics.mean(meanSum),4), truncate(statistics.median(medianSum), 4)))

def printMeanMedian(df, labels):
    samples = silhouette_samples(df, labels, metric = "cosine")
    types = list(set(labels))
    dict = {}
    totalSum = []
    meanSum = []
    medianSum = []
    for label in types:
        dict[label] = []
    for i in range(len(labels)):
        dict[labels[i]].append(samples[i])
    for key, value in dict.items():
        totalSum = totalSum + list(value)
        meanSum.append(statistics.mean(value))
        medianSum.append(statistics.median(value))
    print("{:<30} {:<15} {:<15}".format("Mean & Median", statistics.mean(totalSum), statistics.median(totalSum)))
    print("{:<30} {:<15} {:<15}".format("Mean of mean & Median of Median", statistics.mean(meanSum), statistics.median(medianSum)))


In [5]:
#Import Data
file_path_l = []

for root, dirs, files in os.walk('TextClustersForVectorizationAlexis'):
    for filename in files:
        file_path_l.append(os.path.join(root, filename))

text_l = []
for file_path in file_path_l:
    filey = open(file_path,encoding="mbcs")
    text_l.append(filey.read())
    filey.close()

types_l = []
for namey in file_path_l:
    splitted = namey.split('\\')
    types_l.append(splitted[1])
'''
stemmer = PorterStemmer()
plurals = ["dies", "buses", "bus", "ran", "running"]
stemText = text_l.copy()
print(text_l[0].sp)
for i in range(len(text_l)):
    words = text_l[i].split();
    for j in range(len(words)):
        stemText[i] = stemmer.stem(text_l[i][j])
stemText[i]
'''

'\nstemmer = PorterStemmer()\nplurals = ["dies", "buses", "bus", "ran", "running"]\nstemText = text_l.copy()\nprint(text_l[0].sp)\nfor i in range(len(text_l)):\n    words = text_l[i].split();\n    for j in range(len(words)):\n        stemText[i] = stemmer.stem(text_l[i][j])\nstemText[i]\n'

In [None]:
#Compare Cropped Data to Normal Data
training_df = pd.read_csv('InternSampleDataSmall.csv')
training_df = training_df.drop(columns = ['Unnamed: 0'])
labels = training_df['labels']
numWords = 30

tfidf_vectorized_df = vectorize_with_tfidf(training_df, False, False, numWords, .8, .005, (1,2), False, True)
#print(silhouette_score(tfidf_vectorized_df, labels, metric = 'cosine'))
"""print(tfidf_vectorized_df)
print()
print(labels)"""
#print(allScores(tfidf_vectorized_df, labels, training_df, metric = 'cosine'))
print(training_df)
svd = TruncatedSVD(n_components=20)
SVDtfidfvect = svd.fit_transform(tfidf_vectorized_df)
#printSamples(SVDtfidfvect, labels)

no_dups_df = drop_duplicates(training_df)
training_corpus = get_corpus(no_dups_df, False, numWords)
vectorizing_corpus = get_corpus(training_df, False, numWords)


In [9]:
cleanText = []
for i in range(len(text_l)):
    cleanText.append(remove_metadata(remove_page_nos(text_l[i])))

d = {"labels": types_l, "text": cleanText}
textInput = pd.DataFrame(data = d)
"""textInputCut = get_corpus(textInput, False, numWords)
print(textInputCut)
cleanTextDF = pd.DataFrame(data = textInputCut, index = types_l)"""



'textInputCut = get_corpus(textInput, False, numWords)\nprint(textInputCut)\ncleanTextDF = pd.DataFrame(data = textInputCut, index = types_l)'

In [11]:
#My countVectorize and TFIDFVectorize
maxFreq = .9
minFreq = .05
ngramRange = (1,1)
#countvectorizer = CountVectorizer(stop_words= 'english', lowercase = True, max_df = maxFreq, min_df = minFreq, ngram_range=ngramRange)
tfidfvectorizer = TfidfVectorizer(stop_words= 'english', lowercase = True, max_df = maxFreq, min_df = minFreq, ngram_range=ngramRange)
#tfidfAlexis = TfidfVectorizer(stop_words= 'english', lowercase = True, max_df = maxFreq, min_df = minFreq, ngram_range=ngramRange)
#tfidfCut = TfidfVectorizer(stop_words= 'english', lowercase = True, max_df = maxFreq, min_df = minFreq, ngram_range=ngramRange)

#count_wm = countvectorizer.fit_transform(cleanText)
tfidf_wm = tfidfvectorizer.fit_transform(cleanText)
#print(count_wm.toarray())
#tfidfAlexis.fit(training_corpus)
#X = tfidfAlexis.transform(vectorizing_corpus)
#tfidfCut_wm = tfidfCut.fit_transform(cleanText)

#count_tokens = countvectorizer.get_feature_names_out()
tfidf_tokens = tfidfvectorizer.get_feature_names_out()
#tfidfAlexis_tokens = tfidfAlexis.get_feature_names_out()
#tfidfCut_tokens = tfidfCut.get_feature_names_out()

#df_countvect = pd.DataFrame(data = count_wm.toarray(),index = types_l,columns = count_tokens)
df_tfidfvect = pd.DataFrame(data = tfidf_wm.toarray(),index = types_l,columns = tfidf_tokens)
print(df_tfidfvect)
score = silhouette_score(df_tfidfvect, types_l, metric = "cosine")
print(score)
#df_tfidfAlexis = pd.DataFrame(X.todense(), index = labels, columns = tfidfAlexis.get_feature_names())
#df_tfidfCut = pd.DataFrame(data = tfidfCut_wm.toarray(), index = types_l, columns = tfidfCut_tokens)
#print(df_tfidfvect)
#jack = JaccardIndex(df_tfidfvect, types_l)
#print(jack.display())
"""totalJackAvg = 0
totalJackVar = 0
jackSum = summary(jack.score_df)

for i in range(len(jackSum)):
    totalJackAvg = totalJackAvg + jackSum.iloc[i]['Avg Score']
    totalJackVar = totalJackVar + jackSum.iloc[i]['Variance']
print(totalJackAvg/len(jackSum))
print(totalJackVar/len(jackSum))
print(jackSum)"""
#d = {"labels": types_l, "text": cleanText}
#textInput = pd.DataFrame(data = d)
#scores = allScores(df_tfidfvect, types_l, textInput)

                               00       000  0001  000â  001  002  003  004  \
Alcohol Control -- good  0.000000  0.000000   0.0   0.0  0.0  0.0  0.0  0.0   
Alcohol Control -- good  0.000000  0.000000   0.0   0.0  0.0  0.0  0.0  0.0   
Alcohol Control -- good  0.000000  0.040769   0.0   0.0  0.0  0.0  0.0  0.0   
Alcohol Control -- good  0.000000  0.000000   0.0   0.0  0.0  0.0  0.0  0.0   
Alcohol Control -- good  0.022229  0.000000   0.0   0.0  0.0  0.0  0.0  0.0   
...                           ...       ...   ...   ...  ...  ...  ...  ...   
StreetPermits-- good     0.000000  0.000000   0.0   0.0  0.0  0.0  0.0  0.0   
StreetPermits-- good     0.000000  0.000000   0.0   0.0  0.0  0.0  0.0  0.0   
StreetPermits-- good     0.000000  0.000000   0.0   0.0  0.0  0.0  0.0  0.0   
StreetPermits-- good     0.000000  0.000000   0.0   0.0  0.0  0.0  0.0  0.0   
StreetPermits-- good     0.000000  0.000000   0.0   0.0  0.0  0.0  0.0  0.0   

                         005  006  ...  years  year

"totalJackAvg = 0\ntotalJackVar = 0\njackSum = summary(jack.score_df)\n\nfor i in range(len(jackSum)):\n    totalJackAvg = totalJackAvg + jackSum.iloc[i]['Avg Score']\n    totalJackVar = totalJackVar + jackSum.iloc[i]['Variance']\nprint(totalJackAvg/len(jackSum))\nprint(totalJackVar/len(jackSum))\nprint(jackSum)"

In [None]:
wordResultsSmall = []

#wordCountSmall = [30]
wordCountSmall = [30,70,100]
for word in wordCountSmall:
    print("Words:", word)
    textInputCut = get_corpus(textInput, False, word)
    d2 = {"labels": types_l, "text": textInputCut}
    #cleanTextDF = pd.DataFrame(data = textInputCut, index = types_l)
    cleanTextDFsil = pd.DataFrame(data = d2)
    wordResultsSmall.append(bestScoresTFIDF(textInputCut, cleanTextDFsil['labels'], cleanTextDFsil))

In [None]:
#print(scores.iloc[44]["sil score"])
#print(scores)

results = bestScoresTFIDF(cleanText, types_l, textInput)


In [None]:
for x in results[0]:
    print("")
    for key, value in x.items():
        print(key, ": ", value)

for y in results[1]:
    print(y)


In [None]:
dic = {"one": 1, "two": 2}
def changeDic(dic):
    dic["one"] = 2
changeDic(dic)
for key, value in dic.items():
        print(key, ": ", value)

In [None]:
#Silhouette Scores
score = silhouette_score(df_tfidfvect, types_l, metric = "cosine")
#print("TFIDF :", score)

samples = silhouette_samples(df_tfidfvect, types_l, metric = "cosine")
printSamples(df_tfidfvect, types_l)

#print(silhouette_scores(df_tfidfAlexis, labels, training_df, 'cosine'))
score = silhouette_score(df_tfidfAlexis, labels, metric = "cosine")
#print("TFIDF Alexis:", score)
print(silhouette_scores(df_tfidfAlexis, labels,training_df, 'cosine'))
score = silhouette_score(df_tfidfCut, types_l, metric = "cosine")
#print("TFIDF Cut:", score)

score = silhouette_score(bert_df, types_l, metric = "cosine")
#print("BERT :", score)


score = silhouette_score(df_combined, types_l, metric = "cosine")
#print("Combined :", score)


In [None]:
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import PCA

# Evaluate Dimension Reduction
svd = TruncatedSVD(n_components=30)
SVDtfidfvect = svd.fit_transform(df_tfidfvect)
SVDtfidfAlexis = svd.fit_transform(df_tfidfAlexis)
SVDtfidfCut = svd.fit_transform(df_tfidfCut)

"""print("TFIDF SVD:", silhouette_score(SVDtfidfvect, types_l, metric = "cosine"))
print("TFIDF")
printMeanMedian(SVDtfidfvect, types_l)
print("TFIDF Alexis")
printMeanMedian(SVDtfidfAlexis, labels)
print("TFIDF Cut OFf")
printMeanMedian(SVDtfidfCut,types_l)
print()"""
#print("TFIDF Alexis SVD:", silhouette_score(SVDtfidfAlexis, labels))
#print("TFIDF Cut SVD:", silhouette_score(SVDtfidfCut, types_l))

pca = PCA(n_components= 20)
PCAtfidfVect = pca.fit_transform(df_tfidfvect)
PCAtfidfAlexis = pca.fit_transform(df_tfidfAlexis)
PCAtfidfCut = pca.fit_transform(df_tfidfCut)
print("TFIDF PCA:", silhouette_score(SVDtfidfvect, types_l, metric = "cosine"))
print("TFIDF")
printMeanMedian(PCAtfidfVect, types_l)
print("TFIDF Alexis")
printMeanMedian(PCAtfidfAlexis, labels)
print("TFIDF Cut OFf")
printMeanMedian(PCAtfidfCut,types_l)
print(PCAtfidfVect.shape)

In [None]:
#Encode BERT
model = SentenceTransformer('msmarco-distilbert-cos-v5')
#model = SentenceTransformer('sentence-transformers/paraphrase-MiniLM-L6-v2')
dataset = api.load('text8')
datasetList= list(dataset)
data = [d for d in dataset]
#print(datasetList[0])
doc_emb = model.encode(text_l)

In [None]:
#DF shapes
print("TFIDF Dataframe shape:", df_tfidfvect.shape)
bert_df = pd.DataFrame(doc_emb, index = types_l)
#print(bert_df)
print("BERT Datframe shape:", bert_df.shape)
#print(df_tfidfvect)
df_combined = pd.concat([df_tfidfvect, bert_df], axis = 1, join = 'outer')
print("Combined Shape:",df_combined.shape)
printSamples(df_combined, types_l)
print()
SVDBert = svd.fit_transform(bert_df)
SVDCombined = pd.concat([df_tfidfvect, SVDBert], axis = 1, join = 'outer')
#printSamples(SVDCombined, types_l)

#I have been going back and forth between the different dimension reductions and combining BERT
#BERT has not been working as well as I hoped but I am going to see if removing non-English characters may help the predicitons better. 
#I was also going to try implementing stemming today and see if that helps the TFIDF models
