# Clustering

## Introduction

In [197]:
# List : Articles_Google_In (2017-2022)

In [200]:
# Convert excel table to list of strings
import pandas
pandas.__version__
import numpy as np
import pandas as pd

In [201]:
# Convert a column to list of strings
def getExcelString(path, column1) :
    agoogleIn = pd.read_excel(path)
    agoogleInLst = agoogleIn[column1].tolist()
    return agoogleInLst

In [202]:
def getCSVString(path, column1) :
    agoogleIn = pd.read_csv(path)
    agoogleInLst = agoogleIn[column1].tolist()
    return agoogleInLst

In [203]:
# Duplicates keywords based on # of records.
# NO PROCESSING
def duplicateKeyword(path, column1, column2) :
    # Grabs list of strings
    keywordLst = getExcelString(path, column1)
    
    # Grabs list of occurances for each keyword
    keywordCount = getExcelString(path, column2)
    
    # Duplicates keywords based on occurances
    listDup = list()

    for i in range(0, len(keywordLst)) :
        for j in range(0, keywordCount[i]):
            listDup.append(keywordLst[i])
        
    # converts listDup back to dataframe
    listDupdf = pd.DataFrame(listDup)
    
    return listDupdf

#### Import modules and install libraries

In [204]:
# This is already installed
# pip install fuzzywuzzy

In [205]:
# This is already installed
# pip install python-Levenshtein

In [206]:
# Import modules
import pandas as pd
import numpy as np

import fuzzywuzzy
from fuzzywuzzy import process
import chardet

# set seed for reproducibility
np.random.seed(0)

#### Preliminary text pre-processing

In [207]:
# generalized version of the split function
# Splits a long string based on any input string : for exampl ","

def splitCar(array, string) : 
    
    # import module
    import re
    regex = re.compile(string)
    lstTemp = []

    for i in range(0,len(array)):
        if(regex.search(array[i]) == None):
            lstTemp.append(array[i])
        else:
            lst = array[i].split(string)
            lstTemp.extend(lst)
            
    array = np.array(lstTemp)
    return array

In [208]:
# Removes any asteryx in a string
def cutCar(array, string) : 
    lstTemp = []

    for i in range(0,len(array)):
        lst = array[i].replace(string, "")
        lstTemp.append(lst)
            
    array = np.array(lstTemp)
    return array

In [209]:
# Pre-processing excel file AND removes DUPLICATES
# IMP : input is a DataFrame; output is list

def preprocessing(array) :

    # Therefore important to cast all objects to str
    array = array.astype(str)

    # convert everything to lower case
    array = array.str.lower()

    # Get all the unique keywords and put it into an array
    keywords = array.unique()

    # split keywords seperated by '/'
    keywords = splitCar(keywords, '/')

    # split keywords seperated by '&'
    keywords = splitCar(keywords, '&')

    # split keywords seperated by ','
    keywords = splitCar(keywords, ',')

    # clean up asteryx
    keywords = cutCar(keywords, "*")

    # convert the array back into dataframe in order to strip white space
    keywordsdf = pd.DataFrame(keywords)

    # remove white spaces again
    keywordsdf[0] = keywordsdf[0].str.strip()

    # converts df back to np.array
    # keywords = keywordsdf[0].to_numpy()

    # removes duplicates again and puts them back into usable array
    keywords = keywordsdf[0].unique()

    # sorts alphabatically
    keywords.sort()
    
    return keywords

In [210]:
def replace_matches_in_column(df, column, string_to_match, min_ratio = 90):
    # get a list of unique strings
    strings = df[column].unique()
    #strings = df[column]

    # get the top 10 closest matches to our input string
    matches = fuzzywuzzy.process.extract(string_to_match, strings, limit = 10, scorer=fuzzywuzzy.fuzz.token_sort_ratio)

    # only get matches with a ratio > 90
    close_matches = [matches[0] for matches in matches if matches[1] >= min_ratio]

    # get the rows of all the close matches in our dataframe
    rows_with_matches = df[column].isin(close_matches)

    # replace all rows with close matches with the input matches 
    df.loc[rows_with_matches, column] = string_to_match

    # let us know the function's done
    print("All done!")

# String similarity Fuzzy Matching + Clustering

In [216]:
# This is already installed
# conda install scikit-learn

In [217]:
# This is already installed
# pip install Distance

In [218]:
import numpy as np
from sklearn.cluster import AffinityPropagation
import distance

In [261]:
# IMP This can't take a list with duplicates !
def levCluster(keywords) :

    lev_similarity = -1*np.array([[distance.levenshtein(w1,w2) for w1 in keywords] for w2 in keywords])

    affprop = AffinityPropagation(affinity="precomputed", damping=0.5)
    affprop.fit(lev_similarity)

    # keywordsUnique = keywords.unique()
    tblCluster = []
    for cluster_id in np.unique(affprop.labels_):
        exemplar = keywords[affprop.cluster_centers_indices_[cluster_id]]
        cluster = np.unique(keywords[np.nonzero(affprop.labels_== cluster_id)])


        # print(exemplar)
        cluster = cluster.tolist()
        cluster.insert(0, exemplar)
        tblCluster.append(cluster)

    return tblCluster

#### Counting

In [221]:
# gets list of keywords WITHOUT duplicates

def preprocessingDup(array) : 

    # Therefore important to cast all objects to str
    array = array.astype(str)

    # convert everything to lower case
    array = array.str.lower()

    # Get all the unique keywords and put it into an array
    # keywords = agoogleIn['A'].unique()
    keywordsDup = array.to_numpy()

    # split keywords seperated by '/'
    keywordsDup = splitCar(keywordsDup, '/')

    # split keywords seperated by '&'
    keywordsDup = splitCar(keywordsDup, '&')

    # split keywords seperated by ','
    keywordsDup = splitCar(keywordsDup, ',')

    # clean up asteryx
    keywordsDup = cutCar(keywordsDup, "*")

    # convert the array back into dataframe in order to strip white space
    keywordsDupdf = pd.DataFrame(keywordsDup)

    # remove white spaces again
    keywordsDupdf[0] = keywordsDupdf[0].str.strip()

    # converts df back to np.array
    keywordsDup = keywordsDupdf[0].to_numpy()

    # sorts alphabatically
    keywordsDup.sort()
    
    return keywordsDup

In [222]:
# finds all duplicates and puts them in a list
def getDupes(lst) :
    tbl = []
    for word in lst :
        i = 0

        for word_ in lst :
            # allows appending duplicates only
            if word_ == word :
                i += 1

            if i == 2 :
                tbl.append(word)
                i += 1
                break
    
    # return a list of duplicate keywords
    return tbl

In [223]:
# adds the duplicates to the cluster table

def addDupes(tblCluster, keywordsDup) :
    dupes = getDupes(keywordsDup)
    for d in dupes :

        for i in range(0, len(tblCluster)) :
            for j in tblCluster[i] :
                if d == j :
                    tblCluster[i].append(d)
                    break
    return tblCluster

In [224]:
# Counts the number of keywords under each exemplar and returns list of list of exemplar to count

def countKeywords(tblCluster, keywordsDup) :
    tblCluster1 = addDupes(tblCluster, keywordsDup)
    
    tblKeywords = []
    for x in tblCluster1 :
        tblx = []
        count = len(x) - 1
        tblx.append(x[0]) 
        tblx.append(count)
        x = tblx
        tblKeywords.append(x)

    return tblKeywords

### Final program : Lev distance-based Keyword Clustering

In [225]:
def classify(path, column1, column2) :
    array = duplicateKeywords(path, column1, column2)
    
    keywords = preprocessing(array[0])
    tblCluster = levCluster(keywords)
    
    keywordsDup = preprocessingDup(array[0])
    tblCluster = addDupes(tblCluster, keywordsDup)
    
    tblKeywords = countKeywords(tblCluster, keywordsDup)
    
    pd.DataFrame(tblKeywords).to_excel('outputTest.xlsx', header=False, index=False)

###### improuvements

In [226]:
# clustering based on semantics rather than lev distance
# Output the list with the duplicates in the same file as well so that there is posSibility for modification
# dont forget that it didnt expert the # of records from endnote, so find a way to export this as well