In [None]:
import nltk
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('punkt')

from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import wordnet as wn
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import word_tokenize, sent_tokenize
from collections import Counter
from tqdm import tqdm

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import spacy
import ast
import re
import nltk
import math

In [None]:
issues_df = pd.read_csv('6_21_validation.csv')

In [None]:
issues_df.head()

### Thresholding Impact
- 80% for all relations
    - 31% w no issues detected (776 / 2451)
- 90% for all relations
    - 34% w no issues detected (826 / 2451)

In [None]:
from utils.data import stop_words

stop_words_custom = list(set(stop_words))

In [None]:
from collections import defaultdict
from utils.data import isolate_issueContext, clean_sentence, clean_issues, flatten_list

issues_df['context_issue'] = issues_df['triplets'].apply(isolate_issueContext)
issues_df['context_issue_cleaned'] = issues_df['context_issue'].apply(clean_issues)

issues_stacked = issues_df.context_issue_cleaned.apply(pd.Series).stack().reset_index(level=1, drop=True).to_frame('Issues').dropna()
issues_stacked = issues_df.drop('context_issue_cleaned', axis=1).join(issues_stacked, how='inner').reset_index(drop=True) ## To avoid missing Issues use inner join

In [None]:
emb_model = SentenceTransformer('bert-base-nli-stsb-mean-tokens')

In [None]:
issue_embeddings = emb_model.encode(flatten_list(issues_df['context_issue_cleaned'].values), show_progress_bar=True)

### U-MAP/ HDBscan
- Found to be much more accurate than original KMeans algorithms
- be right for what it can, and defer on anything that it couldn’t have sufficient confidence in
- classifies many points/docs as "noise"
- [UMAP documentation link](https://umap-learn.readthedocs.io/en/latest/clustering.html)
- [HDBScan documentation link](https://hdbscan.readthedocs.io/en/latest/parameter_selection.html)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
import hdbscan
import umap


tfidf_vectorizer = TfidfVectorizer(max_df=0.1, min_df=0.0015, ngram_range=(1,2)) #
feature_matrix = tfidf_vectorizer.fit_transform(flatten_list(issues_df['context_issue_cleaned'].values))
print(feature_matrix.shape)

umap_model = umap.UMAP(n_neighbors=30,
                            min_dist=0.0,
                            n_components=10,
                            metric='hellinger')
umap_embeddings = umap_model.fit_transform(feature_matrix)

In [None]:
cluster = hdbscan.HDBSCAN(min_cluster_size=15, 
                          min_samples=1,
                          cluster_selection_method='eom').fit(umap_embeddings)

In [None]:
docs_df = pd.DataFrame(flatten_list(issues_df['context_issue_cleaned'].values), columns=["Doc"])
docs_df['Topic'] = cluster.labels_
docs_df['Doc_ID'] = range(len(docs_df))
docs_per_topic = docs_df.groupby(['Topic'], as_index = False).agg({'Doc': ' '.join})

In [None]:
from utils.data import c_tf_idf

tf_idf, count = c_tf_idf(docs_per_topic.Doc.values, m=len(flatten_list(issues_df['context_issue_cleaned'].values)))

In [None]:
from utils.data import extract_top_n_words_per_topic, extract_topic_sizes

top_n_words = extract_top_n_words_per_topic(tf_idf, count, docs_per_topic, n=20)
topic_sizes = extract_topic_sizes(docs_df); topic_sizes.head(10)

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

data = flatten_list(issues_df['context_issue_cleaned'].values)
                    
for i in range(10):
    # Calculate cosine similarity
    similarities = cosine_similarity(tf_idf.T)
    np.fill_diagonal(similarities, 0)

    # Extract label to merge into and from where
    topic_sizes = docs_df.groupby(['Topic']).count().sort_values("Doc", ascending=False).reset_index()
    topic_to_merge = topic_sizes.iloc[-1].Topic
    topic_to_merge_into = np.argmax(similarities[topic_to_merge + 1]) - 1

    # Adjust topics
    docs_df.loc[docs_df.Topic == topic_to_merge, "Topic"] = topic_to_merge_into
    old_topics = docs_df.sort_values("Topic").Topic.unique()
    map_topics = {old_topic: index - 1 for index, old_topic in enumerate(old_topics)}
    docs_df.Topic = docs_df.Topic.map(map_topics)
    docs_per_topic = docs_df.groupby(['Topic'], as_index = False).agg({'Doc': ', '.join})

    # Calculate new topic words
    m = len(data)
    tf_idf, count = c_tf_idf(docs_per_topic.Doc.values, m)
    top_n_words = extract_top_n_words_per_topic(tf_idf, count, docs_per_topic, n=20)

print(topic_sizes.count())
topic_sizes = extract_topic_sizes(docs_df); topic_sizes.head(10)

In [None]:
from utils.data import clean_dups, word_count, clear_redundent_unigrams

top_n_words = extract_top_n_words_per_topic(tf_idf, count, docs_per_topic, n=6)

for cluster, topics in top_n_words.items():
    # clear duplicates
    topics = [clean_dups(top) for top, score in topics]
    
    # remove duplicates, list(set) loses ordering
    final_topics = []
    [final_topics.append(x) for x in topics if x not in final_topics]
    
    # clear unigrams that repeat in bigrams
    final_topics = clear_redundent_unigrams(final_topics)
    
    # return top 3 words as string
    topic_dict[cluster] = ', '.join(final_topics[0:3])
    
topics = pd.DataFrame(topic_dict, index=[0]).T.reset_index()
topics.iloc[:20]

In [None]:
cluster_df = pd.merge(docs_df, topics, left_on='Topic', right_on='index').drop(['Doc_ID', 'index'], axis=1)
cluster_df.columns = ['Issue', 'Cluster', 'Topic']
print('Total Clusters: ', cluster_df['Topic'].value_counts().count())
cluster_df['Topic'].value_counts().head(40)

### Merge "Noise" cluster with most similar clusters

In [None]:
noise_inds = cluster_df[cluster_df['Cluster'] == -1].index

noise = cluster_df.loc[noise_inds].drop(['Cluster', 'Topic'], axis=1)
classified = cluster_df.drop(noise_inds)

In [None]:
# compute feature matrix for previous "noise"
feature_matrix = tfidf_vectorizer.transform(noise['Issue'].values)

# compute topic feature matrix for previous topics (ignoring "noise" i.e. -1)
topic_matrix = tfidf_vectorizer.transform(docs_per_topic.Doc.values[1:])

similarity = cosine_similarity(feature_matrix, topic_matrix)
top = np.argmax(similarity, axis=1)

# get top cluster and scores for all docs
top_scores = [(x, similarity[i][x]) for i, x in enumerate(top)]

# if tf_idf score over 5% add to new cluster else keep as noise
noise['Cluster'] = [x[0] if x[1] > 0.05 else -1 for x in top_scores]

In [None]:
noise = pd.merge(noise, topics, left_on='Cluster', right_on='index')
noise = noise.drop('index', axis=1)
noise.columns = ['Issue', 'Cluster', 'Topic']

noise.loc[noise['Cluster'] == -1, 'Topic'] = 'Noise'

In [None]:
cluster_df = classified.append(noise)

In [None]:
topic_gen = (i for i in cluster_df['Topic'].unique())

In [None]:
topic = next(topic_gen)
print(topic)
print('Instances: ', cluster_df[cluster_df['Topic'] == topic]['Topic'].count())

cluster_df[cluster_df['Topic'] == topic]['Issue'].values

In [None]:
cases_df = pd.merge(issues_stacked[['Case Number', 'Date', 'Severity', 'Customer Request', 'Issues']],
                    cluster_df[['Issue', 'Cluster', 'Topic']], 
                    left_on='Issues',
                    right_on='Issue',
                    how='inner').drop_duplicates().reset_index()

bi_cluster = cases_df.groupby('Case Number')['Cluster'].apply(list)
tfidf_cluster = cases_df.groupby('Case Number')['Topic'].apply(set)
cases_df = pd.merge(issues_df, bi_cluster, on='Case Number', how='left').merge(tfidf_cluster, on='Case Number')

In [None]:
nlp = spacy.load('spacy_ner_may11/')

In [None]:
sample = cases_df.sample(1)
spacy.displacy.render(nlp(sample['Customer Request'].values[0]), style='ent')

for trip in ast.literal_eval(sample['triplets'].values[0]):
    print('\n', '-'*100)
    print(f'PRODUCT: {trip[0]}')
    print(f'RELATION: {trip[1]}')
    print(f'TEXT: {trip[2]}')

if sample['Cluster'].values[0] == [-1]:
    print('Cluster Unassigned')
else:
    print('\nUMap Clusters: ', sample['Topic'].values[0])

## KMeans Clustering 

In [None]:
from sklearn.cluster import KMeans

In [None]:
def calculate_wcss(data, rand): 
    wcss = []
    for n in tqdm(range(2, 100)):
        kmeans = KMeans(n_clusters=n, random_state=rand)
        kmeans.fit(X=data)
        wcss.append(kmeans.inertia_)
    
    plt.figure()
    sns.lineplot(range(2, 100), wcss)
    plt.title('Within Cluster Sum of Squared Error');
    plt.xlabel("Number of cluster")
    plt.ylabel("SSE")
    plt.show()
    return wcss

def optimal_number_of_clusters(wcss):
    x1, y1 = 2, wcss[0]
    x2, y2 = 100, wcss[len(wcss)-1]
    distances = []
    
    for i in tqdm(range(len(wcss))):
        x0 = i+2
        y0 = wcss[i]
        numerator = abs((y2-y1)*x0 - (x2-x1)*y0 + x2*y1 - y2*x1)
        denominator = math.sqrt((y2 - y1)**2 + (x2 - x1)**2)
        distances.append(numerator/denominator)
        
    return distances.index(max(distances)) + 2

In [None]:
print("K value not provided, estimating optimal no of clusters")
print("calculating the within clusters sum-of-squares")
sum_of_squares = calculate_wcss(issue_embeddings, 43) #, op_dir, filename
print("calculating the optimal number of clusters")
n = optimal_number_of_clusters(sum_of_squares)
print("Optimal no of clusters : ", n)

kmeans = KMeans(n_clusters = n, random_state=43)
y_kmeans = kmeans.fit_predict(issue_embeddings)
y=y_kmeans+1
y

In [None]:
extended_clusters = int(n*1.5)

#clustering_model = KMeans(n_clusters=n, random_state=26)
clustering_model = KMeans(n_clusters=extended_clusters, random_state=0)

clustering_model.fit(issue_embeddings)
cluster_assignment = clustering_model.labels_

## Analyze Clusters

In [None]:
clusterdf = pd.DataFrame()
clusterdf['Issues']=flatten_list(issues_df['context_issue_cleaned'].values)
#clusterdf['Issues']=flatten_list(issues_df['text_clean'].values)

clusterdf['cluster']=cluster_assignment
print(clusterdf.shape)

In [None]:
clusterdf.head(10)

In [None]:
clusterdf['cluster'].value_counts()

### Cluster Tf-idf matrix

In [None]:
feature_matrix = tfidf_vectorizer.fit_transform(clusterdf['Issues'].values)

In [None]:
print("K value not provided, estimating optimal no of clusters")
print("calculating the within clusters sum-of-squares")
sum_of_squares = calculate_wcss(feature_matrix, 43) #, op_dir, filename
print("calculating the optimal number of clusters")
n = optimal_number_of_clusters(sum_of_squares)
print("Optimal no of clusters : ", n)

kmeans = KMeans(n_clusters = n, random_state=43)
y_kmeans = kmeans.fit_predict(feature_matrix)
y=y_kmeans+1
y

In [None]:
clustering_model = KMeans(n_clusters=n, random_state=0)

clustering_model.fit(feature_matrix)
cluster_assignment = clustering_model.labels_

In [None]:
clusterdf = pd.DataFrame()
clusterdf['Issues']=flatten_list(issues_df['context_issue_cleaned'].values)
#clusterdf['Issues']=flatten_list(issues_df['text_clean'].values)

clusterdf['cluster']=cluster_assignment
print(clusterdf.shape)

In [None]:
terms = tfidf_vectorizer.get_feature_names()
cluster_centers = clustering_model.cluster_centers_

# Display the top_n terms in that cluster
key_terms = []
for i in range(n):
    # Sort the terms and print top_n terms
    center_terms = dict(zip(terms, list(cluster_centers[i])))
    sorted_terms = sorted(center_terms, key=center_terms.get, reverse=True)
    key_terms.append(sorted_terms[:5])

In [None]:
cluster_terms = pd.DataFrame([', '.join(words) for words in key_terms], columns=['Top Terms']).reset_index()
tfidf_clusters = pd.merge(clusterdf, cluster_terms, left_on='cluster', right_on='index')
tfidf_clusters.head()

## Labeling Clusters

#### Bi-gram/Uni-gram

In [None]:
from utils.data import get_wordnet_pos

unq_clust = clusterdf["cluster"].unique()

lemmatizer = WordNetLemmatizer()
label_df = pd.DataFrame()

for uc in tqdm(unq_clust):
    try:
        tmp_clust_df = clusterdf.loc[clusterdf['cluster']==uc].copy()
        temp_clust_corpus = list(set(tmp_clust_df['Issues'].dropna().to_list()))
        
        chunks_list =[]
        wc_dict = Counter()
        c_vec = CountVectorizer(ngram_range=(2, 2)) #min_n=1, max_n=2 #ngram_range=(1, 5) , min_df=0.5
        for chunks in temp_clust_corpus:
            words = [lemmatizer.lemmatize(w, get_wordnet_pos(w)) for w in word_tokenize(chunks)]
            wc_dict.update(word for word in words)
            chunks_list.append(words)
        
        label = wc_dict.most_common(1)[0][0]
        
        bigrams = c_vec.fit_transform(temp_clust_corpus)
        vocab = c_vec.vocabulary_
        count_values = bigrams.toarray().sum(axis=0)
        bigram_count = sorted([(count_values[i],k) for k,i in vocab.items()], reverse=True)
        
        bigram_label = bigram_count[0][1]
        
        flag_var = 2 if label in bigram_label.split(" ") else 1 ## or (bigram_label.find(label)>=0)
        #& label != 'not'
        
        lb_df = pd.DataFrame({'Cluster2': bigram_label, 'Cluster1': label, 'Cluster_Flag': flag_var, 'cluster' : uc}, index=[0]) #, 'Cluster1_root': root
        label_df = label_df.append(lb_df, ignore_index = True)
    except:
        pass

In [None]:
label_df.loc[label_df['Cluster_Flag']==1, 'Cluster'] = label_df['Cluster1']
label_df.loc[label_df['Cluster_Flag']==2, 'Cluster'] = label_df['Cluster2']
label_df

In [None]:
labeled_clusters = pd.merge(clusterdf, label_df, on='cluster')

#### TF-IDF

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

#define vectorizer parameters
tfidf_vectorizer = TfidfVectorizer(max_df=0.1, min_df=0.001, ngram_range=(1,2))
tfidf_vectorizer.fit(issues_df['Customer Request'].values)


unq_clust = clusterdf["cluster"].unique()
unq_clusterdf = pd.DataFrame()

for uc in tqdm(unq_clust):
    tmp_clust_df = clusterdf.loc[clusterdf['cluster']==uc].copy()
    chunks = [word_tokenize(x) for x in list(set(tmp_clust_df['Issues'].dropna().to_list()))]
    chunk_corpus = ([lemmatizer.lemmatize(w, get_wordnet_pos(w)) for w in flatten_list(chunks)])
    
    tmp_df = pd.DataFrame({"cluster": uc, 'text': [', '.join(chunk_corpus)]})
    unq_clusterdf = unq_clusterdf.append(tmp_df, ignore_index = True) 

tfidf_matrix = tfidf_vectorizer.transform(unq_clusterdf.text.dropna())
print(tfidf_matrix.shape)

In [None]:
tfidf_matrix_lst = list(tfidf_matrix)

tfidf_terms = tfidf_vectorizer.get_feature_names()
term_lst = []
score_lst = []

for e in tfidf_matrix_lst:
    top_n_term_indx = np.argsort(e.toarray())[0][::-1][:4]
    term_lst.append(list(map(tfidf_terms.__getitem__, top_n_term_indx)))
    score_lst.append(list(map(e.toarray()[0].__getitem__, top_n_term_indx)))

unq_clust = pd.DataFrame(unq_clust)
unq_clust['Cluster_tfidf'] = term_lst
unq_clust['Score_tfidf'] = score_lst

labeled_clusters = pd.merge(labeled_clusters, unq_clust, left_on='cluster', right_on=0)
labeled_clusters['Cluster_tfidf'] = list(labeled_clusters['Cluster_tfidf'].apply(lambda x: ', '.join(x)))

In [None]:
cases_df = pd.merge(issues_stacked[['Case Number', 'Date', 'Severity', 'Customer Request', 'Issues']],
                    labeled_clusters[['Issues', 'cluster', 'Cluster', 'Cluster_tfidf']], 
                    on='Issues', 
                    how='inner').drop_duplicates().reset_index()

bi_cluster = cases_df.groupby('Case Number')['Cluster'].apply(list)
tfidf_cluster = cases_df.groupby('Case Number')['Cluster_tfidf'].apply(set)
cases_df = pd.merge(issues_df, bi_cluster, on='Case Number', how='left').merge(tfidf_cluster, on='Case Number')