# NMF at message level
1. split messages further into sentences (include commas)
2. add an additional `message_idx` (for aggregating later)
3. split df into 1 word and multi-word
    - ignore 1 word df for now
4. run nmf
5. aggregate at conv level
    - sort by `["conv_id", "msg_idx"]`
    - group by `conv_id`: concat text, concat topics, concat topic keywords

In [2]:
from typing import List
from pathlib import Path
import json
import pandas as pd
import numpy as np
from sklearn.decomposition import NMF
from sklearn.feature_extraction.text import TfidfVectorizer
from joblib import dump
from gensim.models.coherencemodel import CoherenceModel
from gensim.corpora.dictionary import Dictionary
from collections import Counter
from operator import itemgetter
from gensim.models.nmf import Nmf
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('darkgrid')

from tqdm import tqdm

from NMF_utils import *

In [3]:
# set values for rest of script
best_num_topics = 200 
top_k = 3 # k for top_k topics
n_top_words = 5 # n for n_top_words (number of words for topic key words)
docweights_threshold = 0.1 # threshold for considering a topic is relevant in NMF
save_folder = "june21"
Path(f"./nmf_topics/{save_folder}").mkdir(parents=True, exist_ok=True)
Path(f"./nmf_models/{save_folder}").mkdir(parents=True, exist_ok=True)


In [4]:
df = pd.read_csv("freedom_intent_no_escalate.csv")

In [5]:
df.head(5)

Unnamed: 0.1,Unnamed: 0,CLUSTER_ID,MESSAGE,SESSION_ID,CHANNEL,MESSAGE_INTENT_1,MESSAGE_INTENT_CONFIDENCE_1,MESSAGE_INTENT_2,MESSAGE_INTENT_CONFIDENCE_2,MESSAGE_INTENT_3,MESSAGE_INTENT_CONFIDENCE_3,MESSAGE_INTENT,MESSAGE_CONFIDENCE,IF_VALID_INTENT,LANGUAGE
0,2,0,netel,d63b71e0-e4c7-4012-822b-135f884dd90d:::4,BOT,-1,-1,-1,-1,-1,-1,$shop_accessory_misc,0.953275,True,en
1,3,0,netel,5241a58a-9b9a-422e-b451-3832ac96e1dc:::6,BOT,-1,-1,-1,-1,-1,-1,$shop_accessory_misc,0.953275,True,en
2,11,0,netel,b864d516-66b3-4223-a852-e4b428b7578d:::4,BOT,-1,-1,-1,-1,-1,-1,$shop_accessory_misc,0.953275,True,en
3,15,0,netel,719801fc-34c6-4ff9-8d7b-bb32d4de6766:::4,BOT,-1,-1,-1,-1,-1,-1,$shop_accessory_misc,0.953275,True,en
4,22,0,netel,3808e306-4e23-4292-94f3-d1fd1b9bfd33:::4,BOT,-1,-1,-1,-1,-1,-1,$shop_accessory_misc,0.953275,True,en


# Preprocessing

In [6]:
df = df[~df.MESSAGE.str.contains("""{ " com """)]
# add conv id and message id to sort by
df["conversation_id"] = df["SESSION_ID"].str.split(":::").str[0]
df["message_id"] = df["SESSION_ID"].str.split(":::").str[1].astype(int)
# sort to make grouping easier
# df = df.sort_values(by=["conversation_id", "message_id"])
df = df[["conversation_id", "message_id", "MESSAGE"]]
df.head(5)

Unnamed: 0,conversation_id,message_id,MESSAGE
0,d63b71e0-e4c7-4012-822b-135f884dd90d,4,netel
1,5241a58a-9b9a-422e-b451-3832ac96e1dc,6,netel
2,b864d516-66b3-4223-a852-e4b428b7578d,4,netel
3,719801fc-34c6-4ff9-8d7b-bb32d4de6766,4,netel
4,3808e306-4e23-4292-94f3-d1fd1b9bfd33,4,netel


split the messages further into sentences

In [7]:
# split the messages further into sentences
df = df.drop('MESSAGE', axis=1).join(df['MESSAGE'].str.split(r"[,.!?]", expand=True).stack().reset_index(level=1, drop=True).rename('MESSAGE'))
df.tail(10)

Unnamed: 0,conversation_id,message_id,MESSAGE
3525,8087c72a-3246-42ee-ac83-db6d87be6bfc,1,how can i do that
3525,8087c72a-3246-42ee-ac83-db6d87be6bfc,1,
3526,e6c37c6b-52c8-4ed0-a02d-ec77d96ad839,3,is there any deal to buy apple watch 6 in mont...
3527,13d88447-fdc4-42ac-87d9-9bb1641a8062,1,hi i recently transferred my phone number from...
3527,13d88447-fdc4-42ac-87d9-9bb1641a8062,1,i am unable to access it due to the fact that...
3527,13d88447-fdc4-42ac-87d9-9bb1641a8062,1,the account name is under neper
3527,13d88447-fdc4-42ac-87d9-9bb1641a8062,1,i am looking to cancel the plan on the accoun...
3527,13d88447-fdc4-42ac-87d9-9bb1641a8062,1,thank you
3528,3836504886472679,18,have a great night
3529,3473478809418490,5,i topped up my account online but i got nothin...


In [8]:
# add sentence_id to track message order for aggregation
# sort to make grouping easier
df = df.sort_values(by=["conversation_id", "message_id"])
df["sentence_id"] = df.groupby("conversation_id").cumcount()
df["session_id"] = df["conversation_id"]+"_"+df["sentence_id"].astype(str)
df.tail(10)

Unnamed: 0,conversation_id,message_id,MESSAGE,sentence_id,session_id
883,ff9ac61f-8f98-4516-9342-7aa1177514d9,20,data + talk,3,ff9ac61f-8f98-4516-9342-7aa1177514d9_3
1062,ff9ac61f-8f98-4516-9342-7aa1177514d9,22,more than 10 gb,4,ff9ac61f-8f98-4516-9342-7aa1177514d9_4
1149,ff9ac61f-8f98-4516-9342-7aa1177514d9,24,15 gb freedom data,5,ff9ac61f-8f98-4516-9342-7aa1177514d9_5
73,ffb30da4-4274-4dde-a755-eb5160834e8c,4,netel,0,ffb30da4-4274-4dde-a755-eb5160834e8c_0
39,ffb30da4-4274-4dde-a755-eb5160834e8c,6,netel,1,ffb30da4-4274-4dde-a755-eb5160834e8c_1
3049,ffbf37b3-2b93-4875-9cbb-d5f5dc0ed3e2,2,hello,0,ffbf37b3-2b93-4875-9cbb-d5f5dc0ed3e2_0
3049,ffbf37b3-2b93-4875-9cbb-d5f5dc0ed3e2,2,i purchased the cannes outdoor sofa however i...,1,ffbf37b3-2b93-4875-9cbb-d5f5dc0ed3e2_1
3049,ffbf37b3-2b93-4875-9cbb-d5f5dc0ed3e2,2,are they machine washable,2,ffbf37b3-2b93-4875-9cbb-d5f5dc0ed3e2_2
3049,ffbf37b3-2b93-4875-9cbb-d5f5dc0ed3e2,2,,3,ffbf37b3-2b93-4875-9cbb-d5f5dc0ed3e2_3
247,ffbf37b3-2b93-4875-9cbb-d5f5dc0ed3e2,5,message with a rep,4,ffbf37b3-2b93-4875-9cbb-d5f5dc0ed3e2_4


In [9]:
# preprocess text
df["MESSAGE"] = df["MESSAGE"].str.replace("|".join(remove_words),'')# remove certain texts from messages
df['word_count'] = df['MESSAGE'].apply(word_count)
df['processed_text'] = df['MESSAGE'].apply(process_text)
df = df[df['processed_text'].map(lambda d: len(d)) > 0]
print(df.shape)
df.tail(10)

(3496, 7)


Unnamed: 0,conversation_id,message_id,MESSAGE,sentence_id,session_id,word_count,processed_text
1240,ff097a90-5972-46d1-845a-3c9a399b70bc,1,i need help,0,ff097a90-5972-46d1-845a-3c9a399b70bc_0,3,"[need, help]"
1883,ff93a689-8d1a-49c2-a5bc-54dd321707a3,1,i am getting a phone for my son and looking to...,0,ff93a689-8d1a-49c2-a5bc-54dd321707a3_0,26,"[get, phone, son, look, plan, iphon, se, packag]"
859,ff9ac61f-8f98-4516-9342-7aa1177514d9,7,data + talk,0,ff9ac61f-8f98-4516-9342-7aa1177514d9_0,3,"[data, talk]"
993,ff9ac61f-8f98-4516-9342-7aa1177514d9,9,less than 10 gb,1,ff9ac61f-8f98-4516-9342-7aa1177514d9_1,4,[gb]
1284,ff9ac61f-8f98-4516-9342-7aa1177514d9,14,10 gb,2,ff9ac61f-8f98-4516-9342-7aa1177514d9_2,2,[gb]
883,ff9ac61f-8f98-4516-9342-7aa1177514d9,20,data + talk,3,ff9ac61f-8f98-4516-9342-7aa1177514d9_3,3,"[data, talk]"
1062,ff9ac61f-8f98-4516-9342-7aa1177514d9,22,more than 10 gb,4,ff9ac61f-8f98-4516-9342-7aa1177514d9_4,4,[gb]
1149,ff9ac61f-8f98-4516-9342-7aa1177514d9,24,15 gb freedom data,5,ff9ac61f-8f98-4516-9342-7aa1177514d9_5,4,"[gb, freedom, data]"
3049,ffbf37b3-2b93-4875-9cbb-d5f5dc0ed3e2,2,i purchased the cannes outdoor sofa however i...,1,ffbf37b3-2b93-4875-9cbb-d5f5dc0ed3e2_1,17,"[purchas, cann, outdoor, sofa, care, instruct,..."
3049,ffbf37b3-2b93-4875-9cbb-d5f5dc0ed3e2,2,are they macne washable,2,ffbf37b3-2b93-4875-9cbb-d5f5dc0ed3e2_2,5,"[macn, washabl]"


In [10]:
# df[df["conversation_id"]=="1895298307224761"]

## split df into 2 dfs
- df containing only 1 word sentences
- df containing multiple sentences

**ignore df containing only 1 word sentences for now**

In [11]:
df1 = df[df["processed_text"].str.len()==1]
df2 = df[df["processed_text"].str.len()!=1]
df2.head(5)

Unnamed: 0,conversation_id,message_id,MESSAGE,sentence_id,session_id,word_count,processed_text
2608,001bede7-810d-4284-b289-c29ab8a12bba,2,i need to activate new sim,1,001bede7-810d-4284-b289-c29ab8a12bba_1,7,"[need, activ, new, sim]"
2608,001bede7-810d-4284-b289-c29ab8a12bba,2,but it's got getting activated online,2,001bede7-810d-4284-b289-c29ab8a12bba_2,7,"[got, get, activ, onlin]"
987,0024cce3-b115-4a79-88c7-604d046076df,5,account details,0,0024cce3-b115-4a79-88c7-604d046076df_0,2,"[account, detail]"
3463,0028d3f2-964b-4903-bd37-3bb364459504,1,i would like to cancel service,0,0028d3f2-964b-4903-bd37-3bb364459504_0,6,"[cancel, servic]"
2591,00560815-b4b9-4919-bd75-5dbd07aadec2,5,i have a balance of but my wife 's account ag...,0,00560815-b4b9-4919-bd75-5dbd07aadec2_0,16,"[balanc, wife, account, say, suspend]"


In [12]:
df2 = df2.drop_duplicates(subset=["MESSAGE"])

In [13]:
df2['word_count'].describe()

count    2495.000000
mean       10.488577
std         7.580401
min         1.000000
25%         6.000000
50%         9.000000
75%        13.000000
max       144.000000
Name: word_count, dtype: float64

# NMF on multi-word sentences

In [14]:
processed_text = [item for sublist in df2.processed_text for item in sublist]
len(set(processed_text))

1372

In [15]:
# Use Gensim's NMF to get the best num of topics via coherence score
texts = df2['processed_text']

# Create a dictionary
# In gensim a dictionary is a mapping between words and their integer id
dictionary = Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]

In [16]:
# # Create a list of the topic numbers we want to try
# topic_nums = list(np.arange(100, 200 + 1, 5))

# # Run the nmf model and calculate the coherence score
# # for each number of topics
# coherence_scores = []

# for num in tqdm(topic_nums):
#     nmf = Nmf(
#         corpus=corpus,
#         num_topics=num,
#         id2word=dictionary,
#         chunksize=2000,
#         passes=5,
#         kappa=.1,
#         minimum_probability=0.01,
#         w_max_iter=300,
#         w_stop_condition=0.0001,
#         h_max_iter=100,
#         h_stop_condition=0.001,
#         eval_every=10,
#         normalize=True,
#         random_state=42
#     )
    
#     # Run the coherence model to get the score
#     cm = CoherenceModel(
#         model=nmf,
#         texts=texts,
#         dictionary=dictionary,
#         coherence='c_v'
#     )
    
#     coherence_scores.append(round(cm.get_coherence(), 5))

# # Get the number of topics with the highest coherence score
# scores = list(zip(topic_nums, coherence_scores))
# best_num_topics = sorted(scores, key=itemgetter(1), reverse=True)[0][0]

# # Plot the results
# fig = plt.figure(figsize=(30, 12))

# plt.plot(
#     topic_nums,
#     coherence_scores,
#     linewidth=3,
#     color='#4287f5'
# )

# plt.xlabel("Topic Num", fontsize=14)
# plt.ylabel("Coherence Score", fontsize=14)
# plt.title('Coherence Score by Topic Number - Best Number of Topics: {}'.format(best_num_topics), fontsize=18)
# plt.xticks(np.arange(5, max(topic_nums) + 1, 5), fontsize=12)
# plt.yticks(fontsize=12)

# file_name = 'c_score'

# plt.show()

In [18]:
# Now use the number of topics with the 
# highest coherence score to run the 
# sklearn nmf model

texts = df2['processed_text']

# Create the tfidf weights
tfidf_vectorizer = TfidfVectorizer(
    min_df=3,
    max_df=0.9,
    max_features=10000,
    ngram_range=(2, 5),
    preprocessor=' '.join
)

tfidf = tfidf_vectorizer.fit_transform(texts)

# Save the feature names for later to create topic summaries
tfidf_fn = tfidf_vectorizer.get_feature_names()

# Run the nmf model
nmf = NMF(
    n_components=best_num_topics,
    init='nndsvd',
    max_iter=1000,
    l1_ratio=0.0,
    solver='cd',
    alpha=0.0,
    tol=1e-4,
    random_state=42
).fit(tfidf)


## Generate topic keywords

In [19]:
# create a dict mapping of topic name and top k keywords
docweights = nmf.transform(tfidf_vectorizer.transform(texts))
topic_dict = get_topic_dictionary(
    nmf,
    tfidf_fn,
    n_top_words
)
topic_dict[0]


['phone number',
 'number neacct',
 'phone number account',
 'suspend phone',
 'suspend phone number']

### Get top k topics

In [21]:
# can go along each row


top_k_topics = get_top_k_topics(docweights, top_k = top_k)
top_k_topics[0], top_k_topics[1], top_k_topics[2], top_k_topics[3]

(array([101,  28,  12]),
 array([-1, -1, -1]),
 array([-1, -1, -1]),
 array([21, 67, 68]))

### map topics to their keywords

In [22]:
# Creating a temp df with the sesion id and topic number to join on
# with top k topics
session_ids = df2['session_id'].tolist()

df_merge = pd.DataFrame({
    'session_id': session_ids,
})

# create 2*k columns, each wtih top k topics and their associated keywords
for i in range(top_k_topics.shape[1]):
    df_merge[f"topic_{i+1}"] = top_k_topics[:,i]
    df_merge[f"topic_{i+1}_key_words"] = df_merge[f"topic_{i+1}"].map(topic_dict)
# df_merge["topic_key_words"] = df_merge["topic"].map(topic_dict)
df_merge.head(5)

Unnamed: 0,session_id,topic_1,topic_1_key_words,topic_2,topic_2_key_words,topic_3,topic_3_key_words
0,001bede7-810d-4284-b289-c29ab8a12bba_1,101,"[activ new, activ new sim, need activ, post pa...",28,"[new sim, new sim card, need new sim, need new...",12,"[activ sim, activ sim card, want activ, sim ph..."
1,001bede7-810d-4284-b289-c29ab8a12bba_2,-1,,-1,,-1,
2,0024cce3-b115-4a79-88c7-604d046076df_0,-1,,-1,,-1,
3,0028d3f2-964b-4903-bd37-3bb364459504_0,21,"[cancel servic, servic freedom, freedom mobil ...",67,"[upgrad devic, doe work, hey want, good night,...",68,"[data work, year contract, hey want, good nigh..."
4,00560815-b4b9-4919-bd75-5dbd07aadec2_0,136,"[postal code, account say, payment account, sa...",105,"[check account, charg time, payment account, w...",99,"[log account, account set, account say, say el..."


In [23]:
# Merging to get the topic and key words
df_topics2 = pd.merge(
    df2,
    df_merge,
    on='session_id',
    how='left'
)
df_topics2.head(5)

Unnamed: 0,conversation_id,message_id,MESSAGE,sentence_id,session_id,word_count,processed_text,topic_1,topic_1_key_words,topic_2,topic_2_key_words,topic_3,topic_3_key_words
0,001bede7-810d-4284-b289-c29ab8a12bba,2,i need to activate new sim,1,001bede7-810d-4284-b289-c29ab8a12bba_1,7,"[need, activ, new, sim]",101,"[activ new, activ new sim, need activ, post pa...",28,"[new sim, new sim card, need new sim, need new...",12,"[activ sim, activ sim card, want activ, sim ph..."
1,001bede7-810d-4284-b289-c29ab8a12bba,2,but it's got getting activated online,2,001bede7-810d-4284-b289-c29ab8a12bba_2,7,"[got, get, activ, onlin]",-1,,-1,,-1,
2,0024cce3-b115-4a79-88c7-604d046076df,5,account details,0,0024cce3-b115-4a79-88c7-604d046076df_0,2,"[account, detail]",-1,,-1,,-1,
3,0028d3f2-964b-4903-bd37-3bb364459504,1,i would like to cancel service,0,0028d3f2-964b-4903-bd37-3bb364459504_0,6,"[cancel, servic]",21,"[cancel servic, servic freedom, freedom mobil ...",67,"[upgrad devic, doe work, hey want, good night,...",68,"[data work, year contract, hey want, good nigh..."
4,00560815-b4b9-4919-bd75-5dbd07aadec2,5,i have a balance of but my wife 's account ag...,0,00560815-b4b9-4919-bd75-5dbd07aadec2_0,16,"[balanc, wife, account, say, suspend]",136,"[postal code, account say, payment account, sa...",105,"[check account, charg time, payment account, w...",99,"[log account, account set, account say, say el..."


### Topics that pass a certain threshold

In [24]:
np.amin(docweights), np.amax(docweights)

(0.0, 0.7908036657609471)

In [25]:

df_topics2[f"topics_threshold_>={docweights_threshold}"] = threshold_docweights(docweights, threshold = docweights_threshold)


In [26]:
df_topics2.head(5)

Unnamed: 0,conversation_id,message_id,MESSAGE,sentence_id,session_id,word_count,processed_text,topic_1,topic_1_key_words,topic_2,topic_2_key_words,topic_3,topic_3_key_words,topics_threshold_>=0.1
0,001bede7-810d-4284-b289-c29ab8a12bba,2,i need to activate new sim,1,001bede7-810d-4284-b289-c29ab8a12bba_1,7,"[need, activ, new, sim]",101,"[activ new, activ new sim, need activ, post pa...",28,"[new sim, new sim card, need new sim, need new...",12,"[activ sim, activ sim card, want activ, sim ph...","[28, 101]"
1,001bede7-810d-4284-b289-c29ab8a12bba,2,but it's got getting activated online,2,001bede7-810d-4284-b289-c29ab8a12bba_2,7,"[got, get, activ, onlin]",-1,,-1,,-1,,[]
2,0024cce3-b115-4a79-88c7-604d046076df,5,account details,0,0024cce3-b115-4a79-88c7-604d046076df_0,2,"[account, detail]",-1,,-1,,-1,,[]
3,0028d3f2-964b-4903-bd37-3bb364459504,1,i would like to cancel service,0,0028d3f2-964b-4903-bd37-3bb364459504_0,6,"[cancel, servic]",21,"[cancel servic, servic freedom, freedom mobil ...",67,"[upgrad devic, doe work, hey want, good night,...",68,"[data work, year contract, hey want, good nigh...",[21]
4,00560815-b4b9-4919-bd75-5dbd07aadec2,5,i have a balance of but my wife 's account ag...,0,00560815-b4b9-4919-bd75-5dbd07aadec2_0,16,"[balanc, wife, account, say, suspend]",136,"[postal code, account say, payment account, sa...",105,"[check account, charg time, payment account, w...",99,"[log account, account set, account say, say el...",[]


# NMF on multi-word sentences

Currently not doing

# Aggregate at conversation level text

In [27]:
# TODO: Combine df_topics_1 and df_topics_2

df_topics = df_topics2

In [28]:
# sort by conversation and sentence id to ensure order
df_conv_topics = df_topics.sort_values(by=["conversation_id", "sentence_id"])


In [29]:
# check 
# df_conv_topics[df_conv_topics["conversation_id"]=="1895298307224761"]

In [30]:
df_conv_topics["concat_text"] = df_conv_topics.groupby(["conversation_id"])["MESSAGE"].transform(
                lambda x: " ".join(x)
            )

# generate new dfs
df_conv_concat_text = df_conv_topics.groupby(["conversation_id"])["processed_text"].agg(sum).reset_index(name="preprocessed_concat_text")
df_combined_topics = df_conv_topics.groupby(["conversation_id"])["topic_1"].apply(list).reset_index(name="combined_topics")
# df_combined_key_words = df_conv_topics.groupby(["conversation_id"])["topic_key_words"].apply(list).reset_index(name="combined_topic_key_words")

In [33]:
# join dfs
df_conv_topics = df_conv_topics.merge(df_conv_concat_text, on="conversation_id")\
                    .merge(df_combined_topics, on="conversation_id")\
                    .drop_duplicates(subset=["conversation_id"])\
                    .reset_index()\
                    [["conversation_id", "concat_text", "preprocessed_concat_text", "combined_topics"]]



In [34]:
df_conv_topics.tail(5)

Unnamed: 0,conversation_id,concat_text,preprocessed_concat_text,combined_topics
1204,fe64aeec-c81a-4759-8ac1-8e6257293a12,how does the 99/yr prepaid plan work in terms...,"[doe, yr, prepaid, plan, work, term, roam, cha...","[78, -1]"
1205,fe8a47dd-d103-44a9-9d3f-d88400aa31d8,i need to cancel my mother's phone plan she ...,"[need, cancel, mother, phone, plan, alzheim, r...","[15, 148, 126, -1]"
1206,fed5e9f2-ff88-453d-9461-c98f5a97af9d,billing information# my bill story# i 'm looki...,"[bill, inform, bill, stori, look, statement, f...","[-1, 53, -1, 0]"
1207,ff93a689-8d1a-49c2-a5bc-54dd321707a3,i am getting a phone for my son and looking to...,"[get, phone, son, look, plan, iphon, se, packag]",[110]
1208,ffbf37b3-2b93-4875-9cbb-d5f5dc0ed3e2,i purchased the cannes outdoor sofa however i...,"[purchas, cann, outdoor, sofa, care, instruct,...","[-1, -1]"


In [None]:
# check 
# df_conv_topics[df_conv_topics["conversation_id"]=="1895298307224761"]

# Saving

In [37]:
# save results, vectorizer, and nmf model
df_topics.to_csv(f"nmf_topics/{save_folder}/freedom_msg_level_nmf_clusters.csv", index=False)
df_conv_topics.to_csv(f"nmf_topics/{save_folder}/freedom_conv_level_nmf_clusters.csv", index=False)
dump(tfidf_vectorizer, f"nmf_models/{save_folder}/msg_level_vectorizer.joblib")
dump(nmf, f"nmf_models/{save_folder}/msg_level_nmf.joblib")
with open(f"nmf_topics/{save_folder}/topic_mapping.json", 'w') as f:
    json.dump(topic_dict, f, indent=4)

# Get sub-topics from topic 7

need to reprocess `processed_text` with `df2['processed_text'] = df2['MESSAGE'].apply(process_text)`