# Clustering with Sentence-BERT
1. split messages further into sentences (include commas)
2. add an additional `message_idx` (for aggregating later)
3. split df into 1 word and multi-word
    - ignore 1 word df for now
4. encode sentences with sentence-bert
    - scale to be non-negative
4. run nmf
5. aggregate at conv level
    - sort by `["conv_id", "msg_idx"]`
    - group by `conv_id`: concat text, concat topics, concat topic keywords

In [1]:
from collections import Counter
from pathlib import Path
from typing import Tuple
import pandas as pd
import numpy as np
from NMF_utils import *
from sklearn.decomposition import NMF
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from joblib import dump
from tqdm import tqdm
from sentence_transformers import SentenceTransformer
import nltk

In [31]:
# settings
"""
couple models to try
 paraphrase-distilroberta-base-v2 
 paraphrase-TinyBERT-L6-v2 
 stsb-distilroberta-base-v2
 
"""
pretrained_model = "paraphrase-distilroberta-base-v2"
# set values for rest of script
best_num_topics = 200 
top_k = 3 # k for top_k topics
n_top_words = 5 # n for n_top_words (number of words for topic key words)
docweights_threshold = 0.75 # threshold for considering a topic is relevant in NMF
save_folder = "june18_bert"
Path(f"./nmf_topics/{save_folder}").mkdir(parents=True, exist_ok=True)
Path(f"./nmf_models/{save_folder}").mkdir(parents=True, exist_ok=True)

# encoding tests

In [3]:
# sentence transformer test

model = SentenceTransformer(pretrained_model)

#Our sentences we like to encode
# sentences = ['This framework generates embeddings for each input sentence',
#     'Sentences are passed as a list of string.',
#     'The quick brown fox jumps over the lazy dog.']

# #Sentences are encoded by calling model.encode()
# embeddings = model.encode(sentences)

"""
can run something like
sentences = df["MESSAGE"]
embeddings = model.encode(sentences)
print(embeddings.shape) results in shape (n, 768)
print(np.amin(embeddings), np.amax(embeddings)) (typically somewhere between -5, 5)
"""

'\ncan run something like\nsentences = df["MESSAGE"]\nembeddings = model.encode(sentences)\nprint(embeddings.shape) results in shape (n, 768)\nprint(np.amin(embeddings), np.amax(embeddings)) (typically somewhere between -5, 5)\n'

In [4]:
# embeddings
# np.amax(embeddings), np.amin(embeddings)

In [5]:
df = pd.read_csv("freedom_intent_no_escalate.csv")

In [6]:
df.head(5)

Unnamed: 0.1,Unnamed: 0,CLUSTER_ID,MESSAGE,SESSION_ID,CHANNEL,MESSAGE_INTENT_1,MESSAGE_INTENT_CONFIDENCE_1,MESSAGE_INTENT_2,MESSAGE_INTENT_CONFIDENCE_2,MESSAGE_INTENT_3,MESSAGE_INTENT_CONFIDENCE_3,MESSAGE_INTENT,MESSAGE_CONFIDENCE,IF_VALID_INTENT,LANGUAGE
0,2,0,netel,d63b71e0-e4c7-4012-822b-135f884dd90d:::4,BOT,-1,-1,-1,-1,-1,-1,$shop_accessory_misc,0.953275,True,en
1,3,0,netel,5241a58a-9b9a-422e-b451-3832ac96e1dc:::6,BOT,-1,-1,-1,-1,-1,-1,$shop_accessory_misc,0.953275,True,en
2,11,0,netel,b864d516-66b3-4223-a852-e4b428b7578d:::4,BOT,-1,-1,-1,-1,-1,-1,$shop_accessory_misc,0.953275,True,en
3,15,0,netel,719801fc-34c6-4ff9-8d7b-bb32d4de6766:::4,BOT,-1,-1,-1,-1,-1,-1,$shop_accessory_misc,0.953275,True,en
4,22,0,netel,3808e306-4e23-4292-94f3-d1fd1b9bfd33:::4,BOT,-1,-1,-1,-1,-1,-1,$shop_accessory_misc,0.953275,True,en


# Preprocessing

In [7]:
df = df[~df.MESSAGE.str.contains("""{ " com """)]
# add conv id and message id to sort by
df["conversation_id"] = df["SESSION_ID"].str.split(":::").str[0]
df["message_id"] = df["SESSION_ID"].str.split(":::").str[1].astype(int)
# sort to make grouping easier
# df = df.sort_values(by=["conversation_id", "message_id"])
df = df[["conversation_id", "message_id", "MESSAGE"]]
df.head(5)

Unnamed: 0,conversation_id,message_id,MESSAGE
0,d63b71e0-e4c7-4012-822b-135f884dd90d,4,netel
1,5241a58a-9b9a-422e-b451-3832ac96e1dc,6,netel
2,b864d516-66b3-4223-a852-e4b428b7578d,4,netel
3,719801fc-34c6-4ff9-8d7b-bb32d4de6766,4,netel
4,3808e306-4e23-4292-94f3-d1fd1b9bfd33,4,netel


split the messages further into sentences

In [8]:
# split the messages further into sentences
df = df.drop('MESSAGE', axis=1).join(df['MESSAGE'].str.split(r"[,.!?]", expand=True).stack().reset_index(level=1, drop=True).rename('MESSAGE'))
df.tail(10)

Unnamed: 0,conversation_id,message_id,MESSAGE
3525,8087c72a-3246-42ee-ac83-db6d87be6bfc,1,how can i do that
3525,8087c72a-3246-42ee-ac83-db6d87be6bfc,1,
3526,e6c37c6b-52c8-4ed0-a02d-ec77d96ad839,3,is there any deal to buy apple watch 6 in mont...
3527,13d88447-fdc4-42ac-87d9-9bb1641a8062,1,hi i recently transferred my phone number from...
3527,13d88447-fdc4-42ac-87d9-9bb1641a8062,1,i am unable to access it due to the fact that...
3527,13d88447-fdc4-42ac-87d9-9bb1641a8062,1,the account name is under neper
3527,13d88447-fdc4-42ac-87d9-9bb1641a8062,1,i am looking to cancel the plan on the accoun...
3527,13d88447-fdc4-42ac-87d9-9bb1641a8062,1,thank you
3528,3836504886472679,18,have a great night
3529,3473478809418490,5,i topped up my account online but i got nothin...


In [9]:
# add sentence_id to track message order for aggregation
# sort to make grouping easier
df = df.sort_values(by=["conversation_id", "message_id"])
df["sentence_id"] = df.groupby("conversation_id").cumcount()
df["session_id"] = df["conversation_id"]+"_"+df["sentence_id"].astype(str)
df.tail(10)

Unnamed: 0,conversation_id,message_id,MESSAGE,sentence_id,session_id
883,ff9ac61f-8f98-4516-9342-7aa1177514d9,20,data + talk,3,ff9ac61f-8f98-4516-9342-7aa1177514d9_3
1062,ff9ac61f-8f98-4516-9342-7aa1177514d9,22,more than 10 gb,4,ff9ac61f-8f98-4516-9342-7aa1177514d9_4
1149,ff9ac61f-8f98-4516-9342-7aa1177514d9,24,15 gb freedom data,5,ff9ac61f-8f98-4516-9342-7aa1177514d9_5
73,ffb30da4-4274-4dde-a755-eb5160834e8c,4,netel,0,ffb30da4-4274-4dde-a755-eb5160834e8c_0
39,ffb30da4-4274-4dde-a755-eb5160834e8c,6,netel,1,ffb30da4-4274-4dde-a755-eb5160834e8c_1
3049,ffbf37b3-2b93-4875-9cbb-d5f5dc0ed3e2,2,hello,0,ffbf37b3-2b93-4875-9cbb-d5f5dc0ed3e2_0
3049,ffbf37b3-2b93-4875-9cbb-d5f5dc0ed3e2,2,i purchased the cannes outdoor sofa however i...,1,ffbf37b3-2b93-4875-9cbb-d5f5dc0ed3e2_1
3049,ffbf37b3-2b93-4875-9cbb-d5f5dc0ed3e2,2,are they machine washable,2,ffbf37b3-2b93-4875-9cbb-d5f5dc0ed3e2_2
3049,ffbf37b3-2b93-4875-9cbb-d5f5dc0ed3e2,2,,3,ffbf37b3-2b93-4875-9cbb-d5f5dc0ed3e2_3
247,ffbf37b3-2b93-4875-9cbb-d5f5dc0ed3e2,5,message with a rep,4,ffbf37b3-2b93-4875-9cbb-d5f5dc0ed3e2_4


In [10]:
# preprocess text
df["MESSAGE"] = df["MESSAGE"].str.replace("|".join(remove_words),'')# remove certain texts from messages
df['word_count'] = df['MESSAGE'].apply(word_count)
df['processed_text'] = df['MESSAGE'].apply(process_text)
df = df[df['processed_text'].map(lambda d: len(d)) > 0]
print(df.shape)
df.tail(10)

(3496, 7)


Unnamed: 0,conversation_id,message_id,MESSAGE,sentence_id,session_id,word_count,processed_text
1240,ff097a90-5972-46d1-845a-3c9a399b70bc,1,i need help,0,ff097a90-5972-46d1-845a-3c9a399b70bc_0,3,"[need, help]"
1883,ff93a689-8d1a-49c2-a5bc-54dd321707a3,1,i am getting a phone for my son and looking to...,0,ff93a689-8d1a-49c2-a5bc-54dd321707a3_0,26,"[get, phone, son, look, plan, iphon, se, packag]"
859,ff9ac61f-8f98-4516-9342-7aa1177514d9,7,data + talk,0,ff9ac61f-8f98-4516-9342-7aa1177514d9_0,3,"[data, talk]"
993,ff9ac61f-8f98-4516-9342-7aa1177514d9,9,less than 10 gb,1,ff9ac61f-8f98-4516-9342-7aa1177514d9_1,4,[gb]
1284,ff9ac61f-8f98-4516-9342-7aa1177514d9,14,10 gb,2,ff9ac61f-8f98-4516-9342-7aa1177514d9_2,2,[gb]
883,ff9ac61f-8f98-4516-9342-7aa1177514d9,20,data + talk,3,ff9ac61f-8f98-4516-9342-7aa1177514d9_3,3,"[data, talk]"
1062,ff9ac61f-8f98-4516-9342-7aa1177514d9,22,more than 10 gb,4,ff9ac61f-8f98-4516-9342-7aa1177514d9_4,4,[gb]
1149,ff9ac61f-8f98-4516-9342-7aa1177514d9,24,15 gb freedom data,5,ff9ac61f-8f98-4516-9342-7aa1177514d9_5,4,"[gb, freedom, data]"
3049,ffbf37b3-2b93-4875-9cbb-d5f5dc0ed3e2,2,i purchased the cannes outdoor sofa however i...,1,ffbf37b3-2b93-4875-9cbb-d5f5dc0ed3e2_1,17,"[purchas, cann, outdoor, sofa, care, instruct,..."
3049,ffbf37b3-2b93-4875-9cbb-d5f5dc0ed3e2,2,are they macne washable,2,ffbf37b3-2b93-4875-9cbb-d5f5dc0ed3e2_2,5,"[macn, washabl]"


In [11]:
# df[df["conversation_id"]=="1895298307224761"]

## split df into 2 dfs
- df containing only 1 word sentences
- df containing multiple sentences

**ignore df containing only 1 word sentences for now**

In [12]:
df1 = df[df["processed_text"].str.len()==1]
df2 = df[df["processed_text"].str.len()!=1]
df2.head(5)

Unnamed: 0,conversation_id,message_id,MESSAGE,sentence_id,session_id,word_count,processed_text
2608,001bede7-810d-4284-b289-c29ab8a12bba,2,i need to activate new sim,1,001bede7-810d-4284-b289-c29ab8a12bba_1,7,"[need, activ, new, sim]"
2608,001bede7-810d-4284-b289-c29ab8a12bba,2,but it's got getting activated online,2,001bede7-810d-4284-b289-c29ab8a12bba_2,7,"[got, get, activ, onlin]"
987,0024cce3-b115-4a79-88c7-604d046076df,5,account details,0,0024cce3-b115-4a79-88c7-604d046076df_0,2,"[account, detail]"
3463,0028d3f2-964b-4903-bd37-3bb364459504,1,i would like to cancel service,0,0028d3f2-964b-4903-bd37-3bb364459504_0,6,"[cancel, servic]"
2591,00560815-b4b9-4919-bd75-5dbd07aadec2,5,i have a balance of but my wife 's account ag...,0,00560815-b4b9-4919-bd75-5dbd07aadec2_0,16,"[balanc, wife, account, say, suspend]"


In [13]:
df2['word_count'].describe()

count    2825.000000
mean        9.804956
std         7.598446
min         1.000000
25%         5.000000
50%         8.000000
75%        13.000000
max       144.000000
Name: word_count, dtype: float64

# Clustering on multi-word sentences

## Encode text into embeddings 

- optionally scale for NMF

In [14]:
texts = list(df2['processed_text'])
model = SentenceTransformer(pretrained_model)


In [15]:
embeddings = model.encode(texts)

In [16]:
print(embeddings.shape)
print(np.amin(embeddings), np.amax(embeddings))

(2825, 768)
-4.325707 5.028679


In [17]:
# scale embeddings to have min value 0
scaled_embeddings = embeddings + np.abs(np.amin(embeddings))
print(np.amin(scaled_embeddings), np.amax(scaled_embeddings))

0.0 9.354385


## Run NMF on embeddings

In [18]:
# Run the nmf model
nmf = NMF(
    n_components=best_num_topics,
    init='nndsvd',
    max_iter=500,
    l1_ratio=0.0,
    solver='cd',
    alpha=0.0,
    tol=1e-4,
    random_state=42,
).fit(scaled_embeddings)


## Create topic groupings

In [19]:
docweights = nmf.transform(scaled_embeddings)

In [20]:
nmf.components_.shape

(200, 768)

### get top k topics

In [21]:
# can go along each row

top_k_topics = get_top_k_topics(docweights, top_k = top_k)
top_k_topics[0], top_k_topics[1], top_k_topics[2], top_k_topics[3]

(array([66, 31, 92]),
 array([ 27,  42, 100]),
 array([104, 177,  66]),
 array([177,   3, 100]))

In [22]:
# Creating a temp df with the sesion id and topic number to join on
# with top k topics
session_ids = df2['session_id'].tolist()

df_merge = pd.DataFrame({
    'session_id': session_ids,
})

# create 2*k columns, each wtih top k topics and their associated keywords
for i in range(top_k_topics.shape[1]):
    df_merge[f"topic_{i+1}"] = top_k_topics[:,i]
    # df_merge[f"topic_{i+1}_key_words"] = df_merge[f"topic_{i+1}"].map(topic_dict)
# df_merge["topic_key_words"] = df_merge["topic"].map(topic_dict)
df_merge.head(5)

Unnamed: 0,session_id,topic_1,topic_2,topic_3
0,001bede7-810d-4284-b289-c29ab8a12bba_1,66,31,92
1,001bede7-810d-4284-b289-c29ab8a12bba_2,27,42,100
2,0024cce3-b115-4a79-88c7-604d046076df_0,104,177,66
3,0028d3f2-964b-4903-bd37-3bb364459504_0,177,3,100
4,00560815-b4b9-4919-bd75-5dbd07aadec2_0,192,138,42


In [23]:
df_topics2 = pd.merge(
    df2,
    df_merge,
    on='session_id',
    how='left'
)
df_topics2.tail(5)

Unnamed: 0,conversation_id,message_id,MESSAGE,sentence_id,session_id,word_count,processed_text,topic_1,topic_2,topic_3
2820,ff9ac61f-8f98-4516-9342-7aa1177514d9,7,data + talk,0,ff9ac61f-8f98-4516-9342-7aa1177514d9_0,3,"[data, talk]",100,33,99
2821,ff9ac61f-8f98-4516-9342-7aa1177514d9,20,data + talk,3,ff9ac61f-8f98-4516-9342-7aa1177514d9_3,3,"[data, talk]",100,33,99
2822,ff9ac61f-8f98-4516-9342-7aa1177514d9,24,15 gb freedom data,5,ff9ac61f-8f98-4516-9342-7aa1177514d9_5,4,"[gb, freedom, data]",177,104,9
2823,ffbf37b3-2b93-4875-9cbb-d5f5dc0ed3e2,2,i purchased the cannes outdoor sofa however i...,1,ffbf37b3-2b93-4875-9cbb-d5f5dc0ed3e2_1,17,"[purchas, cann, outdoor, sofa, care, instruct,...",100,192,33
2824,ffbf37b3-2b93-4875-9cbb-d5f5dc0ed3e2,2,are they macne washable,2,ffbf37b3-2b93-4875-9cbb-d5f5dc0ed3e2_2,5,"[macn, washabl]",66,144,177


In [24]:
print(df_topics2["topic_1"].unique())
print(len(df_topics2["topic_1"].unique()))

[ 66  27 104 177 192  33  28 142 110  15  81 100 148   3  18  24 149 155
 191 144 136  99  22 138 127 161 133  14  73  92  71  42  13  34 175 128
  58  35  10 114 169 195  46  75   6 109  31  50]
48


### Topics that pass a certain threshold

In [32]:
df_topics2[f"topics_threshold_>={docweights_threshold}"] = threshold_docweights(docweights, threshold = docweights_threshold)


In [33]:
df_topics2.head(5)

Unnamed: 0,conversation_id,message_id,MESSAGE,sentence_id,session_id,word_count,processed_text,topic_1,topic_2,topic_3,topics_threshold_>=0.5,topics_threshold_>=0.75
0,001bede7-810d-4284-b289-c29ab8a12bba,2,i need to activate new sim,1,001bede7-810d-4284-b289-c29ab8a12bba_1,7,"[need, activ, new, sim]",66,31,92,"[12, 24, 26, 31, 33, 42, 46, 66, 92, 100]","[12, 31, 66, 92]"
1,001bede7-810d-4284-b289-c29ab8a12bba,2,but it's got getting activated online,2,001bede7-810d-4284-b289-c29ab8a12bba_2,7,"[got, get, activ, onlin]",27,42,100,"[22, 27, 42, 66, 81, 92, 99, 100, 101, 104, 11...","[27, 42, 66, 81, 100, 104, 148, 177]"
2,0024cce3-b115-4a79-88c7-604d046076df,5,account details,0,0024cce3-b115-4a79-88c7-604d046076df_0,2,"[account, detail]",104,177,66,"[8, 61, 66, 85, 99, 104, 136, 177, 192]","[8, 66, 99, 104, 136, 177]"
3,0028d3f2-964b-4903-bd37-3bb364459504,1,i would like to cancel service,0,0028d3f2-964b-4903-bd37-3bb364459504_0,6,"[cancel, servic]",177,3,100,"[3, 17, 18, 33, 42, 46, 86, 92, 100, 104, 147,...","[3, 42, 46, 92, 100, 104, 177]"
4,00560815-b4b9-4919-bd75-5dbd07aadec2,5,i have a balance of but my wife 's account ag...,0,00560815-b4b9-4919-bd75-5dbd07aadec2_0,16,"[balanc, wife, account, say, suspend]",192,138,42,"[29, 31, 35, 42, 48, 61, 71, 73, 101, 110, 116...","[35, 42, 48, 61, 71, 73, 101, 110, 138, 150, 1..."


### map topics to their keywords

**method 1 (doesnt work)**
1. filter df to just dfs that have topic that pass a certain threhsold
2. treat that as a document, and use n_gram counts
3. the top k most frequent n grams become the topic key words

**method 2 (current method)**
1. filter df to just top topic
2. treat that as document and use n gram counts
3. the top k most frequent n grams become the topic key words


In [34]:
def get_top_k_keywords(df: pd.DataFrame, n_gram_range: Tuple[int, int] = (2, 5), k: int = 5):
    texts = list(df["processed_text"])
    counts = Counter() 
    for text in texts:
        for i in range(n_gram_range[0], n_gram_range[1]):
            counts.update(nltk.ngrams(text, 2))

    return [c[0] for c in counts.most_common(k)]

In [53]:
# df_tmp = df_topics2[df_topics2[f"topics_threshold_>={docweights_threshold}"].apply(lambda x: 2 in x)]
# print(df_tmp.shape)
# df_tmp.head(5)

(65, 15)


Unnamed: 0,conversation_id,message_id,MESSAGE,sentence_id,session_id,word_count,processed_text,topic_1,topic_2,topic_3,topics_threshold_>=0.5,topics_threshold_>=0.75,topic_1_key_words,topic_2_key_words,topic_3_key_words
50,029499a8-9c86-401d-b66a-9ba12bf2c068,5,can i pay my bill before my auto payment date,4,029499a8-9c86-401d-b66a-9ba12bf2c068_4,10,"[pay, bill, auto, payment, date]",142,66,104,"[2, 10, 17, 24, 28, 33, 35, 42, 66, 99, 104, 1...","[2, 10, 28, 33, 66, 104, 142, 144, 175]","[(just, wonder), (just, got), (hey, just), (go...","[(just, wonder), (just, got), (hey, just), (go...","[(just, wonder), (just, got), (hey, just), (go..."
61,044a8201-6808-4d37-a41a-753161fca415,1,how much do i need to pay you to buy out my pl...,0,044a8201-6808-4d37-a41a-753161fca415_0,15,"[need, pay, buy, plan, right]",142,177,33,"[2, 12, 18, 28, 33, 42, 66, 71, 73, 100, 142, ...","[2, 28, 33, 42, 73, 142, 177]","[(just, wonder), (just, got), (hey, just), (go...","[(just, wonder), (just, got), (hey, just), (go...","[(just, wonder), (just, got), (hey, just), (go..."
95,07ab6e8d-6066-4b7f-86d5-0ade29db411d,4,i paid the bill it reflected on my bank statem...,0,07ab6e8d-6066-4b7f-86d5-0ade29db411d_0,16,"[paid, bill, reflect, bank, statement, bill]",104,66,28,"[2, 10, 15, 17, 18, 28, 33, 42, 66, 87, 99, 10...","[2, 28, 42, 66, 99, 104, 127, 142, 175]","[(just, wonder), (just, got), (hey, just), (go...","[(just, wonder), (just, got), (hey, just), (go...","[(just, wonder), (just, got), (hey, just), (go..."
132,0cee3efc-838f-4572-a196-b4b6fd651ea3,2,i paid my bill april 29 via my cibc account,0,0cee3efc-838f-4572-a196-b4b6fd651ea3_0,11,"[paid, bill, april, cibc, account]",104,66,28,"[2, 10, 15, 17, 18, 28, 33, 42, 66, 87, 99, 10...","[2, 28, 42, 66, 99, 104, 127, 142, 175]","[(just, wonder), (just, got), (hey, just), (go...","[(just, wonder), (just, got), (hey, just), (go...","[(just, wonder), (just, got), (hey, just), (go..."
162,0f4676f7-de28-4f32-8446-8afab5bd7d66,2,but i still pay for a plan on it,4,0f4676f7-de28-4f32-8446-8afab5bd7d66_4,11,"[pay, plan]",33,142,66,"[2, 18, 30, 33, 35, 62, 66, 71, 95, 100, 142, ...","[2, 33, 35, 66, 142]","[(just, wonder), (just, got), (hey, just), (go...","[(just, wonder), (just, got), (hey, just), (go...","[(just, wonder), (just, got), (hey, just), (go..."


In [None]:
# # get top k keywords for each topic
# topic_dict = {-1:""} # intitial condition
# for topic_num in tqdm(range(best_num_topics)):
#     # subset df_topics2 by if topic_num apprears in threshold topics
#     df_tmp = df_topics2[df_topics2[f"topics_threshold_>={docweights_threshold}"].apply(lambda x: 1 in x)]
#     if len(df_tmp):
#         topic_dict[topic_num] = get_top_k_keywords(df_tmp, n_gram_range=(2, 4), k=n_top_words)


In [62]:
# get top k keywords for each topic
topic_dict = {-1:""} # intitial condition
for topic_num in tqdm(sorted(df_topics2.topic_1.unique())):
    if topic_num != -1: # avoid cases where there's no topic (i.e. max topic has weight 0 in docweights)
        df_tmp = df_topics2[df_topics2["topic_1"]==topic_num]
        topic_dict[topic_num] = get_top_k_keywords(df_tmp, n_gram_range=(2, 4), k=n_top_words)



100%|██████████| 48/48 [00:00<00:00, 829.13it/s]


In [68]:
list(topic_dict.items())[:3]

[(-1, ''),
 (3,
  [('want', 'cancel'),
   ('cancel', 'plan'),
   ('need', 'cancel'),
   ('cancel', 'home'),
   ('home', 'internet')]),
 (6,
  [('want', 'chang'),
   ('chang', 'address'),
   ('chang', 'bill'),
   ('bill', 'address'),
   ('chang', 'bank')])]

In [69]:
# map topic dict to df_topic_2
for i in range(top_k_topics.shape[1]):
    df_topics2[f"topic_{i+1}_key_words"] = df_topics2[f"topic_{i+1}"].map(topic_dict)

df_topics2.head(5)

Unnamed: 0,conversation_id,message_id,MESSAGE,sentence_id,session_id,word_count,processed_text,topic_1,topic_2,topic_3,topics_threshold_>=0.5,topics_threshold_>=0.75,topic_1_key_words,topic_2_key_words,topic_3_key_words
0,001bede7-810d-4284-b289-c29ab8a12bba,2,i need to activate new sim,1,001bede7-810d-4284-b289-c29ab8a12bba_1,7,"[need, activ, new, sim]",66,31,92,"[12, 24, 26, 31, 33, 42, 46, 66, 92, 100]","[12, 31, 66, 92]","[(activ, sim), (speak, agent), (sim, card), (c...","[(want, activ), (activ, sim), (tri, activ), (r...","[(abl, pay), (possibl, sim), (sim, card), (car..."
1,001bede7-810d-4284-b289-c29ab8a12bba,2,but it's got getting activated online,2,001bede7-810d-4284-b289-c29ab8a12bba_2,7,"[got, get, activ, onlin]",27,42,100,"[22, 27, 42, 66, 81, 92, 99, 100, 101, 104, 11...","[27, 42, 66, 81, 100, 104, 148, 177]","[(got, payment), (ts, get), (get, bill), (got,...","[(pay, remain), (remain, balanc), (need, talk)...","[(data, talk), (upgrad, phone), (just, receiv)..."
2,0024cce3-b115-4a79-88c7-604d046076df,5,account details,0,0024cce3-b115-4a79-88c7-604d046076df_0,2,"[account, detail]",104,177,66,"[8, 61, 66, 85, 99, 104, 136, 177, 192]","[8, 66, 99, 104, 136, 177]","[(account, detail), (just, want), (phone, bill...","[(gb, freedom), (freedom, data), (cancel, serv...","[(activ, sim), (speak, agent), (sim, card), (c..."
3,0028d3f2-964b-4903-bd37-3bb364459504,1,i would like to cancel service,0,0028d3f2-964b-4903-bd37-3bb364459504_0,6,"[cancel, servic]",177,3,100,"[3, 17, 18, 33, 42, 46, 86, 92, 100, 104, 147,...","[3, 42, 46, 92, 100, 104, 177]","[(gb, freedom), (freedom, data), (cancel, serv...","[(want, cancel), (cancel, plan), (need, cancel...","[(data, talk), (upgrad, phone), (just, receiv)..."
4,00560815-b4b9-4919-bd75-5dbd07aadec2,5,i have a balance of but my wife 's account ag...,0,00560815-b4b9-4919-bd75-5dbd07aadec2_0,16,"[balanc, wife, account, say, suspend]",192,138,42,"[29, 31, 35, 42, 48, 61, 71, 73, 101, 110, 116...","[35, 42, 48, 61, 71, 73, 101, 110, 138, 150, 1...","[(mytab, balanc), (samsung, galaxi), (check, m...","[(transcript, url), (url, https://freedomcusto...","[(pay, remain), (remain, balanc), (need, talk)..."


In [71]:
df_topics2[df_topics2["topic_3"]==9]

Unnamed: 0,conversation_id,message_id,MESSAGE,sentence_id,session_id,word_count,processed_text,topic_1,topic_2,topic_3,topics_threshold_>=0.5,topics_threshold_>=0.75,topic_1_key_words,topic_2_key_words,topic_3_key_words
22,0150b1e8-7800-47c9-934c-1f8dabbb1a01,11,15 gb freedom data,2,0150b1e8-7800-47c9-934c-1f8dabbb1a01_2,4,"[gb, freedom, data]",177,104,9,"[9, 10, 13, 15, 18, 21, 27, 28, 33, 37, 46, 92...","[9, 13, 21, 27, 92, 104, 138, 177]","[(gb, freedom), (freedom, data), (cancel, serv...","[(account, detail), (just, want), (phone, bill...",
27,01793a58-8e8b-4c25-af7d-cd486259eed2,8,i want to cancel my payment,0,01793a58-8e8b-4c25-af7d-cd486259eed2_0,6,"[want, cancel, payment]",3,142,9,"[3, 9, 24, 42, 142, 150]","[3, 9, 142]","[(want, cancel), (cancel, plan), (need, cancel...","[(pay, bill), (want, pay), (add, line), (make,...",
59,036cd2b1-ee8f-411b-8025-7a55f09878f8,14,15 gb freedom data,2,036cd2b1-ee8f-411b-8025-7a55f09878f8_2,4,"[gb, freedom, data]",177,104,9,"[9, 10, 13, 15, 18, 21, 27, 28, 33, 37, 46, 92...","[9, 13, 21, 27, 92, 104, 138, 177]","[(gb, freedom), (freedom, data), (cancel, serv...","[(account, detail), (just, want), (phone, bill...",
137,0d084769-c927-4fe1-b26c-cf0bd7e8e90a,1,i want a new phone but your site is acting up,0,0d084769-c927-4fe1-b26c-cf0bd7e8e90a_0,11,"[want, new, phone, site, act]",14,15,9,"[9, 14, 15, 24, 28, 31, 42, 71, 126, 142, 144]","[9, 14, 15, 24, 42]","[(new, phone), (new, sim), (activ, new), (sim,...","[(sim, card), (phone, number), (account, numbe...",
219,12b62db5-ab6e-44e1-83f0-d5405365a4df,1,freedom mobile iam having trouble for over on...,0,12b62db5-ab6e-44e1-83f0-d5405365a4df_0,61,"[freedom, mobil, iam, have, troubl, year, get,...",73,177,9,"[9, 13, 24, 72, 73, 81, 105, 128, 142, 177]","[9, 13, 73, 142, 177]","[(freedom, mobil), (wind, mobil), (mobil, year...","[(gb, freedom), (freedom, data), (cancel, serv...",
227,1396070490425721,11,15 gb freedom data,2,1396070490425721_2,4,"[gb, freedom, data]",177,104,9,"[9, 10, 13, 15, 18, 21, 27, 28, 33, 37, 46, 92...","[9, 13, 21, 27, 92, 104, 138, 177]","[(gb, freedom), (freedom, data), (cancel, serv...","[(account, detail), (just, want), (phone, bill...",
228,1396070490425721,16,i am with freedom mobile for over 10 years,5,1396070490425721_5,11,"[freedom, mobil, year]",73,177,9,"[9, 13, 24, 72, 73, 81, 105, 128, 142, 177]","[9, 13, 73, 142, 177]","[(freedom, mobil), (wind, mobil), (mobil, year...","[(gb, freedom), (freedom, data), (cancel, serv...",
255,1630882257015503,3,i want to cancel my home internet,0,1630882257015503_0,7,"[want, cancel, home, internet]",3,142,9,"[3, 9, 24, 42, 142, 150]","[3, 9, 142]","[(want, cancel), (cancel, plan), (need, cancel...","[(pay, bill), (want, pay), (add, line), (make,...",
258,1630882257015503,16,i want to cancel,4,1630882257015503_4,4,"[want, cancel]",3,142,9,"[3, 9, 24, 42, 142, 150]","[3, 9, 142]","[(want, cancel), (cancel, plan), (need, cancel...","[(pay, bill), (want, pay), (add, line), (make,...",
280,1844269545616220,3,i need to update my address,2,1844269545616220_2,6,"[need, updat, address]",18,175,9,"[9, 18, 28, 31, 33, 65, 71, 73, 75, 81, 100, 1...","[9, 18, 28, 31, 71, 175]","[(need, chang), (chang, address), (updat, addr...","[(reset, pin), (need, reset), (reset, voicemai...",


# NMF on single-word sentences

Currently not doing

# Aggregate at conversation level text

In [None]:
# TODO: Combine df_topics_1 and df_topics_2

# df_topics_final 

In [None]:
# sort by conversation and sentence id to ensure order
df_conv_topics = df_topic_final.sort_values(by=["conversation_id", "sentence_id"])


In [None]:
# check 
# df_conv_topics[df_conv_topics["conversation_id"]=="1895298307224761"]

In [None]:
df_conv_topics["concat_text"] = df_conv_topics.groupby(["conversation_id"])["MESSAGE"].transform(
                lambda x: " ".join(x)
            )

# generate new dfs
df_conv_concat_text = df_conv_topics.groupby(["conversation_id"])["processed_text"].agg(sum).reset_index(name="preprocessed_concat_text")
df_combined_topics = df_conv_topics.groupby(["conversation_id"])["topic"].apply(list).reset_index(name="combined_topics")
df_combined_key_words = df_conv_topics.groupby(["conversation_id"])["topic_key_words"].apply(list).reset_index(name="combined_topic_key_words")

In [None]:
# join dfs
df_conv_topics = df_conv_topics.merge(df_conv_concat_text, on="conversation_id")\
                    .merge(df_combined_topics, on="conversation_id")\
                    .merge(df_combined_key_words, on="conversation_id")\
                    .drop_duplicates(subset=["conversation_id"])\
                    .reset_index()\
                    [["conversation_id", "concat_text", "preprocessed_concat_text", "combined_topics", "combined_topic_key_words"]]



In [None]:
df_conv_topics.tail(5)

In [None]:
# check 
# df_conv_topics[df_conv_topics["conversation_id"]=="1895298307224761"]

# Saving

In [None]:
# save results, vectorizer, and nmf model
df_topic_final.to_csv(f"nmf_topics/{save_folder}/freedom_msg_level_nmf_clusters.csv", index=False)
df_conv_topics.to_csv(f"nmf_topics/{save_folder}/freedom_conv_level_nmf_clusters.csv", index=False)
dump(tfidf_vectorizer, f"nmf_models/{save_folder}/msg_level_vectorizer.joblib")
dump(nmf, f"nmf_models/{save_folder}/msg_level_nmf.joblib")