# Conversations Data Analysis

In [None]:
import configparser
import os
import pickle
import re
import warnings

import emoji
import nltk
nltk.download('punkt')
import numpy as np
import pandas as pd
import plotly.express as px
from langdetect import detect

warnings.filterwarnings('ignore')

In [None]:
# Reading configuration file
config = configparser.ConfigParser()
config.read('config.ini')

## Pull Raw Data - Download Customer Service conversations

In [None]:
df_conv = pd.read_csv(f'{config["data"]["conversations_file"]}')

In [None]:
df_conv.head()

In [None]:
df_conv.info()

## Data preprocessing
### Transform create_at column to datetime

In [None]:
df_conv['created_at'] = pd.to_datetime(df_conv['created_at'], errors='coerce')
df_conv.info()

In [None]:
# Check if all tweet ids are unique - they should be
print("Row_count:", df_conv.shape[0])
print("N tweet id:", df_conv['tweet_id'].nunique())

### Order data by create date

In [None]:
df_conv.head()

In [None]:
df_conv.sort_values(by='created_at', ignore_index=True, inplace=True)
df_conv.head()

### Add conversation_id

In [None]:
# Make a new column called conversation_id 
df_conv['conversation_id'] = df_conv['tweet_id']
whole = df_conv.shape[0]
for i in range(len(df_conv)):
    if i % 50000 == 0:
        print("{} out of {} were preprocessed".format(i, whole))
    prev_tweet = df_conv.loc[i, 'in_response_to_tweet_id']
    if not np.isnan(prev_tweet):
        df_temp = df_conv[df_conv['tweet_id']==prev_tweet]
        if len(df_temp) > 0:
            new_conv_id = df_temp['conversation_id'].values[0]
            df_conv.loc[i, 'conversation_id'] = new_conv_id

In [None]:
# the above statement was started at: 13:12
# the above statement finished at: 15:44 it was finished

In [None]:
df_conv.head()

One conversation can include more than 2 authors, it can be multiple users struggling with same issue and one person from support, see example below

In [None]:
df_conv[df_conv['conversation_id']==119250]

### Add author_type

In [None]:
(df_conv['author_id'].value_counts().to_frame().reset_index()).sort_values('count', ascending=False)

In [None]:
df_conv['author_type'] = df_conv['author_id'] .map(lambda x: "user" if x.isnumeric() else "support")

In [None]:
df_conv[["author_id", "author_type"]]

### Filter out links to previous tweets


In [None]:
df_conv_filtered = df_conv.copy(deep=True)
df_conv_filtered['text'] = df_conv_filtered['text'].apply(lambda s: re.sub("@\S+ |@\S+$", "", s))

In [None]:
# before filtering out links to previous tweets
selected_conv_id = 1278330
for idx, item in df_conv.loc[df_conv['conversation_id'] == selected_conv_id, :].iterrows():
    print(f"{idx} | {item['author_id']} | {item['text']}")

In [None]:
# after filtering out links to previous tweets
selected_conv_id = 1278330
for idx, item in df_conv_filtered.loc[df_conv_filtered['conversation_id'] == selected_conv_id, :].iterrows():
    print(f"{idx} | {item['author_id']} | {item['text']}")

In [None]:
# before filtering out links to previous tweets
selected_conv_id = 2763707
for idx, item in df_conv.loc[df_conv['conversation_id'] == selected_conv_id, :].iterrows():
    print(f"{idx} | {item['author_id']} | {item['text']}")

In [None]:
# after filtering out links to previous tweets
selected_conv_id = 2763707
for idx, item in df_conv_filtered.loc[df_conv_filtered['conversation_id'] == selected_conv_id, :].iterrows():
    print(f"{idx} | {item['author_id']} | {item['text']}")

### Add parameters linked to body length

In [None]:
# Add number of characters in the utterance 
df_conv_filtered['utterance_size'] = df_conv_filtered['text'].str.len()

# Add number of tokens in the utterance 
df_conv_filtered['utterance_tokens_size'] = df_conv_filtered['text'].apply(lambda x: len(str(x).split(' ')))

In [None]:
# after filtering out links to previous tweets
selected_conv_id = 2763707
df_conv_filtered.loc[df_conv_filtered['conversation_id'] == selected_conv_id, :]

### Filter data

In [None]:
# Number of unique conversations before filtering
print("Number of unique conversations before filtering:", df_conv['conversation_id'].nunique())
# Number of tweets before filtering
print("Number of tweets before filtering:", df_conv['tweet_id'].nunique())

#### Filter out empty strings

In [None]:
# Apply filter
df_conv_filtered = df_conv_filtered[df_conv_filtered['text']!=""]
# Number of unique conversations after filtering
print("Number of unique conversations after filtering:", df_conv_filtered['conversation_id'].nunique())
# Number of tweets after filtering
print("Number of tweets after filtering:", df_conv_filtered['tweet_id'].nunique())

#### Filter out emoticons

In [None]:
df_conv_filtered_emot = df_conv_filtered['text'].apply(lambda s: emoji.replace_emoji(s, ''))

In [None]:
# Example before filtering
print(df_conv.loc[1768087, 'text'])

In [None]:
# Example after filtering
print(df_conv_filtered_emot[1768087])

In [None]:
# Apply filter
df_conv_filtered['text'] = df_conv_filtered_emot

#### Filter out empty and one char strings

In [None]:
# Apply filter
df_conv_filtered = df_conv_filtered.loc[df_conv_filtered['utterance_size'] > 1, :]
# Number of unique conversations after filtering
print("Number of unique conversations after filtering empty strings:", df_conv_filtered['conversation_id'].nunique())
# Number of tweets after filtering
print("Number of tweets after filtering empty strings:", df_conv_filtered['tweet_id'].nunique())

#### Replace web links

In [None]:
# Replace web links in utterances/messages by '[link]' label
LINK_CLEANER = re.compile(r'\b(?:https?://)\S+', flags=re.IGNORECASE)
df_conv_filtered['text'] = df_conv_filtered['text'].str.replace(LINK_CLEANER, '[link]', regex=True)

#### Remove additinal whitespaces and align punctuation

In [None]:
# Remove additinal whitespaces
WHITESPACES_CLEANER = re.compile(r'(\uFEFF|\s)+')
df_conv_filtered['text'] = df_conv_filtered['text'].str.replace(WHITESPACES_CLEANER, ' ', regex=True)

In [None]:
# Align punctuation
WHITESPACES_BEFORE_PUNCTUATION_CLEANER = re.compile(r'(\w)\s+([.,;:?!])')
df_conv_filtered['text'] = df_conv_filtered['text'].str.replace(WHITESPACES_BEFORE_PUNCTUATION_CLEANER, r'\1\2', regex=True)

### Add language info

In [None]:
def detect_lang(x):
    try:
        lang = detect(x)
    except:
        lang = 'unknown'
    return lang

In [None]:
df_conv_filtered_lang = df_conv_filtered.loc[:, 'text'].apply(lambda x: detect_lang(x))

In [None]:
df_conv_filtered['utterance_lang'] = df_conv_filtered_lang

In [None]:
conv_lang = df_conv_filtered.groupby(['conversation_id']).apply(lambda x: x.sort_values('utterance_size', ascending=False).iloc[0]['utterance_lang'])
df_conv_lang = conv_lang.to_frame().rename(columns={0: "conversation_lang"})
df_conv_lang_counts = df_conv_lang.value_counts().reset_index().rename(columns={0: 'count'})
df_conv_lang_counts.head()

In [None]:
fig = px.bar(df_conv_lang_counts, x='conversation_lang', y='count',
             title="Number of conversations in a given language",
            height=600,
            # text_auto=True
            )
fig.update_layout(xaxis_title='Conversation language',
                  yaxis_title='Count')
fig.show()

In [None]:
# Merge detected language info with other data 
df_conv_filtered = df_conv_filtered.merge(df_conv_lang, left_on='conversation_id', right_index=True)
df_conv_filtered.shape

## Read/write preprocessed data

In [None]:
## Save data to file
# df_conv_filtered.to_csv(f'{config["data"]["data_folder"]}/twitter_sample.csv')
# df_conv_filtered.to_parquet(f'{config["data"]["data_folder"]}/twitter_sample.parq')
# df_conv_filtered.to_csv(f'{config["data"]["data_folder"]}/twitter_entire_dataset.csv')
# df_conv_filtered.to_parquet(f'{config["data"]["data_folder"]}/twitter_entire_dataset.parq')

In [None]:
# Read data from file
print(os.getcwd())
df_conv_filtered = pd.read_parquet(f'{config["data"]["data_folder"]}/twitter_entire_dataset.parq')

df_conv_filtered['created_at'] = pd.to_datetime(df_conv_filtered['created_at'], errors='coerce')

In [None]:
df_conv_filtered.info()

## EDA
### Number of conversations over time

In [None]:
df_conv_created = df_conv_filtered[['conversation_id', 'created_at']]
df_conv_created['created_month']= df_conv_filtered['created_at'].dt.strftime('%Y-%m')
# df_conv_created

In [None]:
df_conv_vs_time = df_conv_created.groupby('created_month', as_index=False).agg(
                conversation_count=pd.NamedAgg(column="conversation_id", aggfunc="nunique"),
)
# df_conv_vs_time

In [None]:
fig = px.bar(df_conv_vs_time[(df_conv_vs_time['created_month']>='2014-01') & (df_conv_vs_time['created_month']<='2024-04')], x='created_month', y='conversation_count',
             title="Number of conversations over time",
            height=600,
            text_auto='.2s'
            )
fig.update_layout(xaxis_title='Created month',
                  yaxis_title='Count')

fig.show()

### Check number of characters per conversation and number of tweets per conversation

In [None]:
df_conv_length = df_conv_filtered.groupby('conversation_id').agg(
                conversation_length=pd.NamedAgg(column="tweet_id", aggfunc="count"),
                conversation_size=pd.NamedAgg(column="utterance_size", aggfunc="sum")
)
df_conv_length

In [None]:
df_conv_length.describe()

In [None]:
conversation_count_vs_length = (df_conv_length['conversation_length'].value_counts().to_frame().reset_index()).sort_values('count')
conversation_count_vs_length

### Distribution of number of tweets within conversations

In [None]:
fig = px.bar(conversation_count_vs_length, x='conversation_length', y='count',
             title="Number of conversations vs. conversation length",
            height=600,
            # text_auto=True
            )
fig.update_xaxes(range=[0, 50])
fig.update_layout(xaxis_title='Conversation length',
                  yaxis_title='Count')
fig.show()

### Distribution of number of characters (length) of conversations

In [None]:
fig = px.histogram(df_conv_length, x='conversation_size', 
             title="Distribution of conversation size",
            # nbins=100,
            height=600,
            # text_auto=True
            )
fig.update_xaxes(range=[0, 1000])
fig.update_layout(xaxis_title='Conversation size',
                  yaxis_title='Count')
fig.show()

### For each conversation length (number of tweets in conversation) how many come from the users and how many from the support

In [None]:
author_conversation = df_conv_filtered.groupby(['conversation_id', 'author_type'], as_index=True).agg(
                author_conversation_count=pd.NamedAgg(column="tweet_id", aggfunc="count")
).merge(df_conv_length, left_index=True, right_index=True).reset_index(level=1)
author_conversation

In [None]:
author_conversation_count = author_conversation.groupby(['conversation_length', 'author_type'], as_index=False).sum()
author_conversation_count

In [None]:
author_conversation_count = author_conversation_count.merge(conversation_count_vs_length, on='conversation_length')
author_conversation_count

In [None]:
author_conversation_count['author_conversation_rate'] = author_conversation_count['author_conversation_count'] / (author_conversation_count['conversation_length'] * author_conversation_count['count'])
author_conversation_count

In [None]:
fig = px.bar(author_conversation_count, x="conversation_length", y="author_conversation_rate", color="author_type", 
             hover_data=['author_conversation_count', 'count'],
             title="Author conversation rate vs. conversation length",
            height=600,
            )
fig.update_xaxes(range=[0, 20])
fig.show()

### Checking the distribution of number of tokens (words) in each tweet

In [None]:
fig = px.histogram(df_conv_filtered, x='utterance_tokens_size', 
             title="Distribution of number of tokens in message/utterance",
            height=600,
            )
fig.update_xaxes(range=[0, 100])
fig.update_layout(xaxis_title='Number of tokens',
                  yaxis_title='Count')
fig.show()

In [None]:
df_conv_filtered[['utterance_tokens_size']].describe(percentiles=np.arange(0.1, 1, 0.1))

In [None]:
df_conv_filtered[['utterance_tokens_size']].quantile(0.95)

### Checking number of conversation in each language

In [None]:
conv_lang = df_conv_filtered.groupby(['conversation_id']).apply(lambda x: x.sort_values('utterance_size', ascending=False).iloc[0]['utterance_lang'])
df_conv_lang = conv_lang.to_frame().rename(columns={0: "conversation_lang"})
df_conv_lang_counts = df_conv_lang.value_counts().reset_index().rename(columns={0: 'count'})
df_conv_lang_counts.head()

In [None]:
fig = px.bar(df_conv_lang_counts, x='conversation_lang', y='count',
             title="Number of conversations in a given language",
            height=600,
            # text_auto=True
            )
fig.update_layout(xaxis_title='Conversation language',
                  yaxis_title='Count')
fig.show()

## Sample chats

### Manually selected chat

In [None]:
selected_conv_id = 119246

Selected conversation before perprocessing:

In [None]:
# for idx, item in df_conv.loc[df_conv['conversation_id']==selected_conv_id, :].iterrows():
#     print(f"{idx} | {item['author_type']} | {item['text']}")

The same conversation after preprocessing:

In [None]:
for idx, item in df_conv_filtered.loc[df_conv_filtered['conversation_id']==selected_conv_id, :].iterrows():
    print(f"| {item['author_type']} | {item['text']}")

In [None]:
df_conv_filtered.loc[df_conv_filtered['conversation_id']==selected_conv_id, :]

### Randomly selected chats

In [None]:
rng = np.random.default_rng(seed=11)

sample_conv_ids = rng.choice(df_conv_filtered['conversation_id'].unique(), 10, replace=False)
# sample_conv_ids

In [None]:
for conv_id in sample_conv_ids:
    print('-'*10 + str(conv_id) + '-'*10)
    for idx, item in df_conv_filtered.loc[df_conv_filtered['conversation_id']==conv_id, :].iterrows():
        print(f"{item['text']}")

# BERTopic

In [None]:
from bertopic import BERTopic
from bertopic.representation import KeyBERTInspired, MaximalMarginalRelevance, OpenAI, PartOfSpeech, TextGeneration
from bertopic.vectorizers import ClassTfidfTransformer
from hdbscan import HDBSCAN
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer
from umap import UMAP

## Prepare conversation data

### Select time frames and English language

In [None]:
# Number of unique conversations before selection
df_conv_filtered['conversation_id'].nunique()

In [None]:
df_conv_filtered['created_date'].min(), df_conv_filtered['created_date'].max()

In [None]:
begin_date = '2022-01-01'
end_date = '2024-01-01'

In [None]:
selection_mask = (df_conv_filtered['created_date'] >= begin_date) & \
                 (df_conv_filtered['created_date'] < end_date) & \
                 (df_conv_filtered['conversation_lang'] == 'en')
df_conv_selected = df_conv_filtered[selection_mask]

In [None]:
df_conv_selected['created_date'].min(), df_conv_selected['created_date'].max()

In [None]:
# Number of unique conversations after selection
df_conv_selected['conversation_id'].nunique()

In [None]:
# Number of unique organizations
df_conv_selected['author_domain'].nunique()

### Cut too long messages

In [None]:
# Max_tokens is set based on distribution of number of tokens in message/utterance
max_tokens = 120

df_conv_selected['body'] = df_conv_selected['body'].apply(lambda x: ' '.join(str(x).split(' ')[:max_tokens]))

### Join messages into one conversation

In [None]:
df_conversations = df_conv_selected.groupby(['conversation_id'], as_index=False).agg(
                conversation_length=pd.NamedAgg(column="id", aggfunc="count"),
                conversation_body=pd.NamedAgg(column="body", aggfunc=lambda x: '\n'.join(x.astype(str))), 
                conversation_date=pd.NamedAgg(column="created_date", aggfunc="first")
)

In [None]:
# Count number of tokens (words) in each conversation
df_conversations_tokens = df_conv_selected[['conversation_id', 'body']].groupby(['conversation_id'])['body']\
    .apply(lambda x: len(nltk.word_tokenize(' '.join(x.astype(str))))).to_frame().reset_index().rename(columns={'body': 'tokens_count'})

df_conversations_tokens

In [None]:
# Add info about number of tokens
df_conversations = df_conversations.merge(df_conversations_tokens, how='left', on='conversation_id')

In [None]:
fig = px.histogram(df_conversations, x='tokens_count', 
             title="Distribution of number of tokens in conversations",
            # nbins=100,
            height=600,
            # text_auto=True
            )
fig.update_xaxes(range=[0, 1500])
fig.show()

In [None]:
df_conversations.describe(percentiles=np.arange(0.1, 1, 0.1))[['tokens_count']]

### Remove duplicated conversations

In [None]:
# All duplicated conversations ('keep=False' marks all duplicates as True)

df_duplicated_convs = df_conversations.loc[df_conversations['conversation_body'].duplicated(keep=False), :]
df_duplicated_convs['conversation_month']= df_duplicated_convs['conversation_date'].dt.strftime('%Y-%m')

df_duplicated_convs#.to_csv('duplicated_conv.csv', sep='\t')

In [None]:
df_duplicated_convs_per_month = df_duplicated_convs.groupby('conversation_month', as_index=False)[['conversation_id']].count()

fig = px.bar(df_duplicated_convs_per_month[df_duplicated_convs_per_month['conversation_month']>='2023-01'], x='conversation_month', y='conversation_id',
             title="Number of duplicated conversations per month",
            height=600,
            text_auto=".2s"
            )
fig.update_layout(xaxis_title='Month',
                  yaxis_title='Count')
fig.show()

In [None]:
# Count duplicates by month
df_duplicated_convs_month_count = df_duplicated_convs.groupby(['conversation_body', 'conversation_month'], as_index=False).agg(
    conversations_count=pd.NamedAgg(column="conversation_id", aggfunc="count"),
)

df_duplicated_convs_month_count

In [None]:
# Count duplicates and represent duplicates by the first occurrence
df_duplicated_convs_count = df_duplicated_convs.groupby('conversation_body', as_index=False).agg(
    conversations_count=pd.NamedAgg(column="conversation_id", aggfunc="count"),
    conversation_id=pd.NamedAgg(column="conversation_id", aggfunc="first")
)[['conversation_id', 'conversation_body', 'conversations_count']].sort_values('conversations_count', ascending=False)

df_duplicated_convs_count

In [None]:
fig = px.histogram(df_conversations[df_conversations['conversation_body'].duplicated()], x='conversation_length', 
             title="Distribution of conversation_length in duplicated conversations",
            # nbins=100,
            height=600,
            # text_auto=True
            )
# fig.update_xaxes(range=[0, 1000])
fig.show()

In [None]:
# Define unique conversations
df_conversations_unique = df_conversations[~df_conversations['conversation_body'].duplicated()].reset_index(drop=True)
df_conversations_unique['conversation_month']= df_conversations_unique['conversation_date'].dt.strftime('%Y-%m')
df_conversations_unique = df_conversations_unique.merge(df_duplicated_convs_count, how='left', on=['conversation_id', 'conversation_body'])
df_conversations_unique = df_conversations_unique.fillna({'conversations_count': 1}).astype({'conversations_count': int})
df_conversations_unique

In [None]:
# Consistency check
df_conversations_unique.shape[0], df_conversations_unique['conversations_count'].sum(), df_conversations.shape[0]

### Define final data

In [None]:
# All unique conversations 2022-2023
conversations_data = df_conversations_unique['conversation_body'].tolist()
len(conversations_data)

In [None]:
timestamps = df_conversations_unique['conversation_date'].dt.strftime('%Y-%m').to_list()
len(timestamps)

In [None]:
df_conversations_unique['conversation_date'].min(), df_conversations_unique['conversation_date'].max()

In [None]:
# All conversations in 2023
df_conversations[(df_conversations['conversation_date'] >= '2023-01-01') & \
                 (df_conversations['conversation_date'] < '2024-01-01')].shape[0]

In [None]:
# Define a mask for unique conversations in 2023
conv_2023_mask = (df_conversations_unique['conversation_date'] >= '2023-01-01') & \
                 (df_conversations_unique['conversation_date'] < '2024-01-01')

In [None]:
# All unique conversations in 2023
conversations_data_2023 = df_conversations_unique.loc[conv_2023_mask, 'conversation_body'].tolist()
len(conversations_data_2023)

In [None]:
timestamps_2023 = df_conversations_unique.loc[conv_2023_mask, 'conversation_date'].dt.strftime('%Y-%m').to_list()
len(timestamps_2023)

In [None]:
df_conversations_unique.loc[conv_2023_mask, 'conversation_date'].min(), df_conversations_unique.loc[conv_2023_mask, 'conversation_date'].max()

## Define stopwords

Stopwords list downloaded from https://github.com/stopwords-iso/stopwords-en/blob/master/stopwords-en.txt

In [None]:
stopwords_list = pd.read_csv('stopwords-en.txt', header=None).astype(str)[0].tolist()

In [None]:
# sorted(stopwords_list)

## Detailed BERTopic pipeline

### Step 1 - Embedding documents

##### Pre-calculate embeddings

In [None]:
# embedding_model = SentenceTransformer("all-MiniLM-L6-v2") # It takes ~20m to make an embeddng for ~30k chats
# conversations_embeddings = embedding_model.encode(conversations_data, show_progress_bar=True)

In [None]:
# conversations_embeddings.shape

In [None]:
embedding_model_gte_base = SentenceTransformer('thenlper/gte-base')
conversations_embeddings_gte_base_2022_2023 = embedding_model_gte_base.encode(conversations_data, show_progress_bar=True)

In [None]:
# Store sentences & embeddings on disc
with open("embeddings_gte_base_2022-2023.pkl", "wb") as fOut:
    pickle.dump({"conversations": conversations_data, "embeddings": conversations_embeddings_gte_base_2022_2023}, fOut, protocol=pickle.HIGHEST_PROTOCOL)

# Load sentences & embeddings from disc
# with open("embeddings_gte_base_2022-2023.pkl", "rb") as fIn:
#     stored_data = pickle.load(fIn)
#     conversations_data = stored_data["conversations"]
#     conversations_embeddings_gte_base_2022_2023 = stored_data["embeddings"]

In [None]:
conversations_embeddings_gte_base_2022_2023.shape

In [None]:
len(conversations_data)

In [None]:
# Embeddings for conversations 2023
conversations_embeddings_gte_base_2023 = conversations_embeddings_gte_base_2022_2023[conv_2023_mask]
conversations_embeddings_gte_base_2023.shape

In [None]:
len(conversations_data_2023)

### Step 2 - Reducing dimensionality of embeddings

In [None]:
# To prevent stochastic behavior set random_state
# umap_model = UMAP(n_neighbors=20, n_components=5, min_dist=0.0, metric='cosine', random_state=42)

### Step 3 - Clustering reduced embeddings into topics

In [None]:
# hdbscan_model = HDBSCAN(min_cluster_size=150, metric='euclidean', cluster_selection_method='eom', prediction_data=True)

### Step 4 - Tokenization of topics

In [None]:
# vectorizer_model = CountVectorizer(stop_words=stopwords_list, min_df=10, ngram_range=(1, 2))

### Step 5 - Weight tokens, create topic representation

In [None]:
# ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)

### Step 6 - (Optional) Fine-tune topic representations

In [None]:
# # KeyBERT
# keybert_model = KeyBERTInspired()

# # Part-of-Speech
# pos_model = PartOfSpeech("en_core_web_sm")

## MMR
# mmr_model = MaximalMarginalRelevance(diversity=0.3)

## GPT-3.5
# client = openai.OpenAI(api_key="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")
# prompt = """
# I have a topic that contains the following documents: 
# [DOCUMENTS]
# The topic is described by the following keywords: [KEYWORDS]

# Based on the information above, extract a short but highly descriptive topic label of at most 5 words. Make sure it is in the following format:
# topic: <topic label>
# """
# openai_model = OpenAI(client, model="gpt-3.5-turbo", exponential_backoff=True, chat=True, prompt=prompt)


# # All representation models
# representation_model = {
#     "KeyBERT": keybert_model,
#     # "OpenAI": openai_model,
#     "MMR": mmr_model,
#     "POS": pos_model
# }

### Training

In [None]:
# Baseline model
umap_model = UMAP(n_neighbors=20, n_components=5, min_dist=0.0, metric='cosine', random_state=42)
hdbscan_model = HDBSCAN(min_cluster_size=150, metric='euclidean', cluster_selection_method='eom', prediction_data=True)
vectorizer_model = CountVectorizer(stop_words=stopwords_list, min_df=10, ngram_range=(1, 2))
ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)
keybert_model = KeyBERTInspired()
representation_model = {
    "KeyBERT": keybert_model,
}

conv_topic_model_1 = BERTopic(
  # Pipeline models
  embedding_model=embedding_model_gte_base,           # Step 1 - Extract embeddings
  umap_model=umap_model,                     # Step 2 - Reduce dimensionality
  hdbscan_model=hdbscan_model,               # Step 3 - Cluster reduced embeddings
  vectorizer_model=vectorizer_model,         # Step 4 - Tokenize topics
  ctfidf_model=ctfidf_model,                 # Step 5 - Extract topic words
  representation_model=representation_model, # Step 6 - (Optional) Fine-tune topic represenations
    
  # Hyperparameters
  top_n_words=10, # 10 is default
  # nr_topics="auto",
  calculate_probabilities=True,
  verbose=True
)

In [None]:
umap_model = UMAP(n_neighbors=20, n_components=5, min_dist=0.0, metric='cosine', random_state=42)
hdbscan_model = HDBSCAN(min_cluster_size=80, min_samples=15, metric='euclidean', cluster_selection_method='eom', prediction_data=True)
vectorizer_model = CountVectorizer(stop_words=stopwords_list, min_df=10, ngram_range=(1, 2))
ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)
keybert_model = KeyBERTInspired()
representation_model = {
    "KeyBERT": keybert_model,
}

conv_topic_model_2 = BERTopic(
  # Pipeline models
  embedding_model = embedding_model_gte_base,     # Step 1 - Extract embeddings
  umap_model=umap_model,                     # Step 2 - Reduce dimensionality
  hdbscan_model=hdbscan_model,               # Step 3 - Cluster reduced embeddings
  vectorizer_model=vectorizer_model,         # Step 4 - Tokenize topics
  ctfidf_model=ctfidf_model,                 # Step 5 - Extract topic words
  representation_model=representation_model, # Step 6 - (Optional) Fine-tune topic represenations

  # Hyperparameters
  top_n_words=10, # 10 is default
  # nr_topics="auto",
  calculate_probabilities=True,
  verbose=True
)

In [None]:
umap_model = UMAP(n_neighbors=20, n_components=5, min_dist=0.0, metric='cosine', random_state=42)
hdbscan_model = HDBSCAN(min_cluster_size=80, min_samples=15, metric='euclidean', cluster_selection_method='eom', prediction_data=True)
vectorizer_model = CountVectorizer(stop_words=stopwords_list, min_df=10, ngram_range=(1, 2))
ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)
keybert_model = KeyBERTInspired()
representation_model = {
    "KeyBERT": keybert_model,
}

conv_topic_model_3 = BERTopic(
  # Pipeline models
  embedding_model=embedding_model_gte_base,       # Step 1 - Extract embeddings
  umap_model=umap_model,                     # Step 2 - Reduce dimensionality
  hdbscan_model=hdbscan_model,               # Step 3 - Cluster reduced embeddings
  vectorizer_model=vectorizer_model,         # Step 4 - Tokenize topics
  ctfidf_model=ctfidf_model,                 # Step 5 - Extract topic words
  representation_model=representation_model, # Step 6 - (Optional) Fine-tune topic represenations

  # Hyperparameters
  top_n_words=10, # 10 is default
  # nr_topics="auto",
  calculate_probabilities=True,
  verbose=True
)

In [None]:
# Disable warning:
# The current process just got forked. Disabling parallelism to avoid deadlocks... To disable this warning, please explicitly set 
# TOKENIZERS_PARALLELISM=(true | false)
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [None]:
conv_topics_1, conv_probs_1 = conv_topic_model_1.fit_transform(conversations_data_2023, conversations_embeddings_gte_base_2023)

In [None]:
conv_topics_2, conv_probs_2 = conv_topic_model_2.fit_transform(conversations_data_2023, conversations_embeddings_gte_base_2023)

In [None]:
conv_topics_3, conv_probs_3 = conv_topic_model_3.fit_transform(conversations_data_2023, conversations_embeddings_gte_base_2023)

In [None]:
conv_topics_over_time_3 = conv_topic_model_3.topics_over_time(conversations_data_2023, timestamps_2023)

### Results

In [None]:
conv_topic_info_1 = conv_topic_model_1.get_topic_info()
conv_topic_info_1

In [None]:
conv_topic_info_2 = conv_topic_model_2.get_topic_info()
conv_topic_info_2

In [None]:
conv_topic_info_3 = conv_topic_model_3.get_topic_info()
conv_topic_info_3

In [None]:
# conv_topic_model_1.save("conv_topic_model_1.pickle")

In [None]:
# conv_topic_model_2.save("conv_topic_model_2.pickle")

In [None]:
# conv_topic_model_3.save("conv_topic_model_3.pickle")

In [None]:
print(f"oulieres = {conv_topic_info_1.loc[0, 'Count'] / conv_topic_info_1['Count'].sum() * 100:.4}%")

In [None]:
print(f"oulieres = {conv_topic_info_2.loc[0, 'Count'] / conv_topic_info_2['Count'].sum() * 100:.4}%")

In [None]:
print(f"oulieres = {conv_topic_info_3.loc[0, 'Count'] / conv_topic_info_3['Count'].sum() * 100:.4}%")

In [None]:
with pd.option_context('display.max_colwidth', None):
  display(conv_topic_info_1.set_index('Topic')[['Count', 'Name', 'Representation']].iloc[:30])

In [None]:
with pd.option_context('display.max_colwidth', None):
  display(conv_topic_info_2.set_index('Topic')[['Count', 'Name', 'Representation']].iloc[:30])

In [None]:
with pd.option_context('display.max_colwidth', None):
  display(conv_topic_info_3.set_index('Topic')[['Count', 'Name', 'Representation']].iloc[:11])

In [None]:
conversations_data_topics_1 = conv_topic_model_1.get_document_info(conversations_data_2023).merge(df_conversations_unique_2023, how='left', left_index=True, right_index=True)
conversations_data_topics_1.shape

In [None]:
conversations_data_topics_2 = conv_topic_model_2.get_document_info(conversations_data_2023).merge(df_conversations_unique_2023, how='left', left_index=True, right_index=True)
conversations_data_topics_2.shape

In [None]:
conversations_data_topics_3 = conv_topic_model_3.get_document_info(conversations_data_2023).merge(df_conversations_unique_2023, how='left', left_index=True, right_index=True)
conversations_data_topics_3.shape

In [None]:
fig = px.bar(conv_topic_info_1, x='Name', y='Count',
             title="Number of conversation related to a given topic",
            height=800,
            # text_auto=True
            )
# fig.update_xaxes(range=[0, 100])
fig.update_layout(xaxis_title='Topic 1',
                  yaxis_title='Count')
fig.show()

In [None]:
fig = px.bar(conv_topic_info_2, x='Name', y='Count',
             title="Number of conversation related to a given topic",
            height=800,
            # text_auto=True
            )
# fig.update_xaxes(range=[0, 100])
fig.update_layout(xaxis_title='Topic 2',
                  yaxis_title='Count')
fig.show()

In [None]:
fig = px.bar(conv_topic_info_3, x='Name', y='Count',
             title="Number of conversation related to a given topic",
            height=1000,
            # text_auto=True
            )
# fig.update_xaxes(range=[0, 100])
fig.update_layout(xaxis_title='Topic Name',
                  yaxis_title='Count')
fig.show()

## Topic results over time

In [None]:
def topics_over_time(conversations_data_topics, conv_topic_info):
    months = np.sort(conversations_data_topics['conversation_month'].unique())
    topics = conv_topic_info['Topic'].values
    
    topics_over_time = [pd.DataFrame(index=topics)]
    for month in months:
        col = conversations_data_topics.loc[conversations_data_topics['conversation_month']==month, 'Topic'].value_counts().to_frame()
        col = col.rename(columns={'Topic': month})
        topics_over_time.append(col)
        
    topics_over_time_df = pd.concat(topics_over_time, axis='columns').fillna(0).astype(int).transpose().reset_index().rename(columns={'index': 'Month'})
    topics_over_time_melt_df = topics_over_time_df.melt(id_vars='Month', var_name="Topic", value_name='Frequency')
    topics_over_time_melt_df = topics_over_time_melt_df.merge(conv_topic_info[['Topic', 'Name', 'Representation']], how='left', on='Topic')
    
    return topics_over_time_melt_df

In [None]:
conv_topics_over_time_3

In [None]:
conv_topics_over_time = topics_over_time(conversations_data_topics_3, conv_topic_info_3)
conv_topics_over_time

In [None]:
top_n = 10

fig = px.line(conv_topics_over_time[conv_topics_over_time['Topic']<top_n], 
              x='Month', y='Frequency', color='Name',
              markers=True,
              height=600)
fig.update_xaxes(showgrid=True)
fig.update_yaxes(showgrid=True)
fig.update_layout(legend_traceorder="normal") 
fig.update_layout(
    title={
        'text': f"<b>Topics over Time for All Customers</b>",
        'y': .95,
        'x': 0.40,
        'xanchor': 'center',
        'yanchor': 'top',
        'font': dict(
            size=22,
            color="Black")
    },
    template="simple_white",
    width=1250,
    height=600,
    hoverlabel=dict(
        # bgcolor="white",
        font_size=16,
        font_family="Rockwell"
    ),
    legend=dict(
        title="<b>Global Topic Representation</b>",
    )
)
fig.show()

In [None]:
conv_topics_over_time_solana = topics_over_time(conversations_data_topics_3_solana, conv_topic_info_3)

In [None]:
top_n = 4
freq_topics = conv_topics_over_time_solana.groupby('Topic')['Frequency'].sum().sort_values(ascending=False)[:top_n]
freq_topics_over_time = conv_topics_over_time_solana[conv_topics_over_time_solana['Topic'].isin(freq_topics.index)]

fig = px.line(freq_topics_over_time, x='Month', y='Frequency', color='Name',
              markers=True,
              )
fig.update_xaxes(showgrid=True)
fig.update_yaxes(showgrid=True)
fig.update_layout(
    title={
        'text': f"<b>Topics over Time for Solana</b>",
        'y': .95,
        'x': 0.40,
        'xanchor': 'center',
        'yanchor': 'top',
        'font': dict(
            size=22,
            color="Black")
    },
    template="simple_white",
    width=1250,
    height=600,
    hoverlabel=dict(
        # bgcolor="white",
        font_size=16,
        font_family="Rockwell"
    ),
    legend=dict(
        title="<b>Global Topic Representation</b>",
    )
)
fig.show()

## Representative conversations

##### Model 1

In [None]:
topic_nb_1 = 0

print(conv_topic_info_1.loc[topic_nb_1 + 1, 'Representation'])
print(conv_topic_info_1.loc[topic_nb_1 + 1, 'Count'])
print("#"*50)

for item in conv_topic_info_1.loc[topic_nb_1 + 1, 'Representative_Docs']:
    print(item)
    print("#"*50)

In [None]:
rng = np.random.default_rng(seed=12)
selected_conv_id = rng.choice(conversations_data_topics_1.loc[conversations_data_topics_1['Topic']==topic_nb_1, 'conversation_id'], 10, replace=False)

for conv_id in selected_conv_id:
    print('-'*10 + str(conv_id) + '-'*10)
    for idx, item in df_conv_selected.loc[df_conv_selected['conversation_id']==conv_id, :].iterrows():
    # for idx, item in df_conv_filtered.loc[df_conv_filtered['conversation_id']==conv_id, :].iterrows():
        print(f"| {item['author_type']} | {item['body']}")

##### Model 2

In [None]:
topic_nb_2 = 0

print(conv_topic_info_2.loc[topic_nb_2 + 1, 'Representation'])
print(conv_topic_info_2.loc[topic_nb_2 + 1, 'Count'])
print("#"*50)

for item in conv_topic_info_2.loc[topic_nb_2 + 1, 'Representative_Docs']:
    print(item)
    print("#"*50)

In [None]:
rng = np.random.default_rng(seed=12)
selected_conv_id = rng.choice(conversations_data_topics_2.loc[conversations_data_topics_2['Topic']==topic_nb_2, 'conversation_id'], 10, replace=False)

for conv_id in selected_conv_id:
    print('-'*10 + str(conv_id) + '-'*10)
    for idx, item in df_conv_selected.loc[df_conv_selected['conversation_id']==conv_id, :].iterrows():
    # for idx, item in df_conv_filtered.loc[df_conv_filtered['conversation_id']==conv_id, :].iterrows():
        print(f"| {item['author_type']} | {item['body']}")

##### Model 3

In [None]:
topic_nb_3 = 0

print(conv_topic_info_3.loc[topic_nb_3 + 1, 'Representation'])
print(conv_topic_info_3.loc[topic_nb_3 + 1, 'Count'])
print("#"*50)

for item in conv_topic_info_3.loc[topic_nb_3 + 1, 'Representative_Docs']:
    print(item)
    print("#"*50)

In [None]:
rng = np.random.default_rng(seed=12)
selected_conv_id = rng.choice(conversations_data_topics_3.loc[conversations_data_topics_3['Topic']==topic_nb_3, 'conversation_id'], 10, replace=False)

for conv_id in selected_conv_id:
    print('-'*10 + str(conv_id) + '-'*10)
    for idx, item in df_conv_selected.loc[df_conv_selected['conversation_id']==conv_id, :].iterrows():
    # for idx, item in df_conv_filtered.loc[df_conv_filtered['conversation_id']==conv_id, :].iterrows():
        print(f"| {item['author_type']} | {item['body']}")

## Custom labels

In [None]:
# Label the topics by one of the other topic representations, like KeyBERTInspired

keybert_topic_labels = {topic: " | ".join(list(zip(*values))[0][:3]) for topic, values in conv_topic_model_3.topic_aspects_["KeyBERT"].items()}
keybert_topic_labels

## Topic-Document Distribution

In [None]:
conv_topic_model_3.probabilities_.shape

In [None]:
conv_topic_model_3.probabilities_[17002].sum()

In [None]:
# Visualize the topic-document distribution for a single document
conv_topic_model_3.visualize_distribution(conv_topic_model_3.probabilities_[17002], custom_labels=True)

In [None]:
conversations_data_topics_3.loc[17002, 'Document']

In [None]:
conv_topic_model_3.topics_[17002]

In [None]:
conv_topic_model_3.probabilities_[17002]

In [None]:
conv_pred = conv_topic_model_3.transform(conversations_data_2023[17002])

In [None]:
conv_pred

## Visualize Topics

In [None]:
conv_topic_model_3.visualize_topics()

In [None]:
conv_topic_model_3.visualize_heatmap()

In [None]:
conv_topic_model_3.visualize_hierarchy()

In [None]:
conv_topic_model_3.visualize_barchart(top_n_topics = 16, n_words = 10, height=300)

In [None]:
conv_topic_model_3.visualize_topics_over_time(conv_topics_over_time_3, top_n_topics=10, height=600)

## Topic Reduction after Training

In [None]:
conv_topic_model_1.reduce_topics(conversations_data_2023, nr_topics=30)

In [None]:
conv_topic_model_1.visualize_heatmap()

In [None]:
conv_topic_model_1.visualize_barchart()

In [None]:
topic_nb = 0
topic_rep = pd.DataFrame(conv_topic_model_1.get_topic(topic_nb, full=True)).explode(['Main', 'KeyBERT'], ignore_index=True)
rep_words = topic_rep.iloc[::2].reset_index(drop=True)
rep_scors = topic_rep.iloc[1::2].add_suffix('_score').astype(float).reset_index(drop=True)
topic_df = pd.concat([rep_words, rep_scors], axis=1)
topic_df

## Update Topic Representation after Training

In [None]:
conv_topic_model_1.update_topics(conversations_data_2023, n_gram_range=(1, 2))

In [None]:
conv_topic_model_1.visualize_heatmap()

In [None]:
conv_topic_model_1.visualize_barchart()

In [None]:
topic_nb = 0
topic_rep = pd.DataFrame(conv_topic_model_1.get_topic(topic_nb, full=True)).explode(['Main', 'KeyBERT'], ignore_index=True)
rep_words = topic_rep.iloc[::2].reset_index(drop=True)
rep_scors = topic_rep.iloc[1::2].add_suffix('_score').astype(float).reset_index(drop=True)
topic_df = pd.concat([rep_words, rep_scors], axis=1)
topic_df

## Outlier reduction

In [None]:
len(conv_topics_2), (np.array(conv_topics_2) == -1).sum()

The default method for reducing outliers is by calculating the c-TF-IDF representations of outlier documents and assigning them to the best matching c-TF-IDF representations of non-outlier topics.

In [None]:
# You can use the `threshold` parameter to select the minimum distance or similarity when matching outlier documents with non-outlier topics. This allows the user to change the amount of outlier documents are assigned to non-outlier topics.

# Reduce outliers using the `c-tf-idf` strategy
new_conv_topics_2 = conv_topic_model_2.reduce_outliers(conversations_data_2023, conv_topics_2, strategy="c-tf-idf", threshold=0.05)

In [None]:
(np.array(new_conv_topics_2) == -1).sum()

Use the topic distributions, as calculated with `.approximate_distribution` to find the most frequent topic in each outlier document. You can use the `distributions_params` variable to tweak the parameters of `.approximate_distribution`.

In [None]:
# Reduce outliers using the `distributions` strategy
new_conv_topics_2d = conv_topic_model_2.reduce_outliers(conversations_data_2023, conv_topics_2, strategy="distributions", threshold=0.05)

In [None]:
(np.array(new_conv_topics_2d) == -1).sum()

Probabilities strategy uses the soft-clustering as performed by HDBSCAN to find the best matching topic for each outlier document. To use this, make sure to calculate the `probabilities` beforehand by instantiating BERTopic with `calculate_probabilities=True`.

In [None]:
# Reduce outliers using the `probabilities` strategy
new_conv_topics_2p = conv_topic_model_2.reduce_outliers(conversations_data_2023, conv_topics_2, probabilities=conv_probs_2, strategy="probabilities", threshold=0.02)

In [None]:
(np.array(new_conv_topics_2p) == -1).sum()

Using the embeddings of each outlier documents, find the best matching topic embedding using cosine similarity.

In [None]:
# Reduce outliers using the `embeddings` strategy
new_conv_topics_2e = conv_topic_model_2.reduce_outliers(conversations_data_2023, conv_topics_2, strategy="embeddings", embeddings=conversations_embeddings_gte_base_2023, threshold=0.9)

In [None]:
(np.array(new_conv_topics_2e) == -1).sum()

In [None]:
# We reduce our embeddings to 2D as it will allows us to quickly iterate later on
conversations_reduced_embeddings = UMAP(n_neighbors=20, n_components=2, min_dist=0.0, metric='cosine', 
                                        random_state=42).fit_transform(conversations_embeddings_gte_base_2023)

In [None]:
conv_topic_model_2.visualize_documents(conversations_data_2023, reduced_embeddings=conversations_reduced_embeddings, 
                                       hide_document_hover=True, hide_annotations=True)

In [None]:
# When outlier documents are generated, they are not used when modeling the topic representations. These documents are completely ignored when finding good descriptions of topics. Thus, after having reduced the number of outliers in your topic model, you might want to update the topic representations with the documents that now belong to actual topics.

conv_topic_model_2.update_topics(conversations_data_2023, topics=new_conv_topics_2p)

In [None]:
conv_topic_model_2.visualize_documents(conversations_data_2023, reduced_embeddings=conversations_reduced_embeddings, 
                                       hide_document_hover=True, hide_annotations=True)

## Clusters similarity analysis

In [None]:
def jaccard_similarity(set_a, set_b):
    # intersection of two sets
    intersection = len(set_a.intersection(set_b))
    # union of two sets
    union = len(set_a.union(set_b))
    
    return intersection / union

In [None]:
def similarity_matrix(data_topics_a, data_topics_b):
    topics_a = np.sort(data_topics_a['Topic'].unique())
    topics_b = np.sort(data_topics_b['Topic'].unique())
    similarity_matrix = pd.DataFrame(0.0, index=topics_a, columns=topics_b)

    for topic_a in topics_a:
        set_a = set(data_topics_a.loc[data_topics_a['Topic']==topic_a, 'conversation_id'])
        for topic_b in topics_b:
            set_b = set(data_topics_b.loc[data_topics_b['Topic']==topic_b, 'conversation_id'])
            similarity_matrix.loc[topic_a, topic_b] = jaccard_similarity(set_a, set_b)
            
    return similarity_matrix

In [None]:
topics_1 = np.sort(conversations_data_topics_1['Topic'].unique())
topics_1

In [None]:
topics_2 = np.sort(conversations_data_topics_2['Topic'].unique())
topics_2

In [None]:
topics_3 = np.sort(conversations_data_topics_3['Topic'].unique())
topics_3

In [None]:
similarity_matrix_1_2 = similarity_matrix(conversations_data_topics_1, conversations_data_topics_2)
similarity_matrix_1_3 = similarity_matrix(conversations_data_topics_1, conversations_data_topics_3)
similarity_matrix_2_3 = similarity_matrix(conversations_data_topics_2, conversations_data_topics_3)

In [None]:
fig = px.imshow(similarity_matrix_1_2, 
                labels=dict(x="Topic 2", y="Topic 1", color="Jaccard similarity"),
               text_auto='.2f',
                color_continuous_scale='Hot_r',
               height=800,)
fig.show()

In [None]:
fig = px.imshow(similarity_matrix_1_3, 
                labels=dict(x="Topic 3", y="Topic 1", color="Jaccard similarity"),
               text_auto='.2f',
                color_continuous_scale='Hot_r',
               height=800,)
fig.show()

In [None]:
fig = px.imshow(similarity_matrix_2_3, 
                labels=dict(x="Topic 3", y="Topic 2", color="Jaccard similarity"),
               text_auto='.2f',
                color_continuous_scale='Hot_r',
               height=800,)
fig.show()

# Large Language Models (LLMs)

In [None]:
import vertexai
from vertexai.language_models import TextGenerationModel
from vertexai.preview import generative_models
from vertexai.preview.generative_models import GenerativeModel, Part
from tqdm import tqdm

In [None]:
def format_conversation(conversations_data_topics, conv_id):
    # conv_body = '\n'.join([f"- {item['body']}" 
    #                        for _, item in df_conv_selected.loc[df_conv_selected['conversation_id']==conv_id, :].iterrows()])
    return '- ' + conversations_data_topics.loc[conversations_data_topics['conversation_id']==conv_id, 'Document'].item().replace('\n', '\n- ')

In [None]:
from sklearn.preprocessing import normalize

def select_conversation_docs(conversations_data_topics, topic_nb, nb_of_convs=10, method='highest_score'):
    rng = np.random.default_rng(seed=12)
    
    topic_conversations = conversations_data_topics.loc[conversations_data_topics['Topic']==topic_nb, :].sort_values('Probability', ascending=False)
    topic_conversations_probabilities = normalize([topic_conversations['Probability'].values], norm="l1").ravel()
    selected_conv_id = []

    if method == 'highest_score':
        # Select conversations with the highest probability score
        selected_conv_id = topic_conversations.iloc[:nb_of_convs]['conversation_id']
    elif method == 'score_dist':
        # Select conversations based on their probability distribution
        selected_conv_id = rng.choice(topic_conversations['conversation_id'], nb_of_convs, replace=False, p=topic_conversations_probabilities)
    elif method == 'uniform':
        # Select conversations based on uniform distribution over all chats
        selected_conv_id = rng.choice(topic_conversations, nb_of_convs, replace=False)
    
    conversation_docs_str = ""
    for nb_of_conv, conv_id in enumerate(selected_conv_id, 1):
        conv_body = format_conversation(conversations_data_topics, conv_id) 
        conversation_docs_str += f'Conversation {nb_of_conv}\n{conv_body}\n\n'

    return conversation_docs_str

In [None]:
def format_model_response(model_response):
    model_response_series = model_response.iloc[:,-1].str.replace(r'\n+', '\n', regex=True).str.replace(r'(-\s(?:\*\*)?.+:(?:\*\*)?)\s+-(.+)', lambda m: m.group(1) + m.group(2), regex=True).str.split('\n')
    model_response_exploded = model_response_series.explode().to_frame()
    model_response_final = model_response_exploded.iloc[:,-1].str.extract(r'-\s(?:\*\*)?(.+):(?:\*\*)?\s(.+)').rename(columns={0: 'Pain point', 1: 'Explanation'}).reset_index()
    return model_response_final

# GCP PaLM model (text-bison)

In [None]:
vertexai.init(project="helix-ds-metal-dev", location="us-central1")

In [None]:
parameters = {
    "candidate_count": 1,
    "max_output_tokens": 2048,
    "temperature": 0,
    "top_p": 0.8,
    "top_k": 40
}
# model = TextGenerationModel.from_pretrained("text-bison")
model = TextGenerationModel.from_pretrained("text-bison-32k")

## Example

In [None]:
conversation_1 = """
"""

In [None]:
conversation_1a = """ 
"""

In [None]:
response = model.predict(
    f"""What is the set of main pain points extracted from the below conversation:
    {conversation_1}
    """,
    **parameters
)
print(f"Response from Model:\n{response.text}")

In [None]:
response = model.predict(
    f"""What is the set of main pain points extracted from the below conversation:
    {conversation_1a}
    """,
    **parameters
)
print(f"Response from Model:\n{response.text}")

In [None]:
# CONTENT_FORMAT = '{"PAIN POINT": ..., "EXPLANATION": ...},\n{"PAIN POINT": ..., "EXPLANATION": ...},\n...'
# CONTENT_FORMAT = '{"PAIN POINT": ..., "EXPLANATION": ..., "KEYWORDS": ...},\n{"PAIN POINT": ..., "EXPLANATION": ..., "KEYWORDS": ...},\n...'
# CONTENT_FORMAT = '{"PAIN POINT": ..., "EXPLANATION": ..., "INSIGHTS": ...},\n{"PAIN POINT": ..., "EXPLANATION": ..., "INSIGHTS": ...},\n...'
CONTENT_FORMAT = '{"PAIN POINT": ..., "EXPLANATION": ..., "ACTIONABLE INSIGHTS": ...},\n{"PAIN POINT": ..., "EXPLANATION": ..., "ACTIONABLE INSIGHTS": ...},\n ...'
# CONTENT_FORMAT = '{"PAIN POINT": ..., "ACTIONABLE INSIGHTS": ..., "KEYWORDS": [...]},\n{"PAIN POINT": ..., "ACTIONABLE INSIGHTS": ..., "KEYWORDS": [...]},\n ...'

response = model.predict(
    f"""
You are a helpful, respectful and honest Equinix assistant who extracts information from customer conversations.

I have the following conversation:
{conversation_1}

Can you extract short but descriptive customer pain points and actionable insights from the above conversation? Provide the answer in the following format: {CONTENT_FORMAT}
""",
    
    **parameters
)
print(f"Response from Model:\n{response.text}")

## Extract pain points from found topics and documents

### All topics

#### Prompt build using multiple representative conversations

In [None]:
convs_nb = 20
model_response_multi_list = []

for topic_nb in tqdm(conv_topic_info_3['Topic']):    
    # conv_kyewords = conv_topic_info_3.loc[topic_nb + 1, 'Representation']
    conv_kyewords = conv_topic_info_3.loc[topic_nb + 1, 'KeyBERT']
    current_convs_nb = 18 if topic_nb in [47] else convs_nb
    conv_docs = select_conversation_docs(conversations_data_topics_3, topic_nb, current_convs_nb)
    
#     prompt = f"""
# You are a helpful, respectful and honest assistant for extracting information from conversations.

# I have the following conversations:

# {conv_docs.strip()}

# The conversations are described by the following keywords: {', '.join(conv_kyewords)}

# Based on the above information, can you extract a short but highly descriptive pain points? Provide a maximum of 2 main pain points. Make sure it is in the following format:
# - <pain point> Explanation of this <pain point>
# """

    prompt = f"""
You are a helpful, respectful and honest assistant for extracting information from conversations.

I have the following conversations:

{conv_docs.strip()}

The conversations are described by the following keywords: {', '.join(conv_kyewords)}

Based on the above information, can you extract a short but highly descriptive main pain point? Make sure it is in the following format:
- <pain point>: Explanation of this <pain point>
"""
    
    # print(f"""Topic {topic_nb}: prompt length {len(prompt)}, {conv_kyewords}""")
    # print(prompt)
    
    prompt_response = model.predict(
        prompt,
        **parameters
    )
    model_response_multi_list.append(prompt_response.text)

In [None]:
# for idx, item in enumerate(model_response_multi_list, -1):
#     print(f"Topic {idx}:")
#     print(item)
    # print('\n')

In [None]:
model_response_multi_df = pd.DataFrame(model_response_multi_list, index=range(-1, len(model_response_multi_list) -1)).rename(columns={0: 'PaLM 2'})
model_response_multi_df.index.name = 'Topic'

In [None]:
with pd.option_context('display.max_colwidth', None):
    display(model_response_multi_df)

In [None]:
palm_model_response_multi_df = format_model_response(model_response_multi_df)
palm_model_response_multi_df['Pain point'] = palm_model_response_multi_df['Pain point'].str.replace(r'\*\*', '')

In [None]:
with pd.option_context('display.max_colwidth', None):
    display(palm_model_response_multi_df.iloc[:10])

#### Prompt to solve pain points

In [None]:
convs_nb = 20
model_response_multi_solve_list = []

for topic_nb in tqdm(conv_topic_info_3['Topic']):    
    # conv_kyewords = conv_topic_info_3.loc[topic_nb + 1, 'Representation']
    conv_kyewords = conv_topic_info_3.loc[topic_nb + 1, 'KeyBERT']
    current_convs_nb = 18 if topic_nb in [47] else convs_nb
    conv_docs = select_conversation_docs(conversations_data_topics_3, topic_nb, current_convs_nb)
    
    solve_prompt = f"""
You are a helpful, respectful and honest assistant for extracting information from conversations.

I have the following conversations:

{conv_docs.strip()}

The conversations are described by the following keywords: {', '.join(conv_kyewords)}

Based on the above information, can you describe methods used to solve pain points? Provide a maximum of 2 main pain points. Make sure it is in the following format:
- pain point 
  Method used to solve this pain point
"""
    
    # print(f"""Topic {topic_nb}: prompt length {len(solve_prompt)}, {conv_kyewords}""")
    # print(solve_prompt)
    
    prompt_response = model.predict(
        solve_prompt,
        **parameters
    )
    model_response_multi_solve_list.append(prompt_response.text)

In [None]:
for idx, item in enumerate(model_response_multi_solve_list, -1):
    print(f"Topic {idx}:")
    print(item)
    print('\n')

#### Prompt to generate actionable insights

In [None]:
convs_nb = 20
model_response_multi_insights_list = []

for topic_nb in tqdm(conv_topic_info_3['Topic']):    
    # conv_kyewords = conv_topic_info_3.loc[topic_nb + 1, 'Representation']
    conv_kyewords = conv_topic_info_3.loc[topic_nb + 1, 'KeyBERT']
    current_convs_nb = 18 if topic_nb in [47] else convs_nb
    conv_docs = select_conversation_docs(conversations_data_topics_3, topic_nb, current_convs_nb)
    
    solve_prompt = f"""
You are a helpful, respectful and honest assistant for extracting information from conversations.

I have the following conversations:

{conv_docs.strip()}

The conversations are described by the following keywords: {', '.join(conv_kyewords)}

Based on the above information, can you generate actionable insights that can help make decisions?
"""
    
    # print(f"""Topic {topic_nb}: prompt length {len(solve_prompt)}, {conv_kyewords}""")
    # print(solve_prompt)
    
    prompt_response = model.predict(
        solve_prompt,
        **parameters
    )
    model_response_multi_insights_list.append(prompt_response.text)

In [None]:
for idx, item in enumerate(model_response_multi_insights_list, -1):
    print(f"Topic {idx}:")
    print(item)
    print('\n')

# GCP Geminni Pro

In [None]:
# Initialize Vertex AI
vertexai.init(project="helix-ds-metal-dev", location="us-central1")

# Load the model
gemini_model = GenerativeModel("gemini-1.0-pro-001")

# Generation config
gemini_parameters = {
    "max_output_tokens": 4096,
    "temperature": 0,
    "top_p": 1,
    # "top_k": 32
}

# Safety config
safety_config = {
    generative_models.HarmCategory.HARM_CATEGORY_HATE_SPEECH: generative_models.HarmBlockThreshold.BLOCK_ONLY_HIGH,
    generative_models.HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: generative_models.HarmBlockThreshold.BLOCK_ONLY_HIGH,
    generative_models.HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: generative_models.HarmBlockThreshold.BLOCK_ONLY_HIGH,
    generative_models.HarmCategory.HARM_CATEGORY_HARASSMENT: generative_models.HarmBlockThreshold.BLOCK_ONLY_HIGH,
}

## Example

In [None]:
conversation_1 = """
"""

In [None]:
conversation_1a = """ 
"""

In [None]:
gemini_response = gemini_model.generate_content(
    f"""What is the set of main pain points extracted from the below conversation:
    {conversation_1}
    """,
    generation_config=gemini_parameters,
    safety_settings=safety_config,
)
print(f"Response from Model:\n{gemini_response.text}")

In [None]:
# CONTENT_FORMAT = '{"PAIN POINT": ..., "EXPLANATION": ...},\n{"PAIN POINT": ..., "EXPLANATION": ...},\n...'
# CONTENT_FORMAT = '{"PAIN POINT": ..., "EXPLANATION": ..., "KEYWORDS": ...},\n{"PAIN POINT": ..., "EXPLANATION": ..., "KEYWORDS": ...},\n...'
# CONTENT_FORMAT = '{"PAIN POINT": ..., "EXPLANATION": ..., "INSIGHTS": ...},\n{"PAIN POINT": ..., "EXPLANATION": ..., "INSIGHTS": ...},\n...'
CONTENT_FORMAT = '{"PAIN POINT": ..., "EXPLANATION": ..., "ACTIONABLE INSIGHTS": ...},\n{"PAIN POINT": ..., "EXPLANATION": ..., "ACTIONABLE INSIGHTS": ...},\n ...'
# CONTENT_FORMAT = '{"PAIN POINT": ..., "ACTIONABLE INSIGHTS": ..., "KEYWORDS": [...]},\n{"PAIN POINT": ..., "ACTIONABLE INSIGHTS": ..., "KEYWORDS": [...]},\n ...'

gemini_response = gemini_model.generate_content(
    f"""
You are a helpful, respectful and honest assistant who extracts information from customer conversations.

Extract short but descriptive customer pain points and actionable insights from the following conversation:
{conversation_1}

Provide the answer in the following format:
{CONTENT_FORMAT}
""",

    generation_config=gemini_parameters,
    safety_settings=safety_config,
)
print(f"Response from Model:\n{gemini_response.text}")

## Extract pain points from found topics and documents

### All topics

#### Prompt build using multiple representative conversations

In [None]:
# Initialize Vertex AI
vertexai.init(project="helix-ds-metal-dev", location="us-central1")

# Load the model
gemini_model = GenerativeModel("gemini-pro")

# Generation config
gemini_parameters = {
    "max_output_tokens": 2048,
    "temperature": 0,
    "top_p": 1,
    # "top_k": 32
}

# Safety config
safety_config = {
    generative_models.HarmCategory.HARM_CATEGORY_HATE_SPEECH: generative_models.HarmBlockThreshold.BLOCK_ONLY_HIGH,
    generative_models.HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: generative_models.HarmBlockThreshold.BLOCK_NONE,
    generative_models.HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: generative_models.HarmBlockThreshold.BLOCK_ONLY_HIGH,
    generative_models.HarmCategory.HARM_CATEGORY_HARASSMENT: generative_models.HarmBlockThreshold.BLOCK_ONLY_HIGH,
}

In [None]:
convs_nb = 20
gemini_model_response_multi_list = []

for topic_nb in tqdm(conv_topic_info_3['Topic']):    
    # conv_kyewords = conv_topic_info_3.loc[topic_nb + 1, 'Representation']
    conv_kyewords = conv_topic_info_3.loc[topic_nb + 1, 'KeyBERT']
    current_convs_nb = 18 if topic_nb in [47] else convs_nb
    conv_docs = select_conversation_docs(conversations_data_topics_3, topic_nb, current_convs_nb)


    prompt = f"""
You are a helpful, respectful and honest assistant for extracting information from conversations.

I have the following conversations:

{conv_docs.strip()}

The conversations are described by the following keywords: {', '.join(conv_kyewords)}

Based on the above information, can you extract a short but highly descriptive main pain point? Make sure it is in the following format:
- <pain point>: Explanation of this <pain point>
"""
    
#     prompt = f"""
# You are a helpful, respectful and honest assistant for extracting information from conversations.

# I have the following conversations:

# {conv_docs.strip()}

# The conversations are described by the following keywords: {', '.join(conv_kyewords)}

# Based on the above information, can you extract a short but highly descriptive pain points? Provide a maximum of 2 main pain points. Make sure it is in the following format:
# - <pain point> Explanation of this <pain point>
# """
    
    # print(f"""Topic {topic_nb}: prompt length {len(prompt)}, {conv_kyewords}""")
    # print(prompt)
    
    gemini_prompt_response = gemini_model.generate_content(
        prompt,
        generation_config=gemini_parameters,
        safety_settings=safety_config,
    )
    
    gemini_model_response_multi_list.append(gemini_prompt_response.text)

In [None]:
for idx, item in enumerate(gemini_model_response_multi_list, -1):
    print(f"Topic {idx}:")
    print(item)
    # print('\n')

##### 20 conversation per topic

In [None]:
gemini_model_response_multi_20_df = pd.DataFrame(gemini_model_response_multi_list, index=range(-1, len(gemini_model_response_multi_list) -1)).rename(columns={0: 'Gemini Pro'})
gemini_model_response_multi_20_df.index.name = 'Topic'

In [None]:
with pd.option_context('display.max_colwidth', None):
    display(gemini_model_response_multi_20_df)

In [None]:
gemini_model_response_multi_20_formatted = format_model_response(gemini_model_response_multi_20_df)
gemini_model_response_multi_20_formatted['Pain point'] = gemini_model_response_multi_20_formatted['Pain point'].str.replace(r'\*\*', '')
gemini_model_response_multi_20_formatted#.to_csv('gemini_model_response_multi_hs_20_2pp.csv', sep='\t', index=False)

In [None]:
with pd.option_context('display.max_colwidth', None):
    display(gemini_model_response_multi_20_formatted.iloc[:11])
    # display(pd.read_csv('gemini_model_response_multi_hs_20.csv', sep='\t')[:39])

In [None]:
top_n = 11
freq_topics = conv_topics_over_time_top_customers.groupby('Topic')['Frequency'].sum().sort_values(ascending=False)[:top_n]

gemini_model_response_multi_20_top_customers = gemini_model_response_multi_20_formatted[gemini_model_response_multi_20_formatted['Topic'].isin(freq_topics.index)]

In [None]:
with pd.option_context('display.max_colwidth', None):
    display(gemini_model_response_multi_20_top_customers)

##### 30 conversation per topic

In [None]:
# gemini_model_response_multi_30_df = pd.DataFrame(gemini_model_response_multi_list, index=range(-1, len(gemini_model_response_multi_list) -1)).rename(columns={0: 'Gemini Pro'})
# gemini_model_response_multi_30_df.index.name = 'Topic'

In [None]:
# with pd.option_context('display.max_colwidth', None):
#     display(gemini_model_response_multi_30_df)

In [None]:
# format_model_response(gemini_model_response_multi_30_df)#to_csv('gemini_model_response_multi_hs_30_2pp.csv', sep='\t', index=False)

In [None]:
# with pd.option_context('display.max_colwidth', None):
#     display(format_model_response(gemini_model_response_multi_30_df))
    # display(pd.read_csv('gemini_model_response_multi_hs_30.csv', sep='\t')[:39])

In [None]:
# with pd.option_context('display.max_colwidth', None):
#     display(pd.read_csv('gemini_model_response_multi_hs_30_2pp.csv', sep='\t')[:20])

#### Prompt to solve pain points

In [None]:
# Initialize Vertex AI
vertexai.init(project="helix-ds-metal-dev", location="us-central1")

# Load the model
gemini_model = GenerativeModel("gemini-pro")

# Generation config
gemini_parameters = {
    "max_output_tokens": 2048,
    "temperature": 0,
    "top_p": 1,
    # "top_k": 32
}

# Safety config
safety_config = {
    generative_models.HarmCategory.HARM_CATEGORY_HATE_SPEECH: generative_models.HarmBlockThreshold.BLOCK_ONLY_HIGH,
    generative_models.HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: generative_models.HarmBlockThreshold.BLOCK_NONE,
    generative_models.HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: generative_models.HarmBlockThreshold.BLOCK_ONLY_HIGH,
    generative_models.HarmCategory.HARM_CATEGORY_HARASSMENT: generative_models.HarmBlockThreshold.BLOCK_ONLY_HIGH,
}

In [None]:
convs_nb = 20
gemini_model_response_multi_solve_list = []

for topic_nb in tqdm(conv_topic_info_3['Topic']):    
    # conv_kyewords = conv_topic_info_3.loc[topic_nb + 1, 'Representation']
    conv_kyewords = conv_topic_info_3.loc[topic_nb + 1, 'KeyBERT']
    current_convs_nb = 15 if topic_nb in [47] else convs_nb
    conv_docs = select_conversation_docs(conversations_data_topics_3, topic_nb, current_convs_nb)


    solve_prompt = f"""
You are a helpful, respectful and honest assistant for extracting information from conversations.

I have the following conversations:

{conv_docs.strip()}

The conversations are described by the following keywords: {', '.join(conv_kyewords)}

Based on the above information, can you describe methods used to solve pain points? Provide a maximum of 2 main pain points. Make sure it is in the following format:
- pain point 
  Method used to solve this pain point
"""
    
    # print(f"""Topic {topic_nb}: prompt length {len(solve_prompt)}, {conv_kyewords}""")
    # print(prompt)
    
    gemini_prompt_response = gemini_model.generate_content(
        solve_prompt,
        generation_config=gemini_parameters,
        safety_settings=safety_config,
    )
    
    gemini_model_response_multi_solve_list.append(gemini_prompt_response.text)

In [None]:
for idx, item in enumerate(gemini_model_response_multi_solve_list, -1):
    print(f"Topic {idx}:")
    print(item)
    print('\n')

In [None]:
gemini_model_response_multi_20_solve_df = pd.DataFrame(gemini_model_response_multi_solve_list, index=range(-1, len(gemini_model_response_multi_solve_list) -1)).rename(columns={0: 'Gemini Pro'})
gemini_model_response_multi_20_solve_df.index.name = 'Topic'

In [None]:
with pd.option_context('display.max_colwidth', None):
    display(gemini_model_response_multi_20_solve_df)

#### Prompt to generate actionable insights

In [None]:
# Initialize Vertex AI
vertexai.init(project="helix-ds-metal-dev", location="us-central1")

# Load the model
gemini_model = GenerativeModel("gemini-pro")

# Generation config
gemini_parameters = {
    "max_output_tokens": 2048,
    "temperature": 0,
    "top_p": 1,
    # "top_k": 32
}

# Safety config
safety_config = {
    generative_models.HarmCategory.HARM_CATEGORY_HATE_SPEECH: generative_models.HarmBlockThreshold.BLOCK_ONLY_HIGH,
    generative_models.HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: generative_models.HarmBlockThreshold.BLOCK_NONE,
    generative_models.HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: generative_models.HarmBlockThreshold.BLOCK_ONLY_HIGH,
    generative_models.HarmCategory.HARM_CATEGORY_HARASSMENT: generative_models.HarmBlockThreshold.BLOCK_ONLY_HIGH,
}

In [None]:
convs_nb = 20
gemini_model_response_multi_insights_list = []

for topic_nb in freq_topics.index:
    # conv_kyewords = conv_topic_info_3.loc[topic_nb + 1, 'Representation']
    conv_kyewords = conv_topic_info_3.loc[topic_nb + 1, 'KeyBERT']
    current_convs_nb = 15 if topic_nb in [47] else convs_nb
    conv_docs = select_conversation_docs(conversations_data_topics_3, topic_nb, current_convs_nb)


    solve_prompt = f"""
You are a helpful, respectful and honest assistant for extracting information from conversations.

I have the following conversations:

{conv_docs.strip()}

The conversations are described by the following keywords: {', '.join(conv_kyewords)}

Based on the above information, can you generate actionable insights that can help make decisions?
"""
    
    print(f"""Topic {topic_nb}: prompt length {len(solve_prompt)}, {conv_kyewords}""")
    # print(prompt)
    
    gemini_prompt_response = gemini_model.generate_content(
        solve_prompt,
        generation_config=gemini_parameters,
        safety_settings=safety_config,
    )
    
    gemini_model_response_multi_insights_list.append(gemini_prompt_response.text)

In [None]:
for idx, item in zip(freq_topics.index, gemini_model_response_multi_insights_list):
    print(f"Topic {idx}:\n")
    print(item)
    print('\n')