# Unveiling the Power of GenAI: Extracting Insights from Chats
Notebook developed for a workshop at Women in Tech Summit 2024

## Import libraries

In [None]:
import configparser
import os
import pickle
import re
import warnings

import emoji
import nltk
# nltk.download('punkt')
import numpy as np
import pandas as pd
import plotly.express as px
from langdetect import detect

warnings.filterwarnings('ignore')

## Source data

We use Kaggle dataset called Customer Support on Twitter for this workshop, you'll find it here: https://www.kaggle.com/datasets/thoughtvector/customer-support-on-twitter.

To run the preprocessing part of our notebook, unzip the data and copy it to the data/raw_data folder.

In [None]:
df_conv = pd.read_csv("data/raw_data/csc_twitter/twcs/twcs.csv")

## Data preprocessing

In [None]:
def add_conv_id(df):
    '''
    Identify which tweets belong to the same conversation
    '''
    df['conversation_id'] = df['tweet_id']
    whole = df.shape[0]
    for i in range(len(df)):
        if i % 100000 == 0:
            print("{} out of {} were preprocessed".format(i, whole))
        prev_tweet = df.loc[i, 'in_response_to_tweet_id']
        if not np.isnan(prev_tweet):
            df_temp = df[df['tweet_id']==prev_tweet]
            if len(df_temp) > 0:
                new_conv_id = df_temp['conversation_id'].values[0]
                df.loc[i, 'conversation_id'] = new_conv_id


def filter_data(df_filtered):
    # Remove additinal whitespaces
    WHITESPACES_CLEANER = re.compile(r'(\uFEFF|\s)+')
    df_filtered['text'] = df_filtered['text'].str.replace(WHITESPACES_CLEANER, ' ', regex=True)
    
    # Align punctuation
    WHITESPACES_BEFORE_PUNCTUATION_CLEANER = re.compile(r'(\w)\s+([.,;:?!])')
    df_filtered['text'] = df_filtered['text'].str.replace(WHITESPACES_BEFORE_PUNCTUATION_CLEANER, r'\1\2', regex=True)
    
    # Filter out empty strings
    df_filtered = df_filtered[df_filtered['text']!=""]

    # Filter out emoticons
    df_filtered_emot = df_filtered['text'].apply(lambda s: emoji.replace_emoji(s, ''))
    df_filtered['text'] = df_filtered_emot

    # Filter out one char strings
    df_filtered = df_filtered.loc[df_filtered['utterance_size'] > 1, :]
    
    # Replace web links
    LINK_CLEANER = re.compile(r'\b(?:https?://)\S+', flags=re.IGNORECASE)
    df_filtered['text'] = df_filtered['text'].str.replace(LINK_CLEANER, '[link]', regex=True)
    
    return df_filtered


def detect_lang(x):
    '''
    Detect the language of the input text
    '''
    try:
        lang = detect(x)
    except:
        lang = 'unknown'
    return lang

In [None]:
def preprocessing(df):
    # Transform create_at column to datetime
    print("Transforming create_at column to datetime")
    df['created_at'] = pd.to_datetime(df['created_at'], errors='coerce')

    # Order data by create date
    print("Ordering data by create date")
    df.sort_values(by='created_at', ignore_index=True, inplace=True)

    # Add conversation_id 
    print("Adding conversation_id")
    add_conv_id(df)

    # Add author_type
    print("Adding author_type")
    df['author_type'] = df['author_id'] .map(lambda x: "user" if x.isnumeric() else "support")

    # Delete links to previous tweets from text body
    print("Deleting links to previous tweets from text body")
    df_filtered = df.copy(deep=True)
    df_filtered['text'] = df_filtered['text'].apply(lambda s: re.sub("@\S+ |@\S+$", "", s))

    print("Adding columns related to text length")
    # Add number of characters in the utterance 
    df_filtered['utterance_size'] = df_filtered['text'].str.len()
    # Add number of tokens in the utterance 
    df_filtered['utterance_tokens_size'] = df_filtered['text'].apply(lambda x: len(str(x).split(' ')))

    print("Limiting data to AppleSupport")
    # Limit data to AppleSupport - one company, choosing based on number of records in the data
    conversation_id_apple = df_filtered.loc[df_filtered['author_id']=='AppleSupport', 'conversation_id'].unique()
    df_filtered = df_filtered[df_filtered['conversation_id'].isin(conversation_id_apple)]
    
    print("Filtering data")
    df_filtered = filter_data(df_filtered)

    print("Adding information about conversation language")
    # Add language info
    df_filtered['utterance_lang'] = df_filtered.loc[:, 'text'].apply(lambda x: detect_lang(x))
    conv_lang = df_filtered.groupby(['conversation_id']).apply(lambda x: x.sort_values('utterance_size', ascending=False).iloc[0]['utterance_lang'])
    df_conv_lang = conv_lang.to_frame().rename(columns={0: "conversation_lang"})
    df_filtered = df_filtered.merge(df_conv_lang, left_on='conversation_id', right_index=True)

    return df_filtered

In [None]:
df_conv_filtered = preprocessing(df_conv)

In [None]:
# Save preprocessed data to a file
# df_conv_filtered.to_csv("data/preprocessed_data/apple_support.csv")
# df_conv_filtered.to_parquet("data/preprocessed_data/apple_support.parq")

### Examples before and after preprocessing

In [None]:
df_conv = pd.read_csv("data/raw_data/csc_twitter/twcs/twcs.csv")
# Transform create_at column to datetime
df_conv['created_at'] = pd.to_datetime(df_conv['created_at'], errors='coerce')
# Order data by create date
df_conv.sort_values(by='created_at', ignore_index=True, inplace=True)

df_conv_filtered = pd.read_parquet("data/preprocessed_data/apple_support.parq")

In [None]:
# before preprocessing
for idx, item in df_conv.loc[df_conv['tweet_id'].isin([363663, 363661, 363662])].iterrows():
    print(f"{item['tweet_id']} | {item['author_id']} | {item['text']}")

In [None]:
# after preprocessing
selected_conv_id = 363663
for idx, item in df_conv_filtered.loc[df_conv_filtered['conversation_id'] == selected_conv_id, :].iterrows():
    print(f"{item['tweet_id']} | {item['author_id']} | {item['text']}")

In [None]:
# before preprocessing
for idx, item in df_conv.loc[df_conv['tweet_id'].isin([1700946, 1700945, 1700944, 1700943, 1700941, 1700942])].iterrows():
    print(f"{item['tweet_id']} | {item['author_id']} | {item['text']}")

In [None]:
# after preprocessing
selected_conv_id = 1700946
for idx, item in df_conv_filtered.loc[df_conv_filtered['conversation_id'] == selected_conv_id, :].iterrows():
    print(f"{item['tweet_id']} | {item['author_id']} | {item['text']}")

## Exploratory Data Analysis on preprocessed data

In [None]:
# # Read data from file
# df_conv_filtered = pd.read_parquet("data/preprocessed_data/apple_support.parq")
# # Transform create_at column to datetime
# df_conv_filtered['created_at'] = pd.to_datetime(df_conv_filtered['created_at'], errors='coerce')

In [None]:
df_conv_filtered.head()

In [None]:
df_conv_filtered['conversation_id'].nunique()

### Number of conversations over time

In [None]:
df_conv_created = df_conv_filtered[['conversation_id', 'created_at']]
df_conv_created['created_month']= df_conv_filtered['created_at'].dt.strftime('%Y-%m')
df_conv_vs_time = df_conv_created.groupby('created_month', as_index=False)
df_conv_vs_time = df_conv_vs_time.agg(conversation_count=pd.NamedAgg(column="conversation_id", aggfunc="nunique"))

In [None]:
fig = px.bar(df_conv_vs_time[(df_conv_vs_time['created_month']>='2014-01') & (df_conv_vs_time['created_month']<='2024-04')], x='created_month', y='conversation_count',
             title="Number of conversations over time",
            height=600,
            text_auto='.2s'
            )
fig.update_layout(xaxis_title='Created month',
                  yaxis_title='Count')

fig.show()

### Number of tweets per conversation

In [None]:
df_conv_length = df_conv_filtered.groupby('conversation_id')
df_conv_length = df_conv_length.agg(
    conversation_length=pd.NamedAgg(column="tweet_id", aggfunc="count"),
    conversation_size=pd.NamedAgg(column="utterance_size", aggfunc="sum")
)
conversation_count_vs_length = (df_conv_length[['conversation_length']].value_counts().to_frame().reset_index()).rename(columns={0: 'count'}).sort_values('count')

In [None]:
fig = px.bar(
    conversation_count_vs_length, 
    x='conversation_length', 
    y='count',
    title="Number of conversations vs. conversation length",
    height=600
)
fig.update_xaxes(range=[0, 20])
fig.update_layout(
    xaxis_title='Conversation length',
    yaxis_title='Count'
)
fig.show()

## Preparing training data

In [None]:
# Select time frames and English language
begin_date = '2017-09-01'
end_date = '2017-12-31'
selection_mask = (df_conv_filtered['created_at'] >= begin_date) & \
                 (df_conv_filtered['created_at'] < end_date) & \
                 (df_conv_filtered['conversation_lang'] == 'en')
df_conv_selected = df_conv_filtered[selection_mask]

# Add who's talking: user or support to tweet text
df_conv_selected['text'] = df_conv_selected['author_type'] + ': ' + df_conv_selected['text']

# Join messages into one conversation
df_conversations = df_conv_selected.groupby(['conversation_id'], as_index=False).agg(
    conversation_length=pd.NamedAgg(column="tweet_id", aggfunc="count"),
    conversation_body=pd.NamedAgg(column="text", aggfunc=lambda x: '\n'.join(x.astype(str))), 
    conversation_date=pd.NamedAgg(column="created_at", aggfunc="first")
)

# Count number of tokens (words) in each conversation
df_conv_tokens = df_conv_selected[['conversation_id', 'utterance_tokens_size']]
df_conv_tokens = df_conv_tokens.groupby(['conversation_id'])['utterance_tokens_size'].sum()
df_conv_tokens = df_conv_tokens.to_frame().reset_index().rename(columns={'utterance_tokens_size': 'conv_tokens_count'})

# Add info about number of tokens
df_conversations = df_conversations.merge(df_conv_tokens, how='left', on='conversation_id')

In [None]:
fig = px.histogram(
    df_conversations, 
    x='conv_tokens_count', 
    title="Distribution of number of tokens in conversations",
    height=600
)
fig.update_xaxes(range=[0, 1500])
fig.show()

In [None]:
# Cut too long conversations - max_tokens is set based on distribution of number of tokens in conversation
print(df_conversations[['conv_tokens_count']].quantile(0.99))
max_tokens = 198
df_conversations['conversation_body'] = df_conversations['conversation_body'].apply(lambda x: ' '.join(str(x).split(' ')[:max_tokens]))

### Remove duplicated conversations

In [None]:
# All duplicated conversations ('keep=False' marks all duplicates as True)
df_duplicated_convs = df_conversations.loc[df_conversations['conversation_body'].duplicated(keep=False), :]

# Count duplicates and represent duplicates by the first occurrence
df_duplicated_conv_count = df_duplicated_convs.groupby('conversation_body', as_index=False).agg(
    conversations_count=pd.NamedAgg(column="conversation_id", aggfunc="count"),
    conversation_id=pd.NamedAgg(column="conversation_id", aggfunc="first")
)[['conversation_id', 'conversation_body', 'conversations_count']].sort_values('conversations_count', ascending=False)

# Define unique conversations
df_conv_unique = df_conversations[~df_conversations['conversation_body'].duplicated()].reset_index(drop=True)
df_conv_unique['conversation_month']= df_conv_unique['conversation_date'].dt.strftime('%Y-%m')
df_conv_unique = df_conv_unique.merge(df_duplicated_conv_count, how='left', on=['conversation_id', 'conversation_body'])
df_conv_unique = df_conv_unique.fillna({'conversations_count': 1}).astype({'conversations_count': int})

# Save the final data
# df_conv_unique.to_parquet("data/preprocessed_data/apple_support_training_data.parq")

In [None]:
# Read final data
# df_conv_unique = pd.read_parquet("data/preprocessed_data/apple_support_training_data.parq")

# Get conversations text as list
conversations_data = df_conv_unique['conversation_body'].tolist()

# Get timestamps as list
timestamps = df_conv_unique['conversation_date'].dt.strftime('%Y-%m').to_list()

In [None]:
# Examples of final conversations which will be used for training
for i in range(5):
    print('_' * 50)
    print(conversations_data[i])

## Topic modelling using BERTopic

For more info on BERTopic, check out their website: https://maartengr.github.io/BERTopic/algorithm/algorithm.html.

In [None]:
from bertopic import BERTopic
from bertopic.representation import KeyBERTInspired, MaximalMarginalRelevance, OpenAI, PartOfSpeech, TextGeneration
from bertopic.vectorizers import ClassTfidfTransformer
from hdbscan import HDBSCAN
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer
from umap import UMAP

### Training

##### Detailed BERTopic pipeline
- Step 1 - Embedding documents using a Hugging Face Transformers model
- Step 2 - Reducing dimensionality of embeddings using UMAP: https://pair-code.github.io/understanding-umap/
- Step 3 - Clustering reduced embeddings into topics using HDBSCAN
- Step 4 - Tokenization of topics using CountVectorizer
- Step 5 - Weight tokens, create topic representation using C-TF-IDF
- Step 6 - (Optional) Fine-tune topic representations using KeyBERT

##### Calculate embeddings
Model used to calculate embeddings: https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2.

In [None]:
# Extract embeddings
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
conversations_embeddings = embedding_model.encode(conversations_data, show_progress_bar=True)

In [None]:
# embedding_model_gte_base = SentenceTransformer('thenlper/gte-base')
# conversations_embeddings_gte_base = embedding_model_gte_base.encode(conversations_data, show_progress_bar=True)

In [None]:
# Save embeddings to a file
# np.save("conversations_embeddings", conversations_embeddings)

In [None]:
# Load embeddings from a file
# conversations_embeddings = np.load("conversations_embeddings.npy")

##### Stopwords

Stopwords list downloaded from https://github.com/stopwords-iso/stopwords-en/blob/master/stopwords-en.txt

In [None]:
stopwords_list = pd.read_csv('stopwords-en.txt', header=None).astype(str)[0].tolist()
# sorted(stopwords_list)

In [None]:
# Disable warning:
# The current process just got forked. Disabling parallelism to avoid deadlocks... To disable this warning, please explicitly set 
# TOKENIZERS_PARALLELISM=(true | false)
os.environ["TOKENIZERS_PARALLELISM"] = "false"

Info about hyperparameter tuning: https://maartengr.github.io/BERTopic/getting_started/parameter%20tuning/parametertuning.html

In [None]:
# Reduce dimensionality
umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine', random_state=42)
# Cluster reduced embeddings
hdbscan_model = HDBSCAN(min_cluster_size=700, min_samples=50, metric='euclidean', cluster_selection_method='eom', prediction_data=True)
# Tokenize topics
vectorizer_model = CountVectorizer(stop_words=stopwords_list, min_df=10, ngram_range=(1, 2))
# Extract topic words
ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)
# Fine-tune topic representations
## KeyBERT
keybert_model = KeyBERTInspired()

## MMR - Diversify topic representations
# mmr_model = MaximalMarginalRelevance(diversity=0.3)

## GPT-3.5 - generate human-readable lables
# client = openai.OpenAI(api_key="")
# prompt = """
# I have a topic that contains the following documents: 
# [DOCUMENTS]
# The topic is described by the following keywords: [KEYWORDS]

# Based on the information above, extract a short but highly descriptive topic label. Make sure it is in the following format:
# topic: <topic label>
# """
# openai_model = OpenAI(client, model="gpt-3.5-turbo", exponential_backoff=True, chat=True, prompt=prompt)

## All representation models
representation_model = {
    "KeyBERT": keybert_model,
    # "MMR": mmr_model,
    # "OpenAI": openai_model
}

## BERTopic model
conv_topic_model = BERTopic(
  # Pipeline models
  embedding_model=embedding_model,           # Step 1 - Extract embeddings
  umap_model=umap_model,                     # Step 2 - Reduce dimensionality
  hdbscan_model=hdbscan_model,               # Step 3 - Cluster reduced embeddings
  vectorizer_model=vectorizer_model,         # Step 4 - Tokenize topics
  ctfidf_model=ctfidf_model,                 # Step 5 - Extract topic words
  representation_model=representation_model, # Step 6 - (Optional) Fine-tune topic representations
    
  # Hyperparameters
  top_n_words=10, # 10 is default
  # nr_topics="auto",
  calculate_probabilities=True,
  verbose=True
)

In [None]:
conv_topics, conv_probs = conv_topic_model.fit_transform(conversations_data, conversations_embeddings)

In [None]:
# conv_topic_model.save("conv_topic_model.pickle")

### BERTopic results

In [None]:
conv_topic_info = conv_topic_model.get_topic_info()
with pd.option_context('display.max_colwidth', None):
  display(conv_topic_info.set_index('Topic')[['Count', 'Name', 'Representation', 'KeyBERT']])

In [None]:
fig = px.bar(conv_topic_info, x='Name', y='Count',
             title="Number of conversation related to a given topic",
            height=800,
            # text_auto=True
            )
# fig.update_xaxes(range=[0, 100])
fig.update_layout(xaxis_title='Topic 1',
                  yaxis_title='Count')
fig.show()

In [None]:
conv_topic_model.visualize_barchart(top_n_topics = 12, n_words = 10, height=300)

### Topic results over time

In [None]:
conv_topics_over_time = conv_topic_model.topics_over_time(conversations_data, timestamps)

In [None]:
def topics_over_time(conversations_data_topics, conv_topic_info):
    months = np.sort(conversations_data_topics['conversation_month'].unique())
    topics = conv_topic_info['Topic'].values
    
    topics_over_time = [pd.DataFrame(index=topics)]
    for month in months:
        col = conversations_data_topics.loc[conversations_data_topics['conversation_month']==month, 'Topic'].value_counts().to_frame()
        col = col.rename(columns={'Topic': month})
        topics_over_time.append(col)
        
    topics_over_time_df = pd.concat(topics_over_time, axis='columns').fillna(0).astype(int).transpose().reset_index().rename(columns={'index': 'Month'})
    topics_over_time_melt_df = topics_over_time_df.melt(id_vars='Month', var_name="Topic", value_name='Frequency')
    topics_over_time_melt_df = topics_over_time_melt_df.merge(conv_topic_info[['Topic', 'Name', 'Representation']], how='left', on='Topic')
    
    return topics_over_time_melt_df

In [None]:
conversations_data_topics = conv_topic_model.get_document_info(conversations_data).merge(df_conv_unique, how='left', left_index=True, right_index=True)
# Saving conversations_data_topics
conversations_data_topics.to_parquet("data/preprocessed_data/conversations_data_topics.parq")
plot_topics_over_time = topics_over_time(conversations_data_topics, conv_topic_info)

In [None]:
top_n = 12

fig = px.line(plot_topics_over_time[plot_topics_over_time['Topic']<top_n], 
              x='Month', y='Frequency', color='Name',
              markers=True,
              height=600)
fig.update_xaxes(showgrid=True)
fig.update_yaxes(showgrid=True)
fig.update_layout(legend_traceorder="normal") 
fig.update_layout(
    title={
        'text': f"<b>Topics over Time</b>",
        'y': .95,
        'x': 0.40,
        'xanchor': 'center',
        'yanchor': 'top',
        'font': dict(
            size=22,
            color="Black")
    },
    template="simple_white",
    width=1250,
    height=600,
    hoverlabel=dict(
        # bgcolor="white",
        font_size=16,
        font_family="Rockwell"
    ),
    legend=dict(
        title="<b>Global Topic Representation</b>",
    )
)
fig.show()

### Representative conversations

In [None]:
topic_nb_1 = 0

print(conv_topic_info.loc[topic_nb_1 + 1, 'Representation'])
print(conv_topic_info.loc[topic_nb_1 + 1, 'Count'])
print("#"*50)

for item in conv_topic_info.loc[topic_nb_1 + 1, 'Representative_Docs']:
    print(item)
    print("#"*50)

### Topic probability distribution

In [None]:
selected_id = 76849
print(conversations_data_topics.loc[selected_id, "conversation_body"])

In [None]:
# Visualize the topic-document distribution for a single document
conv_topic_model.visualize_distribution(conv_topic_model.probabilities_[selected_id], custom_labels=True)

### How similar are documents in different topics?

In [None]:
conv_topic_model.visualize_heatmap()

In [None]:
conv_topic_model.visualize_hierarchy()

In [None]:
conv_topic_model.visualize_topics()

In [None]:
# We reduce our embeddings to 2D as it will allows us to quickly iterate later on
conversations_reduced_embeddings = UMAP(n_neighbors=15, n_components=2, min_dist=0.0, metric='cosine', 
                                        random_state=42).fit_transform(conversations_embeddings)

In [None]:
conv_topic_model.visualize_documents(conversations_data, reduced_embeddings=conversations_reduced_embeddings, 
                                       hide_document_hover=True, hide_annotations=True)

### Outlier reduction
Documentation on outlier reduction: https://maartengr.github.io/BERTopic/getting_started/outlier_reduction/outlier_reduction.html

In [None]:
print(f"Percentage of outliers: {conv_topic_info.loc[0, 'Count'] / conv_topic_info['Count'].sum() * 100:.4}%")
print("Number of outliers: ", (np.array(conv_topics) == -1).sum())

The default method for reducing outliers is by calculating the c-TF-IDF representations of outlier documents and assigning them to the best matching c-TF-IDF representations of non-outlier topics.

In [None]:
# You can use the `threshold` parameter to select the minimum distance or similarity when matching outlier documents with non-outlier topics. 
# This allows the user to change the amount of outlier documents are assigned to non-outlier topics.

# Reduce outliers using the `c-tf-idf` strategy
new_conv_topics = conv_topic_model.reduce_outliers(conversations_data, conv_topics, strategy="c-tf-idf", threshold=0.05)

In [None]:
print("Number of outliers after reduction with c-tf-idf: ", (np.array(new_conv_topics) == -1).sum())

When outlier documents are generated, they are not used when modeling the topic representations. These documents are completely ignored when finding good descriptions of topics. Hence, after having reduced the number of outliers in your topic model, you might want to update the topic representations with the documents that now belong to actual topics.


In [None]:
conv_topic_model.update_topics(conversations_data, topics=new_conv_topics)

In [None]:
conv_topic_model.visualize_documents(conversations_data, reduced_embeddings=conversations_reduced_embeddings, 
                                       hide_document_hover=True, hide_annotations=True)

### Other possible improvements 
#### Outlier reduction - continuation
Use the topic distributions, as calculated with `.approximate_distribution` to find the most frequent topic in each outlier document. You can use the `distributions_params` variable to tweak the parameters of `.approximate_distribution`.

In [None]:
# Reduce outliers using the `distributions` strategy
# new_conv_topics_d = conv_topic_model.reduce_outliers(conversations_data, conv_topics, strategy="distributions", threshold=0.08)
# print("Number of outliers after reduction with `distributions` strategy: ", (np.array(new_conv_topics_d) == -1).sum())

Probabilities strategy uses the soft-clustering as performed by HDBSCAN to find the best matching topic for each outlier document. To use this, make sure to calculate the `probabilities` beforehand by instantiating BERTopic with `calculate_probabilities=True`.

In [None]:
# Reduce outliers using the `probabilities` strategy
# new_conv_topics_p = conv_topic_model.reduce_outliers(
#     conversations_data, 
#     conv_topics, 
#     probabilities=conv_probs, 
#     strategy="probabilities", 
#     threshold=0.02
# )
# print("Number of outliers after reduction with `probabilities` strategy: ", (np.array(new_conv_topics_p) == -1).sum())

Using the embeddings of each outlier documents, find the best matching topic embedding using cosine similarity.

In [None]:
# Reduce outliers using the `embeddings` strategy
# new_conv_topics_e = conv_topic_model.reduce_outliers(conversations_data, conv_topics, strategy="embeddings", embeddings=conversations_embeddings, threshold=0.5)
# print("Number of outliers after reduction with `probabilities` strategy: ", (np.array(new_conv_topics_e) == -1).sum())

#### Topic Reduction after Training
Documentation on topic reduction: https://maartengr.github.io/BERTopic/getting_started/topicreduction/topicreduction.html

In [None]:
# conv_topic_model.reduce_topics(conversations_data, nr_topics=30)

In [None]:
# conv_topic_model.visualize_heatmap()

In [None]:
# conv_topic_model.visualize_barchart()

#### Update Topic Representation after Training
Documentation on updating topic representations: https://maartengr.github.io/BERTopic/getting_started/topicrepresentation/topicrepresentation.html

In [None]:
# conv_topic_model.update_topics(conversations_data, n_gram_range=(1, 2))

In [None]:
# conv_topic_model.visualize_heatmap()

In [None]:
# conv_topic_model.visualize_barchart()

# Extracting labels, main issues and insights using LLM

## Introduction to Large Language Models (LLMs)

In [None]:
import json
import vertexai
from vertexai.preview import generative_models
from vertexai.preview.generative_models import GenerativeModel
from tqdm import tqdm
from IPython.display import display, Markdown
from sklearn.preprocessing import normalize

In [None]:
def format_conversation(conversations_data_topics: pd.DataFrame, conv_id: int) -> str:
    """
    Formats conversation for a given conversation ID into a string.
    """
    return '- ' + conversations_data_topics.loc[conversations_data_topics['conversation_id']==conv_id, 'Document'].item().replace('\n', '\n- ')

In [None]:
def format_selected_conversations(conversations_data_topics: pd.DataFrame, topic_nb: int, nb_of_convs: int = 10, method: str = 'highest_score') -> str:
    """
    Selects conversations from a given topic number and combines them into one string. By default, it selects 10
    conversations using the highest probability score.
    """
    rng = np.random.default_rng(seed=42)
    
    topic_conversations = conversations_data_topics.loc[conversations_data_topics['Topic']==topic_nb, :].sort_values('Probability', ascending=False)
    topic_conversations_probabilities = normalize([topic_conversations['Probability'].values], norm="l1").ravel()
    selected_conv_id = []

    if method == 'highest_score':
        # Select conversations with the highest probability score
        selected_conv_id = topic_conversations.iloc[:nb_of_convs]['conversation_id']
    elif method == 'score_dist':
        # Select conversations based on their probability distribution
        selected_conv_id = rng.choice(topic_conversations['conversation_id'], nb_of_convs, replace=False, p=topic_conversations_probabilities)
    elif method == 'uniform':
        # Select conversations based on uniform distribution over all chats
        selected_conv_id = rng.choice(topic_conversations['conversation_id'], nb_of_convs, replace=False)
    
    conversation_docs_str = ""
    for nb_of_conv, conv_id in enumerate(selected_conv_id, 1):
        conv_body = format_conversation(conversations_data_topics, conv_id) 
        conversation_docs_str += f'Chat {nb_of_conv}\n{conv_body}\n\n'

    return conversation_docs_str

In [None]:
def parse_json(input_str: str) -> dict:
    """
    Deserialize a ``str`` instance containing a JSON document to a Python object.
    """
    json_input = re.sub('(```)?(json)?', '', input_str, flags=re.IGNORECASE)

    try:
        json_data = json.loads(json_input)
    except json.JSONDecodeError as err:
        print(f"Input: {json_input}. This input could not be transformed into a Python object. "
              f"Error message: {err}")
        json_data = {}

    return json_data

## Geminni 1.0 Pro (Google LLM)

**Description**
- The best performing model with features for a wide range of text-only tasks.
- Supports only text as input.
- Supports supervised tuning.

**Specifications**
- Max total tokens (input and output): **32,760** (A token is approximately four characters. 100 tokens correspond to roughly 60-80 words.)
- Max output tokens: **8,192**
- Training data: **up to Feb 2023**

Documentation of Google modles: https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models

In [None]:
# Initialize Vertex AI
vertexai.init(project="helix-ds-metal-dev", location="us-central1")

# Load the model
gemini_model = GenerativeModel("gemini-1.0-pro-001")

# Generation config
gemini_parameters = {
    "max_output_tokens": 2048, # default: 8192
    "temperature": 0.5, # default: 0.9
    "top_p": 0.8, # default: 1
}

# Safety config
safety_config = {
    generative_models.HarmCategory.HARM_CATEGORY_HATE_SPEECH: generative_models.HarmBlockThreshold.BLOCK_ONLY_HIGH,
    generative_models.HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: generative_models.HarmBlockThreshold.BLOCK_ONLY_HIGH,
    generative_models.HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: generative_models.HarmBlockThreshold.BLOCK_ONLY_HIGH,
    generative_models.HarmCategory.HARM_CATEGORY_HARASSMENT: generative_models.HarmBlockThreshold.BLOCK_ONLY_HIGH,
}

### Example

In [None]:
chat_1 = """
- user: iOS 11 is hot garbage. Ever since my phone updated, my cell data doesn’t work about 90% of the time. Might as well be an iPod
- support: We're here for you. Have you tried these steps: [link]
- user: Appreciate the feedback. Have tried those steps and the phone shows an active lte signal. Just seems to stop data transmission randomly
- support: Does this happen with certain apps or usage, or does it happen with all data usage? Do you notice how many bars are showing?
"""



In [None]:
prompt_1 = f"""
Context: You are a helpful, respectful and honest assistant who extracts information from chats between customers and support.
    
Extract short but descriptive customer issues from the following chat:
{chat_1}

Provide the issues in JSON format only. Valid fields are 'issue' and 'explanation'.
"""

print(prompt_1)
# gemini_model.count_tokens(prompt_1)

In [None]:
gemini_response = gemini_model.generate_content(
    prompt_1,
    generation_config=gemini_parameters,
    safety_settings=safety_config,
)
print(f"Response from Model:\n\n{gemini_response.text}")

In [None]:
display(Markdown(f"Response from Model:\n\n{gemini_response.text}"))

In [None]:
prompt_2 = f"""
Context: You are a helpful, respectful and honest assistant who extracts information from conversations between customers and support.
    
Extract short but descriptive customer issues from the following conversation:
{chat_2}

Provide the issues in JSON format only. Valid fields are 'issue' and 'explanation'.
"""

print(prompt_2)
# gemini_model.count_tokens(prompt_2)

In [None]:
gemini_response = gemini_model.generate_content(
    prompt_2,
    generation_config=gemini_parameters,
    safety_settings=safety_config,
)
print(f"Response from Model:\n\n{gemini_response.text}")

In [None]:
# display(Markdown(f"Response from Model:\n{gemini_response.text}"))

### Create labels

**Task** 
- We have a group of similar chats that are represented by a set of keywords and we would like to describe these chats by a descriptive label (supplemented by a short description) using LLM.

**Problems to solve**
- The size of all chats can be larger than the size of the context window
- The LLM response should be able to parse using code

In [None]:
# Initialize Vertex AI
vertexai.init(project="helix-ds-metal-dev", location="us-central1")

# Load the model
gemini_model = GenerativeModel("gemini-pro")

# Generation config
gemini_parameters = {
    "max_output_tokens": 2048, # default: 8192
    "temperature": 0.5, # default: 0.9
    "top_p": 0.8, # default: 1
}

# Safety config
safety_config = {
    generative_models.HarmCategory.HARM_CATEGORY_HATE_SPEECH: generative_models.HarmBlockThreshold.BLOCK_NONE,
    generative_models.HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: generative_models.HarmBlockThreshold.BLOCK_NONE,
    generative_models.HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: generative_models.HarmBlockThreshold.BLOCK_NONE,
    generative_models.HarmCategory.HARM_CATEGORY_HARASSMENT: generative_models.HarmBlockThreshold.BLOCK_NONE,
}

In [None]:
# Create labels based on selected chats and existing keywords

convs_nb = 100
topics = conv_topic_info.loc[1:5, 'Topic']
gemini_model_response_labels_list = []

for topic_nb in tqdm(topics):    
    conv_kyewords = conv_topic_info.loc[topic_nb + 1, 'Representation']
    conv_docs = format_selected_conversations(conversations_data_topics, topic_nb, convs_nb, method='highest_score')

    label_prompt = f"""
Context: You are a helpful, respectful and honest assistant who extracts information from chats between users and support.

I have the following chats:

{conv_docs.strip()}

These chats are described by the following keywords: {', '.join(conv_kyewords)}.

Based on the above information, create a label for these chats with a short and clear description. 
Provide the label in JSON format only. Valid fields are 'label' and 'description'. Use safe quote nesting if necessary.
"""

    try:
        gemini_prompt_response = gemini_model.generate_content(
            label_prompt,
            generation_config=gemini_parameters,
            safety_settings=safety_config,
        )
        response_text = gemini_prompt_response.text
    except ValueError as err:
        print(err)
        response_text = """```json
{}
```"""
    
    gemini_model_response_labels_list.append(response_text)

In [None]:
# print(f"""Topic {topic_nb}\nkeywords: {conv_kyewords},\n{gemini_model.count_tokens(label_prompt)}total characters: {len(label_prompt)}""")
# print(label_prompt)

In [None]:
# Show LLM results (in JSON format)

for idx, item in enumerate(gemini_model_response_labels_list):
    print(f"Topic {idx}:")
    # print(item)
    display(Markdown(item))

In [None]:
# Print chat examples

print(format_selected_conversations(conversations_data_topics, topic_nb=0, nb_of_convs=5, method='highest_score'))

In [None]:
# Transform LLM results from JSON format to pd.DataFrame

df_labels_list = []
for idx, item in enumerate(gemini_model_response_labels_list):
    label_data = parse_json(item)

    df = pd.DataFrame(label_data, index=[0])
    df['topic'] = idx
    df_labels_list.append(df)
    
df_labels = pd.concat(df_labels_list, ignore_index=True)

In [None]:
with pd.option_context('display.max_colwidth', None):
    display(df_labels.set_index('topic'))

### Extract issues

**Task** 
- We have a group of similar chats that are represented by a set of keywords and we would like to extract the main issues with a short explanation using LLM.

**Problems to solve**
- The size of all chats can be larger than the size of the context window
- The LLM response should be able to parse using code

In [None]:
# Initialize Vertex AI
vertexai.init(project="helix-ds-metal-dev", location="us-central1")

# Load the model
gemini_model = GenerativeModel("gemini-1.0-pro-001")

# Generation config
gemini_parameters = {
    "max_output_tokens": 2048, # default: 8192
    "temperature": 0.5, # default: 0.9
    "top_p": 0.8, # default: 1
}

# Safety config
safety_config = {
    generative_models.HarmCategory.HARM_CATEGORY_HATE_SPEECH: generative_models.HarmBlockThreshold.BLOCK_NONE,
    generative_models.HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: generative_models.HarmBlockThreshold.BLOCK_NONE,
    generative_models.HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: generative_models.HarmBlockThreshold.BLOCK_NONE,
    generative_models.HarmCategory.HARM_CATEGORY_HARASSMENT: generative_models.HarmBlockThreshold.BLOCK_NONE,
}

In [None]:
# Extract issues based on selected chats and existing keywords

convs_nb = 100
topics = conv_topic_info.loc[1:5, 'Topic']
gemini_model_response_issues_list = []

for topic_nb in tqdm(topics):    
    conv_kyewords = conv_topic_info.loc[topic_nb + 1, 'Representation']
    conv_docs = format_selected_conversations(conversations_data_topics, topic_nb, convs_nb, method='score_dist')

    issue_prompt = f"""
Context: You are a helpful, respectful and honest assistant who extracts information from chats between users and support.

I have the following chats:

{conv_docs.strip()}

These chats are described by the following keywords: {', '.join(conv_kyewords)}.

Based on the above information, extract short but highly descriptive issues. Provide up to 3 top issues along with the frequency number of these issues, 
in JSON format only. Valid fields are 'issue', 'explanation' and 'frequency'. Use safe quote nesting if necessary.
"""
    
    try:
        gemini_prompt_response = gemini_model.generate_content(
            issue_prompt,
            generation_config=gemini_parameters,
            safety_settings=safety_config,
        )
        response_text = gemini_prompt_response.text
    except ValueError as err:
        print(err)
        response_text = """```json
[]
```"""
    
    gemini_model_response_issues_list.append(response_text)

In [None]:
# print(f"""Topic {topic_nb}\nkeywords: {conv_kyewords},\n{gemini_model.count_tokens(issue_prompt)}total characters: {len(issue_prompt)}""")
# print(issue_prompt)

In [None]:
# Show LLM results (in JSON format)

for idx, item in enumerate(gemini_model_response_issues_list):
    print(f"Topic {idx}:")
    # print(item)
    display(Markdown(item))

In [None]:
# Print chat examples

print(format_selected_conversations(conversations_data_topics, topic_nb=1, nb_of_convs=5, method='score_dist'))

In [None]:
# Transform LLM results from JSON format to pd.DataFrame

df_issues_list = []
for idx, item in enumerate(gemini_model_response_issues_list):
    issue_data = parse_json(item)

    df = pd.DataFrame(issue_data)
    df['topic'] = idx
    df_issues_list.append(df)
    
df_issues = pd.concat(df_issues_list, ignore_index=True)
df_issues['frequency'] /= convs_nb

In [None]:
with pd.option_context('display.max_colwidth', None):
    display(df_issues.set_index('topic'))

### Generate methods to solve issues

**Task** 
- We have a group of similar chats that are represented by a set of keywords and we would like to obtain methods to solve the main issue raised in these chats using LLM.

**Problems to solve**
- The size of all chats can be larger than the size of the context window
- The LLM response should be able to parse using code

In [None]:
# Initialize Vertex AI
vertexai.init(project="helix-ds-metal-dev", location="us-central1")

# Load the model
gemini_model = GenerativeModel("gemini-1.0-pro-001")

# Generation config
gemini_parameters = {
    "max_output_tokens": 2048, # default: 8192
    "temperature": 0.5, # default: 0.9
    "top_p": 0.3, # default: 1
}

# Safety config
safety_config = {
    generative_models.HarmCategory.HARM_CATEGORY_HATE_SPEECH: generative_models.HarmBlockThreshold.BLOCK_ONLY_HIGH,
    generative_models.HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: generative_models.HarmBlockThreshold.BLOCK_NONE,
    generative_models.HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: generative_models.HarmBlockThreshold.BLOCK_ONLY_HIGH,
    generative_models.HarmCategory.HARM_CATEGORY_HARASSMENT: generative_models.HarmBlockThreshold.BLOCK_ONLY_HIGH,
}

In [None]:
# Obtain methods to solve the main issue based on selected chats and existing keywords

convs_nb = 100
topics = conv_topic_info.loc[1:5, 'Topic']
gemini_model_response_solve_list = []

for topic_nb in tqdm(topics):    
    conv_kyewords = conv_topic_info.loc[topic_nb + 1, 'Representation']
    conv_docs = format_selected_conversations(conversations_data_topics, topic_nb, convs_nb, method='highest_score')

    solve_prompt = f"""
Context: You are a helpful, respectful and honest assistant who extracts information from chats between users and support.

I have the following chats:

{conv_docs.strip()}

These chats are described by the following keywords: {', '.join(conv_kyewords)}.

Based on the above information, describe the main issue and generate a list of methods to solve it. Provide the result in JSON format, 
where valid fields are only 'issue' and 'solution_methods'. Use safe quote nesting if necessary.
"""
    
    try:
        gemini_prompt_response = gemini_model.generate_content(
            solve_prompt,
            generation_config=gemini_parameters,
            safety_settings=safety_config,
        )
        response_text = gemini_prompt_response.text
    except ValueError as err:
        print(err)
        response_text = """```json
{}
```"""
    
    gemini_model_response_solve_list.append(response_text)

In [None]:
# print(f"""Topic {topic_nb}\nkeywords: {conv_kyewords},\n{gemini_model.count_tokens(solve_prompt)}total characters: {len(solve_prompt)}""")
# print(solve_prompt)

In [None]:
# Show LLM results (in JSON format)

for idx, item in enumerate(gemini_model_response_solve_list):
    print(f"Topic {idx}:")
    # display(Markdown(item))
    print(item)

In [None]:
# Print chat examples

print(format_selected_conversations(conversations_data_topics, topic_nb=1, nb_of_convs=5, method='highest_score'))

In [None]:
# Transform LLM results from JSON format to pd.DataFrame

df_solve_list = []
for idx, item in enumerate(gemini_model_response_solve_list):
    solve_data = parse_json(item)

    df = pd.DataFrame(solve_data)
    df['topic'] = idx
    df_solve_list.append(df)
    
df_solves = pd.concat(df_solve_list, ignore_index=True)

In [None]:
with pd.option_context('display.max_colwidth', None):
    display(df_solves.set_index('topic'))

### Generate actionable insights

**Task** 
- We have a group of similar chats that are represented by a set of keywords and we would like to obtain actionable insights about the main issue raised in these chats using LLM.

**Problems to solve**
- The size of all chats can be larger than the size of the context window
- The LLM response should be able to parse using code

In [None]:
# Initialize Vertex AI
vertexai.init(project="helix-ds-metal-dev", location="us-central1")

# Load the model
gemini_model = GenerativeModel("gemini-1.0-pro-001")

# Generation config
gemini_parameters = {
    "max_output_tokens": 2048, # default: 8192
    "temperature": 0.7, # default: 0.9
    "top_p": 0.4, # default: 1
}

# Safety config
safety_config = {
    generative_models.HarmCategory.HARM_CATEGORY_HATE_SPEECH: generative_models.HarmBlockThreshold.BLOCK_ONLY_HIGH,
    generative_models.HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: generative_models.HarmBlockThreshold.BLOCK_NONE,
    generative_models.HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: generative_models.HarmBlockThreshold.BLOCK_ONLY_HIGH,
    generative_models.HarmCategory.HARM_CATEGORY_HARASSMENT: generative_models.HarmBlockThreshold.BLOCK_ONLY_HIGH,
}

In [None]:
# Generate actionable insights about the main issue based on selected chats and existing keywords

convs_nb = 100
topics = conv_topic_info.loc[1:5, 'Topic']
gemini_model_response_insights_list = []

for topic_nb in tqdm(topics):    
    conv_kyewords = conv_topic_info.loc[topic_nb + 1, 'Representation']
    conv_docs = format_selected_conversations(conversations_data_topics, topic_nb, convs_nb, method='highest_score')

    insights_prompt = f"""
Context: You are a helpful, respectful and honest assistant who extracts information from chats between users and support.

I have the following chats:

{conv_docs.strip()}

These chats are described by the following keywords: {', '.join(conv_kyewords)}.

Taking into account the above information, generate a list of actionable insights about the main issue that can help decision makers. 
Provide the result in JSON format, where valid fields are only 'issue' and 'actinable_insights'. Use safe quote nesting if necessary.
"""
    
    try:
        gemini_prompt_response = gemini_model.generate_content(
            insights_prompt,
            generation_config=gemini_parameters,
            safety_settings=safety_config,
        )
        response_text = gemini_prompt_response.text
    except ValueError as err:
        print(err)
        response_text = """```json
{}
```"""
    
    gemini_model_response_insights_list.append(response_text)


In [None]:
# print(f"""Topic {topic_nb}\nkeywords: {conv_kyewords},\n{gemini_model.count_tokens(insights_prompt)}total characters: {len(insights_prompt)}""")
# print(insights_prompt)

In [None]:
# Show LLM results (in JSON format)

for idx, item in enumerate(gemini_model_response_insights_list):
    print(f"Topic {idx}:")
    display(Markdown(item))
    # print(item)

In [None]:
# Print chat examples

print(format_selected_conversations(conversations_data_topics, topic_nb=0, nb_of_convs=5, method='highest_score'))

In [None]:
# Transform LLM results from JSON format to pd.DataFrame

df_insights_list = []
for idx, item in enumerate(gemini_model_response_insights_list):
    # print(f"Topic {idx}")
    insights_data = parse_json(item)

    df = pd.DataFrame(insights_data)
    df['topic'] = idx
    df_insights_list.append(df)

df_insights = pd.concat(df_insights_list, ignore_index=True)

In [None]:
with pd.option_context('display.max_colwidth', None):
    display(df_insights.set_index('topic'))