# Exploration of Qual AF transcript data

- Considering separately Chatbot and User transcripts?
- Is there any structure to the conversations?
- Do you want to find matching question-answer pairs?

## Potential preprocessing needed
- Combining audio transcripts and text into one column



In [11]:
import pandas as pd
from dsp_interview_transcripts import PROJECT_DIR, logger
from matplotlib import pyplot as plt

DATA_PATH = PROJECT_DIR / "data/qual_af_transcripts.csv"

In [None]:
data_df = pd.read_csv(DATA_PATH)
logger.info(f"Loaded in data from {DATA_PATH}")
logger.info(f"Data shape: {data_df.shape}")
logger.info(f"Data columns: {data_df.columns}")

In [None]:
# check number of unique values in each column
for col in data_df.columns:
    print(f"{col}: {data_df[col].nunique()}")

In [4]:
# Imput nan text column with text from transcript column
data_df = (
    data_df
    .assign(text=lambda x: x["text"].fillna(x["transcript"]))
    # Length of text 
    .fillna({"text": ""})
    .assign(text_length=lambda x: x["text"].apply(len))
)


In [None]:
data_df.head(1)

In [None]:
number_of_messages = (
    data_df
    .groupby("conversation")
    .agg(counts = ("uuid", "count"))
    .reset_index()
)

# change size
plt.figure(figsize=(4, 2))
plt.hist(number_of_messages["counts"], bins=25)
plt.grid(axis='y', linestyle=':')
plt.show()

In [None]:
number_of_messages = (
    data_df
    .groupby(["conversation", "role"])
    .agg(counts = ("uuid", "count"))
    .reset_index()
)


# display distribution of number of messages per conversation
plt.figure(figsize=(4, 2))
plt.hist(number_of_messages.query("role == 'BOT'")["counts"], label = "BOT", alpha=0.5, bins=25)
plt.hist(number_of_messages.query("role == 'USER'")["counts"], label = "USER", alpha=0.5, bins=25)
plt.xlabel('Number of messages per conversation')
plt.ylabel('Conversations')
plt.legend()
plt.grid(axis='y', linestyle=':')
plt.show()

In [None]:
# tansform from long to wide format
number_of_messages_bot_vs_user = (
    number_of_messages
    .pivot(index="conversation", columns="role", values="counts")
    .reset_index()
)

# scatter plot
plt.figure(figsize=(3, 3))
plt.scatter(
    number_of_messages_bot_vs_user["BOT"],
    number_of_messages_bot_vs_user["USER"],
    alpha=0.4,
)
plt.xlabel("BOT messages")
plt.ylabel("USER messages")
plt.grid(linestyle=':')
plt.show()

In [None]:
# Hist of length of messages (user roles separately)

plt.figure(figsize=(4, 2))
plt.hist(data_df.query("role == 'BOT'")["text_length"], bins=25, alpha=0.5, label="BOT")
plt.hist(data_df.query("role == 'USER'")["text_length"], bins=25, alpha=0.5, label="USER")
plt.xlabel('Length of messages')
plt.ylabel('Messages')
plt.legend()
plt.grid(axis='y', linestyle=':')
plt.show()


In [None]:
(
    data_df
    .query("role == 'USER'")
    .query("text_length > 0")
    .sort_values("text_length", ascending=True)[["role", "text", "text_length"]]
    .head(20)
)

In [None]:
(
    data_df
    .query("role == 'USER'")
    .query("text_length > 0")
    .sort_values("text_length", ascending=False)[["role", "text", "text_length"]]
    .head(20)
)

In [None]:
# To what extent are the bot messages the same from conversation to conversation?

# group by bot messages
bot_messages = (
    data_df
    .query("role == 'BOT'")
    .groupby("text")
    .agg(counts = ("uuid", "count"))
    .reset_index()
)
bot_messages.sort_values("counts", ascending=False).head(20)

In [None]:
# Let's do a quick BERTopic analysis on the user messages
from bertopic import BERTopic
