In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
# import seaborn as sns

# Set style for better-looking plots
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)

In [None]:
# Load the datasets
conversations_df = pd.read_csv('data_clean/data_conversations.csv')
participants_df = pd.read_csv('data_clean/data_participants.csv')
persona_vectors_df = pd.read_csv('data_clean/persona_vectors.csv')

print("Conversations shape:", conversations_df.shape)
print("Participants shape:", participants_df.shape)
print("Persona vectors shape:", persona_vectors_df.shape)

In [None]:
# 1. Number of user interactions from conversations
user_interactions = conversations_df[conversations_df['role'] == 'user'].groupby('participant_id').size().reset_index(name='num_interactions')

# 2. Number of vectors generated (each row in persona_vectors represents one vector generation)
vectors_generated = persona_vectors_df.groupby('firebase_id').size().reset_index(name='num_vectors_generated')

# 3. Number of unique system prompts per participant
unique_prompts = persona_vectors_df.groupby('firebase_id')['system_prompt'].nunique().reset_index(name='num_unique_prompts')

print("Metrics calculated successfully")

In [None]:
# Merge all engagement metrics
engagement_df = participants_df[['firebase_id', 'prolific_id', 'condition_name']].copy()

# Merge with calculated metrics
engagement_df = engagement_df.merge(user_interactions, left_on='firebase_id', right_on='participant_id', how='left')
engagement_df = engagement_df.merge(vectors_generated, on='firebase_id', how='left')
engagement_df = engagement_df.merge(unique_prompts, on='firebase_id', how='left')

# Drop duplicate column
if 'participant_id' in engagement_df.columns:
    engagement_df = engagement_df.drop('participant_id', axis=1)

# Fill NaN values with 0
engagement_df = engagement_df.fillna(0)

# Convert to int
engagement_df['num_interactions'] = engagement_df['num_interactions'].astype(int)
engagement_df['num_vectors_generated'] = engagement_df['num_vectors_generated'].astype(int)
engagement_df['num_unique_prompts'] = engagement_df['num_unique_prompts'].astype(int)

print(f"Total participants: {len(engagement_df)}")
engagement_df.head()

In [None]:
engagement_df.to_csv("model_data/user_engagement.csv", index=False)