In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import sys
import seaborn as sns
import numpy as np
from scipy import ndimage
import json

# https://huggingface.co/datasets/allenai/WildChat-1M

sys.path.append('../../shared')
from file_analyzer import FileAnalyzer


# Load and parse the JSON file manually to handle nested structure
def load_wildchat_data(file_path):
    """
    Load the wildChat.json file and convert it to a pandas DataFrame.
    This function handles the complex nested structure that causes issues with pd.read_json().
    """
    with open(file_path, 'r') as f:
        data = json.load(f)
    
    # Extract the rows data - each row has a 'row' key containing the actual data
    rows_data = [item['row'] for item in data['rows']]
    
    # Convert to DataFrame using json_normalize to handle nested structures
    df = pd.json_normalize(rows_data)
    
    return df

# Load the data using our custom function
df = load_wildchat_data('../../../data/wildChat.json')

# Display basic info about the loaded data
print("Data loaded successfully!")
print(f"Shape: {df.shape}")
print(f"Columns: {list(df.columns)}")
print("\nFirst few rows:")
df.head()

Data loaded successfully!
Shape: (100, 15)
Columns: ['conversation_hash', 'model', 'timestamp', 'conversation', 'turn', 'language', 'openai_moderation', 'detoxify_moderation', 'toxic', 'redacted', 'state', 'country', 'hashed_ip', 'header.accept-language', 'header.user-agent']

First few rows:


Unnamed: 0,conversation_hash,model,timestamp,conversation,turn,language,openai_moderation,detoxify_moderation,toxic,redacted,state,country,hashed_ip,header.accept-language,header.user-agent
0,c9ec5b440fbdd2a269333dd241f32f64,gpt-4-0314,2023-04-09T00:02:53Z,[{'content': 'Hey there! Are you familiar with...,1,English,"[{'categories': {'harassment': False, 'harassm...","[{'identity_attack': 0.000205891352379694, 'in...",False,False,Texas,United States,22fd87ba9b98f3d379b23c7b52961f2d4a8505127e58b3...,"en-US,en;q=0.9,es;q=0.8",Mozilla/5.0 (Windows NT 10.0; Win64; x64) Appl...
1,34f1581760df304d539e2fe4653b40d3,gpt-4-0314,2023-04-09T00:03:20Z,[{'content': 'Crea una imagen de una mujer cor...,2,Spanish,"[{'categories': {'harassment': False, 'harassm...","[{'identity_attack': 0.00717086344957352, 'ins...",False,False,A Coruña,Spain,58369722cd0bdf7fc027a67491ba65b74576df6994c36c...,"es-ES,es;q=0.9,en;q=0.8",Mozilla/5.0 (Linux; Android 12; 21061119DG) Ap...
2,cf1267ca6b2f6fccc9c36652a00059a1,gpt-4-0314,2023-04-09T00:04:52Z,"[{'content': 'Old age PT hx of DM, HTN, dyslip...",1,English,"[{'categories': {'harassment': False, 'harassm...","[{'identity_attack': 0.000225802257773466, 'in...",False,False,Mecca Region,Saudi Arabia,8133108d1c433c180c6be8302dc5a6681f2bec980190a1...,"en-US,en;q=0.9",Mozilla/5.0 (Windows NT 10.0; Win64; x64) Appl...
3,7f1c97a4f873cda8106b010d040be078,gpt-4-0314,2023-04-09T00:06:29Z,[{'content': 'calcula la mediana de followers:...,1,Catalan,"[{'categories': {'harassment': False, 'harassm...","[{'identity_attack': 0.000339018501108512, 'in...",False,False,Barcelona,Spain,846e43fb5fbb4b8cfbafa17083387aad62e58f5fb23482...,"es,es-ES;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0....",Mozilla/5.0 (Windows NT 10.0; Win64; x64) Appl...
4,e98d3e74c57f9a65261df393d9124ac2,gpt-4-0314,2023-04-09T00:06:49Z,[{'content': 'Hey there! Are you familiar with...,1,English,"[{'categories': {'harassment': False, 'harassm...","[{'identity_attack': 0.000206426018849015, 'in...",False,False,Texas,United States,22fd87ba9b98f3d379b23c7b52961f2d4a8505127e58b3...,"en-US,en;q=0.9,es;q=0.8",Mozilla/5.0 (Windows NT 10.0; Win64; x64) Appl...


In [12]:
print(df.conversation)

0     [{'content': 'Hey there! Are you familiar with...
1     [{'content': 'Crea una imagen de una mujer cor...
2     [{'content': 'Old age PT hx of DM, HTN, dyslip...
3     [{'content': 'calcula la mediana de followers:...
4     [{'content': 'Hey there! Are you familiar with...
                            ...                        
95    [{'content': 'What are some rhetorical devices...
96    [{'content': 'Hi', 'country': 'Canada', 'hashe...
97    [{'content': 'I’m going to shift realities ton...
98    [{'content': 'Hi', 'country': 'Canada', 'hashe...
99    [{'content': 'Write me a short book aimed at y...
Name: conversation, Length: 100, dtype: object


In [13]:
# Function to display conversations in a readable format
def display_conversation(conversation_data, max_length=500):
    """
    Display a conversation in a readable format.
    
    Args:
        conversation_data: List of conversation turns
        max_length: Maximum length for content display (to avoid overwhelming output)
    """
    if not conversation_data:
        print("No conversation data available.")
        return
    
    print("=" * 80)
    print("CONVERSATION")
    print("=" * 80)
    
    for i, turn in enumerate(conversation_data):
        role = turn.get('role', 'unknown')
        content = turn.get('content', '')
        timestamp = turn.get('timestamp', 'No timestamp')
        country = turn.get('country', 'Unknown')
        language = turn.get('language', 'Unknown')
        
        print(f"\n--- Turn {i+1}: {role.upper()} ---")
        print(f"Country: {country} | Language: {language}")
        print(f"Timestamp: {timestamp}")
        print("-" * 40)
        
        # Truncate content if too long
        if len(content) > max_length:
            print(f"{content[:max_length]}...")
            print(f"[Content truncated - full length: {len(content)} characters]")
        else:
            print(content)
        print()

# Function to display a specific conversation by index
def show_conversation_by_index(df, index=0):
    """
    Display a specific conversation from the DataFrame.
    
    Args:
        df: DataFrame containing conversation data
        index: Index of the conversation to display
    """
    if index >= len(df):
        print(f"Index {index} is out of range. DataFrame has {len(df)} rows.")
        return
    
    conversation = df.iloc[index]['conversation']
    print(f"Conversation {index + 1} of {len(df)}")
    display_conversation(conversation)

# Display the first conversation
show_conversation_by_index(df, 0)


Conversation 1 of 100
CONVERSATION

--- Turn 1: USER ---
Country: United States | Language: English
Timestamp: None
----------------------------------------
Hey there! Are you familiar with reality shifting? So, I’m refining a foolproof method for reality shifting and want to pick a destination. Want to help me? I’m thinking something pretty personalized. There are a few things that are required of my destination. 1. The quest. I have to have a clear overarching goal in my reality, and don’t make it too crazy. It should be more along the lines of “save the president’s daughter” or “escape this weird wacky sinister place” NOT “get an artifact that li...
[Content truncated - full length: 1373 characters]


--- Turn 2: ASSISTANT ---
Country: None | Language: English
Timestamp: 2023-04-09T00:02:53Z
----------------------------------------
Hey there! I'm more than happy to help you plan your reality-shifting adventure, and I've got just the destination in mind for you based on your requirem

In [14]:
# Additional utility functions for exploring conversation data

def get_conversation_summary(df):
    """Get a summary of all conversations in the dataset."""
    print("=" * 60)
    print("CONVERSATION DATASET SUMMARY")
    print("=" * 60)
    print(f"Total conversations: {len(df)}")
    
    # Count conversations by language
    if 'language' in df.columns:
        lang_counts = df['language'].value_counts()
        print(f"\nConversations by language:")
        for lang, count in lang_counts.items():
            print(f"  {lang}: {count}")
    
    # Count conversations by model
    if 'model' in df.columns:
        model_counts = df['model'].value_counts()
        print(f"\nConversations by model:")
        for model, count in model_counts.items():
            print(f"  {model}: {count}")
    
    # Count conversations by country
    if 'country' in df.columns:
        country_counts = df['country'].value_counts().head(10)
        print(f"\nTop 10 countries:")
        for country, count in country_counts.items():
            print(f"  {country}: {count}")
    
    # Analyze conversation lengths
    conversation_lengths = df['conversation'].apply(len)
    print(f"\nConversation length statistics:")
    print(f"  Average turns per conversation: {conversation_lengths.mean():.1f}")
    print(f"  Min turns: {conversation_lengths.min()}")
    print(f"  Max turns: {conversation_lengths.max()}")

def show_conversation_preview(df, num_conversations=3):
    """Show a preview of multiple conversations."""
    print("=" * 80)
    print(f"PREVIEW OF FIRST {num_conversations} CONVERSATIONS")
    print("=" * 80)
    
    for i in range(min(num_conversations, len(df))):
        conversation = df.iloc[i]['conversation']
        print(f"\n🔹 CONVERSATION {i+1}")
        print(f"   Model: {df.iloc[i].get('model', 'Unknown')}")
        print(f"   Language: {df.iloc[i].get('language', 'Unknown')}")
        print(f"   Country: {df.iloc[i].get('country', 'Unknown')}")
        print(f"   Turns: {len(conversation)}")
        
        # Show first turn only for preview
        if conversation:
            first_turn = conversation[0]
            role = first_turn.get('role', 'unknown')
            content = first_turn.get('content', '')
            preview = content[:200] + "..." if len(content) > 200 else content
            print(f"   First turn ({role}): {preview}")
        print()

# Run the summary and preview
get_conversation_summary(df)
print("\n" + "="*80 + "\n")
show_conversation_preview(df, 3)


CONVERSATION DATASET SUMMARY
Total conversations: 100

Conversations by language:
  English: 59
  Chinese: 16
  Spanish: 7
  Russian: 6
  Latin: 2
  French: 2
  Catalan: 1
  Nolang: 1
  Yoruba: 1
  Turkish: 1
  Japanese: 1
  Italian: 1
  Hungarian: 1
  Maori: 1

Conversations by model:
  gpt-4-0314: 74
  gpt-3.5-turbo-0301: 26

Top 10 countries:
  United States: 30
  China: 12
  Germany: 10
  Morocco: 7
  Spain: 6
  Russia: 4
  Türkiye: 3
  Canada: 3
  South Korea: 2
  Brazil: 2

Conversation length statistics:
  Average turns per conversation: 4.5
  Min turns: 2
  Max turns: 32


PREVIEW OF FIRST 3 CONVERSATIONS

🔹 CONVERSATION 1
   Model: gpt-4-0314
   Language: English
   Country: United States
   Turns: 2
   First turn (user): Hey there! Are you familiar with reality shifting? So, I’m refining a foolproof method for reality shifting and want to pick a destination. Want to help me? I’m thinking something pretty personalized....


🔹 CONVERSATION 2
   Model: gpt-4-0314
   Language: Sp

In [15]:
# Function to display full conversation without truncation
def show_full_conversation(df, index=0):
    """
    Display a complete conversation without any content truncation.
    
    Args:
        df: DataFrame containing conversation data
        index: Index of the conversation to display
    """
    if index >= len(df):
        print(f"Index {index} is out of range. DataFrame has {len(df)} rows.")
        return
    
    conversation = df.iloc[index]['conversation']
    row_data = df.iloc[index]
    
    print("=" * 100)
    print(f"COMPLETE CONVERSATION {index + 1} of {len(df)}")
    print("=" * 100)
    print(f"Model: {row_data.get('model', 'Unknown')}")
    print(f"Language: {row_data.get('language', 'Unknown')}")
    print(f"Country: {row_data.get('country', 'Unknown')}")
    print(f"State: {row_data.get('state', 'Unknown')}")
    print(f"Timestamp: {row_data.get('timestamp', 'Unknown')}")
    print(f"Toxic: {row_data.get('toxic', 'Unknown')}")
    print(f"Redacted: {row_data.get('redacted', 'Unknown')}")
    print("=" * 100)
    
    for i, turn in enumerate(conversation):
        role = turn.get('role', 'unknown')
        content = turn.get('content', '')
        timestamp = turn.get('timestamp', 'No timestamp')
        country = turn.get('country', 'Unknown')
        language = turn.get('language', 'Unknown')
        toxic = turn.get('toxic', False)
        
        print(f"\n{'='*20} TURN {i+1}: {role.upper()} {'='*20}")
        print(f"Country: {country} | Language: {language} | Toxic: {toxic}")
        print(f"Timestamp: {timestamp}")
        print("-" * 80)
        print(content)
        print("-" * 80)

# Example usage - show the first conversation in full
print("To view a complete conversation, use:")
print("show_full_conversation(df, 0)  # Shows first conversation")
print("show_full_conversation(df, 1)  # Shows second conversation")
print("\nOr use the truncated version:")
print("show_conversation_by_index(df, 0)  # Shows first conversation with truncation")
show_full_conversation(df, 0)

To view a complete conversation, use:
show_full_conversation(df, 0)  # Shows first conversation
show_full_conversation(df, 1)  # Shows second conversation

Or use the truncated version:
show_conversation_by_index(df, 0)  # Shows first conversation with truncation
COMPLETE CONVERSATION 1 of 100
Model: gpt-4-0314
Language: English
Country: United States
State: Texas
Timestamp: 2023-04-09T00:02:53Z
Toxic: False
Redacted: False

Country: United States | Language: English | Toxic: False
Timestamp: None
--------------------------------------------------------------------------------
Hey there! Are you familiar with reality shifting? So, I’m refining a foolproof method for reality shifting and want to pick a destination. Want to help me? I’m thinking something pretty personalized. There are a few things that are required of my destination. 1. The quest. I have to have a clear overarching goal in my reality, and don’t make it too crazy. It should be more along the lines of “save the president’