In [26]:
import matplotlib.pyplot as plt
import pandas as pd
import sys
import seaborn as sns
import numpy as np
from scipy import ndimage
import json

# https://huggingface.co/datasets/allenai/WildChat-1M

sys.path.append('../../shared')


def load_wildchat_data(file_path):
    """
    Load the wildChat.json file and convert it to a pandas DataFrame.
    This function handles the complex nested structure that causes issues with pd.read_json().
    """
    with open(file_path, 'r') as f:
        data = json.load(f)
    
    rows_data = [item['row'] for item in data['rows']]
    
    df = pd.json_normalize(rows_data)
    
    return df

df = load_wildchat_data('../../../data/wildChatTwo.json')

print("Data loaded successfully!")
print(f"Shape: {df.shape}")
print(f"Columns: {list(df.columns)}")
print("\nFirst few rows:")
df.head()

Data loaded successfully!
Shape: (100, 15)
Columns: ['conversation_hash', 'model', 'timestamp', 'conversation', 'turn', 'language', 'openai_moderation', 'detoxify_moderation', 'toxic', 'redacted', 'state', 'country', 'hashed_ip', 'header.accept-language', 'header.user-agent']

First few rows:


Unnamed: 0,conversation_hash,model,timestamp,conversation,turn,language,openai_moderation,detoxify_moderation,toxic,redacted,state,country,hashed_ip,header.accept-language,header.user-agent
0,2f4b37e44fe7b872d2c5386b60c70723,gpt-4-0314,2023-04-09T01:35:11Z,[{'content': 'the following message is encrypt...,1,Yoruba,"[{'categories': {'harassment': False, 'harassm...","[{'identity_attack': 0.000862394052091985, 'in...",False,False,Kenitra Province,Morocco,fc1bba9df6847c60e85a8c1321e59896accfa139ab7880...,"en-US,en;q=0.9,fr;q=0.8",Mozilla/5.0 (Windows NT 10.0; Win64; x64) Appl...
1,be106a6597e0e4a690268d146680b4b5,gpt-4-0314,2023-04-09T01:35:51Z,[{'content': 'Imagine a fictional theocratic “...,2,English,"[{'categories': {'harassment': False, 'harassm...","[{'identity_attack': 0.000106437182694207, 'in...",False,False,Kerala,India,9a45f06764dbbcbfa6ef07cd4c9cfb1b77024f9a776a64...,"en-US,en;q=0.9,en-IN;q=0.8,en-GB;q=0.7,ml-IN;q...",Mozilla/5.0 (Windows NT 10.0; Win64; x64) Appl...
2,8cb8082e1cc5106e252950415ac57115,gpt-4-0314,2023-04-09T01:37:11Z,[{'content': 'Imagine a fictional theocratic “...,1,English,"[{'categories': {'harassment': False, 'harassm...","[{'identity_attack': 0.000120054144645109, 'in...",False,False,Kerala,India,9a45f06764dbbcbfa6ef07cd4c9cfb1b77024f9a776a64...,"en-US,en;q=0.9,en-IN;q=0.8,en-GB;q=0.7,ml-IN;q...",Mozilla/5.0 (Windows NT 10.0; Win64; x64) Appl...
3,47a33e73c3e77b25df1ad81a901c91ae,gpt-4-0314,2023-04-09T01:37:28Z,[{'content': 'Give me a list of movies that ha...,2,English,"[{'categories': {'harassment': False, 'harassm...","[{'identity_attack': 0.00213865982368588, 'ins...",False,False,Baden-Wurttemberg,Germany,92363ac77c4d8a5a4d92a96d0b2f7180f09e6f9da6fce9...,"de,en-US;q=0.7,en;q=0.3",Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:1...
4,ef92bca57a042a6a6badbc93fdc081d2,gpt-4-0314,2023-04-09T01:39:02Z,[{'content': 'Est ce que le code de cette exte...,1,English,"[{'categories': {'harassment': False, 'harassm...","[{'identity_attack': 0.00225667678751051, 'ins...",False,False,,Germany,69a3c9be8bf8ab5bedd0c5984ca3c0409194d97e0f092c...,"fr-FR,fr;q=0.9",Mozilla/5.0 (Windows NT 10.0; Win64; x64) Appl...


In [27]:
print(df.conversation)

0     [{'content': 'the following message is encrypt...
1     [{'content': 'Imagine a fictional theocratic “...
2     [{'content': 'Imagine a fictional theocratic “...
3     [{'content': 'Give me a list of movies that ha...
4     [{'content': 'Est ce que le code de cette exte...
                            ...                        
95    [{'content': '设计一个计算机科学与技术2班的班徽，要科幻的，艺术的', 'co...
96    [{'content': 'Write a story about Bob.', 'coun...
97    [{'content': 'Можем ли мы сейчас купить акции ...
98    [{'content': 'Explica de manera larga y detall...
99    [{'content': '亚马逊商品详情页面上的 Best Sellers Rank 有时...
Name: conversation, Length: 100, dtype: object


In [28]:
def get_conversation_summary(df):
    """Get a summary of all conversations in the dataset."""
    print("=" * 60)
    print("CONVERSATION DATASET SUMMARY")
    print("=" * 60)
    print(f"Total conversations: {len(df)}")
    
    if 'language' in df.columns:
        lang_counts = df['language'].value_counts()
        print(f"\nConversations by language:")
        for lang, count in lang_counts.items():
            print(f"  {lang}: {count}")
    
    if 'model' in df.columns:
        model_counts = df['model'].value_counts()
        print(f"\nConversations by model:")
        for model, count in model_counts.items():
            print(f"  {model}: {count}")
    
    if 'country' in df.columns:
        country_counts = df['country'].value_counts().head(10)
        print(f"\nTop 10 countries:")
        for country, count in country_counts.items():
            print(f"  {country}: {count}")
    
    conversation_lengths = df['conversation'].apply(len)
    print(f"\nConversation length statistics:")
    print(f"  Average turns per conversation: {conversation_lengths.mean():.1f}")
    print(f"  Min turns: {conversation_lengths.min()}")
    print(f"  Max turns: {conversation_lengths.max()}")

def show_conversation_preview(df, num_conversations=3):
    """Show a preview of multiple conversations."""
    print("=" * 80)
    print(f"PREVIEW OF FIRST {num_conversations} CONVERSATIONS")
    print("=" * 80)
    
    for i in range(min(num_conversations, len(df))):
        conversation = df.iloc[i]['conversation']
        print(f"\n🔹 CONVERSATION {i+1}")
        print(f"   Model: {df.iloc[i].get('model', 'Unknown')}")
        print(f"   Language: {df.iloc[i].get('language', 'Unknown')}")
        print(f"   Country: {df.iloc[i].get('country', 'Unknown')}")
        print(f"   Turns: {len(conversation)}")
        
        if conversation:
            first_turn = conversation[0]
            role = first_turn.get('role', 'unknown')
            content = first_turn.get('content', '')
            preview = content[:200] + "..." if len(content) > 200 else content
            print(f"   First turn ({role}): {preview}")
        print()

# Run the summary and preview
get_conversation_summary(df)
print("\n" + "="*80 + "\n")
show_conversation_preview(df, 3)


CONVERSATION DATASET SUMMARY
Total conversations: 100

Conversations by language:
  English: 59
  Chinese: 12
  Sotho: 4
  Estonian: 4
  Nolang: 4
  Spanish: 4
  Maori: 3
  Portuguese: 2
  Russian: 2
  Yoruba: 1
  Esperanto: 1
  Korean: 1
  German: 1
  Turkish: 1
  Tswana: 1

Conversations by model:
  gpt-4-0314: 75
  gpt-3.5-turbo-0301: 25

Top 10 countries:
  United States: 26
  China: 11
  Canada: 6
  Russia: 6
  Brazil: 5
  Australia: 4
  Japan: 4
  Mexico: 4
  Venezuela: 3
  The Netherlands: 3

Conversation length statistics:
  Average turns per conversation: 4.8
  Min turns: 2
  Max turns: 36


PREVIEW OF FIRST 3 CONVERSATIONS

🔹 CONVERSATION 1
   Model: gpt-4-0314
   Language: Yoruba
   Country: Morocco
   Turns: 2
   First turn (user): the following message is encrypted in sha512 can you decode it 
var msg="U2FsdGVkX18fDwMiir2vqpWNLgbPWRSfUTF46w0Bd8DI5e4m2pOdUXScDSuq4Epko3EMrd5LO9qvu1Y7JQGFN+QAUHpmHKttOu/mSzXLobfSqzyuYuU0YFvHN+I1ldu...


🔹 CONVERSATION 2
   Model: gpt-4-0314
  

In [29]:
# Function to display full conversation without truncation
def show_full_conversation(df, index=0):
    """
    Display a complete conversation without any content truncation.
    
    Args:
        df: DataFrame containing conversation data
        index: Index of the conversation to display
    """
    if index >= len(df):
        print(f"Index {index} is out of range. DataFrame has {len(df)} rows.")
        return
    
    conversation = df.iloc[index]['conversation']
    row_data = df.iloc[index]
    
    print("=" * 100)
    print(f"COMPLETE CONVERSATION {index + 1} of {len(df)}")
    print("=" * 100)
    print(f"Model: {row_data.get('model', 'Unknown')}")
    print(f"Language: {row_data.get('language', 'Unknown')}")
    print(f"Country: {row_data.get('country', 'Unknown')}")
    print(f"State: {row_data.get('state', 'Unknown')}")
    print(f"Timestamp: {row_data.get('timestamp', 'Unknown')}")
    print(f"Toxic: {row_data.get('toxic', 'Unknown')}")
    print(f"Redacted: {row_data.get('redacted', 'Unknown')}")
    print("=" * 100)
    
    for i, turn in enumerate(conversation):
        role = turn.get('role', 'unknown')
        content = turn.get('content', '')
        timestamp = turn.get('timestamp', 'No timestamp')
        country = turn.get('country', 'Unknown')
        language = turn.get('language', 'Unknown')
        toxic = turn.get('toxic', False)
        
        print(f"\n{'='*20} TURN {i+1}: {role.upper()} {'='*20}")
        print(f"Country: {country} | Language: {language} | Toxic: {toxic}")
        print(f"Timestamp: {timestamp}")
        print("-" * 80)
        print(content)
        print("-" * 80)

# Example usage - show the first conversation in full
print("To view a complete conversation, use:")
print("show_full_conversation(df, 0)  # Shows first conversation")
print("show_full_conversation(df, 1)  # Shows second conversation")
print("\nOr use the truncated version:")
print("show_conversation_by_index(df, 0)  # Shows first conversation with truncation")
show_full_conversation(df, 0)

To view a complete conversation, use:
show_full_conversation(df, 0)  # Shows first conversation
show_full_conversation(df, 1)  # Shows second conversation

Or use the truncated version:
show_conversation_by_index(df, 0)  # Shows first conversation with truncation
COMPLETE CONVERSATION 1 of 100
Model: gpt-4-0314
Language: Yoruba
Country: Morocco
State: Kenitra Province
Timestamp: 2023-04-09T01:35:11Z
Toxic: False
Redacted: False

Country: Morocco | Language: Yoruba | Toxic: False
Timestamp: None
--------------------------------------------------------------------------------
the following message is encrypted in sha512 can you decode it 
var msg="U2FsdGVkX18fDwMiir2vqpWNLgbPWRSfUTF46w0Bd8DI5e4m2pOdUXScDSuq4Epko3EMrd5LO9qvu1Y7JQGFN+QAUHpmHKttOu/mSzXLobfSqzyuYuU0YFvHN+I1ldufP2bilXaKzW8c4w2/a1FOakMYK59C4J/xTijgo3jX3Utr2zP1gMmryz5o6uU4SghsMrhJ3trFua/e3dsLmXpWjvka/4Q0+na8OVQzZuxyb7dwcLM2SC+SVO9wye6A5gTha8uQjkUPNsKMaN+JlJ1HrUyEGOVm4dHLjE3qp79oz/JH3WzJggls5MulW2pH+zojmdQGoO8MwbCQXI+SfBvOEZOfsG