## 1. Load From Hugging Face Datasets
- Manually extract each datapoint to allow it to be put into a pandas dataframe

In [None]:
from datasets import load_dataset
import pandas as pd
# Load using streaming (often bypasses parsing errors)
dataset = load_dataset("RyokoAI/ShareGPT52K", split="train", streaming=True)

# Initialize an empty list to store the data
data = []

# Iterate through the dataset and collect the examples
for example in dataset:
    data.append(example)

# Let's extract conversations and group them by their source
conversations_by_source = []

for item in data:
    # Check if the item has a 'conversations' attribute
    if 'conversations' in item:
        # Add the entire conversation as one item
        conversations_by_source.append(item['conversations'])

# Display information about the extracted conversations
print(f"Total conversations extracted: {len(conversations_by_source)}")
print("\nFirst 3 conversations:")
for i, conversation in enumerate(conversations_by_source[:3]):
    print(f"Conversation {i+1}:")
    for j, message in enumerate(conversation):
        role = message.get('from', 'unknown')
        content = message.get('value', '')
        # Print a truncated version if the message is too long
        if len(content) > 100:
            print(f"  {role}: {content[:100]}...")
        else:
            print(f"  {role}: {content}")
    print("-" * 50)

# Create a DataFrame with the conversations
conversations_df = pd.DataFrame({'conversation': conversations_by_source})

# Display information about the conversations DataFrame
print("\nConversations DataFrame:")
print(f"Shape: {conversations_df.shape}")
print(conversations_df.head())


## 2. Filter out non-english conversations

In [None]:
# Import langdetect for language detection
import langdetect
from langdetect import DetectorFactory
from langdetect.lang_detect_exception import LangDetectException
from tqdm import tqdm

# Set seed for deterministic language detection
DetectorFactory.seed = 0

# Function to check if a conversation contains non-English messages
def conversation_has_non_english(conversation, english_langs=['en']):
    """
    Check if any message in the conversation is non-English.
    
    Args:
        conversation: List of message dictionaries
        english_langs: List of language codes considered as English
        
    Returns:
        True if any message is detected as non-English, False otherwise
    """
    for message in conversation:
        content = message.get('value', '')
        # Skip very short messages or invalid text
        if not isinstance(content, str) or len(content.strip()) <= 20:
            continue
        
        try:
            # Detect language of the text
            detected_lang = langdetect.detect(content)
            if detected_lang not in english_langs:
                return True
        except LangDetectException:
            # If detection fails, assume it's valid (English)
            continue
            
    return False

# Filter out conversations with non-English content
english_conversations = []
for conversation in tqdm(conversations_by_source, desc="Filtering non-English conversations"):
    if not conversation_has_non_english(conversation):
        english_conversations.append(conversation)

print(f"Original number of conversations: {len(conversations_by_source)}")
print(f"Number of conversations after filtering non-English: {len(english_conversations)}")

# Update the DataFrame with only English conversations
english_conversations_df = pd.DataFrame({'conversation': english_conversations})


## 3. Extract human prompts from LLM Responses

In [None]:
# Extract human prompts from conversations
print("\nExtracting human prompts from conversations...")
human_prompts = []

for i, conversation in enumerate(english_conversations):
    conversation_id = i  # Assign a unique ID to each conversation
    for message in conversation:
        # Check if the message is from a human (typically 'human' or 'user')
        if message.get('from', '').lower() in ['human', 'user']:
            # Get the content of the message
            content = message.get('value', '')
            if content:  # Only add non-empty prompts
                human_prompts.append({
                    'conversation_id': conversation_id,
                    'prompt': content
                })

# Create a DataFrame with human prompts and conversation IDs
human_prompts_df = pd.DataFrame(human_prompts)

#print("Filtering to keep only the first message from each unique conversation...")
first_messages_df = human_prompts_df.drop_duplicates(subset=['conversation_id'], keep='first')

## 4 & 5. Filter out short messages and sample 1000 examples for testing

In [None]:
# Take a random sample of 1000 prompts with more than 3 words and save them to a new CSV file
import pandas as pd
import numpy as np

# Set a random seed for reproducibility
np.random.seed(42)

# Filter prompts with more than 3 words
filtered_prompts = first_messages_df[first_messages_df['word_count'] > 3]

# Take a random sample of 1000 prompts (or all if less than 1000)
sample_size = min(1000, len(filtered_prompts))
filtered_prompts.sample(n=sample_size, random_state=42)

# Save the sampled prompts to a new CSV file
output_file = '../data/ShareGPT/testing_prompts_cleaned.csv'
sampled_prompts.to_csv(output_file, index=False)

print(f"Saved {sample_size} randomly sampled prompts (>3 words) to {output_file}")
print(f"Sample statistics:")
print(f"- Mean word count: {sampled_prompts['word_count'].mean():.2f} words")
print(f"- Median word count: {sampled_prompts['word_count'].median():.2f} words")
print(f"- Min word count: {sampled_prompts['word_count'].min()} words")
print(f"- Max word count: {sampled_prompts['word_count'].max()} words")
