Load and prepare some other datasets for use in the Classify tool.

## 1. Reddit comment toxicity

https://storage.googleapis.com/kaggle-data-sets/1796725/2930769/compressed/ruddit_comments_score.csv.zip

In [1]:
import pandas as pd
pd.read_csv('~/Downloads/ruddit_comments_score.csv')

Unnamed: 0,comment_id,body,score
0,cza1q49,> The difference in average earnings between m...,-0.083
1,cza1wdh,"The myth is that the ""gap"" is entirely based o...",-0.022
2,cza23qx,[deleted],0.167
3,cza2bw8,The assertion is that women get paid less for ...,-0.146
4,cza2iji,You said in the OP that's not what they're mea...,-0.083
...,...,...,...
5961,f0i0mqp,They should only censor things that talk badly...,0.064
5962,f80wlxq,> and one of them is a woman. \n\nOH SHIT we b...,0.458
5963,f8uksbp,how is this flared as US politics,-0.292
5964,fa6nc1r,People in Hong Kong must decide if they are go...,0.333


In [16]:
# Read the data
df = pd.read_csv('~/Downloads/ruddit_comments_score.csv').round(2).sample(frac=1, random_state=1234)

# Create the examples list
examples = [
    {
        "text": row['body'],
        "truth": row['score']  # scores are already between -1 and 1
    }
    for _, row in df.iterrows()
]

# Limit to 100 random examples
import random
random.seed(1234)
if len(examples) > 100:
    examples = random.sample(examples, 100)

# Create the JSON structure
dataset = {
    "description": "This dataset contains Reddit comments scored for their toxicity. Scores range from -1 (most supportive) to 1 (most toxic) - rounded to 2 decimal places. CONTENT WARNING: Some comments are inappropriate or offensive. The ground truth scores were determined through human annotation. From Hada et al. 2021: <https://aclanthology.org/2021.acl-long.210v2.pdf>",
    "train_data_url": "https://github.com/organisciak/ai-class/tree/main/static/datasets/reddit_toxicity_analysis.md",
    "examples": examples
}

# Save JSON file
import json
import os
os.makedirs('../src/lib/data/datasets', exist_ok=True)
with open('../src/lib/data/datasets/reddit_toxicity.json', 'w') as f:
    json.dump(dataset, f, indent=4)

# Create analysis markdown file
# Sort by score for percentile analysis
sorted_data = df.sort_values('score', ascending=False)
n_examples = len(df)
n_five_percent = int(n_examples * 0.05)

top_responses = sorted_data.head(n_five_percent)
bottom_responses = sorted_data.tail(n_five_percent)

median_idx = n_examples // 2
median_responses = sorted_data.iloc[median_idx-2:median_idx+3]

markdown_content = f"""# Reddit Comment Toxicity Analysis

## Examples: Least Toxic Comments
{sorted_data[['body', 'score']].iloc[::-1].head(20).to_markdown(index=False)}

## Examples: Most Toxic Comments
{sorted_data[['body', 'score']].head(20).to_markdown(index=False)}

## Median Comments
{sorted_data[['body', 'score']].head(len(sorted_data)//2+20).tail(20).to_markdown(index=False)}

## 300 Random Examples

```csv
{df[['body', 'score']].head(300).to_csv(index=False).strip()}
```
"""

# Save markdown file
with open('../static/datasets/reddit_toxicity_analysis.md', 'w') as f:
    f.write(markdown_content)
