In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import json
from collections import Counter

In [2]:
# Load the dataset

df = pd.read_csv('../data_raw/ahole-small.csv')
df.head()

Unnamed: 0,title,post_id,over_18,subreddit,link_flair_text,self_text
0,AITA for cutting communications with my ex?,b3jk0h,False,AmItheAsshole,No A-holes here,"So me and my ex are both high school seniors, ..."
1,aita for thinking my girlfriend is dating me o...,b3jpqu,False,AmItheAsshole,Asshole,"so, hi. on mobile, second time poster, english..."
2,AITA For looking at my partners phone?,b3jsz3,False,AmItheAsshole,Not the A-hole,Backstory: about 3 years ago my wife (fiancee ...
3,AITA for taking an unvaccinated child to a fri...,b3k5l9,False,AmItheAsshole,Asshole,Ok so here’s the thing. My friends daughter is...
4,AITA when I give up on trying to follow the ru...,b3kbde,False,AmItheAsshole,Not the A-hole,"So. Lately, I've been extremely depressed. I'v..."


In [3]:
# Check data quality issues
df.isnull().sum()
df['post_id'].nunique()
df['title'].nunique()
df['over_18'].value_counts()
df['subreddit'].value_counts()
df['link_flair_text'].value_counts().head(10)


link_flair_text
Not the A-hole     362237
Asshole            101730
No A-holes here     46261
Everyone Sucks      28694
Not enough info     13530
UPDATE               6062
not the a-hole       4768
TL;DR                3808
asshole              1803
META                 1287
Name: count, dtype: int64

In [4]:
# Step 2: Filter out rows with invalid self_text
# Keep only rows with valid, non-empty text content

invalid_text_mask = (
    df['self_text'].isna() | 
    (df['self_text'] == '') | 
    (df['self_text'] == '[removed]') | 
    (df['self_text'] == '[deleted]')
)

df_clean = df[~invalid_text_mask].copy()

In [5]:
# Step 3: Filter to binary classification categories
# Keep only "Not the A-hole" and "Asshole" for training
main_categories_binary = ['Not the A-hole', 'Asshole']
df_binary = df_clean[df_clean['link_flair_text'].isin(main_categories_binary)].copy()

In [6]:
# Step 4: Create binary labels for classification
def create_binary_label(flair_text):
    """Convert flair text to binary label: 0 = Not the A-hole, 1 = Asshole"""
    if flair_text == 'Not the A-hole':
        return 0
    elif flair_text == 'Asshole':
        return 1
    else:
        return None

df_binary['label'] = df_binary['link_flair_text'].apply(create_binary_label)

In [7]:
# Step 5: Create train/test splits

def split(sample_data):
    #Shuffle samples
    randomized = sample_data.sample(frac=1)
    num_samples = randomized.shape[0]
    #percent training data
    percent_training = .80
    num_training = int(num_samples * percent_training)
    train = randomized.iloc[:num_training]
    test = randomized.iloc[num_training:, :]
    X_train_bin = train.drop("label", axis=1)
    y_train_bin = train["label"]
    X_test_bin = test.drop("label", axis=1)
    y_test_bin = test["label"]
    return X_train_bin, y_train_bin, X_test_bin, y_test_bin

X_train_bin, y_train_bin, X_test_bin, y_test_bin = split(df_binary)
X_train_bin.head()

Unnamed: 0,title,post_id,over_18,subreddit,link_flair_text,self_text
1359557,WIBTA if I tell people to pick up their garbage,caagbv,False,AmItheAsshole,Not the A-hole,So in accordance with the increasing awareness...
1165415,AITA for not taking the full blame for ruining...,ueau1g,False,AmItheAsshole,Not the A-hole,"My aunt is going for surgery today, so yesterd..."
18601,AITA for telling my 26 year old daughter to gr...,gc3nhk,False,AmItheAsshole,Asshole,My daughter takes commissions from people some...
1176311,AITA for how I reacted towards a stranger who ...,qni2r3,False,AmItheAsshole,Not the A-hole,I was in a local cafe waiting for a smoothie I...
1239853,AITA for not picking up my estranged husband a...,von20c,False,AmItheAsshole,Not the A-hole,"We do not get along. Days before the surgery, ..."


In [8]:
# Step 6: Export standardized datasets
# Create output directory structure
import os
os.makedirs('../data_raw/standardized', exist_ok=True)

# Export binary classification datasets
def export_split(X, y, split_name):
    """Export a split to CSV"""
    df_export = X.copy()
    df_export['label'] = y
    output_path = f'../data_raw/standardized/binary_{split_name}.csv'
    df_export.to_csv(output_path, index=False)
    return output_path

# Export binary splits
export_split(X_train_bin, y_train_bin, 'train')
export_split(X_test_bin, y_test_bin, 'test')

'../data_raw/standardized/binary_test.csv'

In [9]:
# Export summary statistics
summary = {
    'original_dataset': {
        'total_rows': len(df),
        'columns': df.columns.tolist(),
        'note': 'over_18 column removed from final datasets'
    },
    'cleaned_dataset': {
        'total_rows': len(df_clean),
        'rows_removed': len(df) - len(df_clean),
        'removal_reason': 'Missing/removed/deleted self_text'
    },
    'binary_dataset': {
        'total_rows': len(df_binary),
        'categories': ['Not the A-hole', 'Asshole'],
        'train_rows': len(X_train_bin),
        'label_distribution': {
            '0 (Not the A-hole)': int((y_train_bin == 0).sum()),
            '1 (Asshole)': int((y_train_bin == 1).sum())
        },
        'note': 'Test set includes extra samples from "No A-holes here" and "Everyone Sucks" categories'
    }
}

print(summary)

{'original_dataset': {'total_rows': 1767258, 'columns': ['title', 'post_id', 'over_18', 'subreddit', 'link_flair_text', 'self_text'], 'note': 'over_18 column removed from final datasets'}, 'cleaned_dataset': {'total_rows': 470534, 'rows_removed': 1296724, 'removal_reason': 'Missing/removed/deleted self_text'}, 'binary_dataset': {'total_rows': 382046, 'categories': ['Not the A-hole', 'Asshole'], 'train_rows': 305636, 'label_distribution': {'0 (Not the A-hole)': 243085, '1 (Asshole)': 62551}, 'note': 'Test set includes extra samples from "No A-holes here" and "Everyone Sucks" categories'}}
