In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
import json

# Replace 'path_to_csv.csv' with the path to your actual CSV file
df = pd.read_csv('../data/toxic/toxic_comments.csv')


In [2]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 159571 entries, 0 to 159570
Data columns (total 8 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   id             159571 non-null  object
 1   comment_text   159571 non-null  object
 2   toxic          159571 non-null  int64 
 3   severe_toxic   159571 non-null  int64 
 4   obscene        159571 non-null  int64 
 5   threat         159571 non-null  int64 
 6   insult         159571 non-null  int64 
 7   identity_hate  159571 non-null  int64 
dtypes: int64(6), object(2)
memory usage: 9.7+ MB


In [3]:
df.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [4]:
label_columns = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

for column in label_columns:
    print(df[column].value_counts())
    print("\n")


toxic
0    144277
1     15294
Name: count, dtype: int64


severe_toxic
0    157976
1      1595
Name: count, dtype: int64


obscene
0    151122
1      8449
Name: count, dtype: int64


threat
0    159093
1       478
Name: count, dtype: int64


insult
0    151694
1      7877
Name: count, dtype: int64


identity_hate
0    158166
1      1405
Name: count, dtype: int64




In [5]:
# Calculate average token length
average_token_length = df['comment_text'].str.len().mean()

# Calculate median token length
median_token_length = df['comment_text'].str.len().median()

# Calculate quantiles
quantiles = df['comment_text'].str.len().quantile([0.25, 0.5, 0.75, 0.95])

# Print results
print("Average token length:", average_token_length)
print("Median token length:", median_token_length)
print("Quantiles for token length:")
print("25th percentile (Q1):", quantiles[0.25])
print("Median (Q2):", quantiles[0.5])
print("75th percentile (Q3):", quantiles[0.75])
print("95th percentile:", quantiles[0.95])


Average token length: 394.0732213246768
Median token length: 205.0
Quantiles for token length:
25th percentile (Q1): 96.0
Median (Q2): 205.0
75th percentile (Q3): 435.0
95th percentile: 1355.0


In [6]:
defined_token_length = 200  # Define the maximum token length you want to keep

df = df[df['comment_text'].str.len() <= defined_token_length]


In [7]:
label_columns = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

for column in label_columns:
    print(df[column].value_counts())
    print("\n")


toxic
0    68355
1    10150
Name: count, dtype: int64


severe_toxic
0    77336
1     1169
Name: count, dtype: int64


obscene
0    72653
1     5852
Name: count, dtype: int64


threat
0    78169
1      336
Name: count, dtype: int64


insult
0    73036
1     5469
Name: count, dtype: int64


identity_hate
0    77545
1      960
Name: count, dtype: int64




In [8]:
# Filter and rename columns
df = df[['comment_text', 'toxic']]
df.columns = ['text', 'label']

In [9]:
df.head()

Unnamed: 0,text,label
1,D'aww! He matches this background colour I'm s...,0
4,"You, sir, are my hero. Any chance you remember...",0
5,"""\n\nCongratulations from me as well, use the ...",0
6,COCKSUCKER BEFORE YOU PISS AROUND ON MY WORK,1
7,Your vandalism to the Matt Shirvington article...,0


In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 78505 entries, 1 to 159570
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    78505 non-null  object
 1   label   78505 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 1.8+ MB


In [11]:
df['label'].value_counts()

label
0    68355
1    10150
Name: count, dtype: int64

In [12]:
# Shuffle before downsampling
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

# Target count for both classes
downsample_count = 15294  

# Function to downsample
downsampled_dfs = []
for label_value in df['label'].unique():  # Get unique values of the label column
    downsampled_dfs.append(df[df['label'] == label_value].head(downsample_count))

# Combine downsampled dfs and shuffle
df = pd.concat(downsampled_dfs)
df = df.sample(frac=1, random_state=42).reset_index(drop=True)


In [13]:
df['label'].value_counts()

label
0    15294
1    10150
Name: count, dtype: int64

In [14]:
df.head()

Unnamed: 0,text,label
0,Does anyone have any logical reason it should ...,0
1,I don't think this page is protected. Did you ...,0
2,bbq \n\nbe a man and lets discuss it-maybe ove...,0
3,I am going to rip off your tests and shove the...,1
4,Why are you unable to see what Homeontherange ...,0


In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25444 entries, 0 to 25443
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    25444 non-null  object
 1   label   25444 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 397.7+ KB


In [16]:
# Split the data
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

# Save as jsonl
def save_as_jsonl(dataframe, filename):
    records = dataframe.to_dict(orient='records')
    with open(filename, 'w') as f:
        for record in records:
            f.write(json.dumps(record) + '\n')

# Convert to JSON Lines and save
save_as_jsonl(train_df, '../data/toxic/train.jsonl')
save_as_jsonl(test_df, '../data/toxic/test.jsonl')
