In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
import json

# Replace 'path_to_csv.csv' with the path to your actual CSV file
df = pd.read_csv('../data/bullying/cyberbullying_tweets.csv')


In [2]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 47692 entries, 0 to 47691
Data columns (total 2 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   tweet_text          47692 non-null  object
 1   cyberbullying_type  47692 non-null  object
dtypes: object(2)
memory usage: 745.3+ KB


In [3]:
df.head()

Unnamed: 0,tweet_text,cyberbullying_type
0,"In other words #katandandre, your food was cra...",not_cyberbullying
1,Why is #aussietv so white? #MKR #theblock #ImA...,not_cyberbullying
2,@XochitlSuckkks a classy whore? Or more red ve...,not_cyberbullying
3,"@Jason_Gio meh. :P thanks for the heads up, b...",not_cyberbullying
4,@RudhoeEnglish This is an ISIS account pretend...,not_cyberbullying


In [4]:
# Count the datapoints for each class in the 'cyberbullying_type' column
df['cyberbullying_type'].value_counts()

cyberbullying_type
religion               7998
age                    7992
gender                 7973
ethnicity              7961
not_cyberbullying      7945
other_cyberbullying    7823
Name: count, dtype: int64

In [5]:
# Set class counts
downsample = {
    'religion': 100,
    'age': 100,
    'gender': 100,
    'ethnicity': 100,
    'other_cyberbullying': 100,
    'not_cyberbullying': 500
}

# Downsample each class
downsampled_dfs = []
for category, count in downsample.items():
    downsampled_dfs.append(df[df['cyberbullying_type'] == category].head(count))

# Combine the downsampled dataframes
df = pd.concat(downsampled_dfs)

# Shuffle the combined dataframe for good measure (though it's already reproducible due to head)
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

In [6]:
df['cyberbullying_type'].value_counts()

cyberbullying_type
not_cyberbullying      500
other_cyberbullying    100
age                    100
religion               100
gender                 100
ethnicity              100
Name: count, dtype: int64

In [7]:
# Encode the classes: 'not_cyberbullying' as 0, all others as 1
df['label'] = df['cyberbullying_type'].apply(lambda x: 0 if x == 'not_cyberbullying' else 1)

df['label'].value_counts()


label
0    500
1    500
Name: count, dtype: int64

In [8]:
# Drop categorical labels
df.drop(columns='cyberbullying_type', inplace=True)

In [9]:
# Rename text column
df = df.rename(columns={'tweet_text': 'text'})

In [10]:
df.head(100)

Unnamed: 0,text,label
0,Kids Love😘❤ @ Mohamad Bin Zayed City مدينة محم...,0
1,@PoliticalAnt @Lithobolos And of course I woul...,0
2,@GemmaBurnsX @_georgepumphrey my iphone is fuc...,0
3,ITS BEAUTIFULLLLLLL :'( :'( :'( :'(,0
4,RT @BuzzFeedUK: When you accidentally open you...,1
...,...,...
95,"@vex0rian i was there, i remember the screams.",1
96,I can't believe people are still surprised tha...,0
97,I’ve asked around and other Muslims haven’t ev...,1
98,@Vandaliser @sajid_fairooz @IsraeliRegime Ther...,1


In [11]:
# Split the data
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

# Save as jsonl
def save_as_jsonl(dataframe, filename):
    dataframe = dataframe.rename(columns={'tweet text': 'text'})
    records = dataframe.to_dict(orient='records')
    with open(filename, 'w') as f:
        for record in records:
            f.write(json.dumps(record) + '\n')

# Convert to JSON Lines and save
save_as_jsonl(train_df, '../data/bullying/train.jsonl')
save_as_jsonl(test_df, '../data/bullying/test.jsonl')
