In [1]:
import os

from pathlib import Path
import torch
import pandas as pd

import torch.nn as nn
from transformers import BertForSequenceClassification, BertTokenizer, get_scheduler
from torch.utils.data import DataLoader, TensorDataset
from transformers import AdamW
import tqdm

import torch                    
import torchvision

In [2]:
path = os.getcwd()
df = pd.read_json(path + '/archive/MMHS150K_GT.json', orient='index')
df.sample(5)

Unnamed: 0,img_url,labels,tweet_url,tweet_text,labels_str
2005-01-30 09:10:30.128549888,http://pbs.twimg.com/media/D10f9kNX4AA2kih.jpg,"[0, 0, 0]",https://twitter.com/user/status/11070762301285...,@ZoeyDollaz Nigga got cheated on &amp; knocked...,"[NotHate, NotHate, NotHate]"
2005-02-04 16:29:32.948983809,http://pbs.twimg.com/tweet_video_thumb/D17A0R0...,"[0, 0, 1]",https://twitter.com/user/status/11075345729489...,@thekaibaby @SoThatsJasmine I would cry real n...,"[NotHate, NotHate, Racist]"
2005-05-01 05:58:14.558068736,http://pbs.twimg.com/media/D3kER_zW4AEwtOJ.jpg,"[0, 0, 2]",https://twitter.com/user/status/11149270945580...,this faggot fucking host is absolutely not sea...,"[NotHate, NotHate, Sexist]"
2005-01-26 10:17:30.221715456,http://pbs.twimg.com/ext_tw_video_thumb/110673...,"[0, 0, 0]",https://twitter.com/user/status/11067346502217...,That nigga @asvpShade was wild back in the day...,"[NotHate, NotHate, NotHate]"
2005-02-11 14:02:54.131052544,http://pbs.twimg.com/ext_tw_video_thumb/110813...,"[0, 0, 0]",https://twitter.com/user/status/11081305741310...,nigga told me to basically kill myself and the...,"[NotHate, NotHate, NotHate]"


In [3]:
# labels: 0 - NotHate, 1 - Racist, 2 - Sexist, 3 - Homophobe, 4 - Religion, 5 - OtherHate

# extract tweet id from tweet url
df['tweet_id'] = df['tweet_url'].apply(lambda x: x.split('/')[-1])

# create a new column for each label
df['is_hateful'] = df['labels'].apply(lambda x: x !=[0,0,0])
# create a new column for each label
df['is_racist'] = df['labels'].apply(lambda x: 1 in x)
df['is_sexist'] = df['labels'].apply(lambda x: 2 in x)
df['is_homophobe'] = df['labels'].apply(lambda x: 3 in x)
df['is_religion'] = df['labels'].apply(lambda x: 4 in x)
df['is_other_hate'] = df['labels'].apply(lambda x: 5 in x)

# remove the labels column
df = df.drop(columns=['labels'])
# remove the labels_str column
df = df.drop(columns=['labels_str'])

# reset index => remove date and get a new index
df = df.reset_index(drop=True)

print(df.is_hateful.value_counts())
df.head()

is_hateful
True     91933
False    57890
Name: count, dtype: int64


Unnamed: 0,img_url,tweet_url,tweet_text,tweet_id,is_hateful,is_racist,is_sexist,is_homophobe,is_religion,is_other_hate
0,http://pbs.twimg.com/tweet_video_thumb/D3gi9MH...,https://twitter.com/user/status/11146793537140...,@FriskDontMiss Nigga https://t.co/cAsaLWEpue,1114679353714016256,True,True,False,True,True,False
1,http://pbs.twimg.com/ext_tw_video_thumb/106301...,https://twitter.com/user/status/10630200488166...,My horses are retarded https://t.co/HYhqc6d5WN,1063020048816660480,True,False,False,False,False,True
2,http://pbs.twimg.com/media/D2OzhzHUwAADQjd.jpg,https://twitter.com/user/status/11089273680753...,“NIGGA ON MA MOMMA YOUNGBOY BE SPITTING REAL S...,1108927368075374593,False,False,False,False,False,False
3,http://pbs.twimg.com/ext_tw_video_thumb/111401...,https://twitter.com/user/status/11145585346356...,RT xxSuGVNGxx: I ran into this HOLY NIGGA TODA...,1114558534635618305,True,True,False,False,False,False
4,http://pbs.twimg.com/media/Dl30pGIU8AAVGxO.jpg,https://twitter.com/user/status/10352524802155...,“EVERYbody calling you Nigger now!” https://t....,1035252480215592966,True,True,False,False,False,False


In [7]:
# Now I want to create a subset of the data with equal number of hateful and non-hateful tweets
# for hateful, to make it easier, we will consider only the tweets multiple labels 
# (i.e., n_labels > 1)
df_hateful = df[df['is_hateful'] == True]
df_non_hateful = df[df['is_hateful'] == False]

num_of_each_class = min(len(df_hateful), len(df_non_hateful), 10000)
print("Number of sample in each class:", num_of_each_class)

df_hateful = df_hateful.sample(n=num_of_each_class, random_state=42)
df_non_hateful = df_non_hateful.sample(n=num_of_each_class, random_state=42)

df_sample = pd.concat([df_hateful, df_non_hateful], axis=0)

df_sample = df_sample.drop(columns=['is_racist', 'is_sexist', 'is_homophobe', 'is_religion', 'is_other_hate'])

# save the data for later use 
df_sample.to_csv(path + '/data/MMHS150K_GT.csv', index=False)

df_sample.head()

Number of sample in each class: 10000


Unnamed: 0,img_url,tweet_url,tweet_text,tweet_id,is_hateful
35353,http://pbs.twimg.com/media/D1hhczLWkAI8q7f.jpg,https://twitter.com/user/status/11057408704632...,@iveylee091200 @nowthisnews Not without a figh...,1105740870463225856,True
108931,http://pbs.twimg.com/tweet_video_thumb/DqcrO0Q...,https://twitter.com/user/status/10558601851708...,@Iainsh @NadsLFC @WiggzLFC @_TheBoss Lieing tw...,1055860185170886657,True
16627,http://pbs.twimg.com/media/D3z9oe-XoAEtOzo.jpg,https://twitter.com/user/status/11160456857434...,HIT A SWITCH ON A FAKE NIGGA LIKE A STATION ht...,1116045685743411205,True
103737,http://pbs.twimg.com/ext_tw_video_thumb/106172...,https://twitter.com/user/status/10627646069496...,@ChinkyyChink Me acting like i wouldn't let my...,1062764606949613570,True
130333,http://pbs.twimg.com/media/DoS5pJvUgAA-7n0.jpg,https://twitter.com/user/status/10461651356504...,a whole dyke mood 😂 #LGBTQTwiter https://t.co/...,1046165135650410501,True


In [None]:
# we need to restrure the image folder
# to be able to use the torchvision.transforms
# not_hateful images need to be in the folder not_hateful
# hateful images need to be in the folder hateful
from os.path import join
import os
import shutil

# create the folders
os.makedirs("data/images/not_hateful", exist_ok=True)
os.makedirs("data/images/hateful", exist_ok=True)

# clear data folders
shutil.rmtree("data/images/not_hateful")
shutil.rmtree("data/images/hateful")

# create the folders
os.makedirs("data/images/not_hateful", exist_ok=True)
os.makedirs("data/images/hateful", exist_ok=True)

# for image in images:
for image in df_sample['tweet_id']:
    if not os.path.exists(join("archive/img_resized", f"{image}.jpg")):
        print(f"Image {image} not found")
        continue
    # if the image is hateful
    if df_sample[df_sample['tweet_id'] == image]['is_hateful'].values[0]:
        # move the image to the hateful folder
        source = join("archive/img_resized", f"{image}.jpg")
        destination = join("data/images/hateful", f"{image}.jpg")
    else:
        # move the image to the not hateful folder
        source = join("archive/img_resized", f"{image}.jpg")
        destination = join("data/images/not_hateful", f"{image}.jpg")
                
    shutil.copy(source, destination)