# Merge obtained data sets into one big file

Merge the data sets from the *code/data/filtered* directory into one big data set. You need to create *code/data/dataset* folder if it does not exist yet.

In [1]:
import json
import pandas as pd

In [2]:
with open("data/filtered/abusive_hateful_spam_normal.json") as f:
    data1 = json.load(f)
    
data2 = pd.read_csv("data/filtered/hatespeech_profane_offensive.csv")
data3 = pd.read_csv("data/filtered/lol_cyberbullying.csv", sep = ";")
with open("data/filtered/racism_sexism_benevolent.json") as f1:
    data4 = json.load(f1) 
data5 = pd.read_csv("data/filtered/toxic_obscene_threat_insult_identity.csv")

In [3]:
# convert first json to pandas
texts = []
labels = []
for tweet in list(data1.values()):
    texts.append(tweet["text"])
    labels.append(tweet["label"])
data1 = pd.DataFrame({"text":texts, "label":labels})
data1

Unnamed: 0,text,label
0,fucks sake go away stupid anon — ^ https://t....,abusive
1,Damn dean just put Corbin to sleep. That Match...,abusive
2,@TheRealCamerota THAT BEER BUYING FREAKING IDI...,abusive
3,what idiot called them antacids and not afterb...,abusive
4,RT @gogglepossum: Don't you hate people that p...,abusive
...,...,...
13759,"In @RDispatches, @sunnivie asks whether ""relig...",hateful
13760,Pop Modern Silicone Cake Putty Spatula Bakewar...,spam
13761,Tired of empty jargon in the wide world of bio...,spam
13762,You could win a big screen TV! Enter now! #swe...,spam


In [4]:
# convert second json to pandas
texts = []
labels = []
for tweet in list(data4.values()):
    if tweet["label"] == "both":
        texts.append(tweet["text"])
        texts.append(tweet["text"])
        labels.append("racism")
        labels.append("sexism")
    else:
        texts.append(tweet["text"])
        labels.append(tweet["label"])

data4 = pd.DataFrame({"text":texts, "label":labels})



In [5]:
data4

Unnamed: 0,text,label
0,These girls are the equivalent of the irritati...,racism
1,Drasko they didn't cook half a bird you idiot ...,racism
2,Hopefully someone cooks Drasko in the next ep ...,racism
3,of course you were born in serbia...you're as ...,racism
4,So Drasko just said he was impressed the girls...,racism
...,...,...
6035,A man is as good as the woman he is with\n\n#A...,benevolent
6036,These woman are missed. My mother and her sist...,benevolent
6037,RT @GemmaAnneStyles: Happy #womensday to all m...,benevolent
6038,RT @ConstanceQueen8: #ADayWithoutWomen Trump W...,benevolent


In [6]:
texts = []
labels = []

texts.extend(data1["text"].tolist())
labels.extend(data1["label"].tolist())
texts.extend(data2["text"].tolist())
labels.extend(data2["label"].tolist())
cyber = data3["messages"].tolist()
texts.extend(cyber)
labels.extend(["cyberbulling"] * len(cyber))
texts.extend(data4["text"].tolist())
labels.extend(data4["label"].tolist())
texts.extend(data5["text"].tolist())
labels.extend(data5["label"].tolist())


In [7]:
df = pd.DataFrame({"text":texts, "label":labels})
df.head()

Unnamed: 0,text,label
0,fucks sake go away stupid anon — ^ https://t....,abusive
1,Damn dean just put Corbin to sleep. That Match...,abusive
2,@TheRealCamerota THAT BEER BUYING FREAKING IDI...,abusive
3,what idiot called them antacids and not afterb...,abusive
4,RT @gogglepossum: Don't you hate people that p...,abusive


In [8]:
df = df[df["label"].notna()]

In [9]:
labels = set(df["label"].tolist())
print(f"{len(labels)} unique labels: {labels}")

15 unique labels: {'sexism', 'abusive', 'spam', 'benevolent', 'threat', 'obscene', 'hateful', 'insult', 'offensive', 'profane', 'toxic', 'identity', 'hate', 'racism', 'cyberbulling'}


In [10]:
df.to_csv("dataset/data.csv", sep =  ";", index = False)