# DataSets
## Preparation
First, import libraries and load all the datasets we currently have

In [16]:
import pandas as pd

In [17]:
auto_labeled_df = pd.read_csv("../resources/auto_labled_data.csv")
fox_df = pd.read_csv("../resources/fox_news.csv", sep=';')
ethos_df = pd.read_csv("../resources/Ethos_Dataset_Binary.csv", sep=';')

olid_labels = pd.read_csv("../resources/OLIDv1.0/labels-levela.csv")
olid_texts = pd.read_csv("../resources/OLIDv1.0/testset-levela.tsv", sep='\t')

### Prepare auto_labeled_data

In [18]:
def classification_auto_labeled(x):
    if x == 0:
        return "hate_speech"
    if x == 1:
        return "offensive_language"
    if x == 2:
        return "neither"
    
auto_labeled_df = auto_labeled_df.drop(columns=['Unnamed: 0', 'count', 'hate_speech', 'offensive_language', 'neither'])
auto_labeled_df['class'] = auto_labeled_df['class'].apply(classification_auto_labeled)

### Prepare Ethos

In [19]:
def classification_ethos(x):
    if x == 0:
        return 'hate_speech'
    else:
        return 'neither'

ethos_df = ethos_df.rename(columns={'isHate': 'class', 'comment': 'tweet'})
ethos_df['class'] = ethos_df['class'].apply(classification_ethos)

### Prepare fox_news

In [20]:
def classification_foxnews(x):
    if x == 0:
        return "neither"
    if x == 1:
        return "offensive_language"


fox_df['class'] = fox_df['class'].apply(classification_foxnews)

### Prepare OLID dataset

In [21]:
olid_df = pd.merge(olid_labels, olid_texts, on="id")
olid_df.sample(10)

Unnamed: 0,id,tweet_x,tweet_y
406,70324,OFF,Alright let me get right with God bc Mother Na...
813,73105,NOT,#ChequersPlan What an absolute farce Brexit is...
42,30900,NOT,@USER @USER @USER Are you referring to how the...
347,47696,OFF,#LiberalLogic #Kavanaugh looks predatory. The...
606,66771,NOT,@USER Ah it's a she and she is called George 🐅...
13,64376,NOT,#GreatAwakening #QAnon #PatriotsUnited #WWG1WG...
7,65507,OFF,@USER Do you get the feeling he is kissing @US...
668,43453,NOT,#NuevaFotoDePerfil Pic by: @USER You are awes...
597,41609,NOT,@USER Chicago's gun control is so effective it...
744,62788,OFF,#OITNB. She is the worst public defender. Trai...


In [22]:
olid_df = olid_df.rename(columns={"tweet_x": "class", "tweet_y": "tweet"})

In [23]:
def classification_olid(x):
    if x == 'NOT':
        return 'neither'
    if x == 'OFF':
        return 'offensive_language'

olid_df['class'] = olid_df['class'].apply(classification_olid)
olid_df = olid_df.drop(columns=['id'])

## Combine the data

### Verify dataframes
Now we can combine the datasets. Verify that all dataframes provide us with the columns _class_ and _tweet_

In [28]:
auto_labeled_df.sample(3)

49566

In [25]:
ethos_df.sample(3)

Unnamed: 0,tweet,class
973,"Archeological evidence is clear, europeans hav...",hate_speech
11,Who do you think you are cunt? wtf! Stay stead...,neither
661,Why the fuck would you ask Trump?,hate_speech


In [26]:
fox_df.sample(3)

Unnamed: 0,class,tweet
1147,neither,LeftandRightareWrong @insideout26529 Because a...
725,offensive_language,whybotherwithliberals How embarrassing for the...
671,neither,Bernstein @chickabalter Funny when white racis...


In [27]:
olid_df.sample(3)

Unnamed: 0,class,tweet
433,offensive_language,@USER Damn I felt this shit. Why you so loud lol
371,offensive_language,And have a bitch thinking you niggas have mone...
660,neither,#Denver's venues are STACKED with epic shows t...


### Finally combine it

In [29]:
dfs = [auto_labeled_df, ethos_df, fox_df, olid_df]
result = pd.concat(dfs)
result.sample(10)

Unnamed: 0,class,tweet
14276,offensive_language,RT @BRANDONBARAY: update: u are still a bitch
3624,offensive_language,@JawShoeeAhhh bitch you can't even say no lol
10914,offensive_language,I swear I worry about me and nothing's but me....
453,offensive_language,"""HOW DO YOU KNOW MY TWITTER PASSWORD"" bitch pl..."
10838,offensive_language,"I really just did some hoe shyt, but just so u..."
5807,offensive_language,@dantoomey2 forgot someone niglet
22941,offensive_language,Wish i had a bad bitch tonight where they at?
22220,hate_speech,Told my dad to go buy cookies for the graduati...
16103,hate_speech,RT @LANURSE1: Here is a #MichaelBrown supporte...
14217,hate_speech,RT @AriesSpears: These young cats being drafte...


Save the result

In [32]:
result.to_csv('../resources/combined_data.csv', index=False, header=True)