In [17]:
import pandas as pd
import numpy as np

In [62]:
def major_vote(votes):
    # filter nan
    votes = votes[votes != "nan"]
    # get counts
    unq, occ = np.unique(votes, return_counts = True)
    
    most_occ = max(occ)
    most = unq[occ == most_occ]
    
    if len(most) == 1:
        return most[0]
    elif len(most) == 2:
        if "both" in most or ("racism" in most and "sexism" in most):
            return "both"
        elif "racism" in most:
            return "racism"
        elif "sexism" in most:
            return "sexism"
    elif len(most) == 3:
        if "both" in most or ("racism" in most and "sexism" in most):
            return "both"
    return "neither"
    

In [70]:
df1 = pd.read_csv("data/NAACL_SRW_2016.csv", header = None)
df1

Unnamed: 0,0,1
0,572342978255048705,racism
1,572341498827522049,racism
2,572340476503724032,racism
3,572334712804384768,racism
4,572332655397629952,racism
...,...,...
16902,576359685843861505,none
16903,576612926838046720,none
16904,576771329975664640,none
16905,560595245814267905,none


In [71]:
df1 = df1[df1[1] != "none"]

In [12]:
df1

Unnamed: 0,0,1
0,572342978255048705,racism
1,572341498827522049,racism
2,572340476503724032,racism
3,572334712804384768,racism
4,572332655397629952,racism
...,...,...
5343,570244879265206272,sexism
5344,571013698111860736,sexism
5345,571304517930774528,sexism
5346,575664315627216896,sexism


In [28]:
df2 = pd.read_csv("data/NLP_CSS_2016.csv", sep = "	", index_col = False, dtype = np.str_)
df2

Unnamed: 0,TweetID,Expert,Amateur_0,Amateur_1,Amateur_2,Amateur_3,Amateur_4,Amateur_5,Amateur_6,Amateur_7,...,Amateur_1079,Amateur_1080,Amateur_1081,Amateur_1082,Amateur_1083,Amateur_1084,Amateur_1085,Amateur_1086,Amateur_1087,Amateur_1088 Amateur_1089
0,597576902212063232,neither,neither,neither,neither,,,,,,...,,,,,,,,,,
1,565586175864610817,neither,neither,neither,neither,,,,,,...,,,,,,,,,,
2,563881580209246209,neither,neither,neither,neither,,,,,,...,,,,,,,,,,
3,595380689534656512,neither,neither,sexism,neither,,,,,,...,,,,,,,,,,
4,563757610327748608,neither,neither,neither,neither,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6904,569176414999588864,neither,neither,neither,neither,,,,,,...,,,,,,,,,,
6905,571437409843306496,sexism,sexism,sexism,sexism,,,,,,...,,,,,,,,,,
6906,569743990897057792,neither,neither,neither,neither,,,,,,...,,,,,,,,,,
6907,563027947423272960,neither,neither,neither,neither,,,,,,...,,,,,,,,,,


In [63]:
ids = []
decisions = []

for i in range(len(df2)):
    tweet_id = df2.iloc[i, 0]
    votes = df2.iloc[i, 1:].to_numpy().astype(np.str_)
    
    ids.append(tweet_id)
    decisions.append(major_vote(votes))

In [57]:
votes = df2.iloc[0, 1:].to_numpy().astype(np.str_)
votes[votes != "nan"]

array(['neither', 'neither', 'neither', ..., nan, nan, nan], dtype=object)

In [66]:
df = pd.DataFrame(data = {0: ids, 1: decisions})
df

Unnamed: 0,0,1
0,597576902212063232,neither
1,565586175864610817,neither
2,563881580209246209,neither
3,595380689534656512,neither
4,563757610327748608,neither
...,...,...
6904,569176414999588864,neither
6905,571437409843306496,sexism
6906,569743990897057792,neither
6907,563027947423272960,neither


In [84]:
df = df[df[1] != "neither"]

In [92]:
all_ids = df1[0].to_numpy().astype(np.str_)
all_ids = np.append(all_ids, df[0].to_numpy())
unq, occ = np.unique(all_ids, return_counts = True)
print(f"All: {len(all_ids)}, unique: {len(unq)}")

duplicates = unq[occ > 1]

All: 6869, unique: 6796


In [96]:
df = df[~df[0].isin(duplicates)]
df

Unnamed: 0,0,1
9,571030421103910912,sexism
14,603286576131506177,link
17,572068636338364417,sexism
25,575501717103644672,sexism
33,575393332328722432,sexism
...,...,...
6885,603639577920839681,sexism
6887,563321657373118465,sexism
6902,595282956970655744,sexism
6903,575480549332008962,sexism


In [101]:
df_full = pd.concat([df1, df])
df_full

Unnamed: 0,0,1
0,572342978255048705,racism
1,572341498827522049,racism
2,572340476503724032,racism
3,572334712804384768,racism
4,572332655397629952,racism
...,...,...
6885,603639577920839681,sexism
6887,563321657373118465,sexism
6902,595282956970655744,sexism
6903,575480549332008962,sexism


In [104]:
# filter out mistakes
df_full = df_full[df_full[1].isin(["neither", "racism", "both", "sexism"])]
df_full

Unnamed: 0,0,1
0,572342978255048705,racism
1,572341498827522049,racism
2,572340476503724032,racism
3,572334712804384768,racism
4,572332655397629952,racism
...,...,...
6885,603639577920839681,sexism
6887,563321657373118465,sexism
6902,595282956970655744,sexism
6903,575480549332008962,sexism


In [105]:
df_full.to_csv("data/filtered/racism_sexism.csv")

In [123]:
df_ben = pd.read_csv("data/benevolent_sexist.tsv.txt", header = None)
# all hostile are already included in the previous dataset

In [128]:
benevolent_ids = df_ben[0].to_numpy().astype(np.str_)

In [130]:
df_ben = pd.DataFrame(data = {0: benevolent_ids, 1: "benevolent"})
df_ben

Unnamed: 0,0,1
0,839880162586071040,benevolent
1,839630746625142784,benevolent
2,839630739335495681,benevolent
3,839665182985097216,benevolent
4,839765815494795264,benevolent
...,...,...
7200,839852292778176513,benevolent
7201,839927532803747844,benevolent
7202,840076198235406336,benevolent
7203,839859663973806084,benevolent


In [132]:
df_new = pd.concat([df_full, df_ben])
df_new.to_csv("data/filtered/racism_sexism_benevolent.csv")

Unnamed: 0,0,1
0,572342978255048705,racism
1,572341498827522049,racism
2,572340476503724032,racism
3,572334712804384768,racism
4,572332655397629952,racism
...,...,...
6885,603639577920839681,sexism
6887,563321657373118465,sexism
6902,595282956970655744,sexism
6903,575480549332008962,sexism
