# Error analysis

## Rieger+2021 data

In [3]:
# Load predictions
import numpy as np

preds_path = '../output/bert/ws_neutral_annotated_test/rieger2021_predictions.txt'
preds = np.loadtxt(preds_path)
print(len(preds))
preds

5162


array([0., 1., 1., ..., 0., 1., 1.])

In [4]:
# Load data and gold labels
import pandas as pd

annotations_fpath = '/home/mamille3/data/hate_speech/rieger2021/Datensatz mit mf_ide2.csv'
annotations = pd.read_csv(annotations_fpath, na_values=-99)
annotations.info()
annotations

# Rename annotations to English (from codebook)
en_cols = ['id', 'src', 'spam', 'pers_ins', 'pers_ins_tar1', 'pers_ins_ref1', 'pers_ins_tar2', 'pers_ins_ref2', 'gen_ins', 'gen_ins_tar1', 'gen_ins_tar2', 
            'viol', 'viol_tar', 'stereo', 'stereo2', 'disinfo', 'disinfo_ref', 'ingroup', 'ih_ide'] + annotations.columns.tolist()[19:]
len(en_cols)

annotations.columns = en_cols

text_fpath = '/home/mamille3/data/hate_speech/rieger2021/Kiening_Kommentare.xlsx'
texts = pd.read_excel(text_fpath, sheet_name='Tabelle5')
texts.drop(columns=[col for col in texts.columns if col.startswith('Unnamed')], inplace=True)
texts.info()
texts

# Merge in text, making sure sources match
data = pd.merge(texts, annotations, left_on='CommentID', right_on='id')
data.info()
print(data.Source.equals(data.src))

data = data.set_index('CommentID').drop(columns=['id', 'src', 'filter_$'])

data['inhuman_ideology'] = data.ih_ide.astype('category').cat.rename_categories(
    {0: 'none discernible', 1: 'National Socialist', 2: 'white supremacy/white ethnostate'})
data['inhuman_ideology'].value_counts()

# Replace numeric codes with names
demo_categories = {
    1: 'ethnicity',
    2: 'religion',
    3: 'country_of_origin',
    4: 'gender',
    5: 'political_views',
    6: 'sexual_orientation',
    7: 'disability',
    8: 'gender_identity',
    9: 'other',
    -9: 'undetermined'
}
identities = {
    1: 'black people',
    2: 'muslims',
    3: 'jews',
    4: 'lgbtq',
    5: 'migrants',
    6: 'people_with_disabilities',
    7: 'social_elites_media',
    8: 'political_opponents',
    9: 'latin_americans',
    10: 'women',
    11: 'criminals',
    12: 'asians',
    13: 'other',
    -9: 'undetermined',
}

ref_cols = ['pers_ins_ref1',
           'pers_ins_ref2']
tar_cols = ['gen_ins_tar1',
            'gen_ins_tar2',
            'viol_tar',
            'disinfo_ref',
           ]

for col in ref_cols:
    data[col] = data[col].astype('category').cat.rename_categories(demo_categories)
for col in tar_cols:
    data[col] = data[col].astype('category').cat.rename_categories(identities)

data['Source'] = data.Source.astype('category').cat.rename_categories({
    1: 'td',
    2: '4chan_pol',
    3: '8chan_pol',
})

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5981 entries, 0 to 5980
Data columns (total 25 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   id            5981 non-null   int64  
 1   src           5981 non-null   int64  
 2   spam          5981 non-null   int64  
 3   pers_bel      5981 non-null   int64  
 4   pers_beladr1  142 non-null    float64
 5   pers_belart1  137 non-null    float64
 6   pers_beladr2  14 non-null     float64
 7   pers_belart2  14 non-null     float64
 8   allg_bel      5981 non-null   int64  
 9   allg_beladr1  570 non-null    float64
 10  allg_beladr2  65 non-null     float64
 11  gewalt        5981 non-null   int64  
 12  gewalt_adr    218 non-null    float64
 13  stereo        5981 non-null   int64  
 14  stereo2       27 non-null     float64
 15  desinfo       5981 non-null   int64  
 16  desinfo_bzg   458 non-null    float64
 17  ingroup       5981 non-null   int64  
 18  mf_ide        5981 non-null 

In [8]:
data['white_supremacist'] = data['inhuman_ideology'].isin(['white supremacy/white ethnostate', 'National Socialist'])

In [9]:
# Select positive and negative examples
data = data.query('white_supremacist or (gen_ins==0 and viol==0 and pers_ins==0 and not white_supremacist)')
len(data)

5302

In [10]:
import nltk
import re

def tokenize_lowercase(inp):
    """ Tokenize and lowercase text """
    return ' '.join(nltk.word_tokenize(str(inp))).lower()

def remove_special(text):
    text = text.replace('>', '')
    text = re.sub(r'\d{7,}', '', text)
    text = re.sub(r'\S+(?:\.com|\.org|\.edu)\S*|https?:\/\/\S*', '', text) # Remove URLs
    return text

def process_rieger2021(text):
    """ Preprocess Rieger+ 2021 4chan, 8chan, t_D data """
    # Remove special characters
    text = remove_special(str(text))
    # Tokenize
    text = tokenize_lowercase(text)
    return text



In [11]:
data = data.dropna(subset='Text')
data['text'] = data['Text'].map(process_rieger2021)
data = data[data['text'] != '']
data['label'] = data['white_supremacist'].astype(int)
len(data)

5162

In [13]:
data['pred'] = preds

In [14]:
# confusion matrix
from sklearn.metrics import confusion_matrix
pd.DataFrame(confusion_matrix(data['label'], data['pred']), columns=['pred_0', 'pred_1'], index=['true_0', 'true_1'])

Unnamed: 0,pred_0,pred_1
true_0,1727,3067
true_1,17,351


In [22]:
# Look at false positives (majority of errors)
pd.set_option('display.max_colwidth', None)
fp = data.query('label==0 and pred==1').sample(30)[['text', 'Source', 'label', 'pred']]
fp

Unnamed: 0_level_0,text,Source,label,pred
CommentID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1471,"thanks , a great read . even better , was this one it linked to . '' my speech at the conference was about the fact , that foreigners ’ culture plays a significant role concerning integration , crime and religious extremism . i emphasized , that people from a muslim culture find it difficult , if not impossible , to create a successful life in denmark. ``",td,0,1.0
2332,same way the msm all say the exact same language when a new hot button issue comes up . `` modern day lynching . '' `` easter worshippers . '' etc,td,0,1.0
1709,delayed again,4chan_pol,0,1.0
4760,this is related,4chan_pol,0,1.0
724,"when obama is thrown in prison , can we send her back ?",td,0,1.0
4891,lmao does n't show anything outside the levant,td,0,1.0
2732,( ( ( meme flag ) ) ) burns down mosque in israel same day passover rothchilds kill lincoln same day jews sink titanic sloppy job mossad,4chan_pol,0,1.0
4099,"he died in berlin . hitler 's dentist was captured after the war by the americans . two of his dental techs were captured by the russians . each independently reconstructed the details of hitler 's dental work from memory . their reconstructions meshed completely , and they fit with the details of the dental work on the jaw fragments attributed to hitler and to the skull x-rays taken of him by his doctor after the july 20 bomb attempt . bottom line , there is solid forensic proof beyond a reasonable doubt that hitler died in berlin . for a detailed account of the forensic exam go to .",td,0,1.0
80,lurk moar,4chan_pol,0,1.0
3786,for,8chan_pol,0,1.0
