## Movie Annotation 

In [90]:
import pandas as pd

In [91]:
def cleanse_half_sentences(df):
    lst = df.utterance.values

    sent = ''
    new_l = []
    for t in lst:
        if t[-1] not in ['.','?','!']:
            sent+=t+' '
        else:
            sent+=t
            new_l.append({'utterance':sent})
            sent = ''
    result_df = pd.DataFrame(new_l)
    return result_df

In [92]:
path_1 = 'Mrs_Doubtfire1.csv'
path_2 = 'blackkklansman.csv'

In [93]:
df_mrs_doubtfire = pd.read_csv(path_1, index_col=0)
df_blackkklansman = pd.read_csv(path_2, index_col=0)

In [94]:
df_mrs_doubtfire.rename(columns={'0':'utterance'}, inplace=True)
df_blackkklansman.rename(columns={'0':'utterance'}, inplace=True)

In [95]:
df_mrs_doubtfire = cleanse_half_sentences(df_mrs_doubtfire)
df_blackkklansman = cleanse_half_sentences(df_blackkklansman)

In [118]:
df_mrs_doubtfire.to_csv('Mrs_Doubtfire_cleansed_s.csv')

In [120]:
df_blackkklansman.to_csv('blackkklansman_cleansed_s.csv')

In [96]:
df_mrs_doubtfire.shape

(2045, 1)

In [97]:
df_mrs_doubtfire.head(1)

Unnamed: 0,utterance
0,"Figaro, Figaro, Figaro, Figaro, Figaro, Figaro..."


In [98]:
df_blackkklansman.head(1)

Unnamed: 0,utterance
0,﻿Have you seen Dr. Meade?


In [100]:
df_blackkklansman.shape

(2543, 1)

## Keywords

In [101]:
path = r'C:\Users\Niklas\Documents\GitHub\movie_hatespeech_detection\keywords\all_hate_keywords.csv'

In [102]:
df_keywords = pd.read_csv(path, index_col=0)

In [103]:
df_keywords.shape

(444, 2)

In [104]:
df_keywords = df_keywords.drop_duplicates(subset='term')

In [105]:
df_keywords.head()

Unnamed: 0,term,source
0,Paedo,hatebase
1,Hodgie,hatebase
2,sand ape,hatebase
3,islamization,hatebase
4,porki,hatebase


In [132]:
def annotate_df(df, df_keywords):
    result = []
    def f(row):
        for term in df_keywords.term.values:
            if term in row['utterance']:
                result.append({'bool': 1, 'index': row.name})
                
    df['annotation_algo'] = df.apply(f, axis=1)
    result_df = pd.DataFrame(result)
    df = df.reset_index()
    result_df = df.merge(result_df, on='index', how='outer')
    result_df = result_df.fillna(value=0)
    result_df = result_df.drop_duplicates()
    result_df.drop(columns=['annotation_algo', 'index'], inplace=True)
    return result_df

## Annotation of Mrs. Doubtfire

In [133]:
df_anno_doubtfire = annotate_df(df_mrs_doubtfire, df_keywords)

In [134]:
df_anno_doubtfire.head(1)

Unnamed: 0,utterance,bool
0,"Figaro, Figaro, Figaro, Figaro, Figaro, Figaro...",0.0


In [135]:
df_anno_doubtfire['bool'].value_counts()

0.0    2039
1.0       6
Name: bool, dtype: int64

In [136]:
df_anno_doubtfire.shape

(2045, 2)

In [137]:
df_anno_doubtfire[df_anno_doubtfire['bool']==1].utterance

114                        What if you're married to one?
843     It's basically egg whites, crème fraîche, powd...
948                                   All spick-and-span.
1234    You can't imagine what it was like being marri...
1686         Make mine not spicy. I'm allergic to pepper.
1969    You just sat there in that courtroom and let t...
Name: utterance, dtype: object

## Annotation of KKK

In [138]:
df_anno_kkk = annotate_df(df_blackkklansman, df_keywords)

In [139]:
df_anno_kkk.head(1)

Unnamed: 0,utterance,bool
0,﻿Have you seen Dr. Meade?,0.0


In [140]:
df_anno_kkk['bool'].value_counts()

0.0    2466
1.0      77
Name: bool, dtype: int64

In [141]:
df_anno_kkk.shape

(2543, 2)

In [143]:
[df_anno_kkk['bool']==1].index

Int64Index([  25,   52,   62,  200,  265,  276,  281,  290,  384,  387,  480,
             486,  488,  509,  564,  687,  689,  734,  739,  741,  752,  757,
             771,  784,  807,  839,  845, 1008, 1072, 1103, 1108, 1126, 1143,
            1168, 1181, 1259, 1260, 1264, 1276, 1373, 1386, 1388, 1397, 1405,
            1423, 1488, 1541, 1546, 1556, 1624, 1634, 1695, 1698, 1707, 1709,
            1713, 1792, 1798, 1873, 1876, 1949, 1950, 1969, 2033, 2148, 2152,
            2159, 2213, 2338, 2339, 2347, 2348, 2364, 2522, 2534, 2539, 2543],
           dtype='int64')

In [144]:
df_anno_kkk.to_csv('blackkklansman_algo_annotation.csv')

25 - undecided
52 - 0
62 - 0
200 - 0
265 - 0

