In [1]:
import pandas as pd

# Loading Dataset

In [4]:
def load_dataset(path):
    df = pd.read_csv(path, index_col=0)
    return df

In [5]:
movies_df = load_dataset('data/all_movies.csv')
movies_df.head()

Unnamed: 0,movie_name,index,utterance,count_utterances,tag
0,12 Angry Men,0,"Thank you very much, sir.",2065,racism
1,12 Angry Men,1,You did a wonderful job. Wonderful.,2065,racism
2,12 Angry Men,2,Shh. Shh.,2065,racism
3,12 Angry Men,3,"We did it, Pete. We did it.",2065,racism
4,12 Angry Men,4,"For a moment, we had our doubts.",2065,racism


In [6]:
movies_df.shape

(359036, 5)

# Word Embedding

In [7]:
annotations_word_embedding_fox_news_df = load_dataset('annotations/annotations_word_embedding_fox_news.csv')
annotations_word_embedding_fox_news_df.rename(columns={'label': 'we_label_fox_news'}, inplace=True)
annotations_word_embedding_fox_news_df

Unnamed: 0,movie_name,index,we_label_fox_news
0,12 Angry Men,0,not hate speech
1,12 Angry Men,1,not hate speech
2,12 Angry Men,2,not hate speech
3,12 Angry Men,3,not hate speech
4,12 Angry Men,4,hate speech
...,...,...,...
359031,You,1767,not hate speech
359032,You,1768,not hate speech
359033,You,1769,not hate speech
359034,You,1770,not hate speech


In [8]:
annotations_word_embedding_twitter_df = load_dataset('annotations/annotations_word_embedding_twitter.csv')
annotations_word_embedding_twitter_df.rename(columns={'label': 'we_label_twitter'}, inplace=True)
annotations_word_embedding_twitter_df

Unnamed: 0,movie_name,index,we_label_twitter
0,12 Angry Men,0,neither
1,12 Angry Men,1,neither
2,12 Angry Men,2,offensive language
3,12 Angry Men,3,neither
4,12 Angry Men,4,neither
...,...,...,...
359031,You,1767,offensive language
359032,You,1768,neither
359033,You,1769,neither
359034,You,1770,offensive language


# Bert

In [9]:
annotations_bert_fox_news_df = load_dataset('annotations/annotations_bert_fox_news.csv')
annotations_bert_fox_news_df.rename(columns={'label': 'bert_label_fox_news'}, inplace=True)
annotations_bert_fox_news_df

Unnamed: 0,index,movie_name,bert_label_fox_news
0,0,1917,hate speech
1,1,1917,not hate speech
2,2,1917,not hate speech
3,3,1917,not hate speech
4,4,1917,not hate speech
...,...,...,...
361051,920,X-Men,not hate speech
361052,921,X-Men,not hate speech
361053,922,X-Men,not hate speech
361054,923,X-Men,not hate speech


In [10]:
annotations_bert_twitter_df = load_dataset('annotations/annotations_bert_twitter.csv')
annotations_bert_twitter_df.rename(columns={'label': 'bert_label_twitter'}, inplace=True)
annotations_bert_twitter_df

Unnamed: 0,index,movie_name,bert_label_twitter
0,0,1917,neither
1,1,1917,neither
2,2,1917,neither
3,3,1917,neither
4,4,1917,neither
...,...,...,...
361051,920,X-Men,neither
361052,921,X-Men,neither
361053,922,X-Men,offensive language
361054,923,X-Men,neither


# Joining Datasets

In [None]:
df_results = movies_df.merge(annotations_word_embedding_fox_news_df, on=['index', 'movie_name'])
df_results = df_results.merge(annotations_word_embedding_twitter_df, on=['index', 'movie_name'])

In [None]:
df_results = df_results.merge(annotations_bert_twitter_df, on=['index', 'movie_name'], how='left')
df_results = df_results.merge(annotations_bert_fox_news_df, on=['index', 'movie_name'], how='left')

In [None]:
df_results.head()

In [None]:
df_results

## Analyse Results

In [None]:
def plot_label_by_tag_distribution(df, title, label):
    df = df_results.groupby('tag')[label].value_counts().to_frame('count').reset_index()
    df_pivot = pd.pivot_table(
        df,
        values="count",
        index="tag",
        columns=label
    )
    ax = df_pivot.plot(kind='bar', title=title)
    return ax

In [None]:
def plot_label_by_tag_distribution_percentage(df, title, label):
    ax = df_results.groupby('tag')[label].value_counts(normalize=True).unstack().plot(kind='bar', stacked = True, title=title)
    for p in ax.patches:
        width = p.get_width()
        height = p.get_height()
        x, y = p.get_xy() 
        a = round(x + width/2, 2)
        b = round(y + height*1.00, 2)
        height = round(height, 2)
        ax.annotate(f'{height}', (a, b), ha='center')
    ax.legend(bbox_to_anchor=(1.5, 1))
    return ax

# Word embedding

## Fox News

In [None]:
plot_label_by_tag_distribution(df_results, title='Label by tag distribution', label='label_fox_news')

In [None]:
plot_label_by_tag_distribution_percentage(df_results, title='Label by tag distribution (Percentage)', label='label_fox_news')

## hate speech examples

In [None]:
hatespeech_sentences = df_results[df_results.label_fox_news=='hate speech'].utterance.values
for i, utterance in enumerate(hatespeech_sentences):
    if i == 40:
        break
    print(utterance)
    print('---')

# Word embedding

## Twitter

In [None]:
plot_label_by_tag_distribution(df_results, title='Label by tag distribution', label='label_twitter')

In [None]:
plot_label_by_tag_distribution_percentage(df_results, title='Label by tag distribution (Percentage)', label='label_twitter')

In [None]:
hatespeech_sentences = df_results[df_results.label_twitter=='hate speech'].utterance.values
for i, utterance in enumerate(hatespeech_sentences):
    if i == 40:
        break
    print(utterance)
    print('---')

## Intersections

### both are "hate speech"

In [None]:
df_intersections = df_results[(df_results.label_twitter=='hate speech')&(df_results.label_fox_news=='hate speech')]

In [None]:
df_intersections

### fox news is "hate speech" and twitter is "offensive language"

In [None]:
df_intersections = df_results[(df_results.label_twitter=='offensive language')&(df_results.label_fox_news=='hate speech')]

In [None]:
df_intersections