In [1]:
import pandas as pd
from matplotlib import pyplot as plt
import numpy as np
import re
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [2]:
!pip install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/98/87/ef312eef26f5cecd8b17ae9654cdd8d1fae1eb6dbd87257d6d73c128a4d0/transformers-4.3.2-py3-none-any.whl (1.8MB)
[K     |████████████████████████████████| 1.8MB 7.5MB/s 
Collecting tokenizers<0.11,>=0.10.1
[?25l  Downloading https://files.pythonhosted.org/packages/fd/5b/44baae602e0a30bcc53fbdbc60bd940c15e143d252d658dfdefce736ece5/tokenizers-0.10.1-cp36-cp36m-manylinux2010_x86_64.whl (3.2MB)
[K     |████████████████████████████████| 3.2MB 37.4MB/s 
[?25hCollecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB)
[K     |████████████████████████████████| 890kB 50.5MB/s 
Building wheels for collected packages: sacremoses
  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone
  Created wheel for sacremoses: filename=sacremoses-0.0.43-cp36-none-any.whl size=893261 sha256=99433

#### Initialize bert classification model

In [3]:
from transformers import BertTokenizer, TFBertForSequenceClassification
from transformers import InputExample, InputFeatures

model = TFBertForSequenceClassification.from_pretrained("bert-base-uncased", 
                                                        trainable=True, 
                                                        num_labels=2)
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=433.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=536063208.0, style=ProgressStyle(descri…




All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




In [121]:
model.summary()

Model: "tf_bert_for_sequence_classification"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
bert (TFBertMainLayer)       multiple                  109482240 
_________________________________________________________________
dropout_37 (Dropout)         multiple                  0         
_________________________________________________________________
classifier (Dense)           multiple                  1538      
Total params: 109,483,778
Trainable params: 109,483,778
Non-trainable params: 0
_________________________________________________________________


In [4]:
type(tokenizer)

transformers.models.bert.tokenization_bert.BertTokenizer

#### Read hate dataset and convert it into train and test

In [7]:
df = pd.read_csv("data/fox_h.csv")
df = df.drop(columns=['Unnamed: 0'])
df['text'] = df['text'].str.strip()

In [19]:
df = df.replace(r'@[a-zA-Z0-9]* ', '', regex=True)

In [20]:
df.head()

Unnamed: 0,text,label
0,barryswallows Merkel would never say NO,1
1,PostApocalypticHero Expect more and more women...,1
2,californiamojo Groping people in public wasn't...,0
3,"MikeSte Merkel, possible the only person in ch...",1
4,"scientist They know very well, no means NO! Th...",1


In [21]:
def get_dataset(df, seed, test_size):
    return train_test_split(df, test_size=test_size, random_state=seed, shuffle=True)

In [22]:
train, test = get_dataset(df, 11, 0.2)

In [23]:
train.label.value_counts()

0    1772
1     672
Name: label, dtype: int64

In [24]:
test.label.value_counts()

0    414
1    198
Name: label, dtype: int64

In [25]:
train.columns = ['DATA_COLUMN', 'LABEL_COLUMN']
test.columns = ['DATA_COLUMN', 'LABEL_COLUMN']


In [26]:
train.head()

Unnamed: 0,DATA_COLUMN,LABEL_COLUMN
2809,painterdave01 wrong it is all on e.b.t cards &...,0
2200,WhitWhitneyWhite You have never been a slave a...,1
650,BeauCharles Don't suppose she watched the atta...,0
1501,philtbn007 There are others who stand with you...,0
1207,obamasmom The days of the Silver backs lying a...,1


In [27]:
test.head()

Unnamed: 0,DATA_COLUMN,LABEL_COLUMN
200,Freedomscout Oh it's going to be a fun place a...,0
2144,marco Moooooochel ought to be wiping not swipi...,1
1965,frommie I think the maximum age to abort a lib...,0
353,SB117 Fourth time; What forced compliance of p...,0
458,earnestlyjoshing US citizens are just collater...,1


In [28]:
def convert_data_to_examples(train, test, DATA_COLUMN, LABEL_COLUMN): 
  train_InputExamples = train.apply(lambda x: InputExample(guid=None, # Globally unique ID for bookkeeping, unused in this case
                                                          text_a = x[DATA_COLUMN], 
                                                          text_b = None,
                                                          label = x[LABEL_COLUMN]), axis = 1)

  validation_InputExamples = test.apply(lambda x: InputExample(guid=None, # Globally unique ID for bookkeeping, unused in this case
                                                          text_a = x[DATA_COLUMN], 
                                                          text_b = None,
                                                          label = x[LABEL_COLUMN]), axis = 1)
  
  return train_InputExamples, validation_InputExamples

  train_InputExamples, validation_InputExamples = convert_data_to_examples(train, 
                                                                           test, 
                                                                           'DATA_COLUMN', 
                                                                           'LABEL_COLUMN')
  
def convert_examples_to_tf_dataset(examples, tokenizer, max_length=128):
    features = [] # -> will hold InputFeatures to be converted later

    for e in examples:
        # Documentation is really strong for this method, so please take a look at it
        input_dict = tokenizer.encode_plus(
            e.text_a,
            add_special_tokens=True,
            max_length=max_length, # truncates if len(s) > max_length
            return_token_type_ids=True,
            return_attention_mask=True,
            pad_to_max_length=True, # pads to the right by default # CHECK THIS for pad_to_max_length
            truncation=True
        )

        input_ids, token_type_ids, attention_mask = (input_dict["input_ids"],
            input_dict["token_type_ids"], input_dict['attention_mask'])

        features.append(
            InputFeatures(
                input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, label=e.label
            )
        )

    def gen():
        for f in features:
            yield (
                {
                    "input_ids": f.input_ids,
                    "attention_mask": f.attention_mask,
                    "token_type_ids": f.token_type_ids,
                },
                f.label,
            )

    return tf.data.Dataset.from_generator(
        gen,
        ({"input_ids": tf.int32, "attention_mask": tf.int32, "token_type_ids": tf.int32}, tf.int64),
        (
            {
                "input_ids": tf.TensorShape([None]),
                "attention_mask": tf.TensorShape([None]),
                "token_type_ids": tf.TensorShape([None]),
            },
            tf.TensorShape([]),
        ),
    )


DATA_COLUMN = 'DATA_COLUMN'
LABEL_COLUMN = 'LABEL_COLUMN'

In [29]:
train_InputExamples, validation_InputExamples = convert_data_to_examples(train, test, DATA_COLUMN, LABEL_COLUMN)

train_data = convert_examples_to_tf_dataset(list(train_InputExamples), tokenizer)
train_data = train_data.batch(32)

validation_data = convert_examples_to_tf_dataset(list(validation_InputExamples), tokenizer)
validation_data = validation_data.batch(32)



#### Compile and fit model

In [32]:
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0), 
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), 
              metrics=[tf.keras.metrics.SparseCategoricalAccuracy('accuracy')])



In [33]:
hist = model.fit(train_data, epochs=10, validation_data=validation_data)

Epoch 1/10
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: <cyfunction Socket.send at 0x7fa00c2ac2a0> is not a module, class, method, function, traceback, frame, or code object
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: <cyfunction Socket.send at 0x7fa00c2ac2a0> is not a module, class, method, function, traceback, frame, or code object

Cause: while/else statement not yet supported
Cause: while/else statement not yet supported
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [34]:
preds = model.predict(validation_data)



In [35]:
preds

TFSequenceClassifierOutput([('logits', array([[ 3.7125561, -4.3825994],
                                    [-4.4424734,  3.9916768],
                                    [ 5.6841173, -5.489478 ],
                                    ...,
                                    [ 5.690194 , -5.69071  ],
                                    [-4.471932 ,  4.170722 ],
                                    [ 5.833737 , -5.615808 ]], dtype=float32))])

In [None]:
len(preds[0])

#### Classification report for validation data

In [36]:
print(classification_report(test['LABEL_COLUMN'],np.argmax(preds[0],axis=1)))

              precision    recall  f1-score   support

           0       0.97      0.96      0.97       414
           1       0.92      0.94      0.93       198

    accuracy                           0.96       612
   macro avg       0.95      0.95      0.95       612
weighted avg       0.96      0.96      0.96       612



In [64]:
df_class_rep = pd.DataFrame(classification_report(test['LABEL_COLUMN'],np.argmax(preds[0],axis=1), target_names=['Non Hate-Speech','Hate-Speech'], output_dict=True)).transpose().to_csv('classification_reports/classification_mat_bert_fox_10epochs.csv')

### Read movie subtitle cleansed data and pass for prediction.

In [37]:
df1 = pd.read_csv('data/all_movies_combined.csv')

In [38]:
df1.head()

Unnamed: 0.1,Unnamed: 0,index,utterance,movie_name,tag,count_utterances
0,0,0,I got here as fast as legally possible.,1917,friendship,1480
1,1,1,Do you have my will?,1917,friendship,1480
2,2,2,"Yes, sir.",1917,friendship,1480
3,3,3,I think we should take him to the hospital.,1917,friendship,1480
4,4,4,"No, George wants to stay at home.",1917,friendship,1480


In [39]:
# function to predict class using dataframe of utterances
def get_annotated_sents(df, name):
  pred_sents_bk = df['utterance'].to_list()
  last=0
  label=0
  df_pred = pd.DataFrame(columns=['sentence','label'])
  for i in range(32,len(pred_sents_bk),32):
    tf_batch = tokenizer(pred_sents_bk[last:i], max_length=128, padding=True, truncation=True, return_tensors='tf')
    tf_outputs = model(tf_batch)
    tf_predictions = tf_outputs[0]
    labels = ['0','1','2']
    label = tf.argmax(tf_predictions, axis=1)
    label = label.numpy()
    
    for j in range(32):
      df_pred.loc[len(df_pred.index)] = [pred_sents_bk[last+j],label[j]]

    last=i

  df_pred.to_csv(name)
  return df_pred


In [40]:
pred_tw = get_annotated_sents(df1, 'output/fox_pred_movies.csv')

In [41]:
pred_tw.label.value_counts()

0    310619
1     50437
Name: label, dtype: int64

In [42]:
pred_tw.head()

Unnamed: 0,sentence,label
0,I got here as fast as legally possible.,1
1,Do you have my will?,0
2,"Yes, sir.",0
3,I think we should take him to the hospital.,0
4,"No, George wants to stay at home.",0


In [43]:
df2=pd.concat([df1, pred_tw], axis=1, join="inner")

In [44]:
df2.head()

Unnamed: 0.1,Unnamed: 0,index,utterance,movie_name,tag,count_utterances,sentence,label
0,0,0,I got here as fast as legally possible.,1917,friendship,1480,I got here as fast as legally possible.,1
1,1,1,Do you have my will?,1917,friendship,1480,Do you have my will?,0
2,2,2,"Yes, sir.",1917,friendship,1480,"Yes, sir.",0
3,3,3,I think we should take him to the hospital.,1917,friendship,1480,I think we should take him to the hospital.,0
4,4,4,"No, George wants to stay at home.",1917,friendship,1480,"No, George wants to stay at home.",0


In [45]:
df2.to_csv("output/concat_movie_data_10epochs_fox.csv")

In [46]:
df3=df2.drop(columns=['Unnamed: 0','sentence'])

In [47]:
df3.head()

Unnamed: 0,index,utterance,movie_name,tag,count_utterances,label
0,0,I got here as fast as legally possible.,1917,friendship,1480,1
1,1,Do you have my will?,1917,friendship,1480,0
2,2,"Yes, sir.",1917,friendship,1480,0
3,3,I think we should take him to the hospital.,1917,friendship,1480,0
4,4,"No, George wants to stay at home.",1917,friendship,1480,0


In [63]:
df4=df2.groupby(['movie_name','tag','label']).count()
df4.to_csv('output/groupby_movie_label_fox_bert_10epochs.csv')

In [62]:
df4.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 0,index,utterance,count_utterances,sentence
movie_name,tag,label,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Electric Dreams,hate_speech,0,1157,1157,1157,1157,1157
Electric Dreams,hate_speech,1,131,131,131,131,131
Miami Vice,hate_speech,0,982,982,982,982,982
Miami Vice,hate_speech,1,193,193,193,193,193
Pandora,hate_speech,0,2013,2013,2013,2013,2013


In [88]:
df5=df2.drop(columns=['Unnamed: 0',	'index',	'utterance',	'count_utterances',	'sentence'])
df5['label'] = df5['label'].map({0: 'Non Hate-Speech', 1: 'Hate-Speech'})
df5=df5.groupby(['tag','label']).count().reset_index()

In [89]:
df5

Unnamed: 0,tag,label,movie_name
0,friendship,Hate-Speech,22242
1,friendship,Non Hate-Speech,145589
2,hate_speech,Hate-Speech,3749
3,hate_speech,Non Hate-Speech,24203
4,racism,Hate-Speech,24446
5,racism,Non Hate-Speech,140827


In [104]:
df6=df1.drop(columns=['Unnamed: 0',	'index',	'utterance', 'count_utterances'])
df6=df6.groupby(['tag']).count().reset_index()
df6

Unnamed: 0,tag,movie_name
0,friendship,167831
1,hate_speech,27952
2,racism,165296


In [120]:
df7=df5.merge(df6,on=['tag','tag'])
df7['% data'] = round((df7['movie_name_x']/df7['movie_name_y'])*100,2)
df7
df7=df7.drop(columns=['movie_name_y'])
df7=df7.rename(columns={'movie_name_x':'movie sents'})
df7.groupby(['tag','label']).first()

Unnamed: 0_level_0,Unnamed: 1_level_0,movie sents,% data
tag,label,Unnamed: 2_level_1,Unnamed: 3_level_1
friendship,Hate-Speech,22242,13.25
friendship,Non Hate-Speech,145589,86.75
hate_speech,Hate-Speech,3749,13.41
hate_speech,Non Hate-Speech,24203,86.59
racism,Hate-Speech,24446,14.79
racism,Non Hate-Speech,140827,85.2
