In [9]:
import pandas as pd
from matplotlib import pyplot as plt
import numpy as np
import re
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [10]:
!pip install transformers



#### Initialize bert classification model

In [32]:
from transformers import BertTokenizer, TFBertForSequenceClassification
from transformers import InputExample, InputFeatures

model = TFBertForSequenceClassification.from_pretrained("bert-base-uncased", 
                                                        trainable=True, 
                                                        num_labels=3)
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [70]:
model.summary()

Model: "tf_bert_for_sequence_classification_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
bert (TFBertMainLayer)       multiple                  109482240 
_________________________________________________________________
dropout_113 (Dropout)        multiple                  0         
_________________________________________________________________
classifier (Dense)           multiple                  2307      
Total params: 109,484,547
Trainable params: 109,484,547
Non-trainable params: 0
_________________________________________________________________


In [12]:
type(tokenizer)

transformers.models.bert.tokenization_bert.BertTokenizer

#### Read hate dataset and convert it into train and test

In [13]:
df = pd.read_csv("data/filtered_tweets_3categ.csv")
df = df.drop(columns=['Unnamed: 0'])
df['tweet'] = df['tweet'].str.strip()

In [14]:
df.head()

Unnamed: 0,tweet,hate_speech_label
0,As a woman you shouldn't complain about cleani...,2
1,boy dats cold...tyga dwn bad for cuffin dat ho...,1
2,You ever fuck a bitch and she start to cry? Yo...,1
3,she look like a tranny,1
4,The shit you hear about me might be true or it...,1


In [15]:
def get_dataset(df, seed, test_size):
    return train_test_split(df, test_size=test_size, random_state=seed, shuffle=True)

In [16]:
train, test = get_dataset(df, 11, 0.2)

In [17]:
train.hate_speech_label.value_counts()

1    15264
2     3312
0     1146
Name: hate_speech_label, dtype: int64

In [18]:
test.hate_speech_label.value_counts()

1    3831
2     821
0     279
Name: hate_speech_label, dtype: int64

In [19]:
train.columns = ['DATA_COLUMN', 'LABEL_COLUMN']
test.columns = ['DATA_COLUMN', 'LABEL_COLUMN']


In [20]:
train.head()

Unnamed: 0,DATA_COLUMN,LABEL_COLUMN
2575,now that ur tour over come kick it with ya nig...,1
2865,some people do shit to get shot at and then be...,1
8731,Drop dead gorgeous a bitch ain't dying for a n...,1
10560,I lose OVO followers every year. 100 bitches u...,1
9697,His bitch ass is the same way so it don't matter.,1


In [21]:
test.head()

Unnamed: 0,DATA_COLUMN,LABEL_COLUMN
15922,Dez is caked up with some bitch or something c...,1
6395,at least im not black bitch your parents left ...,0
23683,faggot http:,0
6647,whatever you fucking pussy,1
12418,Lmfao all bitches is hoes then,1


In [22]:
def convert_data_to_examples(train, test, DATA_COLUMN, LABEL_COLUMN): 
  train_InputExamples = train.apply(lambda x: InputExample(guid=None, # Globally unique ID for bookkeeping, unused in this case
                                                          text_a = x[DATA_COLUMN], 
                                                          text_b = None,
                                                          label = x[LABEL_COLUMN]), axis = 1)

  validation_InputExamples = test.apply(lambda x: InputExample(guid=None, # Globally unique ID for bookkeeping, unused in this case
                                                          text_a = x[DATA_COLUMN], 
                                                          text_b = None,
                                                          label = x[LABEL_COLUMN]), axis = 1)
  
  return train_InputExamples, validation_InputExamples

  train_InputExamples, validation_InputExamples = convert_data_to_examples(train, 
                                                                           test, 
                                                                           'DATA_COLUMN', 
                                                                           'LABEL_COLUMN')
  
def convert_examples_to_tf_dataset(examples, tokenizer, max_length=128):
    features = [] # -> will hold InputFeatures to be converted later

    for e in examples:
        # Documentation is really strong for this method, so please take a look at it
        input_dict = tokenizer.encode_plus(
            e.text_a,
            add_special_tokens=True,
            max_length=max_length, # truncates if len(s) > max_length
            return_token_type_ids=True,
            return_attention_mask=True,
            pad_to_max_length=True, # pads to the right by default # CHECK THIS for pad_to_max_length
            truncation=True
        )

        input_ids, token_type_ids, attention_mask = (input_dict["input_ids"],
            input_dict["token_type_ids"], input_dict['attention_mask'])

        features.append(
            InputFeatures(
                input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, label=e.label
            )
        )

    def gen():
        for f in features:
            yield (
                {
                    "input_ids": f.input_ids,
                    "attention_mask": f.attention_mask,
                    "token_type_ids": f.token_type_ids,
                },
                f.label,
            )

    return tf.data.Dataset.from_generator(
        gen,
        ({"input_ids": tf.int32, "attention_mask": tf.int32, "token_type_ids": tf.int32}, tf.int64),
        (
            {
                "input_ids": tf.TensorShape([None]),
                "attention_mask": tf.TensorShape([None]),
                "token_type_ids": tf.TensorShape([None]),
            },
            tf.TensorShape([]),
        ),
    )


DATA_COLUMN = 'DATA_COLUMN'
LABEL_COLUMN = 'LABEL_COLUMN'

In [29]:
train_InputExamples, validation_InputExamples = convert_data_to_examples(train, test, DATA_COLUMN, LABEL_COLUMN)

train_data = convert_examples_to_tf_dataset(list(train_InputExamples), tokenizer)
train_data = train_data.batch(32)

validation_data = convert_examples_to_tf_dataset(list(validation_InputExamples), tokenizer)
validation_data = validation_data.batch(32)



#### Compile and fit model

In [33]:
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0), 
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), 
              metrics=[tf.keras.metrics.SparseCategoricalAccuracy('accuracy'), recall_m, precision_m,f1_m])



In [34]:
hist = model.fit(train_data, epochs=10, validation_data=validation_data)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [37]:
preds = model.predict(validation_data)

In [38]:
preds

TFSequenceClassifierOutput([('logits',
                             array([[-3.2243176 ,  6.0604    , -3.2032282 ],
                                    [-0.17221999,  4.3795977 , -4.516703  ],
                                    [ 1.652204  ,  0.7522583 , -3.2626994 ],
                                    ...,
                                    [-3.5031388 ,  6.0820823 , -3.0246224 ],
                                    [-3.7296169 ,  5.690911  , -2.3252714 ],
                                    [-1.8072023 ,  1.2436776 ,  0.49140668]], dtype=float32))])

In [77]:
len(preds[0])

4931

#### Classification report for validation data

In [39]:
print(classification_report(test['LABEL_COLUMN'],np.argmax(preds[0],axis=1)))

              precision    recall  f1-score   support

           0       0.41      0.42      0.41       279
           1       0.93      0.93      0.93      3831
           2       0.82      0.81      0.82       821

    accuracy                           0.88      4931
   macro avg       0.72      0.72      0.72      4931
weighted avg       0.88      0.88      0.88      4931



In [50]:
df_class_rep = pd.DataFrame(classification_report(test['LABEL_COLUMN'],np.argmax(preds[0],axis=1), target_names=['Hate Speech','Offensive Language','Neither'], output_dict=True)).transpose().to_csv('classification_reports/classification_mat_bert_tweet_10epochs.csv')

#### Read movie subtitle cleansed data and pass for prediction.

In [40]:
df1 = pd.read_csv('data/all_movies_combined.csv')

In [41]:
df1.head()

Unnamed: 0.1,Unnamed: 0,index,utterance,movie_name,tag,count_utterances
0,0,0,I got here as fast as legally possible.,1917,friendship,1480
1,1,1,Do you have my will?,1917,friendship,1480
2,2,2,"Yes, sir.",1917,friendship,1480
3,3,3,I think we should take him to the hospital.,1917,friendship,1480
4,4,4,"No, George wants to stay at home.",1917,friendship,1480


In [45]:
# function to predict class using dataframe of utterances
def get_annotated_sents(df, name):
  pred_sents_bk = df['utterance'].to_list()
  last=0
  label=0
  df_pred = pd.DataFrame(columns=['sentence','label'])
  for i in range(32,len(pred_sents_bk),32):
    tf_batch = tokenizer(pred_sents_bk[last:i], max_length=128, padding=True, truncation=True, return_tensors='tf')
    tf_outputs = model(tf_batch)
    tf_predictions = tf_outputs[0]
    labels = ['0','1','2']
    label = tf.argmax(tf_predictions, axis=1)
    label = label.numpy()
    
    for j in range(32):
      df_pred.loc[len(df_pred.index)] = [pred_sents_bk[last+j],label[j]]

    last=i

  df_pred.to_csv(name)
  return df_pred


In [43]:
pred_tw = get_annotated_sents(df1, 'output/tweet_pred_movies_10epochs.csv')

In [44]:
pred_tw.label.value_counts()

2    263279
1     87158
0     10619
Name: label, dtype: int64

In [46]:
pred_tw.head()

Unnamed: 0,sentence,label
0,I got here as fast as legally possible.,2
1,Do you have my will?,2
2,"Yes, sir.",2
3,I think we should take him to the hospital.,2
4,"No, George wants to stay at home.",2


In [47]:
df2=pd.concat([df1, pred_tw], axis=1, join="inner")

In [48]:
df2.head()

Unnamed: 0.1,Unnamed: 0,index,utterance,movie_name,tag,count_utterances,sentence,label
0,0,0,I got here as fast as legally possible.,1917,friendship,1480,I got here as fast as legally possible.,2
1,1,1,Do you have my will?,1917,friendship,1480,Do you have my will?,2
2,2,2,"Yes, sir.",1917,friendship,1480,"Yes, sir.",2
3,3,3,I think we should take him to the hospital.,1917,friendship,1480,I think we should take him to the hospital.,2
4,4,4,"No, George wants to stay at home.",1917,friendship,1480,"No, George wants to stay at home.",2


In [49]:
df2.to_csv("output/concat_movie_data_twitter_10epochs.csv")

In [51]:
df4=df2.groupby(['movie_name','tag','label']).count()
df4.to_csv('output/groupby_movie_label_tweet_bert_10epochs.csv')

In [52]:
df4

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 0,index,utterance,count_utterances,sentence
movie_name,tag,label,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Electric Dreams,hate_speech,0,28,28,28,28,28
Electric Dreams,hate_speech,1,305,305,305,305,305
Electric Dreams,hate_speech,2,955,955,955,955,955
Miami Vice,hate_speech,0,47,47,47,47,47
Miami Vice,hate_speech,1,234,234,234,234,234
...,...,...,...,...,...,...,...
You,friendship,1,608,608,608,608,608
You,friendship,2,1149,1149,1149,1149,1149
afterMath,hate_speech,0,30,30,30,30,30
afterMath,hate_speech,1,235,235,235,235,235


In [65]:
df5=df2.drop(columns=['Unnamed: 0',	'index',	'utterance',	'count_utterances',	'sentence'])
df5['label'] = df5['label'].map({0: 'Hate-Speech', 1: 'Offensive Language', 2:'Neither'})
df5
df5=df5.groupby(['tag','label']).count()
df5

Unnamed: 0_level_0,Unnamed: 1_level_0,movie_name
tag,label,Unnamed: 2_level_1
friendship,Hate-Speech,4121
friendship,Neither,124199
friendship,Offensive Language,39511
hate_speech,Hate-Speech,996
hate_speech,Neither,20741
hate_speech,Offensive Language,6215
racism,Hate-Speech,5502
racism,Neither,118339
racism,Offensive Language,41432


In [66]:
df5=df5.reset_index()

In [67]:
df6=df1.drop(columns=['Unnamed: 0',	'index',	'utterance', 'count_utterances'])
df6=df6.groupby(['tag']).count().reset_index()
df6

Unnamed: 0,tag,movie_name
0,friendship,167831
1,hate_speech,27952
2,racism,165296


In [68]:
df7=df5.merge(df6,on=['tag','tag'])
df7['% data'] = round((df7['movie_name_x']/df7['movie_name_y'])*100,2)
df7
df7=df7.drop(columns=['movie_name_y'])
df7=df7.rename(columns={'movie_name_x':'movie sents'})
df7.groupby(['tag','label']).first()

Unnamed: 0_level_0,Unnamed: 1_level_0,movie sents,% data
tag,label,Unnamed: 2_level_1,Unnamed: 3_level_1
friendship,Hate-Speech,4121,2.46
friendship,Neither,124199,74.0
friendship,Offensive Language,39511,23.54
hate_speech,Hate-Speech,996,3.56
hate_speech,Neither,20741,74.2
hate_speech,Offensive Language,6215,22.23
racism,Hate-Speech,5502,3.33
racism,Neither,118339,71.59
racism,Offensive Language,41432,25.07
