In [None]:
#!wget --quiet https://raw.githubusercontent.com/tensorflow/models/master/official/nlp/bert/tokenization.py

In [None]:
#!pip install sentencepiece

In [None]:
#import necessary packages
import gc
import re
import string
import operator
from collections import defaultdict

import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

import matplotlib.pyplot as plt
import seaborn as sns

import tokenization
from wordcloud import STOPWORDS

import tensorflow as tf
import tensorflow_hub as hub
from tensorflow import keras
from tensorflow.keras.optimizers import SGD, Adam
from tensorflow.keras.layers import Dense, Input, Dropout, GlobalAveragePooling1D
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, Callback

SEED = 1337

from sklearn.model_selection import StratifiedKFold, StratifiedShuffleSplit
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.preprocessing import LabelEncoder,MultiLabelBinarizer

In [None]:
df_train=pd.read_csv('Train.csv')
df_test=pd.read_csv('Test.csv')

In [None]:
df_train.head()

Unnamed: 0,Tweet_ID,tweet,type
0,ID_0022DWKP,Had a dream i got raped last night. By a guy i...,sexual_violence
1,ID_00395QYM,he thought the word raped means sex and told m...,sexual_violence
2,ID_003EOSSF,She NOT TALKING TO ME I WAS RAPED BY 2 MEN 1 M...,sexual_violence
3,ID_004BBHOD,I was sexually abused for 3 years at age 4 to ...,sexual_violence
4,ID_004F7516,Chessy Prout can do better by telling the trut...,sexual_violence


In [None]:
#creating meta-features
# word_count
df_train['word_count'] = df_train['tweet'].apply(lambda x: len(str(x).split()))
df_test['word_count'] = df_test['tweet'].apply(lambda x: len(str(x).split()))

# unique_word_count
df_train['unique_word_count'] = df_train['tweet'].apply(lambda x: len(set(str(x).split())))
df_test['unique_word_count'] = df_test['tweet'].apply(lambda x: len(set(str(x).split())))

# stop_word_count
df_train['stop_word_count'] = df_train['tweet'].apply(lambda x: len([w for w in str(x).lower().split() if w in STOPWORDS]))
df_test['stop_word_count'] = df_test['tweet'].apply(lambda x: len([w for w in str(x).lower().split() if w in STOPWORDS]))


In [None]:
# url_count
df_train['url_count'] = df_train['tweet'].apply(lambda x: len([w for w in str(x).lower().split() if 'http' in w or 'https' in w]))
df_test['url_count'] = df_test['tweet'].apply(lambda x: len([w for w in str(x).lower().split() if 'http' in w or 'https' in w]))

# mean_word_length
df_train['mean_word_length'] = df_train['tweet'].apply(lambda x: np.mean([len(w) for w in str(x).split()]))
df_test['mean_word_length'] = df_test['tweet'].apply(lambda x: np.mean([len(w) for w in str(x).split()]))

# char_count
df_train['char_count'] = df_train['tweet'].apply(lambda x: len(str(x)))
df_test['char_count'] = df_test['tweet'].apply(lambda x: len(str(x)))

In [None]:
# punctuation_count
df_train['punctuation_count'] = df_train['tweet'].apply(lambda x: len([c for c in str(x) if c in string.punctuation]))
df_test['punctuation_count'] = df_test['tweet'].apply(lambda x: len([c for c in str(x) if c in string.punctuation]))

In [None]:
le=LabelEncoder()
df_train['type']=le.fit_transform(df_train['type'])

In [None]:
df_train.type.value_counts()

4    32648
1     5946
3      651
2      217
0      188
Name: type, dtype: int64

In [None]:
## text cleaning
def clean(tweet): 
    # Character entity references
    tweet = re.sub(r"&gt;", ">", tweet)
    tweet = re.sub(r"&lt;", "<", tweet)
    tweet = re.sub(r"&amp;", "&", tweet)
    # Urls
    tweet = re.sub(r"https?:\/\/t.co\/[A-Za-z0-9]+", "", tweet)
        
    # Words with punctuations and special characters
    punctuations = '@#!?+&*[]-%.:/();$=><|{}^' + "'`"
    for p in punctuations:
        tweet = tweet.replace(p, f' {p} ')
        
    # ... and ..
    tweet = tweet.replace('...', ' ... ')
    if '...' not in tweet:
        tweet = tweet.replace('..', ' ... ')      
        
    # Acronyms
    tweet = re.sub(r"MH370", "Malaysia Airlines Flight 370", tweet)
    tweet = re.sub(r"mÌ¼sica", "music", tweet)
    tweet = re.sub(r"okwx", "Oklahoma City Weather", tweet)
    tweet = re.sub(r"arwx", "Arkansas Weather", tweet)    
    tweet = re.sub(r"gawx", "Georgia Weather", tweet)  
    tweet = re.sub(r"scwx", "South Carolina Weather", tweet)  
    tweet = re.sub(r"cawx", "California Weather", tweet)
    tweet = re.sub(r"tnwx", "Tennessee Weather", tweet)
    tweet = re.sub(r"azwx", "Arizona Weather", tweet)  
    tweet = re.sub(r"alwx", "Alabama Weather", tweet)
    tweet = re.sub(r"wordpressdotcom", "wordpress", tweet)    
    tweet = re.sub(r"usNWSgov", "United States National Weather Service", tweet)
    tweet = re.sub(r"Suruc", "Sanliurfa", tweet)   
    
    # Grouping same words without embeddings
    tweet = re.sub(r"Bestnaijamade", "bestnaijamade", tweet)
    tweet = re.sub(r"SOUDELOR", "Soudelor", tweet)
    
    return tweet

In [None]:
df_train['tweet_cleaned'] = df_train['tweet'].apply(lambda s : clean(s))
df_test['tweet_cleaned'] = df_test['tweet'].apply(lambda s : clean(s))

In [None]:
%%time
bert_layer = hub.KerasLayer('https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1', trainable=True)

CPU times: user 9.55 s, sys: 1.97 s, total: 11.5 s
Wall time: 16.5 s


In [None]:
class ClassificationReport(Callback):
    
    def __init__(self, train_data=(), validation_data=()):
        super(Callback, self).__init__()
        
        self.X_train, self.y_train = train_data
        self.train_precision_scores = []
        self.train_recall_scores = []
        self.train_f1_scores = []
        
        self.X_val, self.y_val = validation_data
        self.val_precision_scores = []
        self.val_recall_scores = []
        self.val_f1_scores = [] 
               
    def on_epoch_end(self, epoch, logs={}):
        train_predictions = np.round(self.model.predict(self.X_train, verbose=0))        
        train_precision = precision_score(self.y_train, train_predictions, average='weighted')
        train_recall = recall_score(self.y_train, train_predictions, average='weighted')
        train_f1 = f1_score(self.y_train, train_predictions, average='weighted')
        self.train_precision_scores.append(train_precision)        
        self.train_recall_scores.append(train_recall)
        self.train_f1_scores.append(train_f1)
        
        val_predictions = np.round(self.model.predict(self.X_val, verbose=0))
        val_precision = precision_score(self.y_val, val_predictions, average='weighted')
        val_recall = recall_score(self.y_val, val_predictions, average='weighted')
        val_f1 = f1_score(self.y_val, val_predictions, average='weighted')
        self.val_precision_scores.append(val_precision)        
        self.val_recall_scores.append(val_recall)        
        self.val_f1_scores.append(val_f1)
        
        print('\nEpoch: {} - Training Precision: {:.6} - Training Recall: {:.6} - Training F1: {:.6}'.format(epoch + 1, train_precision, train_recall, train_f1))
        print('Epoch: {} - Validation Precision: {:.6} - Validation Recall: {:.6} - Validation F1: {:.6}'.format(epoch + 1, val_precision, val_recall, val_f1))  

In [None]:
#the model
class GBVDetector:
    
    def __init__(self, bert_layer, max_seq_length=128, lr=0.0001, epochs=15, batch_size=32):
        
        # BERT and Tokenization params
        self.bert_layer = bert_layer
        
        self.max_seq_length = max_seq_length        
        vocab_file = self.bert_layer.resolved_object.vocab_file.asset_path.numpy()
        do_lower_case = self.bert_layer.resolved_object.do_lower_case.numpy()
        self.tokenizer = tokenization.FullTokenizer(vocab_file, do_lower_case)
        
        # Learning control params
        self.lr = lr
        self.epochs = epochs
        self.batch_size = batch_size
        
        self.models = []
        self.scores = {}
        
        
    def encode(self, texts):
                
        all_tokens = []
        all_masks = []
        all_segments = []

        for text in texts:
            text = self.tokenizer.tokenize(text)
            text = text[:self.max_seq_length - 2]
            input_sequence = ['[CLS]'] + text + ['[SEP]']
            pad_len = self.max_seq_length - len(input_sequence)

            tokens = self.tokenizer.convert_tokens_to_ids(input_sequence)
            tokens += [0] * pad_len
            pad_masks = [1] * len(input_sequence) + [0] * pad_len
            segment_ids = [0] * self.max_seq_length

            all_tokens.append(tokens)
            all_masks.append(pad_masks)
            all_segments.append(segment_ids)

        return np.array(all_tokens), np.array(all_masks), np.array(all_segments)
    
    
    def build_model(self):
        
        input_word_ids = Input(shape=(self.max_seq_length,), dtype=tf.int32, name='input_word_ids')
        input_mask = Input(shape=(self.max_seq_length,), dtype=tf.int32, name='input_mask')
        segment_ids = Input(shape=(self.max_seq_length,), dtype=tf.int32, name='segment_ids')    
        
        pooled_output, sequence_output = self.bert_layer([input_word_ids, input_mask, segment_ids])   
        clf_output = sequence_output[:, 0, :]
        out = Dense(5, activation='sigmoid')(clf_output)
        
        model = Model(inputs=[input_word_ids, input_mask, segment_ids], outputs=out)
        optimizer = SGD(learning_rate=self.lr, momentum=0.8)
        model.compile(loss='sparse_categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])
        
        return model
    
    
    def train(self, X):

        skf=StratifiedKFold(n_splits=5)
        for fold, (trn_idx, val_idx) in enumerate(skf.split(X['tweet_cleaned'], X['type'])):
            
            print('\nFold {}\n'.format(fold))
        
            X_trn_encoded = self.encode(X.loc[trn_idx, 'tweet_cleaned'].str.lower())
            y_trn = X.loc[trn_idx, 'type']
            X_val_encoded = self.encode(X.loc[val_idx, 'tweet_cleaned'].str.lower())
            y_val = X.loc[val_idx, 'type']
        
            # Callbacks
            #metrics = ClassificationReport(train_data=(X_trn_encoded, y_trn), validation_data=(X_val_encoded, y_val))
            
            # Model
            model = self.build_model()        
            model.fit(X_trn_encoded, y_trn, validation_data=(X_val_encoded, y_val), epochs=self.epochs, batch_size=self.batch_size)
            
            self.models.append(model)

                    
    def predict(self, X):
        
        X_test_encoded = self.encode(X['tweet_cleaned'].str.lower())
        y_pred = np.zeros((X_test_encoded[0].shape[0], 5))

        for model in self.models:
            y_pred += model.predict(X_test_encoded) / len(self.models)

        return y_pred

In [None]:
df_train.head()

Unnamed: 0,Tweet_ID,tweet,type,word_count,unique_word_count,stop_word_count,url_count,mean_word_length,char_count,punctuation_count,tweet_cleaned
0,ID_0022DWKP,Had a dream i got raped last night. By a guy i...,4,45,40,26,0,3.533333,203,3,Had a dream i got raped last night . By a guy...
1,ID_00395QYM,he thought the word raped means sex and told m...,4,21,20,9,0,3.809524,100,0,he thought the word raped means sex and told m...
2,ID_003EOSSF,She NOT TALKING TO ME I WAS RAPED BY 2 MEN 1 M...,4,24,23,13,0,3.375,104,0,She NOT TALKING TO ME I WAS RAPED BY 2 MEN 1 M...
3,ID_004BBHOD,I was sexually abused for 3 years at age 4 to ...,4,61,50,29,0,3.557377,280,9,I was sexually abused for 3 years at age 4 to ...
4,ID_004F7516,Chessy Prout can do better by telling the trut...,4,52,44,22,0,4.365385,278,11,Chessy Prout can do better by telling the trut...


In [None]:
#train
clf = GBVDetector(bert_layer, max_seq_length=128, lr=0.0001, epochs=1, batch_size=64)

clf.train(df_train)


Fold 0


Fold 1


Fold 2


Fold 3


Fold 4



In [None]:
y_pred = clf.predict(df_test)

In [None]:
y_pred

array([[0.30198131, 0.25112323, 0.22739491, 0.40421669, 0.93243764],
       [0.55376435, 0.27838741, 0.30839808, 0.48802634, 0.82680137],
       [0.58609284, 0.37785996, 0.29285473, 0.33619948, 0.84248178],
       ...,
       [0.42325608, 0.2223551 , 0.33703112, 0.28072935, 0.96654077],
       [0.43994587, 0.23350284, 0.24356303, 0.20479442, 0.95520465],
       [0.39235706, 0.37539535, 0.25974469, 0.42665473, 0.93552123]])

In [None]:
pred=pd.DataFrame(y_pred,columns=['Harmful_Traditional_practice','Physical_violence','economic_violence','emotional_violence','sexual_violence'])

In [None]:
maxValueIndex = pred.idxmax(axis = 1)

In [None]:
series=pd.DataFrame(np.array(maxValueIndex),columns=['type'])

In [None]:
series.head()

Unnamed: 0,type
0,sexual_violence
1,sexual_violence
2,sexual_violence
3,sexual_violence
4,sexual_violence


In [None]:
pred.head(20)

Unnamed: 0,class_0,class_1,class_2,class_3,class_4
0,0.301981,0.251123,0.227395,0.404217,0.932438
1,0.553764,0.278387,0.308398,0.488026,0.826801
2,0.586093,0.37786,0.292855,0.336199,0.842482
3,0.42391,0.233902,0.326646,0.296982,0.979658
4,0.311474,0.46869,0.390199,0.63721,0.855967
5,0.370248,0.277378,0.224915,0.220904,0.975708
6,0.354629,0.447552,0.310475,0.487167,0.852003
7,0.255479,0.53716,0.454964,0.66046,0.54101
8,0.377214,0.264445,0.24313,0.240897,0.978513
9,0.459658,0.265291,0.262312,0.302525,0.970999


In [None]:
pred.to_csv('predicted.csv',index=False)

In [None]:
submit=pd.DataFrame(data={'Tweet_ID':np.array(ss['Tweet_ID']),'type':np.array(series['type'])})

In [None]:
submit.to_csv('koubrah.csv',index=False)

In [None]:
submit

Unnamed: 0,Tweet_ID,type
0,ID_0095QL4S,sexual_violence
1,ID_00DREW5O,sexual_violence
2,ID_00E9F5X9,sexual_violence
3,ID_00G9OSKZ,sexual_violence
4,ID_00HU96U6,sexual_violence
...,...,...
15576,ID_ZZR1D21T,sexual_violence
15577,ID_ZZSQF54Y,sexual_violence
15578,ID_ZZTN5126,sexual_violence
15579,ID_ZZWS0XZZ,sexual_violence


In [None]:
ss.head()

Unnamed: 0,Tweet_ID,type
0,ID_0095QL4S,sexual_violence
1,ID_00DREW5O,sexual_violence
2,ID_00E9F5X9,sexual_violence
3,ID_00G9OSKZ,sexual_violence
4,ID_00HU96U6,sexual_violence


In [None]:
ss.to_csv('koubrah.csv',index=False)