In [1]:
import os
import numpy as np 
import pandas as pd
import seaborn
import seaborn as sns
import shap # for SHAP value
import random
import warnings
import matplotlib.pyplot as plt
from matplotlib.pylab import rcParams
from bs4 import BeautifulSoup
import re

# The Natural Language Toolkit, or more commonly NLTK, is a suite of libraries and programs for symbolic and 
# statistical natural language processing for English written in the Python programming language.
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()


#TQDM is a progress bar library with good support for nested loops and Jupyter/IPython notebooks.
from tqdm import tqdm


# Use Keras Tensorflow deeplearning library

from tensorflow import set_random_seed

from keras.utils import to_categorical
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer
from keras.layers import Dense,Dropout,Embedding,LSTM
from keras.callbacks import EarlyStopping
from keras.losses import categorical_crossentropy
from keras.optimizers import Adam
from keras.models import Sequential


from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.ensemble import RandomForestClassifier

from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz

from sklearn.metrics import roc_curve,auc,make_scorer, accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix



import eli5 # for permutation importance
from eli5.sklearn import PermutationImportance

from pdpbox import pdp, info_plots # flor partial plots 

from scipy import interp

from itertools import cycle

from joblib import dump, load

os.environ['PYTHONHASHSEED']=str(seed_value)

Using TensorFlow backend.


In [2]:
os.getcwd()
warnings.filterwarnings("ignore", category=UserWarning, module='bs4')

In [3]:
set_random_seed(123)
random.seed(123)

In [4]:
np.random.seed(123)
pd.options.mode.chained_assignment = None  #hide any pandas warnings
%matplotlib inline

In [5]:
train2 = pd.read_csv("input/train.csv")
train2 = train2.rename(columns={'class':'Sentiment','tweet':'Phrase'})
train2 = train2.drop(['Unnamed: 0', 'count', 'hate_speech', 'offensive_language', 'neither'],axis=1).copy()

target_t2 = train2.Sentiment.values
y_target_t2 = to_categorical(target_t2)

# Save the texts before tokenizing (must use the same random seed)
X_train_t2, X_val_t2, y_train_t2, y_val_t2 = train_test_split(train2,y_target_t2,
                                                              test_size=0.2,
                                                              random_state=123,
                                                              stratify=y_target_t)

In [6]:
# Train dataset: Need only class as "Sentiment" and text as 'Phrase'
train = train.rename(columns={'class':'Sentiment','tweet':'Phrase'})
# train = train.drop(['Unnamed: 0', 'count', 'hate_speech', 'offensive_language', 'neither'],axis=1).copy()

In [8]:
# Need to Save the split texts before cleaning and tokenizing
# Collect dependent values and convert to ONE-HOT encoding
# Output using to_categorical
target_t = train.Sentiment.values
y_target_t = to_categorical(target_t)

# Save the texts before tokenizing (must use the same random seed)
X_train_t, X_val_t, y_train_t, y_val_t = train_test_split(train,y_target_t,
                                                          test_size=0.2,
                                                          random_state=42,
                                                          stratify=y_target_t)

In [9]:
def clean_sentences(df):
    tweets = []
    
#     for sent in tqdm(df['Phrase']):
    for sent in df['Phrase']:
        # remove non-alphabetic characters
        tweet_text = re.sub("[^a-zA-Z]"," ", str(sent))
        
        #remove html content
        tweet_text = BeautifulSoup(tweet_text).get_text()
        
        # tokenize
        words = word_tokenize(tweet_text.lower())
        
        # lemmatize each word to its lemma
        lemma_words = [lemmatizer.lemmatize(i) for i in words]
        
        tweets.append(lemma_words)
        
    return(tweets)

In [10]:
# cleaned tweets for both train and test set retrieved

train_sentences = clean_sentences(train)

test_sentences = clean_sentences(test)

In [11]:
# Collect dependent values and convert to ONE-HOT encoding
# Output using to_categorical
target = train.Sentiment.values
y_target = to_categorical(target)
num_classes = y_target.shape[1]

In [12]:
# Split into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(train_sentences,
                                                  y_target,
                                                  test_size=0.2,
                                                  random_state=42,
                                                  stratify=y_target)

In [13]:
# Getting the no of unique words and max length of a tweet available in the list of cleaned tweets
# It is needed for initializing tokenizer of keras and subsequent padding

# Build an unordered collection of unique elements.
unique_words = set()
len_max = 0

# for sent in tqdm(X_train):
for sent in X_train:

    unique_words.update(sent)
    
    if(len_max<len(sent)):
        len_max=len(sent)

# length of the list of unique_words gives the number of unique words

print(len(list(unique_words)))
print(len_max)

28701
53


In [14]:
# Actual tokenizer of keras and convert to sequences

tokenizer = Tokenizer(num_words=len(list(unique_words)))
tokenizer.fit_on_texts(list(X_train))

# texts_to_sequences
# ARGUMENTS: list of texts to turn to sequences
# RETURN: list of sequences (one per text input)

X_train = tokenizer.texts_to_sequences(X_train)
X_val = tokenizer.texts_to_sequences(X_val)
X_test = tokenizer.texts_to_sequences(test_sentences)

In [15]:
# Padding is done to equalize the lengths of all input tweets.
# LTSM networks need all inputs to be same length.
# Therefore, tweets lesser than max length will be made equal using extra zeros at end. This is padding.
# Also, you always have to give a three-dimensional array as an input to your LSTM network

X_train = sequence.pad_sequences(X_train, maxlen=len_max)
X_val = sequence.pad_sequences(X_val, maxlen=len_max)
X_test = sequence.pad_sequences(X_test, maxlen=len_max)

In [16]:
# Early stopping to prevent overfitting deep learning neural network models
# This is a method that allows you to specify an arbitrary large number of training epochs.
# This stops training once the model performance stops improving on a hold out validation dataset


early_stopping = EarlyStopping(min_delta = 0.001, mode = 'max', monitor = 'val_acc', patience = 2)
callback = [early_stopping]

In [17]:
# re-run model everytime new parameter changes (must run sequential to initialize epoch)
# Model with just acc as metric

model = Sequential()
model.add(Embedding(len(list(unique_words)),300,input_length=len_max))
model.add(LSTM(128,dropout=0.5,recurrent_dropout=0.5,return_sequences=True))
model.add(LSTM(64,dropout=0.5,recurrent_dropout=0.5,return_sequences=False))
model.add(Dense(100,activation='relu')) #try elu
model.add(Dropout(0.5))
model.add(Dense(num_classes,activation='softmax'))
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['acc'])
model.summary()

W0802 16:45:01.245801 4617016768 deprecation_wrapper.py:119] From /anaconda3/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:74: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.

W0802 16:45:01.249207 4617016768 deprecation_wrapper.py:119] From /anaconda3/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:517: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.

W0802 16:45:01.252202 4617016768 deprecation_wrapper.py:119] From /anaconda3/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:4138: The name tf.random_uniform is deprecated. Please use tf.random.uniform instead.

W0802 16:45:01.332825 4617016768 deprecation_wrapper.py:119] From /anaconda3/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:133: The name tf.placeholder_with_default is deprecated. Please use tf.compat.v1.placeholder_with_default instead.

W0802 16:45:01.339013 4617016768 deprecati

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 53, 300)           8610300   
_________________________________________________________________
lstm_1 (LSTM)                (None, 53, 128)           219648    
_________________________________________________________________
lstm_2 (LSTM)                (None, 64)                49408     
_________________________________________________________________
dense_1 (Dense)              (None, 100)               6500      
_________________________________________________________________
dropout_1 (Dropout)          (None, 100)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 3)                 303       
Total params: 8,886,159
Trainable params: 8,886,159
Non-trainable params: 0
_________________________________________________________________


In [18]:
# re-run model everytime new parameter changes (must run sequential to initialize epoch)
# Model with no metrics

model_1 = Sequential()
model_1.add(Embedding(len(list(unique_words)),300,input_length=len_max))
model_1.add(LSTM(128,dropout=0.5,recurrent_dropout=0.5,return_sequences=True))
model_1.add(LSTM(64,dropout=0.5,recurrent_dropout=0.5,return_sequences=False))
model_1.add(Dense(100,activation='relu')) #try elu
model_1.add(Dropout(0.5))
model_1.add(Dense(num_classes,activation='softmax'))
model_1.compile(optimizer='adam',
              loss='categorical_crossentropy')
model_1.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 53, 300)           8610300   
_________________________________________________________________
lstm_3 (LSTM)                (None, 53, 128)           219648    
_________________________________________________________________
lstm_4 (LSTM)                (None, 64)                49408     
_________________________________________________________________
dense_3 (Dense)              (None, 100)               6500      
_________________________________________________________________
dropout_2 (Dropout)          (None, 100)               0         
_________________________________________________________________
dense_4 (Dense)              (None, 3)                 303       
Total params: 8,886,159
Trainable params: 8,886,159
Non-trainable params: 0
_________________________________________________________________


In [19]:
# create pandas df for y_train
# Describe classifier hate (0), offensive (1), neither or neutral (2)

y_df = pd.DataFrame(y_train)
y_df.sum()

0     1144.0
1    15348.0
2     3330.0
dtype: float32

In [20]:
# Sum up the no. of records for each classifier
hate_num = y_df[0].sum()
offensive_num = y_df[1].sum()
neutral_num = y_df[2].sum()

In [21]:
# Do inverse ratio so that hate and neutral has a higher weight
print(f' K Hate:{1 - hate_num / len(y_df)}')
print(f' S Hate:{1 - (hate_num / len(y_df))}')

print(f'K neutral: {1 - neutral_num / len(y_df)}')
print(f'S neutral: {1 - (neutral_num / len(y_df))}')

# Inverse ratio will give offensive a lower weight
print(f'K Offensive: {1 - offensive_num / len(y_df)}')
print(f'S Offensive: {1 - (offensive_num / len(y_df))}')

 K Hate:0.9422863485016648
 S Hate:0.9422863485016648
K neutral: 0.8320048431036222
S neutral: 0.8320048431036222
K Offensive: 0.22570880839471297
S Offensive: 0.22570880839471297


In [22]:
# fit the model adjusting for epochs, batch, and weight

model.fit(
    X_train, y_train, 
    validation_data=(X_val,y_val),
    epochs=15, #may not run all due to callback
    batch_size=256, #faster with larger batch_size but it's generalizing
    verbose=1,
    callbacks=callback, #stops training once the model stops improving. Prevents overfitting.
    class_weight={0: .94*100, 1: .23, 2: .83*10} #use inverse ratio to set hate with highest weight (somewhat arbitrary)
) 

W0802 16:45:02.742547 4617016768 deprecation.py:323] From /anaconda3/lib/python3.7/site-packages/tensorflow/python/ops/math_grad.py:1250: add_dispatch_support.<locals>.wrapper (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


Train on 19822 samples, validate on 4956 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15


<keras.callbacks.History at 0x1c43139198>

In [23]:
model_1.fit(
    X_train, y_train, 
    validation_data=(X_val,y_val),
    epochs=15, #may not run all due to callback
    batch_size=256, #faster with larger batch_size but it's generalizing
    verbose=1,
    callbacks=callback, #stops training once the model stops improving. Prevents overfitting.
    class_weight={0: .94*100, 1: .23, 2: .83*10} #use inverse ratio to set hate with highest weight (somewhat arbitrary)
) 

Train on 19822 samples, validate on 4956 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<keras.callbacks.History at 0x1c43114748>

In [24]:
# Predict validation sentiment!

y_pred = model.predict(X_val)
# print(y_pred)

y_pred_1 = model_1.predict(X_val)

In [25]:
# Keras and Sklearn read arrays differently
# Create function to convert keras array to show only one highest sentiment result per list
def keras_output_sklearn(y):
    
    result = []
    
    for element in y:
        result.append(np.argmax(element))
        

    return result

In [26]:
# Report for model with 'acc' metric
report = classification_report(keras_output_sklearn(y_val), keras_output_sklearn(y_pred))
print(report)

              precision    recall  f1-score   support

           0       0.16      0.69      0.26       286
           1       0.96      0.67      0.79      3838
           2       0.68      0.89      0.77       832

   micro avg       0.70      0.70      0.70      4956
   macro avg       0.60      0.75      0.61      4956
weighted avg       0.87      0.70      0.75      4956



In [27]:
# Report for model with no metrics 
report_1 = classification_report(keras_output_sklearn(y_val), keras_output_sklearn(y_pred_1))
print(report_1)

              precision    recall  f1-score   support

           0       0.20      0.63      0.30       286
           1       0.96      0.72      0.82      3838
           2       0.65      0.91      0.76       832

   micro avg       0.75      0.75      0.75      4956
   macro avg       0.60      0.75      0.63      4956
weighted avg       0.86      0.75      0.78      4956



In [28]:
# Saving model with 'acc' metric
model.save('../models/model_acc.h5')

# Saving model with no metrics
model_1.save('../models/model_no.h5')