In [1]:
# from __future__ import print_function
%matplotlib inline
import pandas as pd
import numpy as np
import re
from bs4 import BeautifulSoup
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences


# score metrics
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix, f1_score
from sklearn.metrics import classification_report
from sklearn import metrics

from keras.models import model_from_json

Using TensorFlow backend.


## Importing data

In [2]:
import gc
gc.collect()

df = pd.read_csv("CQC_documents_df_revised_tableau_v1.csv", index_col = 0)

df['rating_overall'] = np.where((df['rating_overall'] =='Good') | (df['rating_overall'] =='Outstanding'), 0,1)

## Setting up model hyperparameters

In [3]:
# set hyperparameters:
max_features = 300
maxlen = 40000
batch_size = 32
embedding_dims = 200
filters = 192
kernel_size = 3
hidden_dims = 256
epochs = 20

print('Shape of dataset ',df.shape)
print('No. of unique classes',len(set(df['rating_overall'])))


# cleaning up the labels (classification 0,1 for classes in df)
macronum=sorted(set(df['rating_overall']))
macro_to_id = dict((note, number) for number, note in enumerate(macronum))

def fun(i):
    return macro_to_id[i]

df['rating_overall']=df['rating_overall'].apply(fun)
labels =  np.array(df['rating_overall'])

# cleaning up the string
def clean_str(string):
    string = re.sub(r"\\", "", string)
    string = re.sub(r"\'", "", string)
    string = re.sub(r"\"", "", string)
    return string.strip().lower()

texts = []
for idx in range(df.full_text_limited_nlpprocess.shape[0]):
    text = BeautifulSoup(df.full_text_limited_nlpprocess[idx])
    texts.append(clean_str(str(text.get_text().encode())))

# changing text to sequences
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

# creating word_index of all vocab, vocab size set to len + 1
word_index = tokenizer.word_index
vocab_size = len(word_index) + 1

print('Number of Unique Tokens',len(word_index))
print('Shape of Label Tensor:', labels.shape)

Shape of dataset  (927, 54)
No. of unique classes 2
Number of Unique Tokens 32362
Shape of Label Tensor: (927,)


In [4]:
# padding sequences (pre-padding)
data = pad_sequences(sequences, maxlen=maxlen)
print('Shape of Data Tensor:', data.shape)

Shape of Data Tensor: (927, 40000)


## Loading hiw data
Loading Welsh dataset for predictions at the end of notebook

In [5]:
df_Welsh = pd.read_csv('Welsh_documents_df_revised_tableau_v1.csv', index_col = 0)

texts = []
# labels = []

for idx in range(df_Welsh.full_text_limited_nlpprocess.shape[0]):
#     print(idx)
    text = BeautifulSoup(df_Welsh.full_text_limited_nlpprocess[idx])
    texts.append(clean_str(str(text.get_text().encode())))

sequences_Welsh = tokenizer.texts_to_sequences(texts)

x_Welsh = pad_sequences(sequences_Welsh, maxlen=maxlen)
x_Welsh = sequence.pad_sequences(x_Welsh, maxlen=maxlen)

## Divide data to train and test

In [6]:
df_train_idx = pd.read_csv('df_train_indices.csv', index_col = 0)
df_test_idx = pd.read_csv('df_test_indices.csv', index_col = 0)

x_train = data[np.array(df_train_idx.iloc[:, 0].values)]
y_train = labels[np.array(df_train_idx.iloc[:, 0].values)]
x_val = data[np.array(df_test_idx.iloc[:, 0].values)]
y_val = labels[np.array(df_test_idx.iloc[:, 0].values)]

In [7]:
print(len(x_train), 'train sequences')
print(len(x_val), 'test sequences')

x_train = sequence.pad_sequences(x_train, maxlen=maxlen)
x_val = sequence.pad_sequences(x_val, maxlen=maxlen)
print('x_train shape:', x_train.shape)
print('x_test shape:', x_val.shape)


649 train sequences
278 test sequences
x_train shape: (649, 40000)
x_test shape: (278, 40000)


In [15]:
# Libraries for tuning with tuner
from kerastuner.tuners import RandomSearch
# from kerastuner.engine.hyperparameters import HyperParameters
import time
from kerastuner import HyperModel
from kerastuner.tuners import RandomSearch, Hyperband, BayesianOptimization
from tensorflow import keras
from sklearn.metrics import confusion_matrix
from tensorflow import keras

# libraries for training
from tensorflow.python.keras import models
from tensorflow.python.keras.layers import Dense
from tensorflow.python.keras.layers import Dropout
from tensorflow.python.keras.layers import Activation
from tensorflow.python.keras.layers import Embedding
from tensorflow.python.keras.layers import Conv1D
from tensorflow.python.keras.layers import MaxPooling1D
from tensorflow.python.keras.layers import GlobalMaxPooling1D
from tensorflow.python.keras.layers import Concatenate
from tensorflow.keras import layers

import kerastuner as kt
import tensorflow as tf

In [16]:
LOG_DIR= f"{int(time.time())}"

def build_model (hp):
    inputs = tf.keras.Input(shape=(maxlen,), dtype='int32')
    x = inputs
    x_encoder = Embedding(max_features,embedding_dims,input_length=maxlen,)(x)
    x_encoder = Dropout(hp.Float('dropout', 0, 0.5, step=0.1, default=0.5))(x_encoder)
    bigrams = Conv1D(filters = hp.Int('filters', 32, 256, step=32),
                     kernel_size=2,
                     padding='valid',
                     activation='relu',
                     strides=1)(x_encoder)
    bigrams = GlobalMaxPooling1D()(bigrams)
    trigrams = Conv1D(filters = hp.Int('filters', 32, 256, step=32),
                     kernel_size=3,
                     padding='valid',
                     activation='relu',
                     strides=1)(x_encoder)
    trigrams = GlobalMaxPooling1D()(trigrams)
    fourgrams = Conv1D(filters = hp.Int('filters', 32, 256, step=32),
                     kernel_size=4,
                     padding='valid',
                     activation='relu',
                     strides=1)(x_encoder)
    fourgrams = GlobalMaxPooling1D()(fourgrams)
    merged = tf.keras.layers.concatenate([bigrams, trigrams, fourgrams], axis = 1)
    for i in range(hp.Int("n_layers", 1, 5)):
        merged = Dense(units=hp.Int('units_' + str(i),
                                                min_value=32,
                                                max_value=512, 
                                                step=32), activation='relu')(merged)
    merged = Dropout(hp.Float('dropout', 0, 0.5, step=0.1, default=0.5))(merged)
    merged = Dense(1)(merged)
    outputs = Activation('sigmoid')(merged)
    
    
    model = tf.keras.Model(inputs, outputs)
    model.compile(
    optimizer=tf.keras.optimizers.Adam(hp.Float('learning_rate', 1e-4, 1e-2, sampling='log')),
    loss='binary_crossentropy', 
    metrics=['accuracy'])
    return model

In [19]:
tuner = RandomSearch (
        build_model, 
        objective = "val_accuracy",
        max_trials = 5,
        executions_per_trial = 3,
        directory = LOG_DIR)

tuner.search(x = x_train,
            y = y_train,
            epochs = 50,
            batch_size = 32,
            validation_data = (x_val, y_val),
            verbose = 2,
            callbacks=[keras.callbacks.EarlyStopping(patience=2)]
            )

In [None]:
# best model parameter
# {'dropout': 0.4, 'filters': 192, 'n_layers': 3, 'units_0': 320, 
#  'learning_rate': 0.0004894779804921969, 'units_1': 32, 'units_2': 32}

In [20]:
from keras import optimizers
from keras import callbacks


from keras.layers import Input, Dense, concatenate, Activation
from keras.utils.np_utils import to_categorical
from keras.layers import Embedding
# from keras.layers import Dense, Input, Flatten, Activation
from keras.layers import Conv1D, MaxPooling1D, Embedding, Dropout, LSTM, GRU, Bidirectional, GlobalMaxPooling1D
from keras.models import Sequential
from keras.models import Model
from keras.callbacks import ModelCheckpoint
import matplotlib.pyplot as plt
plt.switch_backend('agg')
from keras import backend as K
from keras.engine.topology import Layer, InputSpec
from keras import initializers

# from keras.layers import Conv1D, GlobalMaxPooling1D
# import pickle
# from collections import defaultdict
# import sys
# import os
# os.environ['KERAS_BACKEND']='theano'

In [None]:
print('Build model...')
tweet_input = Input(shape=(maxlen,), dtype='int32')

tweet_encoder = Embedding(max_features,embedding_dims,input_length=maxlen,)(tweet_input)
tweet_encoder = merged = Dropout(0.4)(tweet_encoder)
bigram_branch = Conv1D(filters=filters, kernel_size=2, padding='valid', activation='relu', strides=1)(tweet_encoder)
bigram_branch = GlobalMaxPooling1D()(bigram_branch)
trigram_branch = Conv1D(filters=filters, kernel_size=3, padding='valid', activation='relu', strides=1)(tweet_encoder)
trigram_branch = GlobalMaxPooling1D()(trigram_branch)
fourgram_branch = Conv1D(filters=filters, kernel_size=4, padding='valid', activation='relu', strides=1)(tweet_encoder)
fourgram_branch = GlobalMaxPooling1D()(fourgram_branch)
merged = concatenate([bigram_branch, trigram_branch, fourgram_branch], axis=1)

merged = Dense(320, activation='relu')(merged)
merged = Dense(32, activation='relu')(merged)
merged = Dense(32, activation='relu')(merged)

merged = Dropout(0.4)(merged)
merged = Dense(1)(merged)
output = Activation('sigmoid')(merged)
model = Model(inputs=[tweet_input], outputs=[output])
optimizer = optimizers.Adam(lr=0.0004894779804921969)
model.compile(loss='binary_crossentropy',
                  optimizer=optimizer,
                  metrics=['accuracy'])
print(model.summary())

hist = model.fit(x_train, y_train,
          batch_size=32,
          epochs=10, # 30
          validation_data=(x_val, y_val),
          verbose = 2,
           callbacks=[callbacks.EarlyStopping(patience=2)])

Build model...
Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            (None, 40000)        0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 40000, 200)   60000       input_2[0][0]                    
__________________________________________________________________________________________________
dropout_1 (Dropout)             (None, 40000, 200)   0           embedding_1[0][0]                
__________________________________________________________________________________________________
conv1d_1 (Conv1D)               (None, 39999, 192)   76992       dropout_1[0][0]                  
_____________________________________________________________________________

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 649 samples, validate on 278 samples
Epoch 1/10
 - 655s - loss: 0.6032 - accuracy: 0.7196 - val_loss: 0.5919 - val_accuracy: 0.7266
Epoch 2/10


In [None]:
fig1 = plt.figure()
plt.plot(hist.history['loss'],'r',linewidth=3.0)
plt.plot(hist.history['val_loss'],'b',linewidth=3.0)
plt.legend(['Training loss', 'Validation Loss'],fontsize=18)
plt.xlabel('Epochs ',fontsize=16)
plt.ylabel('Loss',fontsize=16)
plt.title('Loss Curves : CNN_ngram (maxlen=40000, features=300)',fontsize=16)
fig1.savefig('cnn_ngram_40000_300_loss.png')
plt.show()

In [None]:
fig2=plt.figure()
plt.plot(hist.history['accuracy'],'r',linewidth=3.0)
plt.plot(hist.history['val_accuracy'],'b',linewidth=3.0)
plt.legend(['Training Accuracy', 'Validation Accuracy'],fontsize=18)
plt.xlabel('Epochs ',fontsize=16)
plt.ylabel('Accuracy',fontsize=16)
plt.title('Accuracy Curves : CNN_ngram (maxlen=40000, features=300)',fontsize=16)
fig2.savefig('cnn_ngram_accuracy40000_300.png')
plt.show()

In [None]:
y_pred = np.asarray(model.predict(x_val))

print(metrics.accuracy_score(y_val, np.where(y_pred>0.5, 1,0)))

print(classification_report(y_val, np.where(y_pred>0.5, 1,0)))

# Print confusion matrix using predictions
print(confusion_matrix(y_val, np.where(y_pred>0.5, 1,0)))
y_pred = np.asarray(model.predict(x_val))

In [None]:
# serialize model to JSON
model_json = model.to_json()
with open("model_CNN_ngram_40000_300.json", "w") as json_file:
    json_file.write(model_json)
# serialize weights to HDF5
model.save_weights("model_CNN_ngram_40000_300.h5")
print("Saved model to disk")

In [None]:
from sklearn.model_selection import StratifiedKFold
seed = 42
kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed)
cvscores = []
for train, test in kfold.split(x_train, y_train):
    tweet_input = Input(shape=(maxlen,), dtype='int32')

    tweet_encoder = Embedding(max_features,embedding_dims,input_length=maxlen,)(tweet_input)
    tweet_encoder = merged = Dropout(0.30000000000000004)(tweet_encoder)
    bigram_branch = Conv1D(filters=filters, kernel_size=2, padding='valid', activation='relu', strides=1)(tweet_encoder)
    bigram_branch = GlobalMaxPooling1D()(bigram_branch)
    trigram_branch = Conv1D(filters=filters, kernel_size=3, padding='valid', activation='relu', strides=1)(tweet_encoder)
    trigram_branch = GlobalMaxPooling1D()(trigram_branch)
    fourgram_branch = Conv1D(filters=filters, kernel_size=4, padding='valid', activation='relu', strides=1)(tweet_encoder)
    fourgram_branch = GlobalMaxPooling1D()(fourgram_branch)
    merged = concatenate([bigram_branch, trigram_branch, fourgram_branch], axis=1)

    merged = Dense(448, activation='relu')(merged)
    merged = Dense(32, activation='relu')(merged)
    merged = Dense(32, activation='relu')(merged)
    
    merged = Dropout(0.30000000000000004)(merged)
    merged = Dense(1)(merged)
    output = Activation('sigmoid')(merged)
    model = Model(inputs=[tweet_input], outputs=[output])
    optimizer = optimizers.Adam(lr=0.00013978522409411077)
    model.compile(loss='binary_crossentropy',
                      optimizer=optimizer,
                      metrics=['accuracy'])
    print(model.summary())

    hist = model.fit(x_train, y_train,
              batch_size=32,
              epochs=50, 
              validation_data=(x_val, y_val),
              verbose = 2,
               callbacks=[callbacks.EarlyStopping(patience=2)])
    
    scores = model.evaluate(x_train[test], y_train[test], verbose=0)
    print("%s: %.2f%%" % (model.metrics_names[1], scores[1]*100))
    cvscores.append(scores[1] * 100)

print("Training Accuracy (cross-validation): %.2f%% " % (np.mean(cvscores)))
print("Training Accuracy (standard deviation): %.2f%% " % (np.std(cvscores)))

In [None]:
from keras.models import model_from_json
# load json and create model
json_file = open('model_CNN_ngram_40000_300.json', 'r')
loaded_model_json = json_file.read()
json_file.close()
loaded_model = model_from_json(loaded_model_json) # loaded model is the one to use with method compile
# load weights into new model
loaded_model.load_weights("model_CNN_ngram_40000_300.h5")
print("Loaded model from disk")

In [None]:
predictions_Welsh = pd.read_csv('Welsh_predictions.csv', index_col = 0)

Welsh_predict = loaded_model.predict(x_Welsh)

Welsh_predict = np.where(Welsh_predict>0.5,1,0)

predictions_Welsh['CNN_ngram_40000_300'] = Welsh_predict

predictions_Welsh.to_csv('Welsh_predictions.csv')

In [None]:
df_Welsh = pd.read_csv('Welsh_documents_df_revised_tableau_v1.csv', index_col = 0)

df_Welsh['CNN_ngram_40000_300'] = np.where(Welsh_predict>0.5,1,0)

df_Welsh.to_csv('Welsh_documents_df_revised_tableau_v1.csv')
df_Welsh[:2]

## SHAP

In [None]:
import shap
word_index

In [None]:
len(x_train[1])

In [None]:
def prepare_explanation_words(encoded_x):
    words = word_index
    num2word = {}
    for w in words.keys():
        num2word[words[w]] = w
    x_test_words = np.stack([
        np.array(list(map(
            lambda x: num2word.get(x, "NONE"), encoded_x[i])
                     )) for i in range(10)])

    return x_test_words

In [None]:
prepare_explanation_words(x_val)

In [None]:

# kernel_explainer = shap.KernelExplainer(pipeline.model.predict, encoded_x_train[:10])
# kernel_shap_values = kernel_explainer.shap_values(encoded_x_test[:1])

# x_test_words = prepare_explanation_words(pipeline, encoded_x_test)
# y_pred = pipeline.predict(x_test[:1])
# print('Actual Category: %s, Predict Category: %s' % (y_test[0], y_pred[0]))

# shap.force_plot(kernel_explainer.expected_value[0], kernel_shap_values[0][0], x_test_words[0])

In [None]:
kernel_explainer = shap.KernelExplainer(model.predict, x_train[:50])

In [None]:
kernel_shap_values = kernel_explainer.shap_values(x_val[:1])

In [None]:
x_test_words = prepare_explanation_words(x_val)

In [None]:
import shap
shap.initjs()
y_pred = model.predict_classes(x_val[:1])
print('Actual Category: %s, Predict Category: %s' % (y_val[0], y_pred[0]))

In [None]:
shap.force_plot(kernel_explainer.expected_value[0], kernel_shap_values[0][0], x_test_words[0])

In [None]:
# explainer = shap.GradientExplainer(model.predict, x_train[:10])

In [None]:
# import shap
# explainer = shap.KernelExplainer(model.predict, x_train[:10])
# shap_values = explainer.shap_values(x_val[:10], nsamples=10)
# # shap_values = explainer.shap_values(X.iloc[299,:], nsamples=500)


In [None]:
shap.summary_plot(kernel_shap_values, x_val[:10], feature_names=x_test_words[0],)

In [None]:
shap.force_plot(explainer.expected_value[0], shap_values[0], x_train[:10])

In [None]:
from hc_utils.shap_deep import TFDeepExplainer

In [None]:
from keras.layers import Input, Dense, concatenate, Activation
from keras.models import Model

print('Build model...')
tweet_input = Input(shape=(maxlen,), dtype='int32')

tweet_encoder = Embedding(max_features,embedding_dims,input_length=maxlen,)(tweet_input)
tweet_encoder = merged = Dropout(0.2)(tweet_encoder)
bigram_branch = Conv1D(filters=filters, kernel_size=2, padding='valid', activation='relu', strides=1)(tweet_encoder)
bigram_branch = GlobalMaxPooling1D()(bigram_branch)
trigram_branch = Conv1D(filters=filters, kernel_size=3, padding='valid', activation='relu', strides=1)(tweet_encoder)
trigram_branch = GlobalMaxPooling1D()(trigram_branch)
fourgram_branch = Conv1D(filters=filters, kernel_size=4, padding='valid', activation='relu', strides=1)(tweet_encoder)
fourgram_branch = GlobalMaxPooling1D()(fourgram_branch)
merged = concatenate([bigram_branch, trigram_branch, fourgram_branch], axis=1)

merged = Dense(256, activation='relu')(merged)
merged = Dropout(0.2)(merged)
merged = Dense(1)(merged)
output = Activation('sigmoid')(merged)
model = Model(inputs=[tweet_input], outputs=[output])
model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])
model.summary()

In [None]:
hist = model.fit(x_train, y_train,
          batch_size=batch_size,
          epochs=epochs,
          validation_data=(x_val, y_val))

In [None]:
fig1 = plt.figure()
plt.plot(hist.history['loss'],'r',linewidth=3.0)
plt.plot(hist.history['val_loss'],'b',linewidth=3.0)
plt.legend(['Training loss', 'Validation Loss'],fontsize=18)
plt.xlabel('Epochs ',fontsize=16)
plt.ylabel('Loss',fontsize=16)
plt.title('Loss Curves :RNN',fontsize=16)
fig1.savefig('loss_rnn.png')
plt.show()

In [None]:
fig2=plt.figure()
plt.plot(hist.history['accuracy'],'r',linewidth=3.0)
plt.plot(hist.history['val_accuracy'],'b',linewidth=3.0)
plt.legend(['Training Accuracy', 'Validation Accuracy'],fontsize=18)
plt.xlabel('Epochs ',fontsize=16)
plt.ylabel('Accuracy',fontsize=16)
plt.title('Accuracy Curves : RNN',fontsize=16)
fig2.savefig('accuracy_rnn.png')
plt.show()

In [None]:
y_pred = np.asarray(model.predict(x_val))
# score metrics
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix, f1_score
from sklearn.metrics import classification_report
from sklearn import metrics

print(metrics.accuracy_score(y_val, np.where(y_pred>0.5, 1,0)))

print(classification_report(y_val, np.where(y_pred>0.5, 1,0)))

# Print confusion matrix using predictions
print(confusion_matrix(y_val, np.where(y_pred>0.5, 1,0)))

In [None]:
# set parameters:
max_features = 200
maxlen = 40000
batch_size = 32
embedding_dims = 100
filters = 100
kernel_size = 3
hidden_dims = 256
epochs = 30


embeddings_index = {}
f = open('C:/Users/MA069ja/cs224u/data/glove.6B/glove.6B.100d.txt',encoding='utf8')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Total %s word vectors in Glove 6B 100d.' % len(embeddings_index))

embedding_matrix = np.random.random((len(word_index) + 1, embedding_dims))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector


In [None]:
embedding_matrix.shape

In [None]:
vocab_size

In [None]:
from keras.layers import Input, Dense, concatenate, Activation
from keras.models import Model

tweet_input = Input(shape=(maxlen,), dtype='int32')

tweet_encoder = Embedding(vocab_size,embedding_dims,input_length=maxlen,weights=[embedding_matrix],trainable = False)(tweet_input)
tweet_encoder = merged = Dropout(0.2)(tweet_encoder)
bigram_branch = Conv1D(filters=filters, kernel_size=2, padding='valid', activation='relu', strides=1)(tweet_encoder)
bigram_branch = GlobalMaxPooling1D()(bigram_branch)
trigram_branch = Conv1D(filters=filters, kernel_size=3, padding='valid', activation='relu', strides=1)(tweet_encoder)
trigram_branch = GlobalMaxPooling1D()(trigram_branch)
fourgram_branch = Conv1D(filters=filters, kernel_size=4, padding='valid', activation='relu', strides=1)(tweet_encoder)
fourgram_branch = GlobalMaxPooling1D()(fourgram_branch)
merged = concatenate([bigram_branch, trigram_branch, fourgram_branch], axis=1)

merged = Dense(256, activation='relu')(merged)
merged = Dropout(0.2)(merged)
merged = Dense(1)(merged)
output = Activation('sigmoid')(merged)
model = Model(inputs=[tweet_input], outputs=[output])
model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])
model.summary()

In [None]:
hist = model.fit(x_train, y_train,
          batch_size=batch_size,
          epochs=epochs,
          validation_data=(x_val, y_val))

In [None]:
fig1 = plt.figure()
plt.plot(hist.history['loss'],'r',linewidth=3.0)
plt.plot(hist.history['val_loss'],'b',linewidth=3.0)
plt.legend(['Training loss', 'Validation Loss'],fontsize=18)
plt.xlabel('Epochs ',fontsize=16)
plt.ylabel('Loss',fontsize=16)
plt.title('Loss Curves :RNN',fontsize=16)
fig1.savefig('loss_rnn.png')
plt.show()

In [None]:
fig2=plt.figure()
plt.plot(hist.history['accuracy'],'r',linewidth=3.0)
plt.plot(hist.history['val_accuracy'],'b',linewidth=3.0)
plt.legend(['Training Accuracy', 'Validation Accuracy'],fontsize=18)
plt.xlabel('Epochs ',fontsize=16)
plt.ylabel('Accuracy',fontsize=16)
plt.title('Accuracy Curves : RNN',fontsize=16)
fig2.savefig('accuracy_rnn.png')
plt.show()

In [None]:
y_pred = np.asarray(model.predict(x_val))
# score metrics
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix, f1_score
from sklearn.metrics import classification_report
from sklearn import metrics

print(metrics.accuracy_score(y_val, np.where(y_pred>0.5, 1,0)))

print(classification_report(y_val, np.where(y_pred>0.5, 1,0)))

# Print confusion matrix using predictions
print(confusion_matrix(y_val, np.where(y_pred>0.5, 1,0)))

In [None]:
from keras.layers import Input, Dense, concatenate, Activation
from keras.models import Model

tweet_input = Input(shape=(maxlen,), dtype='int32')

tweet_encoder = Embedding(vocab_size,embedding_dims,input_length=maxlen,weights=[embedding_matrix],trainable = True)(tweet_input)
tweet_encoder = merged = Dropout(0.2)(tweet_encoder)
bigram_branch = Conv1D(filters=filters, kernel_size=2, padding='valid', activation='relu', strides=1)(tweet_encoder)
bigram_branch = GlobalMaxPooling1D()(bigram_branch)
trigram_branch = Conv1D(filters=filters, kernel_size=3, padding='valid', activation='relu', strides=1)(tweet_encoder)
trigram_branch = GlobalMaxPooling1D()(trigram_branch)
fourgram_branch = Conv1D(filters=filters, kernel_size=4, padding='valid', activation='relu', strides=1)(tweet_encoder)
fourgram_branch = GlobalMaxPooling1D()(fourgram_branch)
merged = concatenate([bigram_branch, trigram_branch, fourgram_branch], axis=1)

merged = Dense(256, activation='relu')(merged)
merged = Dropout(0.2)(merged)
merged = Dense(1)(merged)
output = Activation('sigmoid')(merged)
model = Model(inputs=[tweet_input], outputs=[output])
model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])
model.summary()

hist = model.fit(x_train, y_train,
          batch_size=batch_size,
          epochs=epochs,
          validation_data=(x_val, y_val))

fig1 = plt.figure()
plt.plot(hist.history['loss'],'r',linewidth=3.0)
plt.plot(hist.history['val_loss'],'b',linewidth=3.0)
plt.legend(['Training loss', 'Validation Loss'],fontsize=18)
plt.xlabel('Epochs ',fontsize=16)
plt.ylabel('Loss',fontsize=16)
plt.title('Loss Curves :RNN',fontsize=16)
fig1.savefig('loss_rnn.png')
plt.show()

fig2=plt.figure()
plt.plot(hist.history['accuracy'],'r',linewidth=3.0)
plt.plot(hist.history['val_accuracy'],'b',linewidth=3.0)
plt.legend(['Training Accuracy', 'Validation Accuracy'],fontsize=18)
plt.xlabel('Epochs ',fontsize=16)
plt.ylabel('Accuracy',fontsize=16)
plt.title('Accuracy Curves : RNN',fontsize=16)
fig2.savefig('accuracy_rnn.png')
plt.show()

y_pred = np.asarray(model.predict(x_val))
# score metrics
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix, f1_score
from sklearn.metrics import classification_report
from sklearn import metrics

print(metrics.accuracy_score(y_val, np.where(y_pred>0.5, 1,0)))

print(classification_report(y_val, np.where(y_pred>0.5, 1,0)))

# Print confusion matrix using predictions
print(confusion_matrix(y_val, np.where(y_pred>0.5, 1,0)))


In [None]:
# print('Build model...')
# model = Sequential()

# # we start off with an efficient embedding layer which maps
# # our vocab indices into embedding_dims dimensions
# model.add(Embedding(len(word_index) + 1,
#                     embedding_dims,
#                     input_length=maxlen,
# #                    weights=[embedding_matrix],
# #                    trainable = True
#                    ))
# model.add(Dropout(0.2))

# # we add a Convolution1D, which will learn filters
# # word group filters of size filter_length:
# model.add(Conv1D(filters,
#                  kernel_size,
#                  padding='valid',
#                  activation='relu',
#                  strides=1))
# # we use max pooling:
# model.add(GlobalMaxPooling1D())
# model.add(LSTM(100))
# # We add a vanilla hidden layer:
# model.add(Dense(hidden_dims))
# model.add(Dropout(0.2))
# model.add(Activation('relu'))

# # We project onto a single unit output layer, and squash it with a sigmoid:
# model.add(Dense(1))
# model.add(Activation('sigmoid'))

# model.compile(loss='binary_crossentropy',
#               optimizer='adam',
#               metrics=['accuracy'])
# hist = model.fit(x_train, y_train,
#           batch_size=batch_size,
#           epochs=epochs,
#           validation_data=(x_val, y_val))

In [None]:
SEED = 10
#import spacy
from tqdm import tqdm_notebook, tnrange
from tqdm.auto import tqdm

tqdm.pandas(desc='Progress')
from collections import Counter
from textblob import TextBlob
from nltk import word_tokenize

# cross validation and metrics
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score
from torch.optim.optimizer import Optimizer
from unidecode import unidecode

from sklearn.preprocessing import StandardScaler
from textblob import TextBlob
from multiprocessing import  Pool
from functools import partial
import numpy as np
from sklearn.decomposition import PCA
import torch as t
import torch.nn as nn
import torch.nn.functional as F

from keras.layers import *
from keras.models import *
from keras import initializers, regularizers, constraints, optimizers, layers
from keras.initializers import *
from keras.optimizers import *
import keras.backend as K
from keras.callbacks import *
import tensorflow as tf

def model_train_cv(x_train,y_train,nfold,model_obj):
    splits = list(StratifiedKFold(n_splits=nfold, shuffle=True, random_state=SEED).split(x_train, y_train))
    x_train = x_train
    y_train = np.array(y_train)
    # matrix for the out-of-fold predictions
    train_oof_preds = np.zeros((x_train.shape[0]))
    for i, (train_idx, valid_idx) in enumerate(splits):
        print(f'Fold {i + 1}')
        x_train_fold = x_train[train_idx.astype(int)]
        y_train_fold = y_train[train_idx.astype(int)]
        x_val_fold = x_train[valid_idx.astype(int)]
        y_val_fold = y_train[valid_idx.astype(int)]

        clf = copy.deepcopy(model_obj)
        clf.fit(x_train_fold, y_train_fold, batch_size=512, epochs=5, validation_data=(x_val_fold, y_val_fold))
        
        valid_preds_fold = clf.predict(x_val_fold)[:,0]

        # storing OOF predictions
        train_oof_preds[valid_idx] = valid_preds_fold
    return train_oof_preds


In [None]:
model_train_cv(x_train, y_train, 5, model)

In [None]:
sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences = embedding_layer(sequence_input)
convs = []
filter_sizes = [2,3,4,5,6]
for filter_size in filter_sizes:
    l_conv = Conv1D(filters=200, 
             kernel_size=filter_size, 
            activation='relu')(embedded_sequences)
    l_pool = GlobalMaxPooling1D()(l_conv)
    convs.append(l_pool)
l_merge = concatenate(convs, axis=1)
x = Dropout(0.1)(l_merge)  
x = Dense(128, activation='relu')(x)
x = Dropout(0.2)(x)
preds = Dense(2, activation='sigmoid')(x)
model = Model(sequence_input, preds)
model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['acc'])
model.summary()