##Mounting Google Drive to Notebook##

In [1]:
# mounting for google drive
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)

KeyboardInterrupt: ignored

##Import Functions##

In [0]:
# helper and visualization imports 
import numpy as np
import pandas as pd
import sklearn
import matplotlib.pyplot as plt

# embedding imports
from gensim.models import KeyedVectors

# model imports
from keras import backend as K
from keras.models import Model, Sequential
from keras.layers import Input, Dense, Embedding, SpatialDropout1D, add, concatenate, Activation
from keras.layers import CuDNNLSTM, Bidirectional, GlobalMaxPooling1D, GlobalAveragePooling1D
from keras.preprocessing import text, sequence
from keras.activations import softsign
from tensorflow.contrib.tpu import keras_to_tpu_model

# metrics imports
from math import sqrt
from sklearn.metrics import confusion_matrix

# preprocessing imports 
import pylab
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 
from sklearn.utils import shuffle

##Constant Definitions##

In [0]:
# imports for the embedding files
EMBEDDING_FILES = [
    '/content/gdrive/My Drive/crawl-300d-2M.gensim',
    '/content/gdrive/My Drive/glove.840B.300d.gensim'
]

# LSTM model parameters
BATCH_SIZE = 512
LSTM_UNITS = 128
DENSE_HIDDEN_UNITS = 4 * LSTM_UNITS
EPOCHS = 4
MAX_LEN = 220

# LSTM training and testing attributes
IDENTITY_COLUMNS = ['male', 'female', 'homosexual_gay_or_lesbian', 'christian', 'jewish',
                    'muslim', 'black', 'white', 'psychiatric_or_mental_illness']
AUX_COLUMNS = ['target', 'severe_toxicity', 'obscene', 'identity_attack', 'insult', 'threat']
TEXT_COLUMN = 'comment_text'
TARGET_COLUMN = 'target'
RATING_COLUMN = 'rating'
TR_COLUMN = ['target', 'rating']
RATING = ['approved', 'rejected']

# preprocessing constants
CHARS_TO_REMOVE = '!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n“”’\'∞θ÷α•à−β∅³π‘₹´°£€\×™√²—'

CONTRACTIONS_DICT = { "ain't": "am not", "aren't": "are not", "can't": "cannot", "can't've": "cannot have",
                     "'cause": "because", "could've": "could have", "couldn't": "could not", 
                     "couldn't've": "could not have", "didn't": "did not", "doesn't": "does not",
                     "don't": "do not", "hadn't": "had not", "hadn't've": "had not have", "hasn't": "has not", 
                     "haven't": "have not", "he'd": "he had", "he'd've": "he would have", "he'll": "he will",
                     "he'll've": "he will have", "he's": "he is", "how'd": "how did", "how'd'y": "how do you",
                     "how'll": "how will", "how's": "how has", "I'd": "I had", "I'd've": "I would have",
                     "I'll": "I will", "I'll've": "I will have", "I'm": "I am", "I've": "I have", "isn't": "is not",
                     "it'd": "it would", "it'd've": "it would have", "it'll": "it will", "it'll've": "it will have",
                     "it's": "it is", "let's": "let us", "ma'am": "madam", "mayn't": "may not", 
                     "might've": "might have", "mightn't": "might not", "mightn't've": "might not have",
                     "must've": "must have", "mustn't": "must not", "mustn't've": "must not have",
                     "needn't": "need not", "needn't've": "need not have", "o'clock": "of the clock",
                     "oughtn't": "ought not", "oughtn't've": "ought not have", "shan't": "shall not",
                     "sha'n't": "shall not", "shan't've": "shall not have", "she'd": "she had",
                     "she'd've": "she would have", "she'll": "she will", "she'll've": "she will have",
                     "she's": "she is", "should've": "should have", "shouldn't": "should not",
                     "shouldn't've": "should not have", "so've": "so have", "so's": "so is",
                     "that'd": "that would", "that'd've": "that would have", "that's": "that is",
                     "there'd": "there would", "there'd've": "there would have", "there's": "there is",
                     "they'd": "they would", "they'd've": "they would have", "they'll": "they will", 
                     "they'll've": "they will have", "they're": "they are", "they've": "they have",
                     "to've": "to have", "wasn't": "was not", "we'd": "we would", "we'd've": "we would have",
                     "we'll": "we will", "we'll've": "we will have", "we're": "we are", "we've": "we have",
                     "weren't": "were not", "what'll": "what will", "what'll've": "what will have", 
                     "what're": "what are", "what's": "what is", "what've": "what have", "when's": "when is", 
                     "when've": "when have", "where'd": "where did", "where's": "where is", "where've": "where have",
                     "who'll": "who will", "who'll've": "who will have", "who's": "who is", "who've": "who have",
                     "why's": "why is", "why've": "why have", "will've": "will have", "won't": "will not",
                     "won't've": "will not have", "would've": "would have", "wouldn't": "would not",
                     "wouldn't've": "would not have", "y'all": "you all", "y'all'd": "you all would", 
                     "y'all'd've": "you all would have", "y'all're": "you all are", "y'all've": "you all have",
                     "you'd": "you would", "you'd've": "you would have", "you'll": "you will", 
                     "you'll've": "you will have", "you're": "you are", "you've": "you have"}

SPECIAL_CHARS = {'~',':','+','[','\\','@','^','{','%','(','-','"','*','|',',','&'
                ,'<','}','.','_','=',']','!','>',';','?','~','#','$',')','/','∞'
                ,'θ','÷','α','•','à','−','β','∅','³','π','‘','₹','´','°','£','€'}

#STOP_WORDS = set(stopwords.words('english')) 

##Function Definitions##

In [0]:
#----------------------------------LSTM Embedding/Model Functions------------------------------------
# function to build the embedded matrix 
def build_matrix(word_index, path):
    embedding_index = KeyedVectors.load(path, mmap='r')
    embedding_matrix = np.zeros((len(word_index) + 1, 300))
    for word, i in word_index.items():
        for candidate in [word, word.lower()]:
            if candidate in embedding_index:
                embedding_matrix[i] = embedding_index[candidate]
                break
    return embedding_matrix
    
# function to build the LSTM model
def build_model(embedding_matrix, num_aux_targets):
    words = Input(shape=(None,))
    x = Embedding(*embedding_matrix.shape, weights=[embedding_matrix], trainable=False)(words)
    x = SpatialDropout1D(0.2)(x)
    x = Bidirectional(CuDNNLSTM(LSTM_UNITS, return_sequences=True))(x)
    x = Bidirectional(CuDNNLSTM(LSTM_UNITS, return_sequences=True))(x)

    hidden = concatenate([
        GlobalMaxPooling1D()(x),
        GlobalAveragePooling1D()(x),
    ])
    hidden = add([hidden, Dense(DENSE_HIDDEN_UNITS, activation='relu')(hidden)])
    hidden = add([hidden, Dense(DENSE_HIDDEN_UNITS, activation='relu')(hidden)])
    result = Dense(1, activation='sigmoid')(hidden)
    aux_result = Dense(num_aux_targets, activation='sigmoid')(hidden)
    
    model = Model(inputs=words, outputs=[result, aux_result])
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

    return model


#----------------------------------Rating Conversion Function for NN-----------------------------------
# function used to convert ratings in to binary digitd for neural network training
def rating_conversion(rating_list):
  new_rating = []
  for rating in rating_list:
    if rating == 'approved': new_rating.append(1)
    else: new_rating.append(0)
  return new_rating


#---------------------------------------Keras Function Metrics------------------------------------------
# NN recall metric 
def recall_m(y_true, y_pred):
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
        recall = true_positives / (possible_positives + K.epsilon())
        return recall

# NN precision metric 
def precision_m(y_true, y_pred):
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
        precision = true_positives / (predicted_positives + K.epsilon())
        return precision

# NN F1 metric 
def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

# NN RMSE metric 
def rmse_m(y_true, y_pred):
    return sqrt(mean_squared_error(y_true, y_pred))


#---------------------------------------Regression Analysis--------------------------------------------
# This is an average of the square differences between the truth and prediction
def mean_squared_error(truth, pred):
  a = (pred - truth) * (pred - truth)
  return np.average(a)

# This is an square root of the average of the square differences between the truth and prediction
def root_mean_squared_error(truth, pred):
  return sqrt(mean_squared_error(pred, truth))

# This is an average in the differences in the prediction and truth 
def mean_absolute_error(truth, pred):
  return np.average(np.absolute(pred - truth))

# This is the MSE relative to the variance of the truth
def relative_mean_squared_error(truth, pred):
  return mean_squared_error(truth, pred) / np.var(truth)

# This is an accuracy measure 
def r2_score(truth, pred):
  return 1 - relative_mean_squared_error(truth, pred)


#---------------------------------------Preprocessing Functions-----------------------------------------
# function to remove special characters 
def special_char_prune(text):
    filtered_chars = [c for c in text if not c in SPECIAL_CHARS]
    filtered_text = ''.join(filtered_chars)
    return filtered_text

# function to map contractions 
def contraction_pruning(text):
    no_special = special_char_prune(text)
    word_tokens = no_special.split()
    filtered_sentence = ""
    for token in word_tokens:
        word = token.lower()
        if word in CONTRACTIONS_DICT: filtered_sentence = filtered_sentence + " " + CONTRACTIONS_DICT[word]
        else: filtered_sentence = filtered_sentence + " " + word
    return filtered_sentence.strip()

# function to remove stopwords
def stop_word_pruning(text):
    word_tokens = word_tokenize(text) 
    no_stop = [w for w in word_tokens if not w in STOP_WORDS]
    filtered_sentence = ' '.join(no_stop)
    return filtered_sentence    

# this creates bins for target scores 
def target_round(a):
    if (a <= 0.33):
        c = 0.0
    elif (a <= 0.66):
        c = 0.5
    else: c = 1.0
    return c

##Testing and Training Set Building##

In [0]:
# read file extension
file_name = 'file_name' # name of file in google drive 
file_path = '/content/gdrive/My Drive/' + file_name + '.csv'

x_split = 1700000
y_split = 104874

# These are the slips for the data
# 1700000 104874
# 90000 25000

In [0]:
# This is the testing and traing data I use for the simple NN
# imports for the training and testing sets 
df = pd.read_csv(file_path)
train_NN_df = df.iloc[:x_split, :]

# training data for the neural network 
train_x_NN = np.asarray(train_NN_df[TARGET_COLUMN].tolist(), dtype=np.float32).reshape(x_split)
train_y_NN = np.asarray(rating_conversion(train_NN_df[RATING_COLUMN].tolist()))

In [0]:
# This is the testing and traing data I use for the LSTM 
# Import and split data into test and train
df = pd.read_csv(file_path)
train_df = df.iloc[:x_split, :]
test_df = df.iloc[x_split:, :]

# Create training splits for text and auxilary columns 
x_train = train_df[TEXT_COLUMN].astype(str)
y_train = train_df[TARGET_COLUMN].values
y_aux_train = train_df[AUX_COLUMNS].values
x_test = test_df[TEXT_COLUMN].astype(str)

for column in IDENTITY_COLUMNS + [TARGET_COLUMN]:
    train_df[column] = np.where(train_df[column] >= 0.5, True, False)

# tokenize text to remove bad characters
tokenizer = text.Tokenizer(filters=CHARS_TO_REMOVE, lower=False)
tokenizer.fit_on_texts(list(x_train) + list(x_test))

# resequence characters back into text 
x_train = tokenizer.texts_to_sequences(x_train)
x_test = tokenizer.texts_to_sequences(x_test)
x_train = sequence.pad_sequences(x_train, maxlen=MAX_LEN)
x_test = sequence.pad_sequences(x_test, maxlen=MAX_LEN)

# initialize the sample weights of the LSTM 
sample_weights = np.ones(len(x_train), dtype=np.float32)
sample_weights += train_df[IDENTITY_COLUMNS].sum(axis=1)
sample_weights += train_df[TARGET_COLUMN] * (~train_df[IDENTITY_COLUMNS]).sum(axis=1)
sample_weights += (~train_df[TARGET_COLUMN]) * train_df[IDENTITY_COLUMNS].sum(axis=1) * 5
sample_weights /= sample_weights.mean()

# building of the embedding matrix 
embedding_matrix = np.concatenate(
    [build_matrix(tokenizer.word_index, f) for f in EMBEDDING_FILES], axis=-1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if sys.path[0] == '':
  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [0]:
# truth sets for the LSTM and NN 
df = pd.read_csv(file_path)
test_app_df = df.iloc[x_split:, :]

LSTM_TRUTH = np.asarray(test_app_df[TARGET_COLUMN].tolist()).reshape(y_split)
NN_TRUTH = np.asarray(rating_conversion(test_app_df[RATING_COLUMN].tolist())).reshape(y_split)

##Model Training##

In [0]:
# LSTM Model summary
model = build_model(embedding_matrix, y_aux_train.shape[-1])
model.summary()











Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, None)         0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, None, 600)    66556800    input_1[0][0]                    
__________________________________________________________________________________________________
spatial_dropout1d_1 (SpatialDro (None, None, 600)    0           embedding_1[0][0]                
_____________________________________________________________________

In [0]:
# LSTM Model
checkpoint_predictions = []
weights = []

model = build_model(embedding_matrix, y_aux_train.shape[-1])
for global_epoch in range(EPOCHS):
  model.fit(x_train, [y_train, y_aux_train], batch_size=BATCH_SIZE, epochs=1,
              verbose=2, sample_weight=[sample_weights.values, np.ones_like(sample_weights)])
  checkpoint_predictions.append(model.predict(x_test, batch_size=2048)[0].flatten())
  weights.append(2 ** global_epoch)



Epoch 1/1
 - 44s - loss: 0.7292 - dense_7_loss: 0.5574 - dense_8_loss: 0.1719 - dense_7_acc: 0.5859 - dense_8_acc: 0.7973
Epoch 1/1
 - 39s - loss: 0.6587 - dense_7_loss: 0.5135 - dense_8_loss: 0.1453 - dense_7_acc: 0.6057 - dense_8_acc: 0.8003
Epoch 1/1
 - 39s - loss: 0.6453 - dense_7_loss: 0.5043 - dense_8_loss: 0.1409 - dense_7_acc: 0.6085 - dense_8_acc: 0.8004
Epoch 1/1
 - 39s - loss: 0.6350 - dense_7_loss: 0.4969 - dense_8_loss: 0.1382 - dense_7_acc: 0.6095 - dense_8_acc: 0.8005


In [0]:
# NN model
model_NN = Sequential()
model_NN.add(Dense(1, input_dim=1, activation='sigmoid')) 

# compile NN model
model_NN.compile(optimizer='sgd', loss='binary_crossentropy', metrics=['acc', f1_m, precision_m, recall_m]) 
model_NN.fit(train_x_NN, train_y_NN, batch_size=64, nb_epoch=5, verbose=1)

##Model Evaluation##

In [0]:
## LSTM Predictions for submission 
predictions = np.average(checkpoint_predictions, weights=weights, axis=0)
submission = pd.DataFrame.from_dict({
    'id': test_df.id,
    'prediction': predictions
})

In [0]:
# LSTM predictions
lstm_pred = submission.sort_index()
lstm_pred = np.array(lstm_pred['prediction'].tolist())

# LSTM prediction metrics 
MSE_LSTM = mean_squared_error(LSTM_TRUTH, lstm_pred)
RMSE_LSTM = root_mean_squared_error(LSTM_TRUTH, lstm_pred)
MAE_LSTM = mean_absolute_error(LSTM_TRUTH, lstm_pred)
rMSE_LSTM = relative_mean_squared_error(LSTM_TRUTH, lstm_pred)
R2_LSTM = r2_score(LSTM_TRUTH, lstm_pred)

# print all metrics 
print('MSE of LSTM: ' + str(MSE_LSTM) + '\n' + 
      'RMSE of LSTM: ' + str(RMSE_LSTM) + '\n' + 
      'MAE of LSTM: ' + str(MAE_LSTM) + '\n' + 
      'rMSE of LSTM: ' + str(rMSE_LSTM) + '\n' + 
      'R2 of LSTM: ' + str(R2_LSTM))

In [0]:
# visualization for QQ-Plot of LSTM predication vs. truth
fig = plt.figure()
plt.scatter(LSTM_TRUTH, lstm_pred)
fig.suptitle('LSTM Scatter for CM', fontsize=20)
plt.xlabel('LSTM TRUTH', fontsize=16)
plt.ylabel('LSTM Prediction', fontsize=16)
plt.show()

In [0]:
# NN preditions
nn_pred = model_NN.predict(lstm_pred).reshape(y_split)
nn_pred = np.around(nn_pred)

# keras model pred
_, accuracy, f1_score, precision, recall = model_NN.evaluate(lstm_pred, NN_TRUTH, verbose=0)

# print keras metrics
print('Accuracy: ' + str(accuracy) + '\n' + 
      'F1_score: ' + str(f1_score) + '\n' + 
      'Precision: ' + str(precision) + '\n' + 
      'Recall: ' + str(recall))

In [0]:
# confusion matrix of NN predictions 
cm = confusion_matrix(NN_TRUTH, nn_pred)
print(cm)

###LSTM as Classifier

In [0]:
LSTM_TOXIC_TRUTH = []
for elem in LSTM_TRUTH:
  if elem >= 0.5: LSTM_TOXIC_TRUTH.append(1)
  else: LSTM_TOXIC_TRUTH.append(0)

LSTM_TOXIC_PRED = []
for elem in submission['prediction']:
  if elem >= 0.5: LSTM_TOXIC_PRED.append(1)
  else: LSTM_TOXIC_PRED.append(0)

In [0]:
lstm_class_df = pd.DataFrame({'LSTM_TRUTH':LSTM_TOXIC_TRUTH, 
                   'LSTM_PRED':LSTM_TOXIC_PRED})

In [0]:
lstm_class_df.to_csv('/content/gdrive/My Drive/lstm_classification.csv')

##Analysis##

In [0]:
# Analysis of comment distribution
df = pd.read_csv(file_path)
df_analysis = df[TR_COLUMN]
df_group = df_analysis.groupby(RATING_COLUMN)

# grouped based on rating
df1 = df_group.get_group(RATING[0])
df2 = df_group.get_group(RATING[1])

# visualization for counts of rating based on target score
fig = plt.figure()
plt.hist([df1[TARGET_COLUMN], df2[TARGET_COLUMN]],bins=10, range=(0,1), stacked=True, color = ['g','r'])
fig.suptitle('Comment Rating Distribution', fontsize=20)
plt.xlabel('Comment Counts', fontsize=16)
plt.ylabel('Comment Target Score', fontsize=16)
plt.show()

###Sample Data###

In [0]:
# read file extension
file_name_s = 'file_name' # name of file in google drive 
file_path_s = '/content/gdrive/My Drive/' + file_name_s + '.csv'

save_name_s = 'file 1 name'

# load main df 
df = pd.read_csv(file_path_s)

# create list of target bins 
target_list = []
for i in range(len(df)):
    target_list.append(round(df.loc[i, 'target']))
    if (i % 250000 == 0): print(i)

# add new column to df
df['target_rounded'] = target_list

# groupby target bin then sample 
df_group = df.groupby('rating')
df_zero_sample = df_group.get_group(0.0).sample(90000)
df_half_sample = df_group.get_group(0.5).sample(16000)
df_one_sample = df_group.get_group(1.0).sample(9000)

# combine new dfs, shuffle, and drop target bin column 
df_combined = pd.concat([df_zero_sample, df_half_sample, df_one_sample], ignore_index=True)
df_shuffled = shuffle(df_combined)
df_final = df_shuffled.drop(columns=['target_rounded'])

# save to csv
df_final.to_csv(save_name_s + '.csv', index=False)

###Preprocessing Data###

In [0]:
# this is the name of the sampled file 
pre_file_name = 'file name' 
pre_path = '/content/gdrive/My Drive/' + pre_file_name + '.csv'

# these are what the file will be saved as 
save_name1 = 'file 1 name'
save_name2 = 'file 2 name'

# load df from path 
df1 = pd.read_csv(pre_path)
df2 = pd.read_csv(pre_path)

# apply functions to comments 
df1[TEXT_COLUMN] = df1.apply(lambda x: contraction_pruning(x[TEXT_COLUMN]), axis=1)
df2[TEXT_COLUMN] = df1.apply(lambda x: stop_word_pruning(x[TEXT_COLUMN]), axis=1)

# save to csv's
df1.to_csv(save_name1 + '.csv', index=False)
df2.to_csv(save_name2 + '.csv', index=False)

##Save File for Submission##

In [0]:
# submission for Kaggle competition 
submission.to_csv('/content/gdrive/My Drive/Model_1.csv')