## Installing libraries

In [29]:
!pip install scikit-multilearn
!pip install contractions
# !pip install mlxtend



In [30]:
%pip install mlxtend --upgrade



In [31]:
import pandas as pd
import numpy as np
import re
import nltk.corpus
nltk.download('stopwords')
nltk.download('words')
nltk.download('punkt')
nltk.download('wordnet')
from nltk.corpus import stopwords, words
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer
import contractions
from gensim.models import Word2Vec
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import scale, MinMaxScaler
from sklearn.metrics import accuracy_score, recall_score, precision_score
from keras.models import Sequential
from keras.layers import Dense
from sklearn.model_selection import learning_curve
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from mlxtend.evaluate import bias_variance_decomp

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


## Importing Dataset


In [35]:
test_label = pd.read_csv("test_labels.csv")
test = pd.read_csv("test.csv")
train = pd.read_csv("train.csv")

In [36]:
# Merging test and train to form one huge dataset
test_data = pd.merge(test, test_label)
dataset = pd.concat([test_data, train])
dataset.drop(columns=['id'], inplace=True)
dataset.drop_duplicates(inplace=True, ignore_index=True)
dataset.drop(dataset.index[dataset['toxic'] == -1], inplace = True)
dataset.reset_index(inplace = True)

In [None]:
dd= dataset.loc[dataset['severe_toxic'] == 1]
dataset= pd.concat([dataset, dd])
dd= dataset.loc[dataset['threat'] == 1]
dataset= pd.concat([dataset, dd])


## Text cleaning

In [34]:
# Text cleaning
#converting to lower case
dataset['comment_text_cleaned'] = dataset['comment_text'].str.lower()
#removing special characters
dataset['comment_text_cleaned'] = dataset['comment_text_cleaned'].apply(lambda elem: re.sub(r"(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?", " ", str(elem)))
#removing numbers
dataset['comment_text_cleaned'] = dataset['comment_text_cleaned'].apply(lambda elem: re.sub(r"\d+", "", str(elem)))
# Removing stop words
stop = stopwords.words('english')
dataset['comment_text_cleaned'] = dataset['comment_text_cleaned'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
# Replacing contractions with their full forms
dataset['comment_text_cleaned'] = dataset['comment_text_cleaned'].apply(lambda x: contractions.fix(x))
#Tokenizing
dataset['comment_text_cleaned'] = dataset['comment_text_cleaned'].apply(lambda x: word_tokenize(x))
#Lemmitization
def word_lemmatizer(text):
    lem_text = [WordNetLemmatizer().lemmatize(i) for i in text]
    return lem_text
dataset['comment_text_cleaned'] = dataset['comment_text_cleaned'].apply(lambda x: word_lemmatizer(x))

KeyboardInterrupt: ignored

In [None]:
# Splitting into train test sets
X = dataset.drop(columns=['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'])
y = dataset[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']].copy()

X_train, X_test_and_val, y_train, y_test_and_val = train_test_split(X,y, train_size=0.8)
X_val, X_test, y_val, y_test = train_test_split(X_test_and_val,y_test_and_val, train_size=0.5)

In [None]:
train_tokens = pd.Series(X_train['comment_text_cleaned']).values
w2v_model = Word2Vec(train_tokens, size= 200)

def buildWordVector(tokens, size):
  vec = np.zeros(size).reshape((1, size))
  count = 0.
  for word in tokens:
    try:
      vec += w2v_model[word].reshape((1, size))
      count += 1.
    except KeyError:
      continue
  if count != 0:
    vec /= count
  return vec

In [None]:
train_vecs_w2v = np.concatenate([buildWordVector(z, 200) for z in train_tokens])
# train_vecs_w2v = scaler.fit_transform(train_vecs_w2v)
train_vecs_w2v = scale(train_vecs_w2v)

val_tokens = pd.Series(X_val['comment_text_cleaned']).values
val_vecs_w2v = np.concatenate([buildWordVector(z, 200) for z in val_tokens])
# val_vecs_w2v = scaler.transform(val_vecs_w2v)
val_vecs_w2v = scale(val_vecs_w2v)

## Neural NETWORK

In [10]:
model = Sequential()
model.add(Dense(160, input_dim = train_vecs_w2v.shape[1], activation='sigmoid', kernel_initializer='he_uniform'))
model.add(Dense(y_train.shape[1], activation='sigmoid'))
model.compile(optimizer='adam',loss='binary_crossentropy',)
model.fit(train_vecs_w2v, y_train, epochs=10, verbose=0) #epochs=100

<keras.callbacks.History at 0x7f6fb72601d0>

In [11]:
predictions = model.predict(val_vecs_w2v)
predictions = predictions.round()
predictions

array([[0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       ...,
       [1., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.]], dtype=float32)

## Evaluation

In [12]:
def score(y_true, y_pred, label):
  prec = precision_score(y_true, y_pred,average='micro')
  re = recall_score(y_true, y_pred,average='micro')
  print('Results for label:', label)
  print('Precision score:', prec)
  print('Recall score:', re)
  print('Final Score:', re*0.6 + prec*0.4, '\n')
  return [re*0.6 + prec*0.4, prec, re]

In [13]:
score(y_val, predictions,'Neural Network with Word2Vec')

Results for label: Neural Network with Word2Vec
Precision score: 0.7149082568807339
Recall score: 0.624024024024024
Final Score: 0.660377717166708 



[0.660377717166708, 0.7149082568807339, 0.624024024024024]

## Hyperparameter tuning



In [14]:
pip install keras-tuner --upgrade


Collecting keras-tuner
  Downloading keras_tuner-1.1.2-py3-none-any.whl (133 kB)
[?25l[K     |██▌                             | 10 kB 16.1 MB/s eta 0:00:01[K     |█████                           | 20 kB 3.6 MB/s eta 0:00:01[K     |███████▍                        | 30 kB 4.2 MB/s eta 0:00:01[K     |█████████▉                      | 40 kB 3.7 MB/s eta 0:00:01[K     |████████████▎                   | 51 kB 3.5 MB/s eta 0:00:01[K     |██████████████▊                 | 61 kB 4.2 MB/s eta 0:00:01[K     |█████████████████▏              | 71 kB 3.8 MB/s eta 0:00:01[K     |███████████████████▋            | 81 kB 3.8 MB/s eta 0:00:01[K     |██████████████████████          | 92 kB 4.2 MB/s eta 0:00:01[K     |████████████████████████▌       | 102 kB 4.3 MB/s eta 0:00:01[K     |███████████████████████████     | 112 kB 4.3 MB/s eta 0:00:01[K     |█████████████████████████████▍  | 122 kB 4.3 MB/s eta 0:00:01[K     |███████████████████████████████▉| 133 kB 4.3 MB/s eta 0:00:

In [15]:
import keras_tuner as kt
from tensorflow import keras

In [22]:
def build_model(hp):
  model = Sequential()
  model.add(keras.layers.Dense(units=hp.Int("units", min_value=32, max_value=512, step=32),activation=hp.Choice("activation", ["relu", "sigmoid"]),))
  model.add(Dense(y_train.shape[1], activation='sigmoid'))
  model.compile(loss='binary_crossentropy')
  # Tune whether to use dropout.
  if hp.Boolean("dropout"):
    model.add(keras.layers.Dropout(rate=0.25))
  model.add(keras.layers.Dense(6, activation="sigmoid"))
  learning_rate = hp.Float("lr", min_value=1e-4, max_value=1e-2, sampling="log")
  model.compile(optimizer=keras.optimizers.Adam(learning_rate=learning_rate),loss='binary_crossentropy',metrics=["accuracy"])

  return model


In [23]:
tuner = kt.RandomSearch(build_model,objective='val_loss',max_trials=5)

INFO:tensorflow:Reloading Oracle from existing project ./untitled_project/oracle.json


In [24]:
tuner.search(train_vecs_w2v, y_train, epochs=50, validation_data=(val_vecs_w2v, y_val))
best_model = tuner.get_best_models()[0]

Trial 5 Complete [00h 14m 23s]
val_loss: 0.06154822185635567

Best val_loss So Far: 0.06106605380773544
Total elapsed time: 01h 08m 51s
INFO:tensorflow:Oracle triggered exit


In [25]:
best_model

<keras.engine.sequential.Sequential at 0x7f6fb7a75890>

In [28]:
predictions = best_model.predict(val_vecs_w2v)
predictions = predictions.round()
predictions
score(y_val, predictions,'Neural Network with Word2Vec')

Results for label: Neural Network with Word2Vec
Precision score: 0.7960469779432827
Recall score: 0.5563563563563564
Final Score: 0.6522326049911269 



[0.6522326049911269, 0.7960469779432827, 0.5563563563563564]