# Toxicity Classification Machine Learning Project
### A Machine Learning Project by *Khanh Phan, Peter Nguyen, Henry Dinh, and John Zhao*

* This is the final project for the Data Analytics Bootcamp
* There are 3 main parts to this project:

    i. Cleaning dataset to make it trainable
    
    ii. Model training to make predictions on other test sets
    
    iii. Loading that model onto a front-end webpage

## Import Dependencies

In [7]:
# Import dependencies
import os
import gc
import re
import operator
import numpy as np
import pandas as pd
from tqdm import tqdm
tqdm.pandas()
from collections import defaultdict

# Machine learning models
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense
from sklearn.pipeline import Pipeline
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.svm import SVR
from sklearn.feature_extraction.text import TfidfVectorizer
from keras.models import load_model

# Gensim (Word2Vec)
import gensim
from gensim.models import KeyedVectors
from gensim.models import Word2Vec

# NLTK 
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk import word_tokenize

## Read in CSV files
    - Training set data
    - Testing set data
    - Submission set data

In [8]:
# We are limiting the training dataset to 100,000 rows due to time/computing constraints
train_df = pd.read_csv('resources/train.csv', nrows = 100000)
test_df = pd.read_csv('resources/test.csv')
submission = pd.read_csv('resources/submission.csv')

## Load in Word2Vec model
    - Found here: https://github.com/eyaler/word2vec-slim/blob/master/GoogleNews-vectors-negative300-SLIM.bin.gz

In [9]:
# Load embedding with pre-made Word2Vec model
w2v = gensim.models.KeyedVectors.load_word2vec_format('resources/GoogleNews-vectors-negative300-SLIM.bin.gz', 
                                                      binary = True)

## View Dataframes

In [40]:
# Training dataset
train_df.head(10)

Unnamed: 0,target,comment_text
0,0.0,"this is so cool . it is like , ' would you ..."
1,0.0,thank you ! ! this would make my life a lot ...
2,0.0,this is such an urgent design problem ; kudos...
3,0.0,is this something i will be able to install on...
4,0.893617,haha you guys are a bunch of losers .
5,0.666667,you are a sh * tty comment .
6,0.457627,hahahahahahahahhha suck it .
7,0.0,ffffuuuuuuuuuuuuuuu
8,0.0,the ranchers seem motivated by mostly by greed...
9,0.0,it was a great show . not a combo i would of ...


In [11]:
# Testing dataset
test_df.head()

Unnamed: 0,id,comment_text
0,7000000,Jeff Sessions is another one of Trump's Orwell...
1,7000001,I actually inspected the infrastructure on Gra...
2,7000002,No it won't . That's just wishful thinking on ...
3,7000003,Instead of wringing our hands and nibbling the...
4,7000004,how many of you commenters have garbage piled ...


In [12]:
# Submission dataset
submission.head()

Unnamed: 0,id,prediction
0,7000000,0.0
1,7000001,0.0
2,7000002,0.0
3,7000003,0.0
4,7000004,0.0


In [13]:
# Take a look at the dimensions of our data sets
print(train_df.shape)
print(test_df.shape)

(100000, 45)
(97320, 2)


## Data Clean-Up
    - Select relevant columns
    - Remove contractions and replace with the words they are made out of
    - Delete special characters
    - Aggregate all functions into one 'cleaning' function

In [14]:
# Set relevant columns in dataframe and set new dataframe as 'train_df'
# Since our test dataset only looks at 'comment_text', we will set 'target' & 'comment_text' as our training columns
train_df = train_df[['target', 'comment_text']]
train_df.head()

Unnamed: 0,target,comment_text
0,0.0,"This is so cool. It's like, 'would you want yo..."
1,0.0,Thank you!! This would make my life a lot less...
2,0.0,This is such an urgent design problem; kudos t...
3,0.0,Is this something I'll be able to install on m...
4,0.893617,haha you guys are a bunch of losers.


In [15]:
# List and set contractions as words without apostrophes
contraction_list = {"ur": "you are", "ain't": "is not", "aren't": "are not", "can't": "cannot", "'cause": "because", "could've": "could have", "couldn't": "could not", "didn't": "did not",  "doesn't": "does not", "don't": "do not", "hadn't": "had not", "hasn't": "has not", "haven't": "have not", "he'd": "he would","he'll": "he will", "he's": "he is", "how'd": "how did", "how'd'y": "how do you", "how'll": "how will", "how's": "how is",  "I'd": "I would", "I'd've": "I would have", "I'll": "I will", "I'll've": "I will have","I'm": "I am", "I've": "I have", "i'd": "i would", "i'd've": "i would have", "i'll": "i will",  "i'll've": "i will have","i'm": "i am", "i've": "i have", "isn't": "is not", "it'd": "it would", "it'd've": "it would have", "it'll": "it will", "it'll've": "it will have","it's": "it is", "let's": "let us", "ma'am": "madam", "mayn't": "may not", "might've": "might have","mightn't": "might not","mightn't've": "might not have", "must've": "must have", "mustn't": "must not", "mustn't've": "must not have", "needn't": "need not", "needn't've": "need not have","o'clock": "of the clock", "oughtn't": "ought not", "oughtn't've": "ought not have", "shan't": "shall not", "sha'n't": "shall not", "shan't've": "shall not have", "she'd": "she would", "she'd've": "she would have", "she'll": "she will", "she'll've": "she will have", "she's": "she is", "should've": "should have", "shouldn't": "should not", "shouldn't've": "should not have", "so've": "so have","so's": "so as", "this's": "this is","that'd": "that would", "that'd've": "that would have", "that's": "that is", "there'd": "there would", "there'd've": "there would have", "there's": "there is", "here's": "here is","they'd": "they would", "they'd've": "they would have", "they'll": "they will", "they'll've": "they will have", "they're": "they are", "they've": "they have", "to've": "to have", "wasn't": "was not", "we'd": "we would", "we'd've": "we would have", "we'll": "we will", "we'll've": "we will have", "we're": "we are", "we've": "we have", "weren't": "were not", "what'll": "what will", "what'll've": "what will have", "what're": "what are",  "what's": "what is", "what've": "what have", "when's": "when is", "when've": "when have", "where'd": "where did", "where's": "where is", "where've": "where have", "who'll": "who will", "who'll've": "who will have", "who's": "who is", "who've": "who have", "why's": "why is", "why've": "why have", "will've": "will have", "won't": "will not", "won't've": "will not have", "would've": "would have", "wouldn't": "would not", "wouldn't've": "would not have", "y'all": "you all", "y'all'd": "you all would","y'all'd've": "you all would have","y'all're": "you all are","y'all've": "you all have","you'd": "you would", "you'd've": "you would have", "you'll": "you will", "you'll've": "you will have", "you're": "you are", "you've": "you have" }


In [16]:
# Contraction function to replace contractions
def contractions(text):
    
    # Find special characters that could be used as apostrophes 
    special = ["’", "‘", "´", "`"]
    
    # Replace these 'apostrophes' with an actual apostrophe
    for s in special:
        text = text.replace(s, "'")
    
    # Replace contraction with words in our contraction list
    words = [contraction_list[word] if word in contraction_list else word for word in text.split(" ")]
    return ' '.join(words)

In [17]:
# Define all special characters
chars = "/~!@#$%^&*)('.,][_+=?><\:;|}{" + '""'

# Create a special characters function to remove any special characters
def sp_char(text):
    for c in chars:
        text = text.replace(c, f' {c} ')
    return text

In [18]:
# Create function to clean the testing dataset
def cleaning(text):
    
    # Lowercase the text
    text = text.lower()
    
    # Replace the contractions
    text = contractions(text)
    
    # Remove any special characters
    text = sp_char(text)
    
    # Tokenize the words with word_tokenize from NLTK
    # tokens = word_tokenize(text) 
    
    # Return tokenized text
    return text

In [19]:
# Apply the text cleaning function to the training dataset
train_df["comment_text"] = train_df["comment_text"].progress_apply(lambda text: cleaning(text))

100%|██████████| 100000/100000 [00:02<00:00, 49085.51it/s]


In [20]:
# Apply the text cleaning function to the testing dataset
test_df["comment_text"] = test_df["comment_text"].progress_apply(lambda text: cleaning(text))

100%|██████████| 97320/97320 [00:01<00:00, 50150.30it/s]


In [21]:
epochs=50
batch_size=128
max_words=100000
max_seq_size=256

In [22]:
transformer = Tokenizer(lower = True, filters='', num_words=max_words)
transformer.fit_on_texts(list(train_df["comment_text"].values) + list(test_df["comment_text"].values))

In [23]:
t_x = transformer.texts_to_sequences(train_df["comment_text"].values)
t_x = pad_sequences(t_x, maxlen = max_seq_size)

In [24]:
x_prediction = transformer.texts_to_sequences(test_df["comment_text"])
x_prediction = pad_sequences(x_prediction, maxlen = max_seq_size)

In [25]:
def build_embedding_matrix(word_index, total_vocab, embedding_size):
    matrix = np.zeros((total_vocab, embedding_size))
    for word, index in tqdm(word_index.items()):
        try:
            matrix[index] = w2v[word]
        except KeyError:
            pass
    return matrix

In [26]:
word_index = transformer.word_index
total_vocab = len(word_index) + 1
embedding_size = 300
w2v = build_embedding_matrix(transformer.word_index, total_vocab, embedding_size)

100%|██████████| 150625/150625 [00:00<00:00, 325257.10it/s]


In [27]:
y = (train_df['target'].values > 0.5).astype(int)
X_train, X_test, y_train, y_test = train_test_split(t_x, y, random_state=6)

In [28]:
model = keras.Sequential()
model.add(keras.layers.Embedding(total_vocab, embedding_size, weights=[w2v], trainable=False))
model.add(keras.layers.GlobalAveragePooling1D())
model.add(keras.layers.Dense(16, activation='relu'))
model.add(keras.layers.Dense(1, activation='linear'))

In [29]:
model.compile(optimizer='adam', loss='mean_squared_error', metrics=['accuracy'])

Instructions for updating:
Use tf.cast instead.


In [30]:
model.fit(X_train, y_train, batch_size=batch_size, epochs=3)

Instructions for updating:
Use tf.cast instead.
Epoch 1/3
Epoch 2/3
Epoch 3/3


<tensorflow.python.keras.callbacks.History at 0x15b2615f8>

In [31]:
model.save('model.h5')

In [32]:
score = model.evaluate(X_test, y_test, batch_size=batch_size)



In [33]:
print('Test loss:', score[0])
print('Test accuracy:', score[1])

Test loss: 0.05037543543875218
Test accuracy: 0.94728


In [34]:
y_predict = model.predict(x_prediction)

In [39]:
y_train[8:]

array([0, 0, 1, ..., 0, 0, 0])

In [35]:
x_prediction

array([[   0,    0,    0, ...,  656, 1640,    1],
       [   0,    0,    0, ...,    4,  965,    1],
       [   0,    0,    0, ..., 6521, 1685,    1],
       ...,
       [   0,    0,    0, ...,  297,    1,   32],
       [   0,    0,    0, ..., 2840,  130,    1],
       [   0,    0,    0, ...,   16,  736,    1]], dtype=int32)

In [34]:
submission["prediction"] = y_predict

In [35]:
submission.head()

Unnamed: 0,id,prediction
0,7000000,0.059665
1,7000001,0.03162
2,7000002,0.036909
3,7000003,0.010817
4,7000004,0.052839


In [36]:
submission.to_csv("submission.csv", index=False)