## Dataset Description
You are provided with a large number of Wikipedia comments which have been labeled by human raters for toxic behavior. The types of toxicity are:

- toxic
- severe_toxic
- obscene
- threat
- insult
- identity_hate

You must create a model which predicts a probability of each type of toxicity for each comment.

File descriptions:
- train.csv - the training set, contains comments with their binary labels
- test.csv - the test set, you must predict the toxicity probabilities for these comments. To deter hand labeling, the test set - contains some comments which are not included in scoring.
- sample_submission.csv - a sample submission file in the correct format
- test_labels.csv - labels for the test data; value of -1 indicates it was not used for scoring; (Note: file added after competition close!)

In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns

import re
import string
from pickle import dump, load
import zipfile
import os

from collections import Counter

from nltk.corpus import stopwords
from nltk.probability import FreqDist

from wordcloud import WordCloud

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Model
from keras.layers import Input, Embedding, Conv1D, MaxPool1D, Flatten, Dropout, Dense, concatenate
from keras.optimizers import Adam
from keras.optimizers.schedules import ExponentialDecay
from keras.utils import plot_model
from keras.callbacks import EarlyStopping, ReduceLROnPlateau

def extract_zip_files(zip_file_path, extract_to_dir):
    # Path to the zip file
    zip_file_path = zip_file_path
    # directory to extract contents
    extract_to_dir = extract_to_dir
    # Extracting the zip file
    try:
        with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
            # Extract all contents to the specified directory
            zip_ref.extractall(extract_to_dir)
        print(f"Contents of {zip_file_path} extracted successfully to {extract_to_dir}.")
    except FileNotFoundError:
        print(f"Error: The file {zip_file_path} does not exist.")
    except zipfile.BadZipFile:
        print(f"Error: The file {zip_file_path} is not a valid zip file.")
    except Exception as e:
        print(f"An unexpected error occurred: {e}")
        
# load doc into memory
def load_doc(filename):
    with open(filename, 'r', encoding='utf-8') as file:
        text = file.read()
    return text

def clean_doc(doc):
    # split into tokens by white space
    tokens = doc.split()
    # prepare regex for char filtering
    re_punc = re.compile('[%s]' % re.escape(string.punctuation))
    # remove punctuation from words
    tokens = [re_punc.sub('', word) for word in tokens]
    # remove remaining words that are not alphabetic
    tokens = [word for word in tokens if word.isalpha()]
    # standardize tokens
    tokens = [word.lower() for word in tokens]
    # filter out stop words
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    # filter out short tokens
    tokens = [word for word in tokens if len(word) > 1]
    return tokens

# save a dataset to a file
def save_dataset(dataset, filename):
    dump(dataset, open(filename, 'wb'))
    print('Saved: %s' % filename)
    
# load a clean dataset
def load_output(filename):
    return load(open(filename, 'rb'))

# Load data:

In [None]:
# extract main file
zip_file_path = 'jigsaw-toxic-comment-classification-challenge.zip'
extract_to_dir = './toxic_files'
extract_zip_files(zip_file_path, extract_to_dir)

In [None]:
# extract training zip file:
zip_file_path = './toxic_files/train.csv.zip'
extract_to_dir = './toxic_files/train_file'
extract_zip_files(zip_file_path, extract_to_dir)

# extract test zip file:
zip_file_path = './toxic_files/test.csv.zip'
extract_to_dir = './toxic_files/test_file'
extract_zip_files(zip_file_path, extract_to_dir)

# extract 
zip_file_path = './toxic_files/test_labels.csv.zip'
extract_to_dir = './toxic_files/test_file'
extract_zip_files(zip_file_path, extract_to_dir)

In [3]:
# load extracted files
train_df = pd.read_csv('./toxic_files/train_file/train.csv', index_col=0)
test_doc = pd.read_csv('./toxic_files/test_file/test.csv')
ytest = pd.read_csv('./toxic_files/test_file/test_labels.csv')

train_df.info()

# perform data remediation for the test set:
test_df = pd.merge(test_doc, ytest, on='id', how='left')
test_df.set_index('id', inplace=True)
# remove none informative labels
test_df = test_df[~(test_df == -1).any(axis=1)]
# sumarise data sets:
print('\nData summary:')
print(f'Training dataset: {train_df.shape[0]} rows, {train_df.shape[1]} columns')
print(f'Test dataset: {test_df.shape[0]} rows, {test_df.shape[1]} columns')

# split into input and output:
train_docs, ytrain = train_df['comment_text'], train_df.iloc[:, -6:].values.astype(int)
test_docs, ytest = test_df['comment_text'], test_df.iloc[:, -6:].values.astype(int)

# save files:
train_docs = train_docs.to_frame()
test_docs = test_docs.to_frame()
train_docs.to_csv('train_docs.csv'), test_docs.to_csv('test_docs.csv')
save_dataset([ytrain, ytest], 'output.pkl')

del train_df, test_doc, ytrain, ytest

<class 'pandas.core.frame.DataFrame'>
Index: 159571 entries, 0000997932d777bf to fff46fc426af1f9a
Data columns (total 7 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   comment_text   159571 non-null  object
 1   toxic          159571 non-null  int64 
 2   severe_toxic   159571 non-null  int64 
 3   obscene        159571 non-null  int64 
 4   threat         159571 non-null  int64 
 5   insult         159571 non-null  int64 
 6   identity_hate  159571 non-null  int64 
dtypes: int64(6), object(1)
memory usage: 9.7+ MB

Data summary:
Training dataset: 159571 rows, 7 columns
Test dataset: 63978 rows, 7 columns
Saved: output.pkl


# Analysing most toxic tokens by toxicity type:

In [4]:
class toxicity_analyser(object):
    def __init__(self, df, most_common):
        self.df = df
        self.most_common = most_common
        
    def combine_words(self, word_list):
        all_words = []
        for word in word_list:
            all_words += word
        return all_words
        
    def create_word_cloud(self, col_name):
        df = self.df[self.df[col_name] == 1]
        tokens = df['comment_text'].apply(clean_doc)
        reviewed_tokens = self.combine_words(tokens)
        mostcommon = FreqDist(reviewed_tokens).most_common(self.most_common)
        wordcloud = WordCloud(width=1500, height=800, background_color='white').generate(str(mostcommon))
        fig = plt.figure(figsize=(30,10), facecolor='white')
        plt.imshow(wordcloud, interpolation="bilinear")
        plt.axis('off')
        plt.title(f'Top {self.most_common} Most Common Words', fontsize=25)
        plt.show()

In [None]:
toxicity = toxicity_analyser(train_df, 100)

In [None]:
toxicity.create_word_cloud('toxic')

In [None]:
toxicity.create_word_cloud('severe_toxic')

In [None]:
toxicity.create_word_cloud('obscene')

In [None]:
toxicity.create_word_cloud('threat')

In [None]:
toxicity.create_word_cloud('insult')

In [None]:
toxicity.create_word_cloud('identity_hate')

# Create Vocabulary:

In [None]:
# add doc to vocabulary
def add_doc_to_vocab(filename, vocab):
    # load doc
    doc = load_doc(filename)
    # clean doc
    tokens = clean_doc(doc)
    # update counts
    vocab.update(tokens)
    
# save list to file
def save_list(lines, filename):
    # convert lines into a single blob of text
    data = '\n'.join(lines)
    # open file
    file = open(filename, 'w')
    # write text
    file.write(data)
    # close file
    file.close

# define vocab
vocab = Counter()

# add all docs to vocab
add_doc_to_vocab('train_docs.csv', vocab)

# print size of the vocab
print(len(vocab))
# print most common words
print(vocab.most_common(50))

# keep token with a min occurance
min_occurance = 2
tokens = [k for k,c in vocab.items() if c >= min_occurance]
print(len(tokens))

# save token to vocavulary
save_list(tokens, 'vocab.text')

# Build model: n-gram multihead CNN Model

In [None]:
def clean_doc(doc, vocab):
    # split into tokens by white space
    tokens = doc.split()
    # prepare regex for char filtering
    re_punc = re.compile('[%s]' % re.escape(string.punctuation))
    # remove punctuation from words
    tokens = [re_punc.sub('', word) for word in tokens]
    # remove remaining words that are not alphabetic
    tokens = [word for word in tokens if word.isalpha()]
    # standardize tokens
    tokens = [word.lower() for word in tokens]
    #filter out tokens not in the vocab
    tokens = [w for w in tokens if w in vocab]
    tokens = ' '.join(tokens)
    return tokens

# load and clean a dataset
def load_clean_dataset(filename, vocab):
    doc = load_doc(filename)
    tokens = clean_doc(doc, vocab)
    return tokens

   
# fit a tokenizer
def create_tokenizer(lines):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(lines)
    return tokenizer

# calculate the maximum document length
def max_length(lines):
    return max([len(s.split()) for s in lines])

# encode a list of lines
def encode_text(tokenizer, lines, length):
    # integer encode
    encoded = tokenizer.texts_to_sequences(lines)
    # pad encoded sequences
    padded = pad_sequences(encoded, maxlen=length, padding='post')
    return padded

# Define model
def define_model(vocab_size, length, kernels, embedding_dim=100):
    inputs = []
    combine = []
    for kernel in kernels:
        in_layer = Input(shape=(length,))
        embedding = Embedding(input_dim=vocab_size, output_dim=embedding_dim)(in_layer)
        conv = Conv1D(filters=32, kernel_size=kernel, activation='relu')(embedding)
        conv = Conv1D(filters=32, kernel_size=kernel, activation='relu')(conv)
        drop = Dropout(0.5)(conv2)
        pool = MaxPool1D(pool_size=2)(drop2)
        flat = Flatten()(pool)
        inputs.append(in_layer)
        combine.append(flat)
    merged = concatenate(combine)
    dense1 = Dense(100, activation='relu')(merged)
    outputs = Dense(6, activation='sigmoid')(dense1)
    model = Model(inputs=inputs, outputs=outputs)
    lr_schedule = ExponentialDecay(initial_learning_rate=0.001, decay_steps=1000, decay_rate=0.9)
    optimizer = Adam(learning_rate=lr_schedule)
    model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy', 'roc_auc'])
    model.summary()
    plot_model(model, to_file='n_gram_multihead_cnn_model.png', show_shapes=True)
    return model

# Evaluate model
def evaluate_model(model, Xtrain, ytrain, Xtest, ytest, kernels, n_repeats=5):
    channels = len(kernels)
    train_pattern = [Xtrain] * channels
    test_pattern = [Xtest] * channels
    scores = []
    for i in range(1, n_repeats + 1):
        rlp = ReduceLROnPlateau(monitor='val_loss', mode='min', patience=5)
        es = EarlyStopping(monitor='val_loss', mode='min', patience=5, restore_best_weights=True)
        model.fit(train_pattern, ytrain, validation_split=0.2, epochs=20, batch_size=32, verbose=1, callbacks=[rlp, es])
        _, acc = model.evaluate(test_pattern, ytest, verbose=0)
        if i == 1 or i % 5 == 0:
            print(f'Run {i}: Test Accuracy: {acc * 100:.3f}')
        scores.append(acc)
    mean_score = np.mean(scores)
    print(f'Mean Accuracy: {mean_score * 100:.3f}')
    
# load vocab
vocab_filename = 'vocab.text'
vocab = load_doc(vocab_filename)
vocab = set(vocab.split())
    
# Load datasets
ytrain, ytest = load_output('output.pkl')

# clean training and test docs
trainLines = train_docs['comment_text'].apply(lambda x: clean_doc(x, vocab)).astype(str)
testLines = test_docs['comment_text'].apply(lambda x: clean_doc(x, vocab)).astype(str)

tokenizer = create_tokenizer(trainLines)
vocab_size = len(tokenizer.word_index) + 1
length = max_length(trainLines)
print(f'Vocabulary size: {vocab_size}')
print(f'Maximum document length: {length}')

# Encode datasets
Xtrain = encode_text(tokenizer, trainLines.tolist(), length)  # Convert Series to list
Xtest = encode_text(tokenizer, testLines.tolist(), length)  # Convert Series to list

# Define and evaluate model
kernels = [10, 20]
model = define_model(vocab_size, length, kernels)
evaluate_model(model, Xtrain, ytrain, Xtest, ytest, kernels)

Vocabulary size: 89283
Maximum document length: 1250
Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_1 (InputLayer)        [(None, 1250)]               0         []                            
                                                                                                  
 input_2 (InputLayer)        [(None, 1250)]               0         []                            
                                                                                                  
 embedding (Embedding)       (None, 1250, 100)            8928300   ['input_1[0][0]']             
                                                                                                  
 embedding_1 (Embedding)     (None, 1250, 100)            8928300   ['input_2[0][0]']             
                                         

In [None]:
ytest.shape