# Phase 1

## Libraries and Packages

In [None]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt     
import seaborn as sns

import string
import re
import nltk
from nltk.corpus import stopwords
import spacy
from nltk import pos_tag
from nltk.stem.wordnet import WordNetLemmatizer 
from nltk.tokenize import word_tokenize
# Tweet tokenizer does not split at apostophes which is what we want
from nltk.tokenize import TweetTokenizer   
from wordcloud import WordCloud, STOPWORDS

from keras.models import Sequential
from keras.layers import Dense,Embedding,LSTM
from keras.layers import Convolution1D, GlobalMaxPooling1D,GlobalAveragePooling1D
from keras.layers import Bidirectional, SpatialDropout1D, GRU
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn import model_selection
from keras.utils import to_categorical

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from keras.optimizers import RMSprop, Adam

from tqdm import tqdm
tqdm.pandas(desc="progress-bar")

import gc
## warnings
import warnings
warnings.filterwarnings("ignore")

## Train Data

In [None]:
train = pd.read_csv('../input/jigsaw-unintended-bias-in-toxicity-classification/train.csv')

## Exploratory Data Analysis

In [None]:
# Missing values
train.isnull().sum()

In [None]:
print(train.shape, '\n')
train.info()

In [None]:
train.head()

### Visualizing Non-null Target distribuition

In [None]:
plt.figure(figsize=(10,6))
graph_1 = sns.distplot(train[train['target'] > 0]['target'], color = 'red')
plt.title('Toxicity (Target) Distribution')
plt.xlabel("Toxicity Rate")
plt.ylabel("Distribution") 
plt.show()

### Non-null Sub-classes Distribution

In [None]:
comment_adjective = ['severe_toxicity', 'obscene', 'identity_attack', 'insult', 'threat', 'sexual_explicit']

plt.figure(figsize=(15,6))

for col in comment_adjective:
    graph_2 = sns.distplot(train[train[col] > 0][col], label=col, hist=False)
    plt.xlabel("Rate", fontsize=16)
    plt.ylabel("Distribution", fontsize=16)
    plt.legend(loc=1, prop={'size': 14})

plt.show()

# Phase 2

## Data Pre-processing

### Embeddings

GloVe pre-trained word vectors: https://nlp.stanford.edu/projects/glove/

6B tokens, 100d vectors

### Creating Vocabulary

In [None]:
""" GLOVE_EMBEDDING_PATH = "./glove.6B.100d.txt"

import pickle

def load_embeddings(path):
    with open(path,'rb') as f:
        emb_arr = pickle.load(f)
    return emb_arr

glove_embeddings = load_embeddings(GLOVE_EMBEDDING_PATH)
print('Found and loaded {} word vectors'.format(len(glove_embeddings)))

#!rm "./glove.840B.300d.pkl"

"check_coverage" goes through a given vocabulary and tries to find word vectors in embedding matrix. "build_vocab" builds a ordered dictionary of words and their frequency in the text corpus.

import operator 

def check_coverage(vocab,embeddings_index):
    a = {}
    oov = {}
    k = 0
    i = 0
    for word in tqdm(vocab):
        try:
            a[word] = embeddings_index[word]
            k += vocab[word]
        except:

            oov[word] = vocab[word]
            i += vocab[word]
            pass

    print('Found embeddings for {:.2%} of vocab'.format(len(a) / len(vocab)))
    print('Found embeddings for  {:.2%} of all text'.format(k / (k + i)))
    sorted_x = sorted(oov.items(), key=operator.itemgetter(1))[::-1]

    return sorted_x

def build_vocab(sentences, verbose =  True):
    """ """
    :param sentences: list of list of words
    :return: dictionary of words and their count
    """ """
    vocab = {}
    for sentence in tqdm(sentences, disable = (not verbose)):
        for word in sentence:
            try:
                vocab[word] += 1
            except KeyError:
                vocab[word] = 1
    return vocab

vocab = build_vocab(list(train['comment_text'].apply(lambda x:x.split())))
oov = check_coverage(vocab,glove_embeddings)
oov[:10]

### Symbols in GloVe

import string
letter_digit_list = string.ascii_letters + string.digits + ' '
letter_digit_list += "'"

Symbols that have embedding vectors in GloVe:

glove_chars = ''.join([c for c in tqdm(glove_embeddings) if len(c) == 1])
glove_symbols = ''.join([c for c in glove_chars if not c in letter_digit_list])
glove_symbols

Symbols that have no embedding vectors in GloVe:

jigsaw_chars = build_vocab(list(train["comment_text"]))

jigsaw_symbols = ''.join([c for c in jigsaw_chars if not c in letter_digit_list])
jigsaw_symbols

symbols_to_delete = ''.join([c for c in jigsaw_symbols if not c in glove_symbols])
symbols_to_delete

symbols_to_isolate = ''.join([c for c in jigsaw_symbols if c in glove_symbols])
symbols_to_isolate

del glove_embeddings
del vocab
del glove_chars
del glove_symbols

gc.collect() """

In [None]:
#from nltk.tokenize.treebank import TreebankWordTokenizer
#tokenizer = TreebankWordTokenizer()


#isolate_dict = {ord(c):f' {c} ' for c in symbols_to_isolate}
#remove_dict = {ord(c):f'' for c in symbols_to_delete}

def cleaning_text(x):
    punct = "/-'?!.,#$%\'()*+-/:;<=>@[\\]^_`{|}~`" + '""“”’' + '∞θ÷α•à−β∅³π‘₹´°£€\×™√²—–&'
    def clean_special_chars(text, punct):
        for p in punct:
            text = text.replace(p, ' ')
        return text

    x = clean_special_chars(x, punct)
    return x

def fix_quote(x):
    x = [x_[1:] if x_.startswith("'") else x_ for x_ in x]
    x = ' '.join(x)
    return x

def preprocess(x):
    x = cleaning_text(x)
    x = fix_quote(x)
    return x

In [None]:
# Make sure all comment_text values are strings
train['comment_text'] = train['comment_text'].astype(str) 

# List all identities
identity_columns = [
    'male', 'female', 'homosexual_gay_or_lesbian', 'christian', 'jewish',
    'muslim', 'black', 'white', 'psychiatric_or_mental_illness']

# Convert target and identity columns to booleans
def convert_to_bool(df, col_name):
    df[col_name] = np.where(df[col_name] >= 0.5, True, False)
    
def convert_dataframe_to_bool(df):
    bool_df = df.copy()
    for col in ['target'] + identity_columns:
        convert_to_bool(bool_df, col)
    return bool_df

train = convert_dataframe_to_bool(train)

In [None]:
train['comment_text'] = train['comment_text'].progress_apply(lambda x:preprocess(x))

In [None]:
train_df, validate_df = model_selection.train_test_split(train, test_size=0.2)
print('%d train comments, %d validate comments' % (len(train_df), len(validate_df)))

In [None]:
MAX_NUM_WORDS = 10000
TARGET_COLUMN = 'target'
TEXT_COLUMN = 'comment_text'

### Padding Function

In [None]:
# Create a text tokenizer.
tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)
tokenizer.fit_on_texts(train[TEXT_COLUMN])

# All comments must be truncated or padded to be the same length.
MAX_SEQUENCE_LENGTH = 250
def pad_text(texts, tokenizer):
    return pad_sequences(tokenizer.texts_to_sequences(texts), maxlen=MAX_SEQUENCE_LENGTH)

# ================ Phase 3 ================

# Pre-training Dataset

In [None]:
"""
!kaggle datasets download -d fizzbuzz/cleaned-toxic-comments

!unzip "./cleaned-toxic-comments.zip"

pre_data = "./train_preprocessed.csv"


## Get the Corpus of all the comments and related Toxicity fields

data = pd.read_csv(pre_data)
data.head()

## Dividing the dataset into features and labels:
Features = "comment"            
Labels = "toxicity"

Features = data['comment_text']
Labels = np.array([0 if y == 0 else 1 for y in data['toxicity']])

### Tokenizing and preprocessing the data

NUM_WORDS = 40000 # Maximum number of unique words which need to be tokenized
MAXLEN = 50 # Maximum length of a sentence/ comment
PADDING = 'post' # The type of padding done for sentences shorter than the Max len

tokenizer = Tokenizer(num_words=NUM_WORDS)

# Fit the tokenizer on the comments 
tokenizer.fit_on_texts(Features)

# Get the word index of the top 20000 words from the dataset
word_idx = tokenizer.word_index

# Convert the string sentence to a sequence of their numerical values
Feature_sequences = tokenizer.texts_to_sequences(Features)

# Pad the sequences to make them of uniform length
padded_sequences = pad_sequences(Feature_sequences, maxlen = MAXLEN, padding = PADDING)

print("The Transformation of sentence::")
print("\n\nThe normal Sentencen:\n")
print(Features[2])
print("\n\nThe tokenized sequence:\n")
print(Feature_sequences[2])
print("\n\nThe padded sequence:\n")
print(padded_sequences[2])

# Convert to array for passing through the model
X = np.array(padded_sequences)
"""

# ================ Phase 4 ==================

# Training

### Hyper-parameters

In [None]:
EMBEDDINGS_PATH = '../input/glove-6b-100d/glove.6B.100d.txt'
EMBEDDINGS_DIMENSION = 100
LEARNING_RATE = 0.0005
NUM_EPOCHS = 5
BATCH_SIZE = 128

# Prepare data
train_text = pad_text(train_df[TEXT_COLUMN], tokenizer)

In [None]:
train_label = to_categorical(train_df[TARGET_COLUMN])

In [None]:
del train
gc.collect()

In [None]:
X_train, X_test, y_train, y_test=train_test_split(train_text, train_label, test_size=0.20, random_state=42)

In [None]:
gc.collect()

### Embedding Matrix

In [None]:
# Load embeddings
print('loading embeddings')
embeddings_index = {}
with open(EMBEDDINGS_PATH) as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

embedding_matrix = np.zeros((len(tokenizer.word_index) + 1,
                                 EMBEDDINGS_DIMENSION))
num_words_in_embedding = 0
for word, i in tokenizer.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        num_words_in_embedding += 1
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector
print("Embeddings loaded!")

In [None]:
# Create model layers.
model=Sequential()
model.add(Embedding(len(tokenizer.word_index) + 1,100,input_length = 100,weights = [embedding_matrix],trainable = False))
model.add(SpatialDropout1D(0.3))
model.add(Bidirectional(LSTM(128,return_sequences=True)))
model.add(Bidirectional(LSTM(128,return_sequences=True)))
model.add(GlobalMaxPooling1D())
model.add(Dense(512,activation = 'relu'))
model.add(Dense(512,activation = 'relu'))
model.add(Dense(2,activation='softmax'))

# Compile model.
print('Compiling model...')
model.compile(loss='categorical_crossentropy',
                  optimizer=Adam(lr=LEARNING_RATE),
                  metrics=['acc'])
print("Compiled model!")

In [None]:
model.summary()

## Pre-training on external data

In [None]:
"""
history = model.fit(
            X, 
            Labels,
            batch_size = 128,
            epochs = 10,
            validation_split = 0.2, # 20 percent data reserved for validation to avoid or monitor overfitting/ underfitting
            verbose = verbose,
        )
"""

## Final Training on competition data

In [None]:
model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath="best_lstm_toxic.h5",
    save_weights_only=True,
    monitor='val_accuracy',
    mode='max',
    save_best_only=True)

In [None]:
import time

In [None]:
# Train model.
print('Training model...')
start = time.time()
history = model.fit(X_train,
              y_train,
              batch_size=BATCH_SIZE,
              epochs= NUM_EPOCHS,
              validation_data=(X_test, y_test),
              verbose=1, callbacks = [model_checkpoint_callback])

end = time.time()
print("Training duration: {} minutes".format(str((end-start)/60)))

In [None]:
MODEL_NAME = 'lstm_model'
validate_df[MODEL_NAME] = model.predict(pad_text(validate_df[TEXT_COLUMN], tokenizer))[:, 1]

In [None]:
validate_df.head()

In [None]:
SUBGROUP_AUC = 'subgroup_auc'
BPSN_AUC = 'bpsn_auc'  # stands for background positive, subgroup negative
BNSP_AUC = 'bnsp_auc'  # stands for background negative, subgroup positive

def compute_auc(y_true, y_pred):
    try:
        return metrics.roc_auc_score(y_true, y_pred)
    except ValueError:
        return np.nan

def compute_subgroup_auc(df, subgroup, label, model_name):
    subgroup_examples = df[df[subgroup]]
    return compute_auc(subgroup_examples[label], subgroup_examples[model_name])

def compute_bpsn_auc(df, subgroup, label, model_name):
    """Computes the AUC of the within-subgroup negative examples and the background positive examples."""
    subgroup_negative_examples = df[df[subgroup] & ~df[label]]
    non_subgroup_positive_examples = df[~df[subgroup] & df[label]]
    examples = subgroup_negative_examples.append(non_subgroup_positive_examples)
    return compute_auc(examples[label], examples[model_name])

def compute_bnsp_auc(df, subgroup, label, model_name):
    """Computes the AUC of the within-subgroup positive examples and the background negative examples."""
    subgroup_positive_examples = df[df[subgroup] & df[label]]
    non_subgroup_negative_examples = df[~df[subgroup] & ~df[label]]
    examples = subgroup_positive_examples.append(non_subgroup_negative_examples)
    return compute_auc(examples[label], examples[model_name])

def compute_bias_metrics_for_model(dataset,
                                   subgroups,
                                   model,
                                   label_col,
                                   include_asegs=False):
    """Computes per-subgroup metrics for all subgroups and one model."""
    records = []
    for subgroup in subgroups:
        record = {
            'subgroup': subgroup,
            'subgroup_size': len(dataset[dataset[subgroup]])
        }
        record[SUBGROUP_AUC] = compute_subgroup_auc(dataset, subgroup, label_col, model)
        record[BPSN_AUC] = compute_bpsn_auc(dataset, subgroup, label_col, model)
        record[BNSP_AUC] = compute_bnsp_auc(dataset, subgroup, label_col, model)
        records.append(record)
    return pd.DataFrame(records).sort_values('subgroup_auc', ascending=True)

bias_metrics_df = compute_bias_metrics_for_model(validate_df, identity_columns, MODEL_NAME, TARGET_COLUMN)
bias_metrics_df

In [None]:
def calculate_overall_auc(df, model_name):
    true_labels = df[TARGET_COLUMN]
    predicted_labels = df[model_name]
    return metrics.roc_auc_score(true_labels, predicted_labels)

def power_mean(series, p):
    total = sum(np.power(series, p))
    return np.power(total / len(series), 1 / p)

def get_final_metric(bias_df, overall_auc, POWER=-5, OVERALL_MODEL_WEIGHT=0.25):
    bias_score = np.average([
        power_mean(bias_df[SUBGROUP_AUC], POWER),
        power_mean(bias_df[BPSN_AUC], POWER),
        power_mean(bias_df[BNSP_AUC], POWER)
    ])
    return (OVERALL_MODEL_WEIGHT * overall_auc) + ((1 - OVERALL_MODEL_WEIGHT) * bias_score)
    
get_final_metric(bias_metrics_df, calculate_overall_auc(validate_df, MODEL_NAME))

In [None]:
del X_train, y_train, X_test, y_test
gc.collect()

In [None]:
#model.save('../toxicity_classifier.h5')

In [None]:
# Plot training & validation accuracy values
plt.plot(history.history['acc'])
plt.plot(history.history['val_acc'])
plt.title('Model accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Train', 'Test'], loc='upper left')
plt.show()

In [None]:
# Plot training & validation loss values
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Test'], loc='upper left')
plt.show()

In [None]:
test = pd.read_csv('../input/jigsaw-unintended-bias-in-toxicity-classification/test.csv')
submission = pd.read_csv('../input/jigsaw-unintended-bias-in-toxicity-classification/sample_submission.csv', index_col='id')

In [None]:
submission['prediction'] = model.predict(pad_text(test[TEXT_COLUMN], tokenizer))[:, 1]
submission.to_csv('submission.csv')