In [None]:
!pip install parfit

In [1]:
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
from nltk.corpus import wordnet
from nltk import word_tokenize
import pandas as pd
import random
import numpy as np
# import chars2vec
import sklearn.decomposition
import matplotlib.pyplot as plt
import pickle
import string
from sklearn.model_selection import train_test_split #split data into train and test sets
from sklearn.feature_extraction.text import CountVectorizer #convert text comment into a numeric vector
from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer #use TF IDF transformer to change text vector created by count vectorizer
from sklearn.svm import SVC, LinearSVC# Support Vector Machine
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
import re
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, AdaBoostClassifier
from sklearn.model_selection import GridSearchCV


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
from tensorflow import keras
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation, GRU,Conv1D,MaxPooling1D, Flatten, GlobalMaxPooling1D
from keras.models import Model,Sequential, load_model
from keras.regularizers import l2
from keras.callbacks import EarlyStopping, ModelCheckpoint

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras import regularizers


In [None]:
# from translate import Translator
# translator= Translator(from_lang="german",to_lang="spanish")
# translation = translator.translate("Guten Morgen")
# print(translation)

# Preprocessing Functions

In [2]:
def get_synonyms(word):
    synonyms = []
    for syn in wordnet.synsets(word):
        for l in syn.lemmas():
            synonyms.append(l.name())
    return synonyms
stop = nltk.corpus.stopwords.words('english')

def augment_data(sent):
    words = sent.split()
    words = [w if w not in stop else '@'+w for w in words]
    for i in range(len(words)):
        if not words[i].startswith('@'):
            syn_w = get_synonyms(words[i])
            if syn_w != []:
                w = random.choice(syn_w)
                words[i] = " ".join(w.split('_'))
        else:
            words[i] = words[i][1:]
    return " ".join(words)


def preprocess_text(s):
    s = s.replace('\n',' ')
    s = s.replace('\t',' ')
    s = s.replace(':',' ')
    s = s.replace('#',' ')
    s = s.replace('*','u')
    s = s.replace('@','a')
    s = s.replace('$','s')
    s = s.replace('7','s')
    s = s.replace('2','to')
    s = s.replace('8','ight')
    s = s.replace('&', 'and')
    s = s.translate(str.maketrans('', '', string.punctuation) ) 
    s = s.split()
    s = [i for i in s if i]
    s = [re.sub("[^0-9a-zA-Z]+", "", i) for i in s]
    s = [i for i in s if len(i)>1]    
    return " ".join(s)


def transform_x(df):
    x = df.apply(lambda row : preprocess_text(row['comment_text']), axis=1)
    return pd.DataFrame(x,columns=['comment_text'])

def merge(df1,df2):
    return pd.concat([df1, df2], axis=1)


def drop_faulty_rows(df):
    return df.drop(df[(df['toxic'] == -1.0) & (df['severe_toxic'] == -1.0) & 
                    (df['obscene'] == -1.0) & (df['threat'] == -1.0) & 
                    (df['insult'] == -1.0) & (df['identity_hate'] == -1.0) ].index)
    
def combine_labels(train_df):
    x = np.where(train_df['toxic']+train_df['severe_toxic']+train_df['obscene']
             +train_df['threat']+train_df['insult']+train_df['identity_hate'] > 0, 1, 0)
    return pd.DataFrame(x,columns=['Toxic'])
    

# Training Data Transformation

In [None]:
train_df = pd.read_csv('/content/drive/MyDrive/NLP_Project_IITB/Project/jigsaw-toxic-comment-classification-challenge/train.csv')

In [None]:
train_df.head(5)

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [None]:
X = transform_x(train_df)
X.head()

Unnamed: 0,comment_text
0,Explanation Why the edits made under my userna...
1,Daww He matches this background colour Im seem...
2,Hey man Im really not trying to edit war Its j...
3,More cant make any real suggestions on improve...
4,You sir are my hero Any chance you remember wh...


In [None]:
Y = combine_labels(train_df)
Y.head()

Unnamed: 0,Toxic
0,0
1,0
2,0
3,0
4,0


## Test Data

In [None]:
test_df = pd.read_csv('/content/drive/MyDrive/NLP_Project_IITB/Project/jigsaw-toxic-comment-classification-challenge/test.csv')
y_test = pd.read_csv('/content/drive/MyDrive/NLP_Project_IITB/Project/jigsaw-toxic-comment-classification-challenge/test_labels.csv')
y_test.head(3)

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,-1,-1,-1,-1,-1,-1
1,0000247867823ef7,-1,-1,-1,-1,-1,-1
2,00013b17ad220c46,-1,-1,-1,-1,-1,-1


In [None]:
x_test = transform_x(test_df)
df_col_merged = merge(x_test,y_test)
df_col_merged.head()

Unnamed: 0,comment_text,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,Yo bitch Ja Rule is more succesful then youll ...,00001cee341fdb12,-1,-1,-1,-1,-1,-1
1,From RfC The title is fine as it is IMO,0000247867823ef7,-1,-1,-1,-1,-1,-1
2,Sources Zawe Ashton on Lapland,00013b17ad220c46,-1,-1,-1,-1,-1,-1
3,If you have look back at the source the inform...,00017563c3f7919a,-1,-1,-1,-1,-1,-1
4,dont anonymously edit articles at all,00017695ad8997eb,-1,-1,-1,-1,-1,-1


In [None]:
test_df = drop_faulty_rows(df_col_merged)

In [None]:
x_test = test_df['comment_text']
y_test = combine_labels(test_df)

## Make my own Embeddings

In [None]:
import os
import re
import time

from gensim.models import Word2Vec
from tqdm import tqdm

tqdm.pandas()

Fit Train & test

In [None]:
X.comment_text

0         Explanation Why the edits made under my userna...
1         Daww He matches this background colour Im seem...
2         Hey man Im really not trying to edit war Its j...
3         More cant make any real suggestions on improve...
4         You sir are my hero Any chance you remember wh...
                                ...                        
159566    And for the second time of asking when your vi...
159567    You should be ashamed of yourself That is horr...
159568    Spitzer Umm theres no actual article for prost...
159569    And it looks like it was actually you who put ...
159570    And really dont think you understand came here...
Name: comment_text, Length: 159571, dtype: object

In [None]:
sentences = pd.concat([X.comment_text,x_test],axis=0)
train_sent = list(sentences.progress_apply(str.split).values)

100%|██████████| 223549/223549 [00:02<00:00, 96940.54it/s] 


In [None]:
start_time = time.time()

model = Word2Vec(sentences=train_sent, 
                 sg=1, 
                 size=100,  
                 workers=4)

print(f'Time taken : {(time.time() - start_time) / 60:.2f} mins')
model.wv.save_word2vec_format('/content/drive/MyDrive/NLP_Project_IITB/Project/jigsaw-toxic-comment-classification-challenge/data/custom_glove_100d.txt')

Time taken : 5.95 mins


In [None]:
start_time = time.time()

model = Word2Vec(sentences=train_sent, 
                 sg=1, 
                 size=300,  
                 workers=4)

print(f'Time taken : {(time.time() - start_time) / 60:.2f} mins')
model.wv.save_word2vec_format('/content/drive/MyDrive/NLP_Project_IITB/Project/jigsaw-toxic-comment-classification-challenge/data/custom_glove_300d.txt')

Time taken : 8.89 mins


In [None]:
start_time = time.time()

model = Word2Vec(sentences=train_sent, 
                 sg=1, 
                 size=768,  
                 workers=4)

print(f'Time taken : {(time.time() - start_time) / 60:.2f} mins')
model.wv.save_word2vec_format('/content/drive/MyDrive/NLP_Project_IITB/Project/jigsaw-toxic-comment-classification-challenge/data/custom_glove_768d.txt')

Time taken : 13.06 mins


In [None]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
train_sent = list(X.comment_text.progress_apply(str.split).values)
documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(train_sent)]
model = Doc2Vec(documents, vector_size=300, window=8, min_count=5, workers=4, dm = 1, epochs=20)

100%|██████████| 159571/159571 [00:02<00:00, 72162.15it/s]


In [None]:
from gensim.test.utils import get_tmpfile
fname = get_tmpfile("/content/drive/MyDrive/NLP_Project_IITB/Project/jigsaw-toxic-comment-classification-challenge/data/doc2vec_model")
model.save(fname)

# With custom embeddings

In [None]:
from gensim.models import KeyedVectors
from collections import defaultdict

w2v = KeyedVectors.load_word2vec_format('/content/drive/MyDrive/NLP_Project_IITB/Project/jigsaw-toxic-comment-classification-challenge/data/custom_glove_768d.txt')
n_dim = 768

In [None]:
tf_idf = TfidfVectorizer()
tf_idf.fit(X['comment_text'])
max_idf = max(tf_idf.idf_)
tf_idf_dict = defaultdict(
            lambda: max_idf,
            [(w, tf_idf.idf_[i]) for w, i in tf_idf.vocabulary_.items()])

In [None]:
def get_word_vec(word):
    try:
         return w2v.word_vec(word)
    except:
        return np.zeros(n_dim) 
vect_get_word_vec = np.vectorize(get_word_vec)

def get_sentence_embed(sent):
    words = np.array(sent.split())
    if len(words)==0:
        return np.zeros(n_dim)
    word_vecs = np.array([vect_get_word_vec(x) for x in words])
    return np.average(word_vecs,axis=0)

def get_sentence_embed_tf_idf(sent):
    global tf_idf_dict
    words = np.array(sent.split())
    if len(words) == 0:
        return np.zeros(n_dim)
    word_vecs = np.array([vect_get_word_vec(x) for x in words])
    for i in range(len(words)):
        word_vecs[i] = tf_idf_dict[words[i]]*word_vecs[i]
    return np.average(word_vecs,axis=0)


In [None]:
X_train_sent = X.comment_text.to_numpy()
sent_embed_X_train = np.stack([get_sentence_embed(x) for x in X_train_sent])
sent_embed_tfidf_X_train = np.stack([get_sentence_embed_tf_idf(x)  for x in X_train_sent])

In [None]:
sent_embed_X_train.shape

(159571, 768)

In [None]:
x_test_sent = x_test.to_numpy()
sent_embed_X_test = np.stack([get_sentence_embed(x) for x in x_test_sent])
sent_embed_tfidf_X_test = np.stack([get_sentence_embed_tf_idf(x)  for x in x_test_sent])

Linear svm

Normal Avg

In [None]:
lsvm  = LinearSVC()
lsvm.fit(sent_embed_X_train, Y.Toxic.to_numpy().ravel())

LinearSVC()

In [None]:
pickle.dump(lsvm,open('/content/drive/MyDrive/NLP_Project_IITB/Project/models/lsvm_emb_on_Train_avged.pkl','wb'))

In [None]:
lsvm_avg_emb = pickle.load(open('/content/drive/MyDrive/NLP_Project_IITB/Project/models/lsvm_emb_on_Train_avged.pkl','rb'))

Analysis of embedding size

In [None]:
# lsvm.predict([get_sentence_embed(preprocess_text('fine'))])

array([0])

In [None]:
y_pred = lsvm.predict(sent_embed_X_test)
print(classification_report(y_test, y_pred)) #100d

              precision    recall  f1-score   support

           0       0.95      0.96      0.96     57735
           1       0.63      0.57      0.60      6243

    accuracy                           0.93     63978
   macro avg       0.79      0.77      0.78     63978
weighted avg       0.92      0.93      0.92     63978



In [None]:
y_pred = lsvm.predict(sent_embed_X_test)
print(classification_report(y_test, y_pred)) #300d

              precision    recall  f1-score   support

           0       0.96      0.96      0.96     57735
           1       0.63      0.62      0.63      6243

    accuracy                           0.93     63978
   macro avg       0.80      0.79      0.79     63978
weighted avg       0.93      0.93      0.93     63978



In [None]:
y_pred = lsvm_avg_emb.predict(sent_embed_X_test)
print(classification_report(y_test, y_pred)) #768d

              precision    recall  f1-score   support

           0       0.96      0.96      0.96     57735
           1       0.64      0.66      0.65      6243

    accuracy                           0.93     63978
   macro avg       0.80      0.81      0.80     63978
weighted avg       0.93      0.93      0.93     63978



TFIDF Weighted Avg

In [None]:
lsvm_tfidf_emb = LinearSVC(max_iter=10000)
lsvm_tfidf_emb.fit(sent_embed_tfidf_X_train, Y.Toxic.to_numpy().ravel())
# pickle.dump(lsvm_tfidf_emb, open('/content/drive/MyDrive/NLP_Project_IITB/Project/models/lsvm_emb_on_Train_tfidf_avged','wb'))

In [None]:
lsvm_tfidf_emb = pickle.load(open('/content/drive/MyDrive/NLP_Project_IITB/Project/models/lsvm_emb_on_Train_tfidf_avged','rb'))

In [None]:
lsvm_tfidf_emb

LinearSVC(max_iter=10000)

In [None]:
y_pred = lsvm_tfidf_emb.predict(sent_embed_X_test)
print(classification_report(y_test, y_pred)) 

              precision    recall  f1-score   support

           0       0.91      1.00      0.95     57735
           1       0.95      0.09      0.17      6243

    accuracy                           0.91     63978
   macro avg       0.93      0.55      0.56     63978
weighted avg       0.91      0.91      0.88     63978



SVM RBF

Normal Avg

In [None]:
emb_svc = SVC(kernel='rbf')
emb_svc.fit(sent_embed_X_train, Y.Toxic.to_numpy().ravel())
pickle.dump(emb_svc, open('/content/drive/MyDrive/NLP_Project_IITB/Project/models/svm_rbf_emb_on_Train_avged.pkl','wb'))

SVC()

In [None]:
svm_rbf_avg_emb = pickle.load(open('/content/drive/MyDrive/NLP_Project_IITB/Project/models/svm_rbf_emb_on_Train_avged.pkl','rb'))

In [None]:
svm_rbf_avg_emb.predict([get_sentence_embed(preprocess_text('fuck off'))])

array([0])

In [None]:
y_pred = svm_rbf_avg_emb.predict(sent_embed_X_test)
print(classification_report(y_test, y_pred)) #768d

              precision    recall  f1-score   support

           0       0.97      0.96      0.96     57735
           1       0.66      0.68      0.67      6243

    accuracy                           0.93     63978
   macro avg       0.81      0.82      0.82     63978
weighted avg       0.94      0.93      0.93     63978



TFIDF Weighted Avg

In [None]:
emb_svc = SVC(kernel='rbf')
# emb_svc.fit(sent_embed_X_train, Y.Toxic.to_numpy().ravel())
# pickle.dump(emb_svc, open('/content/drive/MyDrive/NLP_Project_IITB/Project/models/svm_rbf_emb_on_Train_tfidf_avged','wb'))

In [None]:
svm_rbf_tfidf_emb = pickle.load(open('/content/drive/MyDrive/NLP_Project_IITB/Project/models/svm_rbf_emb_on_Train_tfidf_avged','rb'))

In [None]:
y_pred = svm_rbf_tfidf_emb.predict(sent_embed_X_test)
print(classification_report(y_test, y_pred)) #768d

              precision    recall  f1-score   support

           0       0.97      0.96      0.96     57735
           1       0.66      0.68      0.67      6243

    accuracy                           0.93     63978
   macro avg       0.81      0.82      0.82     63978
weighted avg       0.94      0.93      0.93     63978



SGD

Normal Avg

In [None]:
sgd_clf = SGDClassifier(loss='hinge', penalty='l2', epsilon=4, alpha=1e-6)
# sgd_clf.fit(sent_embed_X_train, Y.Toxic.to_numpy().ravel())
# pickle.dump(sgd_clf, open('/content/drive/MyDrive/NLP_Project_IITB/Project/models/sgd_emb_on_Train_avged','wb'))

In [None]:
sgd_clf_avg_emb = pickle.load(open('/content/drive/MyDrive/NLP_Project_IITB/Project/models/sgd_emb_on_Train_avged','rb'))

In [None]:
y_pred = sgd_clf_avg_emb.predict(sent_embed_X_test)
print(classification_report(y_test, y_pred)) #768d

              precision    recall  f1-score   support

           0       0.97      0.93      0.95     57735
           1       0.55      0.76      0.64      6243

    accuracy                           0.92     63978
   macro avg       0.76      0.84      0.80     63978
weighted avg       0.93      0.92      0.92     63978



TFIDF Weighted Avg

In [None]:
sgd_clf = SGDClassifier(loss='hinge', penalty='l2', epsilon=4, alpha=1e-6)
# sgd_clf.fit(sent_embed_X_train, Y.Toxic.to_numpy().ravel())
# pickle.dump(sgd_clf, open('/content/drive/MyDrive/NLP_Project_IITB/Project/models/sgd_emb_on_Train_tfidf_avged','wb'))

In [None]:
sgd_clf_tfidf_emb = pickle.load(open('/content/drive/MyDrive/NLP_Project_IITB/Project/models/sgd_emb_on_Train_tfidf_avged','rb'))

In [None]:
y_pred = sgd_clf_tfidf_emb.predict(sent_embed_X_test)
print(classification_report(y_test, y_pred)) #768d

              precision    recall  f1-score   support

           0       0.97      0.94      0.96     57735
           1       0.58      0.73      0.65      6243

    accuracy                           0.92     63978
   macro avg       0.77      0.84      0.80     63978
weighted avg       0.93      0.92      0.93     63978



## Doc2Vec

In [None]:
from gensim.models.doc2vec import Doc2Vec

doc2vec = Doc2Vec.load("/content/drive/MyDrive/NLP_Project_IITB/Project/jigsaw-toxic-comment-classification-challenge/data/doc2vec_model")

In [None]:
test_doc = word_tokenize("yoooooooooooo mothhherfuckerr")
test_doc_vector = doc2vec.infer_vector(test_doc)

In [None]:
X_train_sent = X.comment_text.to_numpy()
sent_embed_X_train = np.stack([doc2vec.infer_vector(word_tokenize(x)) for x in X_train_sent])

In [None]:
sent_embed_X_train.shape

(159571, 300)

In [None]:
x_test_sent = x_test.to_numpy()
sent_embed_X_test = np.stack([doc2vec.infer_vector(word_tokenize(x)) for x in x_test_sent])

SVM RBF

In [None]:
emb_svc = SVC(kernel='rbf')
emb_svc.fit(sent_embed_X_train, Y.Toxic.to_numpy().ravel())
# pickle.dump(emb_svc, open('/content/drive/MyDrive/NLP_Project_IITB/Project/models/svm_rbf_d2v_emb.pkl','wb'))

In [None]:
svm_rbf_d2v_emb = pickle.load(open('/content/drive/MyDrive/NLP_Project_IITB/Project/models/svm_rbf_d2v_emb.pkl','rb'))

In [None]:
y_pred = svm_rbf_d2v_emb.predict(sent_embed_X_test)
print(classification_report(y_test, y_pred)) 

              precision    recall  f1-score   support

           0       0.94      0.96      0.95     57735
           1       0.58      0.45      0.51      6243

    accuracy                           0.91     63978
   macro avg       0.76      0.71      0.73     63978
weighted avg       0.91      0.91      0.91     63978



SGD

In [None]:
sgd_clf = SGDClassifier(loss='hinge', penalty='l2', epsilon=4, alpha=1e-6)
sgd_clf.fit(sent_embed_X_train, Y.Toxic.to_numpy().ravel())
# pickle.dump(sgd_clf, open('/content/drive/MyDrive/NLP_Project_IITB/Project/models/sgd_d2v_emb.pkl','wb'))

In [None]:
sgd_d2v_emb = pickle.load(open('/content/drive/MyDrive/NLP_Project_IITB/Project/models/sgd_d2v_emb.pkl','rb'))

In [None]:
y_pred = sgd_d2v_emb.predict(sent_embed_X_test)
print(classification_report(y_test, y_pred)) #768d

              precision    recall  f1-score   support

           0       0.97      0.93      0.95     57735
           1       0.55      0.76      0.64      6243

    accuracy                           0.92     63978
   macro avg       0.76      0.84      0.80     63978
weighted avg       0.93      0.92      0.92     63978

