In [None]:
!pip install chars2vec

In [None]:
!pip install sentence-transformers
from sentence_transformers import SentenceTransformer


In [None]:
import nltk
from nltk.corpus import wordnet
import pandas as pd
import random
import numpy as np
import chars2vec
import sklearn.decomposition
import matplotlib.pyplot as plt
import pickle
from collections import defaultdict
import string
import re
from gensim.models import KeyedVectors

from sklearn.model_selection import train_test_split #split data into train and test sets
from sklearn.feature_extraction.text import CountVectorizer #convert text comment into a numeric vector
from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer #use TF IDF transformer to change text vector created by count vectorizer
from sklearn.svm import SVC# Support Vector Machine
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, AdaBoostClassifier
from sklearn.metrics import confusion_matrix

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

In [5]:
# !pip install google.colab
# from google.colab import drive
# drive.mount('/content/drive')

Mounted at /content/drive


# Preprocessing Functions

In [6]:
# !pip install chars2vec 
c2v_model = chars2vec.load_model('eng_300')

In [7]:
def get_synonyms(word):
    synonyms = []
    for syn in wordnet.synsets(word):
        for l in syn.lemmas():
            synonyms.append(l.name())
    return synonyms
stop = nltk.corpus.stopwords.words('english')

def augment_data(sent):
    words = sent.split()
    words = [w if w not in stop else '@'+w for w in words]
    for i in range(len(words)):
        if not words[i].startswith('@'):
            syn_w = get_synonyms(words[i])
            if syn_w != []:
                w = random.choice(syn_w)
                words[i] = " ".join(w.split('_'))
        else:
            words[i] = words[i][1:]
    return " ".join(words)

def preprocess_text(s):

    s = s.replace('\n',' ')
    s = s.replace('\t',' ')
    s = s.replace('7','s')
    s = s.replace('2','to')
    s = s.replace('8','ight')
    s = s.split()
    s = [i for i in s if i]
    s = " ".join(s)
    s = s.split()
    return " ".join(s)

def transform_x(df):
    x = df.apply(lambda row : preprocess_text(row['comment_text']), axis=1)
    return pd.DataFrame(x,columns=['comment_text'])

def merge(df1,df2):
    return pd.concat([df1, df2], axis=1)

def drop_faulty_rows(df):
    return df.drop(df[(df['toxic'] == -1.0) & (df['severe_toxic'] == -1.0) & 
                    (df['obscene'] == -1.0) & (df['threat'] == -1.0) & 
                    (df['insult'] == -1.0) & (df['identity_hate'] == -1.0) ].index)
    
def combine_labels(train_df):
    x = np.where(train_df['toxic']+train_df['severe_toxic']+train_df['obscene']
             +train_df['threat']+train_df['insult']+train_df['identity_hate'] > 0, 1, 0)
    return pd.DataFrame(x,columns=['Toxic'])

#Adjust the path here accordingly
w2v_whole_data = KeyedVectors.load_word2vec_format('/content/drive/MyDrive/NLP_Project_IITB/Project/jigsaw-toxic-comment-classification-challenge/data/custom_glove_768d.txt')
n_dim = 768

def get_word_vec(word):
    try:
         return w2v_whole_data.word_vec(word)
    except:
        return np.zeros(n_dim) 
vect_get_word_vec = np.vectorize(get_word_vec)


def get_sentence_embed(sent):
    words = np.array(sent.split())
    if len(words)==0:
        return np.zeros(n_dim)
    word_vecs = np.array([vect_get_word_vec(x) for x in words])
    return np.average(word_vecs,axis=0)


def get_sentence_embed_tf_idf(sent):
    global tf_idf_dict
    words = np.array(sent.split())
    if len(words) == 0:
        return np.zeros(n_dim)
    word_vecs = np.array([vect_get_word_vec(x) for x in words])
    for i in range(len(words)):
        word_vecs[i] = tf_idf_dict[words[i]]*word_vecs[i]
    return np.average(word_vecs,axis=0)

stop_words = set(nltk.corpus.stopwords.words('english'))  

def get_sentence_embed_tf_idf_with_features(sent):
  global tf_idf_dict
  words = np.array(sent.split())
  if len(words) == 0:
      return np.zeros(n_dim)
  word_vecs = np.array([vect_get_word_vec(x) for x in words])
  for i in range(len(words)):
      word_vecs[i] = tf_idf_dict[words[i]]*word_vecs[i]
  vec =  np.average(word_vecs,axis=0).tolist()

  #Total Length of sentences
  vec.append(len(sent))

  #Total number of words
  vec.append(len(words))

  #Number of captial characters
  caps = 0
  for ch in sent:
    if ch.isupper():
      caps += 1
  vec.append(caps)

  #Number of all capital words
  caps = 0
  for ch in words:
    if ch.isupper():
      caps += 1
  vec.append(caps)
  
  #Number of exclamation marks 
  vec.append(sent.count('!'))
  

  #Number of question marks 
  vec.append(sent.count('?'))


  #Number of punctuations marks 
  vec.append(sent.count('.')+sent.count(',')+sent.count(';')+sent.count(':')) 

  #Number of symbols *&$% marks 
  vec.append(sent.count('*')+sent.count('&')+sent.count('$')+sent.count('%')) 

  #Number of words inside quotes single or double:
  x = re.findall('"([^"]*)"', sent)
  y = re.findall("'([^']*)'", sent)
  vec.append(len(x)+len(y))

  #Number of sentences:
  vec.append(len(nltk.sent_tokenize(sent)))

  #Count the number of unique words
  vec.append(len(set(words)))

  #Count of hashtags
  vec.append(len(re.findall(r'(#[A-Za-z0-9]*)', sent)))


  #Count of mentions
  vec.append(len(re.findall(r'(@[A-Za-z0-9]*)', sent)))


  #Count of stopwords
  vec.append(len([w for w in words if w in stop_words]))
  

  #Calculating average word length
  vec.append(len(sent)/len(words))

  #Calculating average sentence length
  vec.append(len(words)/len(nltk.sent_tokenize(sent)))
  

  #unique words vs word count feature
  vec.append(len(set(words))/len(words))
  

  #Stopwords count vs words counts feature
  vec.append(len([w for w in words if w in stop_words])/len(words))
  
  return np.array(vec)

def feature_normalise_param(X):
  res = []
  Y = X
  #Total Length of sentences
  res.append(X['comment_text'].apply(len).max())

  #Total number of words
  res.append(X['comment_text'].apply(lambda x: len(x.split())).max())

  #Number of captial characters
  res.append(X['comment_text'].apply(lambda x: len(re.findall("([A-Z])",x))).max())
  
  #Number of all capital words
  res.append(X['comment_text'].apply(lambda x: len([1 for y in x.split() if y.isupper()])).max())
  
  #Number of exclamation marks 
  res.append(X['comment_text'].apply(lambda x: x.count('!')).max())

  #Number of question marks 
  res.append(X['comment_text'].apply(lambda x: x.count('?')).max())

  #Number of punctuations marks 
  res.append(X['comment_text'].apply(lambda x: x.count('.') + x.count(',') + x.count(';') + x.count(':')).max())

  #Number of symbols *&$% marks 
  res.append(X['comment_text'].apply(lambda x: x.count('*') + x.count('&') + x.count('$') + x.count('%')).max())

  #Number of words inside quotes single or double:
  res.append(X['comment_text'].apply(lambda x: len(re.findall('"([^"]*)"', x)) + len(re.findall("'([^']*)'", x))).max())

  #Number of sentences:
  res.append(X['comment_text'].apply(lambda x: len(nltk.sent_tokenize(x))).max())

  #Count the number of unique words
  res.append(X['comment_text'].apply(lambda x: len(set(x.split()))).max())
  
  #Count of hashtags
  res.append(X['comment_text'].apply(lambda x: len(re.findall(r'(#[A-Za-z0-9]*)', x))).max())

  #Count of mentions
  res.append(X['comment_text'].apply(lambda x: len(re.findall(r'(@[A-Za-z0-9]*)', x))).max())

  #Count of stopwords
  res.append(X['comment_text'].apply(lambda x: len([w for w in x.split() if w in stop_words])).max())
  
  return res

def get_sentence_embed_tf_idf_with_features_norm(sent,maxs):
  global tf_idf_dict
  words = np.array(sent.split())
  if len(words) == 0:
      return np.zeros(n_dim)
  word_vecs = np.array([vect_get_word_vec(x) for x in words])
  for i in range(len(words)):
      word_vecs[i] = tf_idf_dict[words[i]]*word_vecs[i]
  vec =  np.average(word_vecs,axis=0).tolist()


  #Total Length of sentences
  vec.append(len(sent)/maxs[0])

  #Total number of words
  vec.append(len(words)/maxs[1])

  #Number of captial characters
  caps = 0
  for ch in sent:
    if ch.isupper():
      caps += 1
  vec.append(caps/maxs[2])

  #Number of all capital words
  caps = 0
  for ch in words:
    if ch.isupper():
      caps += 1
  vec.append(caps/maxs[3])
  
  #Number of exclamation marks 
  vec.append(sent.count('!')/maxs[4])
  
  #Number of question marks 
  vec.append(sent.count('?')/maxs[5])

  #Number of punctuations marks 
  vec.append((sent.count('.')+sent.count(',')+sent.count(';')+sent.count(':'))/maxs[6]) 

  #Number of symbols *&$% marks 
  vec.append((sent.count('*')+sent.count('&')+sent.count('$')+sent.count('%'))/maxs[7]) 

  #Number of words inside quotes single or double:
  x = re.findall('"([^"]*)"', sent)
  y = re.findall("'([^']*)'", sent)
  vec.append((len(x)+len(y))/maxs[8])

  #Number of sentences:
  vec.append(len(nltk.sent_tokenize(sent))/maxs[9])

  #Count the number of unique words
  vec.append(len(set(words))/maxs[10])

  #Count of hashtags
  vec.append(len(re.findall(r'(#[A-Za-z0-9]*)', sent))/maxs[11])

  #Count of mentions
  vec.append(len(re.findall(r'(@[A-Za-z0-9]*)', sent))/maxs[12])

  #Count of stopwords
  vec.append(len([w for w in words if w in stop_words])/maxs[13])
  
  #Calculating average word length
  vec.append(len(sent)/len(words))

  #Calculating average sentence length
  vec.append(len(words)/len(nltk.sent_tokenize(sent)))
  

  #unique words vs word count feature
  vec.append(len(set(words))/len(words))
  

  #Stopwords count vs words counts feature
  vec.append(len([w for w in words if w in stop_words])/len(words))
  
  return np.array(vec)


# Training Data Transformation

In [8]:
train_df = pd.read_csv('/content/drive/MyDrive/NLP_Project_IITB/Project/jigsaw-toxic-comment-classification-challenge/train.csv')
train_df.head(5)
X = transform_x(train_df)
X.head()

Unnamed: 0,comment_text
0,Explanation Why the edits made under my userna...
1,D'aww! He matches this background colour I'm s...
2,"Hey man, I'm really not trying to edit war. It..."
3,""" More I can't make any real suggestions on im..."
4,"You, sir, are my hero. Any chance you remember..."


In [10]:
tf_idf = TfidfVectorizer()
tf_idf.fit(X['comment_text'])
max_idf = max(tf_idf.idf_)
tf_idf_dict = defaultdict(
            lambda: max_idf,
            [(w, tf_idf.idf_[i]) for w, i in tf_idf.vocabulary_.items()])

In [None]:
X_np_comment = X.apply(lambda row: get_sentence_embed_tf_idf(row['comment_text']),axis=1)
X_train_np = np.stack(X_np_comment, axis=0)


X_np_comment_features = X.apply(lambda row: get_sentence_embed_tf_idf_with_features(row['comment_text']),axis=1)
X_train_np_features = np.stack(X_np_comment_features, axis=0)

norm_list = feature_normalise_param(X)
X_np_comment_features_norm = X.apply(lambda row: get_sentence_embed_tf_idf_with_features_norm(row['comment_text'],norm_list),axis=1)
X_train_np_features_norm = np.stack(X_np_comment_features_norm, axis=0)

print(X_train_np.shape, X_train_np_features.shape, X_train_np_features_norm.shape)

In [None]:
Y = combine_labels(train_df)
Y.head()
y_train_np = Y['Toxic'].to_numpy()
y_train_np

array([0, 0, 0, ..., 0, 0, 0])

# Model definition and training



In [None]:
text_clf_tf_idf = Pipeline([
                     ('clf', SVC(kernel='rbf', verbose=True) )
                     ])
text_clf_tf_idf_features = Pipeline([
                     ('clf', SVC(kernel='rbf', verbose=True) )
                     ])
text_clf_tf_idf_features_norm = Pipeline([
                     ('clf', SVC(kernel='rbf', verbose=True) )
                     ])


In [None]:
text_clf_tf_idf.fit(X_train_np, y_train_np)
text_clf_tf_idf_features.fit(X_train_np_features, y_train_np)
text_clf_tf_idf_features_norm.fit(X_train_np_features_norm, y_train_np)


[LibSVM]

Pipeline(steps=[('clf', SVC(verbose=True))])

### Store as pickle objects to avoid retraining

In [None]:
# pickle.dump(text_clf_tf_idf, open('/content/drive/MyDrive/NLP_Project_IITB/models/SVM_tf_idf.pkl','wb'))
# pickle.dump(text_clf_tf_idf_features, open('/content/drive/MyDrive/NLP_Project_IITB/models/SVM_tf_idf_features.pkl','wb'))
# pickle.dump(text_clf_tf_idf_features_norm, open('/content/drive/MyDrive/NLP_Project_IITB/models/SVM_tf_idf_features_norm.pkl','wb'))


# Testing on Model

In [None]:
test_df = pd.read_csv('/content/drive/MyDrive/NLP_Project_IITB/Project/jigsaw-toxic-comment-classification-challenge/test.csv')
y_test = pd.read_csv('/content/drive/MyDrive/NLP_Project_IITB/Project/jigsaw-toxic-comment-classification-challenge/test_labels.csv')
y_test.head(3)
x_test = transform_x(test_df)
df_col_merged = merge(x_test,y_test)
df_col_merged.head()
test_df = drop_faulty_rows(df_col_merged)
test_df.head()

Unnamed: 0,comment_text,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
5,Thank you for understanding. I think very high...,0001ea8717f6de06,0,0,0,0,0,0
7,:Dear god this site is horrible.,000247e83dcc1211,0,0,0,0,0,0
11,"""::: Somebody will invariably try to add Relig...",0002f87b16116a7f,0,0,0,0,0,0
13,""" It says it right there that it IS a type. Th...",0003e1cccfd5a40a,0,0,0,0,0,0
14,""" == Before adding a new product to the list, ...",00059ace3e3e9a53,0,0,0,0,0,0


In [None]:
#Training data for tf-idf
X_test_np_comment = test_df.apply(lambda row: get_sentence_embed_tf_idf(row['comment_text']),axis=1)
X_test_np = np.stack(X_test_np_comment, axis=0)

#Training data for tf-idf with features
X_test_np_comment_features = test_df.apply(lambda row: get_sentence_embed_tf_idf_with_features(row['comment_text']),axis=1)
X_test_np_features = np.stack(X_test_np_comment_features, axis=0)

#Training data for tf-idf with features normaliased
X_test_np_comment_features_norm = test_df.apply(lambda row: get_sentence_embed_tf_idf_with_features_norm(row['comment_text'],norm_list),axis=1)
X_test_np_features_norm = np.stack(X_test_np_comment_features_norm, axis=0)

print(X_test_np.shape,X_test_np_features.shape)

In [None]:
y_test_np = combine_labels(test_df)['Toxic'].to_numpy()

## Predictions

In [None]:
predicted_tf_idf = text_clf_tf_idf.predict(X_test_np)
predicted_tf_idf_features = text_clf_tf_idf_features.predict(X_test_np_features)
predicted_tf_idf_features_norm = text_clf_tf_idf_features_norm.predict(X_test_np_features_norm)


In [None]:
print(classification_report(y_test_np, predicted_tf_idf))


              precision    recall  f1-score   support

           0       0.95      0.97      0.96     57735
           1       0.66      0.54      0.60      6243

    accuracy                           0.93     63978
   macro avg       0.81      0.76      0.78     63978
weighted avg       0.92      0.93      0.93     63978



In [None]:
print(classification_report(y_test_np, predicted_tf_idf_features))


              precision    recall  f1-score   support

           0       0.93      0.99      0.96     57735
           1       0.68      0.28      0.40      6243

    accuracy                           0.92     63978
   macro avg       0.80      0.63      0.68     63978
weighted avg       0.90      0.92      0.90     63978



In [None]:
print(classification_report(y_test_np, predicted_tf_idf_features_norm))


              precision    recall  f1-score   support

           0       0.95      0.97      0.96     57735
           1       0.66      0.49      0.57      6243

    accuracy                           0.93     63978
   macro avg       0.80      0.73      0.76     63978
weighted avg       0.92      0.93      0.92     63978

