In [None]:
!pip install scikit-multilearn



In [None]:
import pandas as pd
import numpy as np
import re
import nltk.corpus
nltk.download('stopwords')
nltk.download('words')
nltk.download('punkt')
nltk.download('wordnet')
from nltk.corpus import stopwords, words
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer
from gensim.models import Word2Vec
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import scale, MinMaxScaler
from skmultilearn.problem_transform import BinaryRelevance, LabelPowerset, ClassifierChain
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, recall_score, precision_score
from keras.models import Sequential
from keras.layers import Dense
from sklearn.model_selection import learning_curve
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


# Pre processing + Word2Vec

In [None]:
test_label = pd.read_csv("test_labels.csv")
test = pd.read_csv("test.csv")
train = pd.read_csv("train.csv")
# Merging test and train to form one huge dataset
test_data = pd.merge(test, test_label)
dataset = pd.concat([test_data, train])
dataset.drop(columns=['id'], inplace=True)
dataset.drop_duplicates(inplace=True, ignore_index=True)
dataset.drop(dataset.index[dataset['toxic'] == -1], inplace = True)
dataset.reset_index(inplace = True)
# Text cleaning
#converting to lower case
dataset['comment_text_cleaned'] = dataset['comment_text'].str.lower()
#removing special characters
dataset['comment_text_cleaned'] = dataset['comment_text_cleaned'].apply(lambda elem: re.sub(r"(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?", " ", str(elem)))
#removing numbers
dataset['comment_text_cleaned'] = dataset['comment_text_cleaned'].apply(lambda elem: re.sub(r"\d+", "", str(elem)))
# Removing stop words
stop = stopwords.words('english')
dataset['comment_text_cleaned'] = dataset['comment_text_cleaned'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
#Tokenizing
dataset['comment_text_cleaned'] = dataset['comment_text_cleaned'].apply(lambda x: word_tokenize(x))
#Lemmitization
def word_lemmatizer(text):
    lem_text = [WordNetLemmatizer().lemmatize(i) for i in text]
    return lem_text
dataset['comment_text_cleaned'] = dataset['comment_text_cleaned'].apply(lambda x: word_lemmatizer(x))
# Splitting into train test sets
X = dataset.drop(columns=['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'])
y = dataset[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']].copy()

X_train, X_test_and_val, y_train, y_test_and_val = train_test_split(X,y, train_size=0.8)
X_val, X_test, y_val, y_test = train_test_split(X_test_and_val,y_test_and_val, train_size=0.5)
train_tokens = pd.Series(X_train['comment_text_cleaned']).values
w2v_model = Word2Vec(train_tokens, size= 200)
def buildWordVector(tokens, size):
  vec = np.zeros(size).reshape((1, size))
  count = 0.
  for word in tokens:
    try:
      vec += w2v_model[word].reshape((1, size))
      count += 1.
    except KeyError:
      continue
  if count != 0:
    vec /= count
  return vec

train_vecs_w2v = np.concatenate([buildWordVector(z, 200) for z in train_tokens])
train_vecs_w2v = scale(train_vecs_w2v)

val_tokens = pd.Series(X_val['comment_text_cleaned']).values
val_vecs_w2v = np.concatenate([buildWordVector(z, 200) for z in val_tokens])
val_vecs_w2v = scale(val_vecs_w2v)

def evaluation_metric(model_name, feature_extraction, y_true, y_pred):
  print('Model:', model_name)
  print('Feature extraction method:', feature_extraction)
  recall= recall_score(y_true, y_pred, average='micro')
  prec = precision_score(y_true, y_pred, average='micro')
  final_score = recall*0.6 + prec*0.4
  print('Precision: ', prec)
  print('Recall: ', recall)
  print('Final score of the model: ', final_score)
  return final_score



# Grid Search - Naive Bayes

In [None]:
parameters = [
    {
        'classifier': [GaussianNB()],
        'classifier__var_smoothing': np.logspace(0,-9, num=100),
    }
]

params_NB = {'var_smoothing': np.logspace(0,-9, num=100)}

clf = GridSearchCV(ClassifierChain(), parameters, scoring="f1_micro", return_train_score=True)
clf.fit(train_vecs_w2v, y_train)
clf.best_estimator_.fit(train_vecs_w2v, y_train)

ClassifierChain(classifier=GaussianNB(var_smoothing=1.0),
                require_dense=[True, True])

In [None]:
print("Optimal hyperparameter combination:", clf.best_params_)
print()
print("Mean cross-validated training accuracy score:", clf.best_score_)
predictions = clf.best_estimator_.predict(val_vecs_w2v) # Predictions
predictions_train = clf.best_estimator_.predict(train_vecs_w2v)

result_test = evaluation_metric('Support Vector Machine with Classifier Chains', 'Word2Vec', y_val, predictions)
result_train = evaluation_metric('Random Forest with Multi Output classifier', 'Word2Vec', y_train, predictions_train)
print("Variance is: ",result_train - result_test)

Optimal hyperparameter combination: {'classifier': GaussianNB(var_smoothing=1.0), 'classifier__var_smoothing': 1.0}

Mean cross-validated training accuracy score: 0.399933802123833
Model: Support Vector Machine with Classifier Chains
Feature extraction method: Word2Vec
Precision:  0.2766789328426863
Recall:  0.7308626974483596
Final score of the model:  0.5491891916060903
Model: Random Forest with Multi Output classifier
Feature extraction method: Word2Vec
Precision:  0.27607027347712093
Recall:  0.7266339251983377
Final score of the model:  0.546408464509851
Variance is:  -0.0027807270962392217
