In [1]:
import pandas as pd
import numpy as np
import scipy.sparse as sp

from sklearn.multiclass import OneVsRestClassifier
from sklearn.multioutput import MultiOutputClassifier, ClassifierChain
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import NuSVC, SVC
from sklearn.ensemble import RandomForestClassifier, VotingClassifier

from sklearn.pipeline import Pipeline, make_union
from sklearn.metrics import roc_auc_score, f1_score
from skmultilearn.model_selection import iterative_train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.calibration import CalibratedClassifierCV

import xgboost as xgb

# !pip install -U imbalanced-learn
from imblearn.over_sampling import ADASYN, SMOTE, RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler, NearMiss
from imblearn.combine import SMOTEENN, SMOTETomek

from tqdm import tqdm

import matplotlib.pyplot as plt
from matplotlib import rcParams
import seaborn as sns

# Setting global parameters for the Plots
rcParams['figure.figsize'] = 18, 9 
rcParams['axes.spines.top'] = False
rcParams['axes.spines.right'] = False
rcParams['lines.linewidth'] = 2.5
rcParams['xtick.labelsize'] = 'smaller'
rcParams['ytick.labelsize'] = 'smaller'
rcParams['axes.labelpad'] = 15

In [2]:
# read preprocessed data

X_train = sp.load_npz("./preprocessed_data/Preprocessed_X_train.npz")
X_test = sp.load_npz("./preprocessed_data/Preprocessed_X_test.npz")
Y_train = np.load(file="./preprocessed_data/Preprocessed_Y_train.npy")

df_test = pd.read_csv("test.csv")

labels = ['harsh', 'extremely_harsh', 'vulgar', 'threatening', 'disrespect', 'targeted_hate']

### Naive Bayes

In [16]:
NB = []
CVR = []
for i in range(len(labels)):
	NB.append(MultinomialNB(alpha=0.04))
	CVR.append(cross_val_score(NB[i], X_train, Y_train[:, i], cv=5, n_jobs=-1, scoring="roc_auc"))
	NB[i] = NB[i].fit(X_train, Y_train[:, i])

print("CV_SCORE - ", np.array([i.mean() for i in CVR]).mean())

CV_SCORE -  0.956268770821925


### Logistic Regression without Sampling

In [17]:
LR = []
CVR = []
for i in range(len(labels)):
	LR.append(LogisticRegression(max_iter=1000, C=1.2))
	CVR.append(cross_val_score(LR[i], X_train, Y_train[:, i], cv=5, n_jobs=-1, scoring="roc_auc"))
	LR[i] = LR[i].fit(X_train, Y_train[:, i])

print("CV_SCORE - ", np.array([i.mean() for i in CVR]).mean())

CV_SCORE -  0.9848657106580413


### Logistic Regression with Sampling -

In [4]:
param_C = [1.4, 1.3, 1.1, 1.3, 0.7, 1.1]	# value for parameter C for each class
x = [X_train for i in range(len(labels))]
y = [Y_train[:, i] for i in range(len(labels))]

# Applying Oversampling and Undersampling for label 1 and 3
x[1], y[1] = RandomOverSampler(random_state=1, sampling_strategy=0.15).fit_resample(X_train, Y_train[:, 1])
x[1], y[1] = RandomUnderSampler(random_state=0, sampling_strategy=1).fit_resample(x[1], y[1])

x[3], y[3] = RandomOverSampler(random_state=0, sampling_strategy=0.18).fit_resample(X_train, Y_train[:, 3])
x[3], y[3] = RandomUnderSampler(random_state=0, sampling_strategy=1).fit_resample(x[3], y[3])

In [19]:
LRS = []
CVR = []
for i in range(len(labels)):
	LRS.append(LogisticRegression(max_iter=1000, C=param_C[i]))
	CVR.append(cross_val_score(LRS[i], x[i], y[i], cv=5, n_jobs=-1, scoring="roc_auc"))
	LRS[i] = LRS[i].fit(x[i], y[i])

print("CV_SCORE - ", np.array([i.mean() for i in CVR]).mean())

CV_SCORE -  0.9877318886860572


In [20]:
# inference
# Y_test_pred= np.ones((X_test.shape[0], Y_train.shape[1]))

# for i in range(len(labels)):
#   Y_test_pred[:, i] = LRS[i].predict_proba(X_test)[:, 1]

### Classifier Chains 

In [21]:
X_split_train, Y_split_train, X_split_test, Y_split_test = iterative_train_test_split(X_train, Y_train, test_size = 0.1)

chains = [ClassifierChain(LogisticRegression(max_iter=1000, C=1.2), order="random", random_state=i) for i in range(10)]

for chain in tqdm(chains):
    chain.fit(X_split_train, Y_split_train)

100%|██████████| 10/10 [17:43<00:00, 106.39s/it]


In [22]:
Y_pred_chains = np.array([chain.predict_proba(X_split_test) for chain in chains])
Y_pred_ensemble = Y_pred_chains.mean(axis=0)

print("CV Score (ensemble) - ", roc_auc_score(y_true=Y_split_test, y_score=Y_pred_ensemble))

# for i in range(10):
#     print("Test Data - ", roc_auc_score(y_true=Y_train, y_score=Y_pred_chains[i]))

CV Score (ensemble) -  0.9856101773449771


### Ridge Classifier

In [23]:
# for Cross Validation (with sampled data)

RCS = []
CVR = []
for i in tqdm(range(len(labels))):
	RCS.append(CalibratedClassifierCV(RidgeClassifier(alpha=15.7), cv=3, ensemble=True))
	CVR.append(cross_val_score(RCS[i], x[i], y[i], cv=5, n_jobs=-1, scoring="roc_auc"))
	RCS[i] = RCS[i].fit(x[i], y[i])
	

print("CV_SCORE - ", np.array([i.mean() for i in CVR]).mean())

100%|██████████| 6/6 [02:17<00:00, 22.90s/it]

CV_SCORE -  0.9881889288818556





In [24]:
OVR_RC = OneVsRestClassifier(CalibratedClassifierCV(RidgeClassifier(alpha=15.7), cv=3, ensemble=True), n_jobs=-1)

OVR_RC.fit(X_train, Y_train)
# Y_test_pred = OVR_RC.predict_proba(X_test)

### Voting Classifier 

In [25]:
VC = []
param_alpha = [9, 14, 11, 15.6, 11.8, 20.1]
param_C = [1.4, 1.3, 1.1, 1.3, 0.7, 1.1]

for i in tqdm(range(len(labels))):
	clf1 = LogisticRegression(max_iter=1000, C=param_C[i])
	clf2 = CalibratedClassifierCV(RidgeClassifier(alpha=param_alpha[i]), cv=3, ensemble=True)
	# clf3 = RandomForestClassifier(max_depth=100, n_estimators=500, max_features="sqrt", min_samples_split=100, bootstrap=False, min_samples_leaf=3)
	VC.append(VotingClassifier(estimators=[('lr', clf1), ('rc', clf2)], voting='soft', n_jobs=-1))
	VC[i] = VC[i].fit(x[i], y[i])

100%|██████████| 6/6 [01:18<00:00, 13.08s/it]


In [26]:
# For predicting test labels

Y_test_pred= np.ones((X_test.shape[0], Y_train.shape[1]))

for i in range(len(labels)):
  Y_test_pred[:, i] = VC[i].predict_proba(X_test)[:, 1]

### Random Forest

In [27]:
X_split_train, Y_split_train, X_split_test, Y_split_test = iterative_train_test_split(X_train, Y_train, test_size = 0.1)

RF = RandomForestClassifier(max_depth=100, n_estimators=500, max_features="sqrt", min_samples_split=100, bootstrap=False, min_samples_leaf=3, n_jobs=-1)
RF.fit(X_split_train, Y_split_train)

In [28]:
Y_train_pred = RF.predict_proba(X_split_train)
Y_test_pred = RF.predict_proba(X_split_test)

temp1 = np.ones((Y_split_train.shape[0], Y_split_train.shape[1]))
temp2 = np.ones((X_split_test.shape[0], Y_split_train.shape[1]))

for i in range(6):
	temp1[:, i] = Y_train_pred[i][:, 1]
	temp2[:, i] = Y_test_pred[i][:, 1]

Y_train_pred = temp1
Y_test_pred = temp2

print("CV Score - ", roc_auc_score(y_true=Y_split_test, y_score=Y_test_pred))

CV Score -  0.9774656780918689


## XGBoost

In [5]:
XGB = []
for i in tqdm(range(len(labels))):
	XGB.append(xgb.XGBClassifier(n_jobs=-1, alpha=0.5, max_depth=3))
	XGB[i] = XGB[i].fit(x[i], y[i])

100%|██████████| 6/6 [09:35<00:00, 95.98s/it] 


In [None]:
# Y_test_pred= np.ones((X_test.shape[0], Y_train.shape[1]))

# for i in range(len(labels)):
#   Y_test_pred[:, i] = XGB[i].predict_proba(X_test)[:, 1]

## Grid Search CV code

In [29]:
# from pprint import pprint
# from time import time
# import logging

# from sklearn.model_selection import GridSearchCV

# parameters = {
#     "tfidf__binary": (True, False),
#     "tfidf__max_df": (0.5, 0.75, 1.0),
#     'tfidf__max_features': (None, 25000, 30000, 35000, 40000),
#     "tfidf__ngram_range": ((1, 1), (1, 2), (2, 2)),  # unigrams or bigrams
#     # 'tfidf__use_idf': (True, False),
#     # 'tfidf__norm': ('l1', 'l2'),
#     # "clf__max_iter": (20,),
#     # "clf__alpha": (0.00001, 0.000001),
#     # "clf__penalty": ("l2", "elasticnet"),
#     # 'clf__max_iter': (10, 50, 80),
# }

# grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1, cv=3, scoring='roc_auc')

# print("Performing grid search...")
# print("pipeline:", [name for name, _ in pipeline.steps])
# print("parameters:")
# pprint(parameters)
# t0 = time()
# grid_search.fit(X_train, Y_train)
# print("done in %0.3fs" % (time() - t0))
# print()

# print("Best score: %0.3f" % grid_search.best_score_)
# print("Best parameters set:")
# best_parameters = grid_search.best_estimator_.get_params()
# for param_name in sorted(parameters.keys()):
#     print("\t%s: %r" % (param_name, best_parameters[param_name]))

In [30]:
# grid_search.cv_results_

In [31]:
# pd.DataFrame(grid_search.cv_results_).sort_values('rank_test_score')

# Exporting test predictions

In [37]:
#Assigning the columns to the obtained data
df_export = pd.DataFrame(data=Y_test_pred, columns=["harsh", "extremely_harsh","vulgar","threatening","disrespect","targeted_hate"])

In [38]:
#appending id in the start
df_export.insert(loc=0, column='id', value=df_test['id'])

In [39]:
#exporting it in the end
df_export.to_csv('prediction.csv',index=False)