In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import os

from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble  import GradientBoostingClassifier, RandomForestClassifier
from sklearn.model_selection import StratifiedKFold, cross_val_score, train_test_split, KFold
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
import random

from sklearn import metrics
from collections import Counter
import argparse
from sklearn.model_selection import cross_validate
from sklearn.metrics import roc_auc_score, average_precision_score, f1_score, precision_score, recall_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import ParameterGrid
import lightgbm as lgb

In [4]:
if os.name=='posix':
    # GOOGLE COLAB SETUP

    # Load the Drive helper and mount
    from google.colab import drive

    # This will prompt for authorization.
    drive.mount('/content/drive')
    
    #2. Get the file
    data_path     = 'drive/My Drive/Colab Notebooks/adaptHAN/AOBDL_code/data'
    codes_path    = 'drive/My Drive/Colab Notebooks/adaptHAN/AOBDL_code/AOBDL_TML'


    #3. Read file as panda dataframe
    train         = pd.read_csv(f'{data_path}/train_cleaned_no_punkt.csv') 
    test_labelled = pd.read_csv(f'{data_path}/test_labelled_cleaned_no_punkt.csv') 
    test_unlabelled = pd.read_csv(f'{data_path}/test_unlabelled_cleaned_no_punkt.csv') 
else:
        #2. Get the file
    data_path     = '../data'
    codes_path    = '../AOBDL_TML'


    #3. Read file as panda dataframe
    train         = pd.read_csv(f'{data_path}/train_cleaned_no_punkt.csv') 
    test_labelled = pd.read_csv(f'{data_path}/test_labelled_cleaned_no_punkt.csv') 
    test_unlabelled = pd.read_csv(f'{data_path}/test_unlabelled_cleaned_no_punkt.csv')

In [5]:
train['mal'] = train[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']].sum(axis=1) >= 1  
train.drop(['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'], axis=1, inplace=True)
train.comment_text.fillna("empty", inplace=True)
train = train.drop_duplicates(subset=['comment_text', 'mal'])

test_labelled['mal'] = test_labelled[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']].sum(axis=1) >= 1  
test_labelled.drop(['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'], axis=1, inplace=True)
test_labelled.comment_text.fillna("empty", inplace=True)
test_labelled = test_labelled.drop_duplicates(subset=['comment_text'])

test_unlabelled.comment_text.fillna("empty", inplace=True)
test_unlabelled = test_unlabelled.drop_duplicates(subset=['comment_text'])

In [6]:
rs=42
def strat_split(strat=False):
  global rs
  if strat:
      from sklearn.model_selection import train_test_split      
      X_train1, X_test1, y_train1, y_test1  = train_test_split(train.drop('mal', axis=1), train.mal, stratify=train.mal, test_size=0.29, random_state=rs )
      X_train2, X_test2, y_train2, y_test2  = train_test_split(test_labelled.drop('mal', axis=1), test_labelled.mal, stratify=test_labelled.mal, test_size=0.29, random_state=rs)
      X = np.concatenate((X_train1.comment_text, X_train2.comment_text))
      y = np.concatenate((y_train1, y_train2))

      X_test = np.concatenate((X_test1.comment_text, X_test2.comment_text))
      y_test = np.concatenate((y_test1, y_test2))
  else:
      X = np.concatenate((train.comment_text, test_labelled.comment_text))
      y = np.concatenate((train.mal, test_labelled.mal))
      from sklearn.utils import shuffle
      from sklearn.model_selection import train_test_split
      X = shuffle(X, random_state=rs)
      y = shuffle(y, random_state=rs)      
      X, X_test, y, y_test  = train_test_split(X, y, stratify=y, test_size=0.3, random_state=rs )
      
  return X, X_test, y, y_test    
X, X_test, y, y_test = strat_split(True)

In [7]:
# Ridge

In [8]:
kf  = StratifiedKFold(n_splits=5, random_state=rs)
auc = []
roc = []
fscore_ = []
c   = 0

word_vectorizer   = TfidfVectorizer(
    sublinear_tf  = True,
    strip_accents = 'unicode',
    analyzer      = 'word',
    token_pattern = r'\w{1,}',
    stop_words    = 'english',
    ngram_range   = (1, 1),
    max_features  = 40000)



In [9]:
#auc_pr = 0.093434375824395
#auc_roc = 0.02343524395
c = 0
C_parameter = np.arange(0.1, 1, 0.1) 

# use best C

C_parameter = [1]

In [10]:
for c_p in C_parameter:  
      for c, (train_index, val_index) in enumerate(kf.split(X, y)):

            X_train, X_val      = X[train_index], X[val_index]
            y_train, y_val      = y[train_index], y[val_index] 
            word_vectorizer.fit(X_train)
            train_word_features = word_vectorizer.transform(X_train)
            val_word_features   = word_vectorizer.transform(X_val)
            y_train             = y_train.astype('int')
            y_val               = y_val.astype('int')
            classifier          = LogisticRegression(C=c_p, solver='sag')
            classifier.fit(train_word_features, y_train)
            probs               = classifier.predict_proba(val_word_features)[:,1]
            auc_roc             = roc_auc_score(y_val, probs)
            auc_pr              = average_precision_score(y_val, probs)
            
            threshold = 0.3
            probs_class = probs.copy()
            probs_class[probs_class >= threshold] = 1 
            probs_class[probs_class < threshold] = 0
            precision = precision_score(y_val, probs_class) 
            recall    = recall_score(y_val, probs_class)
            fscore    = f1_score(y_val, probs_class)
            print(f' {threshold} fold {c} precision {round(precision, 3)} recall {round(recall, 3)} fscore {round(fscore,3)}')

            if len(C_parameter)==1:
                # print performance
                print(f'---------------------------------------------')
                print(f'FOLD {c}: AUC PR-C = {round(auc_pr, 3)}, AUC ROC = {round(auc_roc, 3)}')
                print(f'---------------------------------------------')
                print(f'')

            auc.append(auc_pr)
            roc.append(auc_roc)
            fscore_.append(fscore)

      if len(C_parameter)!=1:
             print(f'PARAMETER C = {c_p}')

      # print performance
      print(f'-----------------------------------------------')
      print(f'CV average: AUC PR-C = {round(np.array(auc).mean(), 3)}, AUC ROC = {round(np.array(roc).mean(), 3)}, FSCORE  = {round(np.array(fscore_).mean(), 3)}')
      print(f'-----------------------------------------------')
      print(f'')

 0.3 fold 0 precision 0.906 recall 0.655 fscore 0.76
---------------------------------------------
FOLD 0: AUC PR-C = 0.869, AUC ROC = 0.969
---------------------------------------------

 0.3 fold 1 precision 0.894 recall 0.66 fscore 0.76
---------------------------------------------
FOLD 1: AUC PR-C = 0.866, AUC ROC = 0.968
---------------------------------------------

 0.3 fold 2 precision 0.896 recall 0.668 fscore 0.766
---------------------------------------------
FOLD 2: AUC PR-C = 0.867, AUC ROC = 0.968
---------------------------------------------

 0.3 fold 3 precision 0.727 recall 0.72 fscore 0.723
---------------------------------------------
FOLD 3: AUC PR-C = 0.814, AUC ROC = 0.965
---------------------------------------------

 0.3 fold 4 precision 0.592 recall 0.819 fscore 0.687
---------------------------------------------
FOLD 4: AUC PR-C = 0.776, AUC ROC = 0.963
---------------------------------------------

-----------------------------------------------
CV average:

In [17]:
# TRAIN ON WHOLE DAATA AND PREDICT ON TEST
word_vectorizer.fit(X)
train_word_features  = word_vectorizer.transform(X)
test_word_features   = word_vectorizer.transform(X_test)
classifier           = LogisticRegression(C=1, solver='sag')
classifier.fit(train_word_features, y)
probs                = classifier.predict_proba(test_word_features)[:,1]
auc_roc              = roc_auc_score(y_test, probs)
auc_pr               = average_precision_score(y_test, probs)

# print performance
print(f'-----------------------------------------')
print(f'TEST: AUC PR-C = {round(auc_pr, 4)}, AUC ROC = {round(auc_roc, 4)}')
print(f'-----------------------------------------')
print(f'')

threshold = 0.3
probs_class = probs.copy()
probs_class[probs_class >= threshold] = 1 
probs_class[probs_class < threshold] = 0
precision = precision_score(y_test, probs_class) 
recall    = recall_score(y_test, probs_class)
fscore    = f1_score(y_test, probs_class)
print(f' {threshold} precision {round(precision, 3)} recall {round(recall, 3)} fscore {round(fscore,3)}')


-----------------------------------------
TEST: AUC PR-C = 0.8328, AUC ROC = 0.967
-----------------------------------------

 0.3 precision 0.787 recall 0.707 fscore 0.745


In [None]:
# RF

In [6]:
kf  = StratifiedKFold(n_splits=5, random_state=rs)
auc = []
roc = []
fscore_ = []
c   = 0

word_vectorizer   = TfidfVectorizer(
    sublinear_tf  = True,
    strip_accents = 'unicode',
    analyzer      = 'word',
    token_pattern = r'\w{1,}',
    stop_words    = 'english',
    ngram_range   = (1, 1),
    max_features  = 40000)

In [20]:
for c, (train_index, val_index) in enumerate(kf.split(X, y)):

            X_train, X_val      = X[train_index], X[val_index]
            y_train, y_val      = y[train_index], y[val_index] 
            word_vectorizer.fit(X_train)
            train_word_features = word_vectorizer.transform(X_train)
            val_word_features   = word_vectorizer.transform(X_val)
            y_train             = y_train.astype('int')
            y_val               = y_val.astype('int')
            classifier          = RandomForestClassifier(n_estimators=600, max_depth=None, max_features='auto', 
                                    min_samples_split=2, verbose = True, n_jobs=20)
            classifier.fit(train_word_features, y_train)
            probs               = classifier.predict_proba(val_word_features)[:,1]
            auc_roc             = roc_auc_score(y_val, probs)
            auc_pr              = average_precision_score(y_val, probs)

            
            # print performance
            print(f'---------------------------------------------')
            print(f'FOLD {c}: AUC PR-C = {round(auc_pr, 3)}, AUC ROC = {round(auc_roc, 3)}')
            print(f'---------------------------------------------')
            print(f'')

            auc.append(auc_pr)
            roc.append(auc_roc)
            
            threshold = 0.3
            probs_class = probs.copy()
            probs_class[probs_class >= threshold] = 1 
            probs_class[probs_class < threshold] = 0
            precision = precision_score(y_val, probs_class) 
            recall    = recall_score(y_val, probs_class)
            fscore    = f1_score(y_val, probs_class)
            print(f' {threshold} fold {c} precision {round(precision, 3)} recall {round(recall, 3)} fscore {round(fscore,3)}')

            fscore_.append(fscore)

# print performance
print(f'-----------------------------------------------')
print(f'CV average: AUC PR-C = {round(np.array(auc).mean(), 3)}, AUC ROC = {round(np.array(roc).mean(), 3)}, FSCORE  = {round(np.array(fscore_).mean(), 3)}')
print(f'-----------------------------------------------')
print(f'')

[Parallel(n_jobs=20)]: Using backend ThreadingBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done  10 tasks      | elapsed:    5.9s
[Parallel(n_jobs=20)]: Done 160 tasks      | elapsed:   47.9s
[Parallel(n_jobs=20)]: Done 410 tasks      | elapsed:  2.0min
[Parallel(n_jobs=20)]: Done 600 out of 600 | elapsed:  2.9min finished
[Parallel(n_jobs=20)]: Using backend ThreadingBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done  10 tasks      | elapsed:    0.0s
[Parallel(n_jobs=20)]: Done 160 tasks      | elapsed:    0.8s
[Parallel(n_jobs=20)]: Done 410 tasks      | elapsed:    2.0s
[Parallel(n_jobs=20)]: Done 600 out of 600 | elapsed:    3.0s finished


---------------------------------------------
FOLD 0: AUC PR-C = 0.845, AUC ROC = 0.96
---------------------------------------------

 0.3 fold 0 precision 0.836 recall 0.707 fscore 0.766


[Parallel(n_jobs=20)]: Using backend ThreadingBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done  10 tasks      | elapsed:    4.9s
[Parallel(n_jobs=20)]: Done 160 tasks      | elapsed:   47.7s
[Parallel(n_jobs=20)]: Done 410 tasks      | elapsed:  2.0min
[Parallel(n_jobs=20)]: Done 600 out of 600 | elapsed:  2.9min finished
[Parallel(n_jobs=20)]: Using backend ThreadingBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done  10 tasks      | elapsed:    0.1s
[Parallel(n_jobs=20)]: Done 160 tasks      | elapsed:    0.8s
[Parallel(n_jobs=20)]: Done 410 tasks      | elapsed:    2.0s
[Parallel(n_jobs=20)]: Done 600 out of 600 | elapsed:    2.9s finished


---------------------------------------------
FOLD 1: AUC PR-C = 0.854, AUC ROC = 0.964
---------------------------------------------

 0.3 fold 1 precision 0.835 recall 0.72 fscore 0.773


[Parallel(n_jobs=20)]: Using backend ThreadingBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done  10 tasks      | elapsed:    5.8s
[Parallel(n_jobs=20)]: Done 160 tasks      | elapsed:   47.9s
[Parallel(n_jobs=20)]: Done 410 tasks      | elapsed:  2.0min
[Parallel(n_jobs=20)]: Done 600 out of 600 | elapsed:  2.8min finished
[Parallel(n_jobs=20)]: Using backend ThreadingBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done  10 tasks      | elapsed:    0.1s
[Parallel(n_jobs=20)]: Done 160 tasks      | elapsed:    0.8s
[Parallel(n_jobs=20)]: Done 410 tasks      | elapsed:    2.0s
[Parallel(n_jobs=20)]: Done 600 out of 600 | elapsed:    2.9s finished


---------------------------------------------
FOLD 2: AUC PR-C = 0.848, AUC ROC = 0.961
---------------------------------------------

 0.3 fold 2 precision 0.833 recall 0.708 fscore 0.766


[Parallel(n_jobs=20)]: Using backend ThreadingBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done  10 tasks      | elapsed:    5.2s
[Parallel(n_jobs=20)]: Done 160 tasks      | elapsed:   46.2s
[Parallel(n_jobs=20)]: Done 410 tasks      | elapsed:  1.9min
[Parallel(n_jobs=20)]: Done 600 out of 600 | elapsed:  2.8min finished
[Parallel(n_jobs=20)]: Using backend ThreadingBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done  10 tasks      | elapsed:    0.1s
[Parallel(n_jobs=20)]: Done 160 tasks      | elapsed:    0.8s
[Parallel(n_jobs=20)]: Done 410 tasks      | elapsed:    2.1s
[Parallel(n_jobs=20)]: Done 600 out of 600 | elapsed:    3.0s finished


---------------------------------------------
FOLD 3: AUC PR-C = 0.788, AUC ROC = 0.959
---------------------------------------------

 0.3 fold 3 precision 0.662 recall 0.789 fscore 0.72


[Parallel(n_jobs=20)]: Using backend ThreadingBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done  10 tasks      | elapsed:    5.6s
[Parallel(n_jobs=20)]: Done 160 tasks      | elapsed:   44.2s
[Parallel(n_jobs=20)]: Done 410 tasks      | elapsed:  1.8min
[Parallel(n_jobs=20)]: Done 600 out of 600 | elapsed:  2.6min finished
[Parallel(n_jobs=20)]: Using backend ThreadingBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done  10 tasks      | elapsed:    0.0s
[Parallel(n_jobs=20)]: Done 160 tasks      | elapsed:    0.8s
[Parallel(n_jobs=20)]: Done 410 tasks      | elapsed:    2.1s


---------------------------------------------
FOLD 4: AUC PR-C = 0.738, AUC ROC = 0.961
---------------------------------------------

 0.3 fold 4 precision 0.511 recall 0.899 fscore 0.652
-----------------------------------------------
CV average: AUC PR-C = 0.83, AUC ROC = 0.963, FSCORE  = 0.737
-----------------------------------------------



[Parallel(n_jobs=20)]: Done 600 out of 600 | elapsed:    3.2s finished


In [21]:
# TRAIN ON WHOLE DAATA AND PREDICT ON TEST
word_vectorizer.fit(X)
train_word_features  = word_vectorizer.transform(X)
test_word_features   = word_vectorizer.transform(X_test)
classifier           = RandomForestClassifier(n_estimators=600, max_depth=None, max_features='auto', 
                                    min_samples_split=2, verbose = True, n_jobs=20)
classifier.fit(train_word_features, y)
probs                = classifier.predict_proba(test_word_features)[:,1]
auc_roc              = roc_auc_score(y_test, probs)
auc_pr               = average_precision_score(y_test, probs)

# print performance
print(f'-----------------------------------------')
print(f'TEST: AUC PR-C = {round(auc_pr, 3)}, AUC ROC = {round(auc_roc, 3)}')
print(f'-----------------------------------------')
print(f'')

threshold = 0.3
probs_class = probs.copy()
probs_class[probs_class >= threshold] = 1 
probs_class[probs_class < threshold] = 0
precision = precision_score(y_test, probs_class) 
recall    = recall_score(y_test, probs_class)
fscore    = f1_score(y_test, probs_class)
print(f' {threshold} precision {round(precision, 3)} recall {round(recall, 3)} fscore {round(fscore,3)}')


[Parallel(n_jobs=20)]: Using backend ThreadingBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done  10 tasks      | elapsed:    8.7s
[Parallel(n_jobs=20)]: Done 160 tasks      | elapsed:  1.2min
[Parallel(n_jobs=20)]: Done 410 tasks      | elapsed:  3.0min
[Parallel(n_jobs=20)]: Done 600 out of 600 | elapsed:  4.4min finished
[Parallel(n_jobs=20)]: Using backend ThreadingBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done  10 tasks      | elapsed:    0.2s
[Parallel(n_jobs=20)]: Done 160 tasks      | elapsed:    3.2s
[Parallel(n_jobs=20)]: Done 410 tasks      | elapsed:    7.9s
[Parallel(n_jobs=20)]: Done 600 out of 600 | elapsed:   11.6s finished


-----------------------------------------
TEST: AUC PR-C = 0.814, AUC ROC = 0.962
-----------------------------------------

 0.3 precision 0.716 recall 0.766 fscore 0.741


In [22]:
# SVM

In [23]:
penalty = ['l2', 'l1']
alpha = [0.000001, 0.00001, 0.0001, 0.001, 0.01, 0.1]
max_iter = [1000, 10000, 15000]

# best parameters
penalty = ['l1']
alpha = [0.00001]
max_iter = [1000] 

for p in penalty:
  for a in alpha:
    for i in max_iter:
      auc = []
      roc = []
      fscore_ = []
      c = 0
      # print performance
      print(f'-------------')
      print(f'penalty {p}')
      print(f'alpha {a}')
      print(f'max_iter {i}')
      print(f'-------------')
      
      for c, (train_index, val_index) in enumerate(kf.split(X, y)):

            X_train, X_val      = X[train_index], X[val_index]
            y_train, y_val      = y[train_index], y[val_index] 
            word_vectorizer.fit(X_train)
            train_word_features = word_vectorizer.transform(X_train)
            val_word_features   = word_vectorizer.transform(X_val)
            y_train             = y_train.astype('int')
            y_val               = y_val.astype('int')
            classifier          = SGDClassifier(n_jobs=20, random_state=rs, loss='log', shuffle=False, 
                                    penalty=p, alpha=a, max_iter=i)
            classifier.fit(train_word_features, y_train)
            probs               = classifier.predict_proba(val_word_features)[:,1]
            auc_roc             = roc_auc_score(y_val, probs)
            auc_pr              = average_precision_score(y_val, probs)
            
            
            print(f'---------------------------------------------')
            print(f'FOLD {c}: AUC PR-C = {round(auc_pr, 3)}, AUC ROC = {round(auc_roc, 3)}')
            print(f'---------------------------------------------')
            print(f'')

            auc.append(auc_pr)
            roc.append(auc_roc)
            threshold = 0.3
            probs_class = probs.copy()
            probs_class[probs_class >= threshold] = 1 
            probs_class[probs_class < threshold] = 0
            precision = precision_score(y_val, probs_class) 
            recall    = recall_score(y_val, probs_class)
            fscore    = f1_score(y_val, probs_class)
            print(f' {threshold} fold {c} precision {round(precision, 3)} recall {round(recall, 3)} fscore {round(fscore,3)}')

            fscore_.append(fscore)

# print performance
print(f'-----------------------------------------------')
print(f'CV average: AUC PR-C = {round(np.array(auc).mean(), 3)}, AUC ROC = {round(np.array(roc).mean(), 3)}, FSCORE  = {round(np.array(fscore_).mean(), 3)}')
print(f'-----------------------------------------------')
print(f'')

-------------
penalty l1
alpha 1e-05
max_iter 1000
-------------
---------------------------------------------
FOLD 0: AUC PR-C = 0.862, AUC ROC = 0.964
---------------------------------------------

 0.3 fold 0 precision 0.939 recall 0.584 fscore 0.72
---------------------------------------------
FOLD 1: AUC PR-C = 0.862, AUC ROC = 0.962
---------------------------------------------

 0.3 fold 1 precision 0.931 recall 0.599 fscore 0.729
---------------------------------------------
FOLD 2: AUC PR-C = 0.864, AUC ROC = 0.963
---------------------------------------------

 0.3 fold 2 precision 0.942 recall 0.591 fscore 0.726
---------------------------------------------
FOLD 3: AUC PR-C = 0.812, AUC ROC = 0.963
---------------------------------------------

 0.3 fold 3 precision 0.777 recall 0.662 fscore 0.715
---------------------------------------------
FOLD 4: AUC PR-C = 0.781, AUC ROC = 0.963
---------------------------------------------

 0.3 fold 4 precision 0.623 recall 0.794 fsco

In [24]:
# TRAIN ON WHOLE DAATA AND PREDICT ON TEST
word_vectorizer.fit(X)
train_word_features  = word_vectorizer.transform(X)
test_word_features   = word_vectorizer.transform(X_test)
classifier           = SGDClassifier(n_jobs=20, random_state=rs, loss='log', shuffle=False, 
                                    penalty=penalty[0], alpha=alpha[0], max_iter=max_iter[0])
classifier.fit(train_word_features, y)
probs                = classifier.predict_proba(test_word_features)[:,1]
auc_roc              = roc_auc_score(y_test, probs)
auc_pr               = average_precision_score(y_test, probs)

# print performance
print(f'-----------------------------------------')
print(f'TEST: AUC PR-C = {round(auc_pr, 3)}, AUC ROC = {round(auc_roc, 3)}')
print(f'-----------------------------------------')
print(f'')

threshold = 0.3
probs_class = probs.copy()
probs_class[probs_class >= threshold] = 1 
probs_class[probs_class < threshold] = 0
precision = precision_score(y_test, probs_class) 
recall    = recall_score(y_test, probs_class)
fscore    = f1_score(y_test, probs_class)
print(f' {threshold} precision {round(precision, 3)} recall {round(recall, 3)} fscore {round(fscore,3)}')


-----------------------------------------
TEST: AUC PR-C = 0.827, AUC ROC = 0.962
-----------------------------------------

 0.3 precision 0.833 recall 0.638 fscore 0.722


In [25]:
# lightgbm

In [26]:
# muner of rounds
max_rounds = 600
stopping   = 600
verbose    = 200

# LGB parameters
lgb_params = {
    'boosting_type':     'gbdt',
    'objective':         'binary',
    'metrics':           'binary_logloss',
    'bagging_fraction':  0.9,
    'feature_fraction':  0.8,
    'lambda_l1':         0.1,
    'lambda_l2':         0.1,
    'min_split_gain':    0.01,
    'min_child_weight':  2,
    'min_child_samples': 20,
    'silent':            True,
    'verbosity':         100,
    'learning_rate':     0.1,
    'max_depth':         7,
    'num_leaves':        70,
    'scale_pos_weight':  1,
    'n_estimators':      max_rounds,
    'nthread' :          20,
    'random_state':      rs,
}

In [27]:
auc = []
roc = []
fscore_ = []
for c, (train_index, val_index) in enumerate(kf.split(X, y)):

            X_train, X_val      = X[train_index], X[val_index]
            y_train, y_val      = y[train_index], y[val_index] 
            word_vectorizer.fit(X_train)
            train_word_features = word_vectorizer.transform(X_train)
            val_word_features   = word_vectorizer.transform(X_val)
            y_train             = y_train.astype('int')
            y_val               = y_val.astype('int')
            classifier          = lgb.LGBMClassifier(**lgb_params) 
            classifier.fit(train_word_features, y_train)
            probs               = classifier.predict_proba(val_word_features)[:,1]
            auc_roc             = roc_auc_score(y_val, probs)
            auc_pr              = average_precision_score(y_val, probs)

            
            # print performance
            print(f'---------------------------------------------')
            print(f'FOLD {c}: AUC PR-C = {round(auc_pr, 3)}, AUC ROC = {round(auc_roc, 3)}')
            print(f'---------------------------------------------')
            print(f'')

            auc.append(auc_pr)
            roc.append(auc_roc)
            threshold = 0.3
            probs_class = probs.copy()
            probs_class[probs_class >= threshold] = 1 
            probs_class[probs_class < threshold] = 0
            precision = precision_score(y_val, probs_class) 
            recall    = recall_score(y_val, probs_class)
            fscore    = f1_score(y_val, probs_class)
            print(f' {threshold} fold {c} precision {round(precision, 3)} recall {round(recall, 3)} fscore {round(fscore,3)}')

            fscore_.append(fscore)

# print performance
print(f'-----------------------------------------------')
print(f'CV average: AUC PR-C = {round(np.array(auc).mean(), 3)}, AUC ROC = {round(np.array(roc).mean(), 3)}, FSCORE  = {round(np.array(fscore_).mean(), 3)}')
print(f'-----------------------------------------------')
print(f'')

---------------------------------------------
FOLD 0: AUC PR-C = 0.851, AUC ROC = 0.96
---------------------------------------------

 0.3 fold 0 precision 0.895 recall 0.654 fscore 0.756
---------------------------------------------
FOLD 1: AUC PR-C = 0.854, AUC ROC = 0.959
---------------------------------------------

 0.3 fold 1 precision 0.887 recall 0.664 fscore 0.759
---------------------------------------------
FOLD 2: AUC PR-C = 0.85, AUC ROC = 0.958
---------------------------------------------

 0.3 fold 2 precision 0.894 recall 0.662 fscore 0.761
---------------------------------------------
FOLD 3: AUC PR-C = 0.798, AUC ROC = 0.956
---------------------------------------------

 0.3 fold 3 precision 0.719 recall 0.724 fscore 0.721
---------------------------------------------
FOLD 4: AUC PR-C = 0.764, AUC ROC = 0.958
---------------------------------------------

 0.3 fold 4 precision 0.574 recall 0.834 fscore 0.68
-----------------------------------------------
CV average

In [28]:
# TRAIN ON WHOLE DAATA AND PREDICT ON TEST
word_vectorizer.fit(X)
train_word_features  = word_vectorizer.transform(X)
test_word_features   = word_vectorizer.transform(X_test)
classifier           = lgb.LGBMClassifier(**lgb_params) 
classifier.fit(train_word_features, y)
probs                = classifier.predict_proba(test_word_features)[:,1]
auc_roc              = roc_auc_score(y_test, probs)
auc_pr               = average_precision_score(y_test, probs)

# print performance
print(f'-----------------------------------------')
print(f'TEST: AUC PR-C = {round(auc_pr, 3)}, AUC ROC = {round(auc_roc, 3)}')
print(f'-----------------------------------------')
print(f'')

threshold = 0.3
probs_class = probs.copy()
probs_class[probs_class >= threshold] = 1 
probs_class[probs_class < threshold] = 0
precision = precision_score(y_test, probs_class) 
recall    = recall_score(y_test, probs_class)
fscore    = f1_score(y_test, probs_class)
print(f' {threshold} precision {round(precision, 3)} recall {round(recall, 3)} fscore {round(fscore,3)}')


-----------------------------------------
TEST: AUC PR-C = 0.819, AUC ROC = 0.958
-----------------------------------------

 0.3 precision 0.77 recall 0.705 fscore 0.736
