In [None]:
import re
import regex
import multiprocessing
import numpy as np
import pandas as pd
from scipy import sparse
from functools import partial
from sklearn.model_selection import RepeatedKFold
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from bayes_opt import BayesianOptimization
from sklearn.linear_model import LogisticRegression()

In [None]:
# load data
FILE_DIR = '../../input'

# create label matrix
train = pd.read_csv(f'{FILE_DIR}/train.csv')
class_list = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
Y = train[class_list].values

# load features 
X = sparse.hstack([
    sparse.load_npz(f'{FILE_DIR}/tfidf/train_char_tfidf.npz'),
    sparse.load_npz(f'{FILE_DIR}/tfidf/train_word_tfidf.npz'),
    sparse.load_npz(f'{FILE_DIR}/tfidf/train_pos_tfidf.npz'),
]).tocsr()

X_test = sparse.hstack([
    sparse.load_npz(f'{FILE_DIR}/tfidf/test_char_tfidf.npz'),
    sparse.load_npz(f'{FILE_DIR}/tfidf/test_word_tfidf.npz'),
    sparse.load_npz(f'{FILE_DIR}/tfidf/test_pos_tfidf.npz'),
]).tocsr()

In [None]:
def nbsvm_eval(indexs, j, alpha, beta):
  
  # unpack data
  train_index, valid_index = indexs
  X_train, y_train = X[train_index], Y[train_index,j]
  X_valid, y_valid = X[valid_index], Y[valid_index,j]

  # create features
  p = 10**alpha + X_train[y_train==1].sum(0)
  q = 10**alpha + X_train[y_train==0].sum(0)
  w = np.log((p/np.sum(p))/(q/np.sum(q)))  
  train_features = X_train.multiply(w)
  valid_features = X_valid.multiply(w)

  # fit model on training data
  f = LogisticRegression(dual=True, C=10**beta)
  f.fit(train_features, y_train)

  # make predictions on validation data
  yhat = f.predict_proba(valid_features)[:,1]
  return roc_auc_score(y_valid, yhat)

def nbsvm_cv(j, alpha=0.0, beta=0.0):
    
  # create folds
  fold_iterator = RepeatedKFold(n_splits=5, n_repeats=2)
  index_list = [(tr,va) for tr,va in fold_iterator.split(X)]
  
  # process folds in parallel
  pool = multiprocessing.Pool(10)
  f_eval = partial(nbsvm_eval, j=j, alpha=alpha, beta=beta)
  scores = pool.map(f_eval, index_list)
  pool.terminate()
   
  return np.mean(scores)

In [None]:
results = []
param_lims = {'alpha': (-2, 0), 'beta': (-2,0)}

for j, class_name in enumerate(class_list):
  
  print(f'Target: {class_name}')
  nbsvm_opt = partial(nbsvm_cv, j)
  BO = BayesianOptimization(nbsvm_opt, param_lims)
  BO.maximize(init_points=5, n_iter=5)
  
  results.append({
    'val_auc': BO.res['max']['max_val'],
    'alpha':   BO.res['max']['max_params']['alpha'], 
    'beta':    BO.res['max']['max_params']['beta'],
  })

In [None]:
pd.DataFrame(results)