In [None]:
import gc
import time
import multiprocessing
import numpy as np
import pandas as pd
from scipy import sparse
from functools import partial
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold
from sklearn.linear_model import LogisticRegression

In [None]:
# load data
FILE_DIR = '../../input'

# create label matrix
train = pd.read_csv(f'{FILE_DIR}/train.csv')
class_list = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
Y = train[class_list].values

# load features 
X = sparse.hstack([
    sparse.load_npz(f'{FILE_DIR}/tfidf/train_clean0_char_tfidf.npz'),
    sparse.load_npz(f'{FILE_DIR}/tfidf/train_clean0_word_tfidf.npz'),
    sparse.load_npz(f'{FILE_DIR}/tfidf/train_clean0_pos_tfidf.npz'),
]).tocsr()

X_test = sparse.hstack([
    sparse.load_npz(f'{FILE_DIR}/tfidf/test_clean0_char_tfidf.npz'),
    sparse.load_npz(f'{FILE_DIR}/tfidf/test_clean0_word_tfidf.npz'),
    sparse.load_npz(f'{FILE_DIR}/tfidf/test_clean0_pos_tfidf.npz'),
]).tocsr()

# optimal parameters
nbsvm_log = pd.read_csv(f'{FILE_DIR}/logs/nbsvm100000_log.csv')

In [None]:
def nbsvm_eval(it, j, train_index, valid_index, alpha, beta):
    gc.collect()
    
    # subset data
    X_train, y_train = X[train_index], Y[train_index, j]
    X_valid, y_valid = X[valid_index], Y[valid_index, j]
        
    # boostrap sampling
    np.random.seed(it)
    bag_indexs = np.random.randint(X_train.shape[0], size=X_train.shape[0])
    X_bag, y_bag = X_train[bag_indexs], y_train[bag_indexs]

    # get NB weights
    smooth = 10**alpha
    p = smooth + X_bag[y_bag==1].sum(0)
    q = smooth + X_bag[y_bag==0].sum(0)
    w = np.log((p/np.sum(p))/(q/np.sum(q)))  
    
    # create features
    valid_features = X_valid.multiply(w)
    test_features = X_test.multiply(w)
    bag_features = X_bag.multiply(w)

    # fit model on training data
    f = LogisticRegression(dual=True, C=10**beta)
    f.fit(bag_features, y_bag)

    # make predictions on validation and test data
    return (f.predict_proba(valid_features)[:,1],
            f.predict_proba(test_features)[:,1])

In [None]:
# create storage containers
n_folds, n_bags, seed = 10, 5, 0
oof_preds = np.zeros(Y.shape)
test_preds = np.zeros((X_test.shape[0], 6))

for j, class_name in enumerate(class_list):

    # unpack parameters
    class_params = nbsvm_log[nbsvm_log['class_name']==class_name]
    alpha = class_params['alpha'].values[0]
    beta = class_params['beta'].values[0]
        
    kf = KFold(n_splits=n_folds, random_state=seed)
    for k, (train_index, valid_index) in enumerate(kf.split(X)):
        print(f'[{k:02}] {class_name}:  ', end='')
        
        # bagging
        start_time = time.time()
        nbsvm_bag = partial(nbsvm_eval, j=j,
                            train_index=train_index,
                            valid_index=valid_index,
                            alpha=alpha,
                            beta=beta)
        pool = multiprocessing.Pool(5)
        bag_preds = pool.map(nbsvm_bag, range(n_bags))
        pool.terminate()        
        
        # unpack predictions
        bag_auc = []
        for oof_bag_preds, test_bag_preds in bag_preds:
            oof_preds[valid_index,j] += oof_bag_preds
            test_preds[:,j] += test_bag_preds/n_bags/n_folds
            bag_auc.append(roc_auc_score(Y[valid_index,j], oof_bag_preds))
            
        print(f'auc: {np.mean(bag_auc):0.5f}  time: {(time.time()-start_time)/60:02.02f} mins')

In [None]:
# out-of-fold predictions
save_name = f'{FILE_DIR}/out_of_fold_preds/nbsvm_bags{n_bags}_validation_seed{seed}.csv'
pd.DataFrame(oof_preds, columns=class_list).to_csv(save_name)

# test set predictions
save_name = f'{FILE_DIR}/submissions/aggregate/nbsvm_bags{n_bags}_seed{seed}.csv'
subm = pd.read_csv(f'{FILE_DIR}/sample_submission.csv')
for j, class_name in enumerate(class_list):
    subm[class_name] = test_preds[:,j]
subm.to_csv(save_name)