In [12]:
import pandas as pd
import numpy as np
from scipy import sparse
import optuna
from matplotlib import pyplot as plt
import re
import string

%matplotlib inline

In [2]:
train_df = pd.read_parquet('./data/train.parquet')
test_df = pd.read_parquet('./data/test.parquet')
submission = pd.read_csv('./data/sample_submission.csv')

In [3]:
labels = train_df.target
train_data = train_df[['Title', 'Body']]
test_data = test_df[['Title', 'Body']]

In [4]:
def split_tags(text):
    return text[1: -1].split('><')

tags = pd.concat([train_df['Tags'], test_df['Tags']])
tags = tags.apply(lambda x: split_tags(x))

In [5]:
unique_tags = []
for _, value in tags.items():
    unique_tags += value
unique_tags, tags_counts = np.unique(unique_tags, return_counts=True)

In [6]:
sorted_idxs = np.argsort(tags_counts)[::-1]
tags_counts = tags_counts[sorted_idxs]
unique_tags = unique_tags[sorted_idxs]

In [7]:
n_tags = 1000
n_train, n_test = train_data.shape[0], test_data.shape[0]
top_tags = unique_tags[:n_tags]
tags_features = np.zeros((n_train + n_test, n_tags), dtype=np.int8)

for i, (_, tags_list) in enumerate(tags.items()):
    for j, tag in enumerate(top_tags):
        if tag in tags_list:
            tags_features[i: j] = 1

tags_train = sparse.csr_matrix(tags_features[:n_train])
tags_test = sparse.csr_matrix(tags_features[n_train:])

In [29]:
from sklearn.model_selection import train_test_split
x_train, x_val, x_train_tags, x_val_tags, y_train, y_val = train_test_split(train_data, tags_train, labels, test_size=.1, stratify=labels, random_state=0)

In [30]:
from sklearn.feature_extraction.text import TfidfVectorizer

def apply_tfidf(train_data, test_data):
    tfidf = TfidfVectorizer()
    train_tfidf = tfidf.fit_transform(train_data)
    test_tfidf = tfidf.transform(test_data)
    return train_tfidf, test_tfidf

In [31]:
x_train_title_tfidf, x_val_title_tfidf = apply_tfidf(x_train.Title, x_val.Title)
x_train_body_tfidf, x_val_body_tfidf = apply_tfidf(x_train.Body, x_val.Body)

In [56]:
from sklearn.ensemble import StackingClassifier

def get_log_reg(trial=None, C=1):
    from sklearn.linear_model import LogisticRegression

    if trial is not None:
        C = trial.suggest_loguniform('C', 1e-5, 1e+3)

    return LogisticRegression(C=C, max_iter=10000, random_state=0)

def get_stacking_1(trial=None,
                   n_neighbors=5,
                   n_estimators=100, min_samples_split=2, min_samples_leaf=1,
                   C=1):
    from sklearn.neighbors import KNeighborsClassifier
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.linear_model import LogisticRegression

    if trial is not None:
        n_neighbors = trial.suggest_int('n_neighbors', 5, 100)
        n_estimators = trial.suggest_int('n_estimators', 50, 500)
        min_samples_split = trial.suggest_int('min_samples_split', 2, 10)
        min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 10)
        C = trial.suggest_loguniform('C', 1e-5, 1e+3)

    estimators = [
        ('knn', KNeighborsClassifier(n_neighbors=n_neighbors)),
        ('rf', RandomForestClassifier(n_estimators=n_estimators, min_samples_split=min_samples_split,
                                      min_samples_leaf=min_samples_leaf, random_state=0))
    ]
    final = LogisticRegression(C=C, max_iter=10000, random_state=0)

    return StackingClassifier(estimators=estimators, final_estimator=final)

In [43]:
from sklearn.metrics import roc_auc_score, accuracy_score

def objective_function(estimator_func, train_data, train_labels, test_data, test_labels=None, trial=None):
    estimator = estimator_func(trial=trial).fit(train_data, train_labels)
    if test_labels is not None:
        prediction = estimator.predict(test_data)
        return accuracy_score(test_labels, prediction)
        # prob = estimator.predict_proba(test_data)
        # return roc_auc_score(test_labels, prob, multi_class='ovr')
    else:
        return estimator.predict(test_data)

In [46]:
def optimize(estimator_func, train_data, train_labels, test_data, test_labels, n_trials=10):
    study = optuna.create_study(sampler=optuna.samplers.TPESampler(seed=123),
                                direction='maximize',
                                pruner=optuna.pruners.HyperbandPruner())
    objective = lambda trial: objective_function(estimator_func, train_data, train_labels, test_data, test_labels, trial)
    study.optimize(objective, n_trials=n_trials, show_progress_bar=True)
    return study


In [61]:
from sklearnex import patch_sklearn, unpatch_sklearn
# patch_sklearn()
# unpatch_sklearn()
patch_sklearn('knn_classifier')

Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


In [35]:
x_train = sparse.hstack([x_train_title_tfidf, x_train_body_tfidf, x_train_tags])
x_val = sparse.hstack([x_val_title_tfidf, x_val_body_tfidf, x_val_tags])

In [36]:
x_train_title_tfidf.shape, x_train_body_tfidf.shape, x_train_tags.shape

((43200, 19964), (43200, 190892), (43200, 1000))

In [37]:
x_val_title_tfidf.shape, x_val_body_tfidf.shape, x_val_tags.shape

((4800, 19964), (4800, 190892), (4800, 1000))

## Roc-auc LogReg

In [38]:
study_log_reg = optimize(get_log_reg, x_train, y_train, x_val, y_val)

[32m[I 2021-11-25 21:35:27,403][0m A new study created in memory with name: no-name-9f8441d0-e565-4efd-8ea5-86b04b976c06[0m
  self._init_valid()
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
 10%|█         | 1/10 [02:07<19:03, 127.08s/it]

[32m[I 2021-11-25 21:37:34,487][0m Trial 0 finished with value: 0.9486876302083332 and parameters: {'C': 3.730383528143731}. Best is trial 0 with value: 0.9486876302083332.[0m


 20%|██        | 2/10 [02:11<07:20, 55.01s/it] 

[32m[I 2021-11-25 21:37:39,054][0m Trial 1 finished with value: 0.8697731119791667 and parameters: {'C': 0.0019458738403480128}. Best is trial 1 with value: 0.8697731119791667.[0m


 30%|███       | 3/10 [02:15<03:41, 31.69s/it]

[32m[I 2021-11-25 21:37:42,984][0m Trial 2 finished with value: 0.8545449218750001 and parameters: {'C': 0.0006528473243309113}. Best is trial 2 with value: 0.8545449218750001.[0m


 40%|████      | 4/10 [02:56<03:32, 35.47s/it]

[32m[I 2021-11-25 21:38:24,260][0m Trial 3 finished with value: 0.9472223958333332 and parameters: {'C': 0.25734643279726915}. Best is trial 2 with value: 0.8545449218750001.[0m


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
 50%|█████     | 5/10 [05:10<05:55, 71.02s/it]

[32m[I 2021-11-25 21:40:38,315][0m Trial 4 finished with value: 0.9470489583333334 and parameters: {'C': 5.698384608345687}. Best is trial 2 with value: 0.8545449218750001.[0m


 60%|██████    | 6/10 [05:28<03:31, 52.91s/it]

[32m[I 2021-11-25 21:40:56,063][0m Trial 5 finished with value: 0.9186776692708332 and parameters: {'C': 0.024257815076676004}. Best is trial 2 with value: 0.8545449218750001.[0m


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
 70%|███████   | 7/10 [07:36<03:51, 77.29s/it]

[32m[I 2021-11-25 21:43:03,561][0m Trial 6 finished with value: 0.9338194010416667 and parameters: {'C': 701.6387837751602}. Best is trial 2 with value: 0.8545449218750001.[0m


 80%|████████  | 8/10 [09:21<02:52, 86.34s/it]

[32m[I 2021-11-25 21:44:49,284][0m Trial 7 finished with value: 0.9494252604166666 and parameters: {'C': 3.0104949891579693}. Best is trial 2 with value: 0.8545449218750001.[0m


 90%|█████████ | 9/10 [09:50<01:08, 68.36s/it]

[32m[I 2021-11-25 21:45:18,097][0m Trial 8 finished with value: 0.9352186197916666 and parameters: {'C': 0.0703809641382708}. Best is trial 2 with value: 0.8545449218750001.[0m


100%|██████████| 10/10 [10:03<00:00, 60.34s/it]

[32m[I 2021-11-25 21:45:30,821][0m Trial 9 finished with value: 0.9074848958333334 and parameters: {'C': 0.013706928443177698}. Best is trial 2 with value: 0.8545449218750001.[0m





## Accuracy LogReg

In [47]:
study_acc_log_reg = optimize(get_log_reg, x_train, y_train, x_val, y_val)

[32m[I 2021-11-25 22:11:11,082][0m A new study created in memory with name: no-name-42d2eed7-2f8d-4d34-b164-f91575674cb4[0m
  self._init_valid()
 10%|█         | 1/10 [02:25<21:50, 145.64s/it]

[32m[I 2021-11-25 22:13:36,727][0m Trial 0 finished with value: 0.8189583333333333 and parameters: {'C': 3.730383528143731}. Best is trial 0 with value: 0.8189583333333333.[0m


 20%|██        | 2/10 [02:30<08:20, 62.56s/it] 

[32m[I 2021-11-25 22:13:41,131][0m Trial 1 finished with value: 0.7108333333333333 and parameters: {'C': 0.0019458738403480128}. Best is trial 0 with value: 0.8189583333333333.[0m


 30%|███       | 3/10 [02:34<04:10, 35.84s/it]

[32m[I 2021-11-25 22:13:45,182][0m Trial 2 finished with value: 0.6908333333333333 and parameters: {'C': 0.0006528473243309113}. Best is trial 0 with value: 0.8189583333333333.[0m


 40%|████      | 4/10 [03:15<03:47, 37.96s/it]

[32m[I 2021-11-25 22:14:26,390][0m Trial 3 finished with value: 0.8177083333333334 and parameters: {'C': 0.25734643279726915}. Best is trial 0 with value: 0.8189583333333333.[0m


 50%|█████     | 5/10 [05:40<06:23, 76.61s/it]

[32m[I 2021-11-25 22:16:51,526][0m Trial 4 finished with value: 0.816875 and parameters: {'C': 5.698384608345687}. Best is trial 0 with value: 0.8189583333333333.[0m


 60%|██████    | 6/10 [05:57<03:45, 56.40s/it]

[32m[I 2021-11-25 22:17:08,690][0m Trial 5 finished with value: 0.7758333333333334 and parameters: {'C': 0.024257815076676004}. Best is trial 0 with value: 0.8189583333333333.[0m


 70%|███████   | 7/10 [12:41<08:30, 170.13s/it]

[32m[I 2021-11-25 22:23:52,962][0m Trial 6 finished with value: 0.793125 and parameters: {'C': 701.6387837751602}. Best is trial 0 with value: 0.8189583333333333.[0m


 80%|████████  | 8/10 [14:30<05:00, 150.40s/it]

[32m[I 2021-11-25 22:25:41,134][0m Trial 7 finished with value: 0.8197916666666667 and parameters: {'C': 3.0104949891579693}. Best is trial 7 with value: 0.8197916666666667.[0m


 90%|█████████ | 9/10 [14:56<01:51, 111.63s/it]

[32m[I 2021-11-25 22:26:07,497][0m Trial 8 finished with value: 0.798125 and parameters: {'C': 0.0703809641382708}. Best is trial 7 with value: 0.8197916666666667.[0m


100%|██████████| 10/10 [15:08<00:00, 90.83s/it]

[32m[I 2021-11-25 22:26:19,384][0m Trial 9 finished with value: 0.7608333333333334 and parameters: {'C': 0.013706928443177698}. Best is trial 7 with value: 0.8197916666666667.[0m





In [48]:
x_full_train_title_tfidf, x_test_title_tfidf = apply_tfidf(train_data.Title, test_data.Title)
x_full_train_body_tfidf, x_test_body_tfidf = apply_tfidf(train_data.Body, test_data.Body)

In [51]:
prediction = objective_function(lambda trial: get_log_reg(C=study_acc_log_reg.best_params['C']),
                                sparse.hstack([x_full_train_title_tfidf, x_full_train_body_tfidf, tags_train]),
                                labels,
                                sparse.hstack([x_test_title_tfidf, x_test_body_tfidf, tags_test]))

In [52]:
submission = pd.DataFrame({'Id': test_df.index, 'Predicted': prediction})
submission.to_csv('submissions/tfidf_log_reg.csv', index=False)

## Stacking 1

In [64]:
study_acc_log_reg = optimize(get_stacking_1, x_train, y_train, x_val, y_val)

[32m[I 2021-11-29 10:50:25,467][0m A new study created in memory with name: no-name-4a80f369-bc3c-4593-b8b9-f04e8172de17[0m
  self._init_valid()
  result = super(NeighborsBase, self)._fit(X, y)
  result = super(NeighborsBase, self)._fit(X, y)
  result = super(NeighborsBase, self)._fit(X, y)
  result = super(NeighborsBase, self)._fit(X, y)
  result = super(NeighborsBase, self)._fit(X, y)
  result = super(NeighborsBase, self)._fit(X, y)
  result = super(NeighborsBase, self)._fit(X, y)


[32m[I 2021-11-29 11:03:32,270][0m Trial 0 finished with value: 0.8045833333333333 and parameters: {'n_neighbors': 71, 'n_estimators': 179, 'min_samples_split': 4, 'min_samples_leaf': 6, 'C': 5.698384608345687}. Best is trial 0 with value: 0.8045833333333333.[0m


  result = super(NeighborsBase, self)._fit(X, y)
  result = super(NeighborsBase, self)._fit(X, y)
  result = super(NeighborsBase, self)._fit(X, y)
  result = super(NeighborsBase, self)._fit(X, y)
  result = super(NeighborsBase, self)._fit(X, y)
  result = super(NeighborsBase, self)._fit(X, y)


[32m[I 2021-11-29 11:32:37,693][0m Trial 1 finished with value: 0.7835416666666667 and parameters: {'n_neighbors': 45, 'n_estimators': 492, 'min_samples_split': 8, 'min_samples_leaf': 5, 'C': 0.013706928443177698}. Best is trial 0 with value: 0.8045833333333333.[0m


  result = super(NeighborsBase, self)._fit(X, y)
  result = super(NeighborsBase, self)._fit(X, y)
  result = super(NeighborsBase, self)._fit(X, y)
  result = super(NeighborsBase, self)._fit(X, y)
  result = super(NeighborsBase, self)._fit(X, y)
 30%|███       | 3/10 [1:59:31<5:32:46, 2852.39s/it]

[32m[I 2021-11-29 12:49:57,109][0m Trial 2 finished with value: 0.8047916666666667 and parameters: {'n_neighbors': 37, 'n_estimators': 378, 'min_samples_split': 5, 'min_samples_leaf': 1, 'C': 0.015288118581527262}. Best is trial 2 with value: 0.8047916666666667.[0m


  result = super(NeighborsBase, self)._fit(X, y)
  result = super(NeighborsBase, self)._fit(X, y)
  result = super(NeighborsBase, self)._fit(X, y)
  result = super(NeighborsBase, self)._fit(X, y)
  result = super(NeighborsBase, self)._fit(X, y)
  result = super(NeighborsBase, self)._fit(X, y)
 40%|████      | 4/10 [2:08:23<3:13:38, 1936.45s/it]

[32m[I 2021-11-29 12:58:49,402][0m Trial 3 finished with value: 0.79875 and parameters: {'n_neighbors': 75, 'n_estimators': 132, 'min_samples_split': 3, 'min_samples_leaf': 6, 'C': 0.17973005068132514}. Best is trial 2 with value: 0.8047916666666667.[0m


  result = super(NeighborsBase, self)._fit(X, y)
  result = super(NeighborsBase, self)._fit(X, y)
  result = super(NeighborsBase, self)._fit(X, y)
  result = super(NeighborsBase, self)._fit(X, y)
  result = super(NeighborsBase, self)._fit(X, y)
  result = super(NeighborsBase, self)._fit(X, y)
 50%|█████     | 5/10 [2:31:04<2:24:03, 1728.74s/it]

[32m[I 2021-11-29 13:21:29,870][0m Trial 4 finished with value: 0.80125 and parameters: {'n_neighbors': 65, 'n_estimators': 433, 'min_samples_split': 8, 'min_samples_leaf': 7, 'C': 6.019314852321072}. Best is trial 2 with value: 0.8047916666666667.[0m


  result = super(NeighborsBase, self)._fit(X, y)
  result = super(NeighborsBase, self)._fit(X, y)
  result = super(NeighborsBase, self)._fit(X, y)
  result = super(NeighborsBase, self)._fit(X, y)
  result = super(NeighborsBase, self)._fit(X, y)
  result = super(NeighborsBase, self)._fit(X, y)
 60%|██████    | 6/10 [2:47:59<1:39:04, 1486.21s/it]

[32m[I 2021-11-29 13:38:25,283][0m Trial 5 finished with value: 0.8054166666666667 and parameters: {'n_neighbors': 36, 'n_estimators': 213, 'min_samples_split': 4, 'min_samples_leaf': 3, 'C': 1.1163721427400002}. Best is trial 5 with value: 0.8054166666666667.[0m


  result = super(NeighborsBase, self)._fit(X, y)
  result = super(NeighborsBase, self)._fit(X, y)
  result = super(NeighborsBase, self)._fit(X, y)
  result = super(NeighborsBase, self)._fit(X, y)
  result = super(NeighborsBase, self)._fit(X, y)
  result = super(NeighborsBase, self)._fit(X, y)
 70%|███████   | 7/10 [3:03:45<1:05:28, 1309.57s/it]

[32m[I 2021-11-29 13:54:11,196][0m Trial 6 finished with value: 0.786875 and parameters: {'n_neighbors': 13, 'n_estimators': 245, 'min_samples_split': 5, 'min_samples_leaf': 5, 'C': 0.025505998062850126}. Best is trial 5 with value: 0.8054166666666667.[0m


  result = super(NeighborsBase, self)._fit(X, y)
  result = super(NeighborsBase, self)._fit(X, y)
  result = super(NeighborsBase, self)._fit(X, y)
  result = super(NeighborsBase, self)._fit(X, y)
  result = super(NeighborsBase, self)._fit(X, y)
  result = super(NeighborsBase, self)._fit(X, y)
  result = super(NeighborsBase, self)._fit(X, y)


[32m[I 2021-11-29 14:06:40,443][0m Trial 7 finished with value: 0.79125 and parameters: {'n_neighbors': 34, 'n_estimators': 242, 'min_samples_split': 10, 'min_samples_leaf': 10, 'C': 0.1034411659429407}. Best is trial 5 with value: 0.8054166666666667.[0m


  result = super(NeighborsBase, self)._fit(X, y)
  result = super(NeighborsBase, self)._fit(X, y)
  result = super(NeighborsBase, self)._fit(X, y)
  result = super(NeighborsBase, self)._fit(X, y)
  result = super(NeighborsBase, self)._fit(X, y)
  result = super(NeighborsBase, self)._fit(X, y)


[32m[I 2021-11-29 14:13:40,378][0m Trial 8 finished with value: 0.7997916666666667 and parameters: {'n_neighbors': 64, 'n_estimators': 102, 'min_samples_split': 4, 'min_samples_leaf': 5, 'C': 85.20660531158539}. Best is trial 5 with value: 0.8054166666666667.[0m


  result = super(NeighborsBase, self)._fit(X, y)
  result = super(NeighborsBase, self)._fit(X, y)
  result = super(NeighborsBase, self)._fit(X, y)
  result = super(NeighborsBase, self)._fit(X, y)
  result = super(NeighborsBase, self)._fit(X, y)
100%|██████████| 10/10 [3:38:16<00:00, 1309.62s/it]

[32m[I 2021-11-29 14:28:41,648][0m Trial 9 finished with value: 0.8033333333333333 and parameters: {'n_neighbors': 29, 'n_estimators': 267, 'min_samples_split': 10, 'min_samples_leaf': 6, 'C': 0.8001219894116979}. Best is trial 5 with value: 0.8054166666666667.[0m





In [66]:
prediction = objective_function(lambda trial: get_stacking_1(n_neighbors=study_acc_log_reg.best_params['n_neighbors'],
                                                             n_estimators=study_acc_log_reg.best_params['n_estimators'],
                                                             min_samples_split=study_acc_log_reg.best_params['min_samples_split'],
                                                             min_samples_leaf=study_acc_log_reg.best_params['min_samples_leaf'],
                                                             C=study_acc_log_reg.best_params['C']),
                                sparse.hstack([x_full_train_title_tfidf, x_full_train_body_tfidf, tags_train]),
                                labels,
                                sparse.hstack([x_test_title_tfidf, x_test_body_tfidf, tags_test]))

  result = super(NeighborsBase, self)._fit(X, y)
  result = super(NeighborsBase, self)._fit(X, y)
  result = super(NeighborsBase, self)._fit(X, y)
  result = super(NeighborsBase, self)._fit(X, y)
  result = super(NeighborsBase, self)._fit(X, y)
  result = super(NeighborsBase, self)._fit(X, y)


In [67]:
submission = pd.DataFrame({'Id': test_df.index, 'Predicted': prediction})
submission.to_csv('submissions/tfidf_stacking_1.csv', index=False)