In [1]:
import pandas as pd
import numpy as np
from scipy import sparse
import optuna
from matplotlib import pyplot as plt
import re
import string

%matplotlib inline

In [2]:
train_df = pd.read_parquet('./data/train.parquet')
test_df = pd.read_parquet('./data/test.parquet')
submission = pd.read_csv('./data/sample_submission.csv')

In [3]:
labels = train_df.target
train_data = train_df[['Title', 'Body']]
test_data = test_df[['Title', 'Body']]

In [4]:
def split_tags(text):
    return text[1: -1].split('><')

tags = pd.concat([train_df['Tags'], test_df['Tags']])
tags = tags.apply(lambda x: split_tags(x))

In [5]:
unique_tags = []
for _, value in tags.items():
    unique_tags += value
unique_tags, tags_counts = np.unique(unique_tags, return_counts=True)

In [6]:
sorted_idxs = np.argsort(tags_counts)[::-1]
tags_counts = tags_counts[sorted_idxs]
unique_tags = unique_tags[sorted_idxs]

In [7]:
n_tags = 1000
n_train, n_test = train_data.shape[0], test_data.shape[0]
top_tags = unique_tags[:n_tags]
tags_features = np.zeros((n_train + n_test, n_tags), dtype=np.int8)

for i, (_, tags_list) in enumerate(tags.items()):
    for j, tag in enumerate(top_tags):
        if tag in tags_list:
            tags_features[i: j] = 1

tags_train = tags_features[:n_train]
tags_test = tags_features[n_train:]

In [8]:
from sklearn.model_selection import train_test_split
x_train, x_val, x_train_tags, x_val_tags, y_train, y_val = train_test_split(train_data, tags_train, labels, test_size=.1, stratify=labels, random_state=0)

In [9]:
from gensim import downloader
list(downloader.info()['models'].keys())

['fasttext-wiki-news-subwords-300',
 'conceptnet-numberbatch-17-06-300',
 'word2vec-ruscorpora-300',
 'word2vec-google-news-300',
 'glove-wiki-gigaword-50',
 'glove-wiki-gigaword-100',
 'glove-wiki-gigaword-200',
 'glove-wiki-gigaword-300',
 'glove-twitter-25',
 'glove-twitter-50',
 'glove-twitter-100',
 'glove-twitter-200',
 '__testing_word2vec-matrix-synopsis']

In [10]:
w2v_model = downloader.load('word2vec-google-news-300')

In [11]:
def sentence2vec(sentence):
    vecs = []
    for word in sentence.split(' '):
        try:
            vecs.append(w2v_model.get_vector(word.lower()))
        except:
            pass
    if len(vecs) == 0:
        vecs = [np.zeros(300)]
    return sum(vecs) / len(vecs)

def apply_word2vec(data):
    title = np.array([sentence2vec(sentence) for sentence in data['Title']])
    body = np.array([sentence2vec(sentence) for sentence in data['Body']])
    return np.hstack([title, body])

In [12]:
x_train_w2v = apply_word2vec(x_train)

In [13]:
x_val_w2v = apply_word2vec(x_val)

In [14]:
from sklearn.ensemble import StackingClassifier

def get_log_reg(trial=None, C=1):
    from sklearn.linear_model import LogisticRegression

    if trial is not None:
        C = trial.suggest_loguniform('C', 1e-5, 1e+3)

    return LogisticRegression(C=C, max_iter=10000, random_state=0)

def get_stacking_1(trial=None,
                   n_neighbors=5,
                   n_estimators=100, min_samples_split=2, min_samples_leaf=1,
                   C=1):
    from sklearn.neighbors import KNeighborsClassifier
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.linear_model import LogisticRegression

    if trial is not None:
        n_neighbors = trial.suggest_int('n_neighbors', 5, 100)
        n_estimators = trial.suggest_int('n_estimators', 50, 500)
        min_samples_split = trial.suggest_int('min_samples_split', 2, 10)
        min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 10)
        C = trial.suggest_loguniform('C', 1e-5, 1e+3)

    estimators = [
        ('knn', KNeighborsClassifier(n_neighbors=n_neighbors)),
        ('rf', RandomForestClassifier(n_estimators=n_estimators, min_samples_split=min_samples_split,
                                      min_samples_leaf=min_samples_leaf, random_state=0))
    ]
    final = LogisticRegression(C=C, max_iter=10000, random_state=0)

    return StackingClassifier(estimators=estimators, final_estimator=final)

In [15]:
from sklearn.metrics import accuracy_score

def objective_function(estimator_func, train_data, train_labels, test_data, test_labels=None, trial=None):
    estimator = estimator_func(trial=trial).fit(train_data, train_labels)
    if test_labels is not None:
        prediction = estimator.predict(test_data)
        return accuracy_score(test_labels, prediction)
    else:
        return estimator.predict(test_data)

In [16]:
def optimize(estimator_func, train_data, train_labels, test_data, test_labels, n_trials=10):
    study = optuna.create_study(sampler=optuna.samplers.TPESampler(seed=123),
                                direction='maximize',
                                pruner=optuna.pruners.HyperbandPruner())
    objective = lambda trial: objective_function(estimator_func, train_data, train_labels, test_data, test_labels, trial)
    study.optimize(objective, n_trials=n_trials, show_progress_bar=True)
    return study

In [17]:
x_train_w2v.shape, x_train_tags.shape

((43200, 600), (43200, 1000))

In [18]:
x_val_w2v.shape, x_val_tags.shape

((4800, 600), (4800, 1000))

In [19]:
x_train = np.hstack([x_train_w2v, x_train_tags])
x_val = np.hstack([x_val_w2v, x_val_tags])

In [26]:
from sklearnex import patch_sklearn, unpatch_sklearn
patch_sklearn()

Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


In [20]:
study_acc_log_reg = optimize(get_log_reg, x_train, y_train, x_val, y_val)

[32m[I 2021-12-01 21:53:38,424][0m A new study created in memory with name: no-name-bb33e9e4-c733-40f8-b772-279b35b66cff[0m
  self._init_valid()
 10%|█         | 1/10 [01:31<13:45, 91.72s/it]

[32m[I 2021-12-01 21:55:10,155][0m Trial 0 finished with value: 0.630625 and parameters: {'C': 3.730383528143731}. Best is trial 0 with value: 0.630625.[0m


 20%|██        | 2/10 [01:39<05:38, 42.34s/it]

[32m[I 2021-12-01 21:55:17,932][0m Trial 1 finished with value: 0.5475 and parameters: {'C': 0.0019458738403480128}. Best is trial 0 with value: 0.630625.[0m


 30%|███       | 3/10 [01:43<02:53, 24.77s/it]

[32m[I 2021-12-01 21:55:21,797][0m Trial 2 finished with value: 0.5289583333333333 and parameters: {'C': 0.0006528473243309113}. Best is trial 0 with value: 0.630625.[0m


 40%|████      | 4/10 [02:31<03:23, 33.96s/it]

[32m[I 2021-12-01 21:56:09,835][0m Trial 3 finished with value: 0.6285416666666667 and parameters: {'C': 0.25734643279726915}. Best is trial 0 with value: 0.630625.[0m


 50%|█████     | 5/10 [04:23<05:10, 62.16s/it]

[32m[I 2021-12-01 21:58:01,997][0m Trial 4 finished with value: 0.63 and parameters: {'C': 5.698384608345687}. Best is trial 0 with value: 0.630625.[0m


 60%|██████    | 6/10 [04:44<03:12, 48.11s/it]

[32m[I 2021-12-01 21:58:22,826][0m Trial 5 finished with value: 0.5995833333333334 and parameters: {'C': 0.024257815076676004}. Best is trial 0 with value: 0.630625.[0m


 70%|███████   | 7/10 [06:45<03:36, 72.04s/it]

[32m[I 2021-12-01 22:00:24,138][0m Trial 6 finished with value: 0.6308333333333334 and parameters: {'C': 701.6387837751602}. Best is trial 6 with value: 0.6308333333333334.[0m


 80%|████████  | 8/10 [08:13<02:34, 77.12s/it]

[32m[I 2021-12-01 22:01:52,129][0m Trial 7 finished with value: 0.6308333333333334 and parameters: {'C': 3.0104949891579693}. Best is trial 6 with value: 0.6308333333333334.[0m


 90%|█████████ | 9/10 [08:40<01:01, 61.52s/it]

[32m[I 2021-12-01 22:02:19,365][0m Trial 8 finished with value: 0.6114583333333333 and parameters: {'C': 0.0703809641382708}. Best is trial 6 with value: 0.6308333333333334.[0m


100%|██████████| 10/10 [08:55<00:00, 53.58s/it]

[32m[I 2021-12-01 22:02:34,195][0m Trial 9 finished with value: 0.58375 and parameters: {'C': 0.013706928443177698}. Best is trial 6 with value: 0.6308333333333334.[0m





In [21]:
x_full_train_w2v = apply_word2vec(train_data)
x_full_test_w2v = apply_word2vec(test_data)

In [23]:
prediction = objective_function(lambda trial: get_log_reg(C=study_acc_log_reg.best_params['C']),
                                np.hstack([x_full_train_w2v, tags_train]),
                                labels,
                                np.hstack([x_full_test_w2v, tags_test]))

In [24]:
submission = pd.DataFrame({'Id': test_df.index, 'Predicted': prediction})
submission.to_csv('submissions/word2vec_log_reg.csv', index=False)

## Stacking 1

In [27]:
study_acc_stacking_1 = optimize(get_stacking_1, x_train, y_train, x_val, y_val)

[32m[I 2021-12-01 22:30:54,229][0m A new study created in memory with name: no-name-dddd45dd-cebf-4754-bc1a-5df1318291bc[0m
  self._init_valid()
 10%|█         | 1/10 [17:45<2:39:46, 1065.14s/it]

[32m[I 2021-12-01 22:48:39,375][0m Trial 0 finished with value: 0.5935416666666666 and parameters: {'n_neighbors': 71, 'n_estimators': 179, 'min_samples_split': 4, 'min_samples_leaf': 6, 'C': 5.698384608345687}. Best is trial 0 with value: 0.5935416666666666.[0m


In [None]:
prediction = objective_function(lambda trial: get_stacking_1(n_neighbors=study_acc_stacking_1.best_params['n_neighbors'],
                                                             n_estimators=study_acc_stacking_1.best_params['n_estimators'],
                                                             min_samples_split=study_acc_stacking_1.best_params['min_samples_split'],
                                                             min_samples_leaf=study_acc_stacking_1.best_params['min_samples_leaf'],
                                                             C=study_acc_stacking_1.best_params['C']),
                                np.hstack([x_full_train_w2v, tags_train]),
                                labels,
                                np.hstack([x_full_test_w2v, tags_test]))

In [None]:
submission = pd.DataFrame({'Id': test_df.index, 'Predicted': prediction})
submission.to_csv('submissions/word2vec_stacking_1.csv', index=False)