In [1]:
import gc

import numpy as np
import pandas as pd
from tqdm import tqdm
from scipy import sparse
# import xgboost as xgb

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, MaxAbsScaler

# from skopt import BayesSearchCV

def read_file(file_name, max_iters=float('Inf')):
    data = []
    n_items = 0

    with open(file_name, 'r') as f:
        for row in f:
            n_items += 1
            data.append(row)
            
            if n_items > max_iters:
                break
    return data

# BaseLine: модели

### Загрузка данных

In [4]:
# Датасет
X_train = sparse.load_npz('..\\sparse\\sp_train_dataset.npz') #.tocsr()

# Правильные ответы на train
y_train = read_file('..\\sparse\\dataset_s_labels.csv')
y_train = np.array([int(a) for a in y_train])

In [5]:
X_train, y_train.shape

(<427994x1786632 sparse matrix of type '<class 'numpy.float64'>'
 	with 292560261 stored elements in Compressed Sparse Row format>, (427994,))

In [None]:
# Загрузка тестовой выборки
X_test = sparse.load_npz('..\\sparse\\sp_test_dataset.npz') #.tocsr()

X_test

### Scale

In [None]:
# scaler = StandardScaler(with_mean=False)
# X_train_scale = scaler.fit_transform(X_train)

# Logistic Regression

### Обучение и предсказания модели

In [None]:
%%time

# Verbose : For the liblinear and lbfgs solvers set verbose to any positive number for verbosity
lr = LogisticRegression(C=10, tol=0.0005, class_weight='balanced', solver='sag', max_iter=400, n_jobs=-1, verbose=True)

lr.fit(X_train, y_train)

In [7]:
lr

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=-1,
          penalty='l2', random_state=None, solver='sag', tol=0.0001,
          verbose=0, warm_start=False)

In [None]:
# Предсказания
y_pred = lr.predict_proba(X_test)
y_pred.to_csv('..\\output\\pred_sparse_sag.csv')

### Результаты

In [None]:
cuids_test = pd.read_csv('..\\sparse\\dataset_s_test_cuid.csv', index_col=None, names=['cuid'], header=None)
cuids_out = pd.read_csv('..\\input\\mlboot_test.tsv', index_col=[0])

In [None]:
file_cuids = '..\\vw\\dataset_cuid_test.vw'
file_test = '..\\input\\mlboot_test.tsv'

cuid_df = pd.read_csv('..\\sparse\\dataset_s_test_cuid.csv', index_col=None, names=['cuid'], usecols=[0], header=None, sep='\t')
cuid_test = pd.read_csv(file_test, index_col=[0])

# SGDClassifier

In [None]:
# Датасет
X_train = sparse.load_npz('..\\sparse\\sp_train_dataset.npz') #.tocsr()

# Правильные ответы на train
y_train = read_file('..\\sparse\\dataset_s_labels.csv')
y_train = np.array([int(a) for a in y_train])

X_train, y_train.shape

# Ridge

In [None]:
model = Ridge()

model.fit(X_train, y) #обучаем на всех данных
test_preds = model.predict(X_test_new)

# Оптимизация

In [None]:
# Датасет
X_test = sparse.load_npz('..\\sparse\\sp_train_dataset.npz').tocsr()

# Правильные ответы на train
y_train = read_file('..\\sparse\\dataset_s_labels.csv')
y_train = np.array([int(a) for a in y])

In [None]:
lrn = LogisticRegression()

skf = StratifiedKFold(n_splits=5, shuffle=True)

for train_index, test_index in skf.split(X, y):
    X_train, y_train = X[train_index], y[train_index]
    X_test, y_test = X[test_index], y[test_index]
    break

lrn.fit(X_train, y_train)
y_pred = lrn.predict(X_test)

roc_auc_score(y_test, y_pred)

# Оптимизация параметров XGBoost

## Загрузка данных

In [4]:
# Датасет
X = sparse.load_npz('..\\sparse\\dataset.npz')

# Правильные ответы на train
y = read_file('..\\sparse\\dataset_s_labels.csv')

y = [int(a) for a in y]
len(y)

NameError: name 'read_file' is not defined

### Оптимизация

In [9]:
# SETTINGS - CHANGE THESE TO GET SOMETHING MEANINGFUL
ITERATIONS = 10 # 1000

In [10]:
# Classifier
bayes_cv_tuner = BayesSearchCV(
    estimator = xgb.XGBClassifier(
        n_jobs = 1,
        objective = 'binary:logistic',
        eval_metric = 'auc',
        silent=1,
        tree_method='approx'
    ),
    search_spaces = {
        'learning_rate': (0.01, 1.0, 'log-uniform'),
        'min_child_weight': (0, 10),
        'max_depth': (0, 50),
        'max_delta_step': (0, 20),
        'subsample': (0.01, 1.0, 'uniform'),
        'colsample_bytree': (0.01, 1.0, 'uniform'),
        'colsample_bylevel': (0.01, 1.0, 'uniform'),
        'reg_lambda': (1e-9, 1000, 'log-uniform'),
        'reg_alpha': (1e-9, 1.0, 'log-uniform'),
        'gamma': (1e-9, 0.5, 'log-uniform'),
        'min_child_weight': (0, 5),
        'n_estimators': (50, 100),
        'scale_pos_weight': (1e-6, 500, 'log-uniform')
    },    
    scoring = 'roc_auc',
    cv = StratifiedKFold(
        n_splits=3,
        shuffle=True,
        random_state=42
    ),
    n_jobs = 3,
    n_iter = ITERATIONS,   
    verbose = 0,
    refit = True,
    random_state = 42
)

def status_print(optim_result):
    """Status callback durring bayesian hyperparameter search"""
    
    # Get all the models tested so far in DataFrame format
    all_models = pd.DataFrame(bayes_cv_tuner.cv_results_)    
    
    # Get current parameters and the best parameters    
    best_params = pd.Series(bayes_cv_tuner.best_params_)
    print('Model #{}\nBest ROC-AUC: {}\nBest params: {}\n'.format(
        len(all_models),
        np.round(bayes_cv_tuner.best_score_, 4),
        bayes_cv_tuner.best_params_
    ))
    
    # Save all model results
    clf_name = bayes_cv_tuner.estimator.__class__.__name__
    all_models.to_csv(clf_name+"_cv_results.csv")

In [None]:
# Fit the model
result = bayes_cv_tuner.fit(X, np.array(y), callback=status_print)