In [1]:
import gc

import numpy as np
import pandas as pd
from scipy import sparse

from sklearn.linear_model import LogisticRegression


def read_file(file_name, max_iters=float('Inf')):
    data = []
    n_items = 0

    with open(file_name, 'r') as f:
        for row in f:
            n_items += 1
            data.append(row)
            
            if n_items > max_iters:
                break
    return data

# Загрузка данных

In [2]:
# Датасет
X_train = sparse.load_npz('..\\sparse\\sp_train_dataset.npz')
# X_train = sparse.load_npz('..\\sparse\\sp_train_dataset_sgd.npz')

# Правильные ответы на train
y_train = read_file('..\\sparse\\dataset_s_labels.csv')
y_train = np.array([int(a) for a in y_train])

X_train, y_train.shape

(<427994x793765 sparse matrix of type '<class 'numpy.float64'>'
 	with 304396787 stored elements in COOrdinate format>, (427994,))

# Logistic Regression baseline

### Обучение

In [7]:
# Verbose : For the liblinear and lbfgs solvers set verbose to any positive number for verbosity
lr = LogisticRegression(C=0.03, tol=0.001, class_weight='balanced', solver='sag', max_iter=700, n_jobs=-1, verbose=10, 
                        random_state=30)
lr

LogisticRegression(C=0.03, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=700,
          multi_class='ovr', n_jobs=-1, penalty='l2', random_state=30,
          solver='sag', tol=0.001, verbose=10, warm_start=False)

In [8]:
%%time

lr.fit(X_train, y_train)

convergence after 124 epochs took 1258 seconds


[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed: 21.0min
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed: 21.0min finished


Wall time: 21min 22s


LogisticRegression(C=0.03, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=700,
          multi_class='ovr', n_jobs=-1, penalty='l2', random_state=30,
          solver='sag', tol=0.001, verbose=10, warm_start=False)

### Загрузка тестовой выборки и предсказания

In [6]:
X_test = sparse.load_npz('..\\sparse\\sp_test_dataset.npz')
# X_test = sparse.load_npz('..\\sparse\\sp_test_dataset_sgd.npz')
X_test

<181024x793765 sparse matrix of type '<class 'numpy.float64'>'
	with 197721673 stored elements in COOrdinate format>

In [11]:
# Предсказания
y_pred = lr.predict_proba(X_test)

In [12]:
pd.Series(y_pred[:,1]).to_csv('..\\output\\pred_sparse_sag_C003_sgd.csv', index=False)

In [None]:
pd.Series(y_pred[:,1]).to_csv('..\\output\\pred_sparse_sag_C0002_sgd.csv', index=False)

### Результаты

In [13]:
cuids_out = pd.read_csv('..\\input\\mlboot_test.tsv', index_col=[0])

In [14]:
submission = pd.read_csv('..\\sparse\\dataset_s_test_cuid.csv', index_col=[0], names=['cuid'], header=None)
submission['pred'] = y_pred[:,1]
submission = submission.reindex(cuids_out.index)

submission.to_csv('..\\output\\sp_baseline_sag_C003_sgd.csv', header=None, index=False)