In [1]:
import numpy as np
import pandas as pd
import gc
import math
from scipy.sparse import hstack
from os import cpu_count

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, StratifiedKFold, GridSearchCV, train_test_split
from sklearn.pipeline import Pipeline

from sklearn.metrics import roc_auc_score

In [None]:
n_jobs = max(cpu_count()-1, 1)

In [2]:
train = pd.read_csv('train_cleansed.csv', sep=',')
test = pd.read_csv('test_cleansed.csv', sep=',')

In [3]:
(test.shape[0]*100)/(train.shape[0]+test.shape[0])

12.017538798823054

In [5]:
X = train.drop(['0'], axis=1)
y = train['0']

In [64]:
classifier = LogisticRegression()

In [65]:
cv_score = np.mean(cross_val_score(classifier, X, y, scoring='roc_auc',n_jobs=n_jobs))

In [66]:
print("Pure LogReg:", cv_score)

Pure LogReg: 0.724696238949512


# Holdout validation

In [7]:
X_train, X_dev, y_train, y_dev = train_test_split(X, y, test_size=0.12, random_state=42)

In [8]:
classifier = LogisticRegression(C=0.1, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=1000,
          multi_class='ovr', n_jobs=3, penalty='l2', random_state=42,
          solver='newton-cg', tol=0.0001, verbose=0, warm_start=False)

In [9]:
model_holdout= classifier.fit(X_train,y_train)

In [10]:
y_holdout_pred = model_holdout.predict_proba(X_dev)

In [11]:
y_holdout_pred[:, 1]

array([0.44664043, 0.2177816 , 0.31704201, ..., 0.25167283, 0.36815249,
       0.77203047])

In [12]:
roc_auc_score(y_dev,y_holdout_pred[:, 1])

0.7363309812161912

# Cross-validation

In [13]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [15]:
params = {
    'C': [0.01,0.1,0.9]
    }
log_reg_classifier = LogisticRegression(solver='newton-cg', class_weight='balanced', random_state=42,max_iter = 1000,n_jobs=n_jobs)

lgs_split_5 = GridSearchCV(log_reg_classifier, params, scoring='roc_auc', cv=skf,verbose=1)
lgs_split_5.fit(X_train, y_train)
print ('gs.best_score_:', lgs_split_5.best_score_)

Fitting 5 folds for each of 3 candidates, totalling 15 fits


[Parallel(n_jobs=1)]: Done  15 out of  15 | elapsed:  4.7min finished


gs.best_score_: 0.7253662973547849


In [16]:
lgs_split_5.best_estimator_

LogisticRegression(C=0.1, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=1000,
          multi_class='ovr', n_jobs=3, penalty='l2', random_state=42,
          solver='newton-cg', tol=0.0001, verbose=0, warm_start=False)

In [17]:
lgs_best = lgs_split.best_estimator_

In [18]:
cv_score = np.mean(cross_val_score(lgs_best, X_train, y_train, scoring='roc_auc',n_jobs=n_jobs))

print("LogReg with train split:", cv_score) #0.7193249282374398 0.7231766475833975

LogReg with train split: 0.7231766475833975


In [19]:
y_crossval_pred = lgs_best.predict_proba(X_dev)

In [20]:
roc_auc_score(y_dev,y_holdout_pred[:, 1]) #0.7363309812161912 

0.7363309812161912

In [21]:
roc_auc_score(y_dev, y_crossval_pred[:, 1]) # 0.7314016840801161 0.7363309812161912

0.7363309812161912

# Prediction and submit

In [22]:
X_test = test.drop(['0'], axis=1)
y_test = test['0']

In [23]:
prediction =lgs_best.predict_proba(X_test)

In [29]:
test_index = pd.read_csv('test.csv', sep='\t')
test_index = test_index['Unnamed: 0']

In [27]:
submition = pd.DataFrame({'_ID_': test_index, '_VAL_': np.zeros_like(test_index)})
submition['_VAL_'] = prediction[:,1]
submition.to_csv('submission_log_reg.csv', index=False)

# LB 0.73412059