In [1]:
import numpy as np
import pandas as pd
import gc
import math
from os import cpu_count

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, StratifiedKFold, GridSearchCV, RandomizedSearchCV, train_test_split
from sklearn.pipeline import Pipeline

from sklearn.ensemble import BaggingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

In [2]:
train = pd.read_csv('train_cleansed.csv', sep=',')
test = pd.read_csv('test_cleansed.csv', sep=',')

In [3]:
n_jobs = max(cpu_count()-1, 1)

# Hold out validation

In [4]:
X = train.drop(['0'], axis=1)
y = train['0'].astype(int)

In [5]:
X_train, X_dev, y_train, y_dev = train_test_split(X, y, test_size=0.12, random_state=42)

In [6]:
classifier = LogisticRegression()

In [7]:
bg = BaggingClassifier(classifier, random_state=42, n_estimators=25,oob_score=True)
model_holdout = bg.fit(X_train, y_train)


In [8]:
y_holdout_pred = model_holdout.predict_proba(X_dev)

In [9]:
y_holdout_pred[:, 1]

array([0.14304478, 0.05416226, 0.09647951, ..., 0.05685172, 0.11545878,
       0.45606946])

In [10]:
roc_auc_score(y_dev,y_holdout_pred[:, 1])

0.7308390269502246

#  Cross-validation

In [13]:
skf = StratifiedKFold(n_splits=5,shuffle=True, random_state=42)

Logistic regression from HW_5_Paul_Liabakh_LR_1 

In [14]:
params = {
    'C': [0.01,0.1,0.9]
    }
log_reg = LogisticRegression(solver='newton-cg', class_weight='balanced', random_state=42,max_iter = 100,n_jobs=n_jobs)

lgs = GridSearchCV(log_reg, params, scoring='roc_auc', cv=skf,verbose=1)
lgs.fit(X_train, y_train)
print ('gs.best_score_:', lgs.best_score_)

Fitting 5 folds for each of 3 candidates, totalling 15 fits


[Parallel(n_jobs=1)]: Done  15 out of  15 | elapsed:  4.7min finished


gs.best_score_: 0.7253662973547849


In [None]:
0.7232106138872144

In [15]:
lg = lgs.best_estimator_

In [16]:
lg

LogisticRegression(C=0.1, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=3, penalty='l2', random_state=42,
          solver='newton-cg', tol=0.0001, verbose=0, warm_start=False)

lg = LogisticRegression(C=0.1, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=1000,
          multi_class='ovr', n_jobs=3, penalty='l2', random_state=42,
          solver='newton-cg', tol=0.0001, verbose=0, warm_start=False)


In [17]:
parameters = {'max_features': [0.5, 0.7, 0.9, 1.], 'max_samples': [0.5, 0.7, 0.9, 1.], "base_estimator__C": [0.005, 0.001, 0.01]}
bg = BaggingClassifier(lg, random_state=42, n_estimators=25,oob_score=True, n_jobs=n_jobs)
bg_r_grid_search = RandomizedSearchCV(bg, parameters, scoring ='roc_auc', n_iter=4, cv=skf, n_jobs=-1, random_state=1,verbose=2)
bg_r_grid_search.fit(X_train, y_train)
print(bg_r_grid_search.best_score_)

Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:  8.4min finished


0.7160151927768805


In [18]:
bg_r_grid_search.best_params_

{'base_estimator__C': 0.01, 'max_features': 0.7, 'max_samples': 0.9}

In [19]:
best_bg = bg_r_grid_search.best_estimator_

In [20]:
cv_score = np.mean(cross_val_score(best_bg, X_train, y_train, scoring='roc_auc',n_jobs=n_jobs))

print("LogReg with train split:", cv_score) #0.7193249282374398 0.7231766475833975

LogReg with train split: 0.7147920148710671


In [21]:
y_crossval_pred = best_bg.predict_proba(X_dev)

In [22]:
roc_auc_score(y_dev,y_holdout_pred[:, 1]) #0.7363309812161912 

0.7308390269502246

In [23]:
roc_auc_score(y_dev, y_crossval_pred[:, 1]) # 0.7314016840801161 0.7363309812161912

0.7320679332236416

# Prediction and submit

In [24]:
X_test = test.drop(['0'], axis=1)
y_test = test['0']

In [26]:
prediction =lgs.predict_proba(X_test)

In [27]:
test_index = pd.read_csv('test.csv', sep='\t')
test_index = test_index['Unnamed: 0']

In [29]:
submition = pd.DataFrame({'_ID_': test_index, '_VAL_': np.zeros_like(test_index)})
submition['_VAL_'] = prediction[:,1]
submition.to_csv('submission_beg.csv', index=False)

# LB 0.73412059