In [9]:
#numpy and pandas for data manipulation
import numpy as np
import pandas as pd

from sklearn.model_selection import KFold

from sklearn.model_selection import StratifiedKFold

from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import cross_val_predict

from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

from sklearn.model_selection import train_test_split
from sklearn import metrics

from itertools import product


samples = pd.read_csv('features-250.csv')

# # Create a label encoder object
# le = LabelEncoder()
# le_count = 0

# # Iterate through the columns
# for col in samples:
#     if samples[col].dtype == 'bool':
#         # If 2 or fewer unique categories
#         if len(list(samples[col].unique())) <= 2:
#             # Train on the training data
#             le.fit(samples[col])
#             # Transform 
#             samples[col] = le.transform(samples[col])
            
#             # Keep track of how many columns were label encoded
#             le_count += 1
# print('%d columns were label encoded.' % le_count)

train_label = samples['TARGET']
samples = samples.drop(columns=['TARGET'])

# one-hot encoding of categorical variables
samples = pd.get_dummies(samples)

# Median imputation of missing values
imputer = SimpleImputer(strategy = 'median')

# Scale each feature to -1 and 1
scaler = StandardScaler()

# Fit on the training data
imputer.fit(samples)

# Transform both training and testing data
samples = imputer.transform(samples)

# Repeat with the scaler
scaler.fit(samples)
samples = scaler.transform(samples)

print('Training data shape: ', samples.shape)


Training data shape:  (307507, 249)


In [10]:
x_train,x_test, y_train, y_test = train_test_split(samples, train_label, test_size=0.2)

In [11]:
def classify(arr, threshold):
    return [1 if row[1] >= threshold else 0 for row in arr]

In [None]:
# C, max iter, weight balancing
hyperparameters = product(tuple((1,0.1,0.01,0.001,0.0001)), tuple((100,200, 400)), 
                               tuple(( 'balanced',)))
CLASS_THRESHOLDS = [0.3,0.4,0.5,0.6,0.7]
for hp in hyperparameters:
    # 80-20 train-test case
    kf = StratifiedKFold(n_splits=5, random_state=None, shuffle=False)
    log_reg = LogisticRegression(C = hp[0], max_iter=hp[1], solver='sag', fit_intercept=True,
                                 class_weight=hp[2], n_jobs=-1)
    tmp_results = cross_val_predict(estimator=log_reg , X=x_train, y=y_train, cv=kf, method='predict_proba')
    print("Hyperparameters - {}".format(hp))
    for thres in CLASS_THRESHOLDS:
        print("Threshold - {}  roc - {}".format(thres,metrics.roc_auc_score(y_train, classify(tmp_results, thres))))
    
    
    

Hyperparameters - (1, 100, 'balanced')
Threshold - 0.3  roc - 0.6475751626697502
Threshold - 0.4  roc - 0.687483791838034
Threshold - 0.5  roc - 0.6974093465043035
Threshold - 0.6  roc - 0.6825226550504049
Threshold - 0.7  roc - 0.6431770228906303


In [19]:
from itertools import product 
perm = product([1, 2, 3], ["asdf","jpt"], [0.1,0.2]) 
print(len(list(product(tuple((0.1,0.01,0.001,0.0001)), tuple((100,200,400)), tuple((0.3, 0.4, 0.5)),                                
                 tuple((None, {0:0.1238,1:0.8762}, 'balanced'))))))
# Print the obtained permutations 
for i in product(tuple((1,0.1,0.01,0.001,0.0001)), tuple((100,200,400)), tuple((0.3, 0.4, 0.5)),                                
                 tuple((None, {0:0.1238,1:0.8762}, 'balanced'))): 
    print(i) 

108
(1, 100, 0.3, None)
(1, 100, 0.3, {0: 0.1238, 1: 0.8762})
(1, 100, 0.3, 'balanced')
(1, 100, 0.4, None)
(1, 100, 0.4, {0: 0.1238, 1: 0.8762})
(1, 100, 0.4, 'balanced')
(1, 100, 0.5, None)
(1, 100, 0.5, {0: 0.1238, 1: 0.8762})
(1, 100, 0.5, 'balanced')
(1, 200, 0.3, None)
(1, 200, 0.3, {0: 0.1238, 1: 0.8762})
(1, 200, 0.3, 'balanced')
(1, 200, 0.4, None)
(1, 200, 0.4, {0: 0.1238, 1: 0.8762})
(1, 200, 0.4, 'balanced')
(1, 200, 0.5, None)
(1, 200, 0.5, {0: 0.1238, 1: 0.8762})
(1, 200, 0.5, 'balanced')
(1, 400, 0.3, None)
(1, 400, 0.3, {0: 0.1238, 1: 0.8762})
(1, 400, 0.3, 'balanced')
(1, 400, 0.4, None)
(1, 400, 0.4, {0: 0.1238, 1: 0.8762})
(1, 400, 0.4, 'balanced')
(1, 400, 0.5, None)
(1, 400, 0.5, {0: 0.1238, 1: 0.8762})
(1, 400, 0.5, 'balanced')
(0.1, 100, 0.3, None)
(0.1, 100, 0.3, {0: 0.1238, 1: 0.8762})
(0.1, 100, 0.3, 'balanced')
(0.1, 100, 0.4, None)
(0.1, 100, 0.4, {0: 0.1238, 1: 0.8762})
(0.1, 100, 0.4, 'balanced')
(0.1, 100, 0.5, None)
(0.1, 100, 0.5, {0: 0.1238, 1: 0.8762}