## Preliminaries

In [41]:
# Load libraries
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

from sklearn.linear_model import LogisticRegression
from sklearn import datasets
import warnings
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold, cross_val_score


from sklearn.pipeline import make_pipeline
from sklearn import preprocessing
from sklearn import cross_validation
from sklearn import svm


## Load Preprocessed data and target

In [74]:
X_train = np.loadtxt('data_preproc_train.txt', delimiter=',')
y_train = np.loadtxt('target_train.txt',delimiter=',')
X_test = np.loadtxt('data_preproc_test.txt',delimiter=',')

## Fit A Logistic Regression

A logistic regression is a binary classifier (i.e. the target vector can only take two values). In a logistic regression, a linear model (e.g. β0+β1) is included in a logistic also called sigmoid function, 
\begin{equation*}
\frac{1}{(1+e-z)}
\end{equation*}

such that: 

\begin{equation*}
P(yi=1∣X) = \frac{1}{1+e^−(β0+β1x)} 
\end{equation*}

where P(yi=1∣X) is the probability of the ith observation’s target value, yi, being class 1, X is the training data, β0 and β1 are the parameters to be learned, and e is Euler’s number.

### Create logistic regression object

In [75]:
clf = LogisticRegression(random_state=0)

In [76]:
clf

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=0, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

### Train Logistic Regression

In [77]:
# Train model
model = clf.fit(X_train, y_train)

In [78]:
model

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=0, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

### Create new observation

In [79]:
new_observation = X_test

### Predict Class Of Observation

In [80]:
np.shape(X_test)

(332, 6)

In [81]:
print(np.shape(model.predict(X_test)))
print(model.predict(X_test))

(332,)
[0. 0. 0. 0. 1. 0. 1. 0. 1. 0. 0. 1. 0. 1. 1. 0. 0. 0. 0. 0. 0. 1. 1. 0.
 1. 0. 0. 0. 0. 0. 1. 0. 1. 0. 0. 0. 1. 1. 0. 0. 1. 0. 0. 0. 1. 1. 0. 0.
 0. 1. 0. 0. 0. 1. 1. 1. 0. 1. 1. 1. 0. 1. 1. 1. 1. 1. 0. 1. 0. 0. 0. 1.
 1. 0. 1. 1. 1. 0. 1. 0. 1. 0. 1. 0. 0. 1. 0. 0. 0. 0. 1. 1. 1. 0. 1. 1.
 1. 1. 1. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 1. 0. 0. 0. 0. 0. 0.
 1. 0. 0. 0. 0. 1. 1. 0. 1. 0. 1. 0. 0. 0. 0. 1. 0. 0. 0. 1. 1. 0. 1. 1.
 0. 0. 1. 1. 0. 1. 0. 0. 0. 0. 0. 0. 0. 1. 1. 0. 0. 0. 1. 0. 1. 0. 1. 0.
 0. 0. 0. 0. 0. 0. 1. 1. 0. 1. 0. 1. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 1. 1.
 1. 1. 0. 0. 1. 0. 1. 1. 0. 1. 0. 0. 0. 1. 0. 0. 0. 1. 1. 0. 0. 0. 1. 1.
 0. 0. 0. 0. 1. 0. 1. 1. 0. 0. 1. 0. 0. 0. 1. 1. 0. 0. 0. 0. 1. 0. 0. 0.
 0. 0. 0. 0. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 1. 0. 0. 0. 1. 0. 1. 0.
 1. 0. 0. 0. 0. 0. 1. 1. 0. 1. 0. 1. 1. 0. 0. 0. 1. 0. 1. 0. 0. 1. 1. 0.
 1. 1. 1. 0. 0. 1. 0. 0. 1. 1. 0. 0. 0. 0. 0. 1. 1. 0. 0. 0. 0. 1. 1. 0.
 0. 0. 1. 0. 1. 0. 0. 1. 0. 1. 1. 0. 0. 0. 0

### View predicted probabilities

In [82]:
print(np.shape(model.predict_proba(X_test)))
print(model.predict_proba(X_test))

(332, 2)
[[0.92128582 0.07871418]
 [0.72523947 0.27476053]
 [0.91817982 0.08182018]
 [0.90937016 0.09062984]
 [0.48249062 0.51750938]
 [0.85108487 0.14891513]
 [0.4257214  0.5742786 ]
 [0.80758667 0.19241333]
 [0.27086974 0.72913026]
 [0.94115267 0.05884733]
 [0.65976568 0.34023432]
 [0.0734321  0.9265679 ]
 [0.95234721 0.04765279]
 [0.18303666 0.81696334]
 [0.16801547 0.83198453]
 [0.77706988 0.22293012]
 [0.8460659  0.1539341 ]
 [0.52619927 0.47380073]
 [0.54462669 0.45537331]
 [0.745142   0.254858  ]
 [0.82752343 0.17247657]
 [0.32717959 0.67282041]
 [0.15844044 0.84155956]
 [0.97503233 0.02496767]
 [0.0376246  0.9623754 ]
 [0.854341   0.145659  ]
 [0.60964566 0.39035434]
 [0.91924666 0.08075334]
 [0.84149634 0.15850366]
 [0.61012088 0.38987912]
 [0.49768308 0.50231692]
 [0.83148179 0.16851821]
 [0.37325048 0.62674952]
 [0.90198881 0.09801119]
 [0.92586228 0.07413772]
 [0.94845273 0.05154727]
 [0.20406251 0.79593749]
 [0.17046207 0.82953793]
 [0.90198881 0.09801119]
 [0.56875696 0.4

### Create Pipeline

In [83]:
# Create standardizer
standardizer = StandardScaler()

# Create logistic regression
logit = LogisticRegression()

# Create a pipeline that standardizes, then runs logistic regression
pipeline = make_pipeline(standardizer, logit)

### Create k-Fold Cross-Validation

In [84]:
# Create k-Fold cross-validation
kf = KFold(n_splits=10, shuffle=True, random_state=1)

### Conduct k-Fold Cross-Validation

In [85]:
# Do k-fold cross-validation
cv_results = cross_val_score(pipeline, # Pipeline
                             X_train, # Feature matrix
                             y_train, # Target vector
                             cv=kf, # Cross-validation technique
                             scoring="accuracy", # Loss function
                             n_jobs=-1) # Use all CPU scores

In [86]:
cv_results

array([0.77777778, 0.83333333, 0.75      , 0.79166667, 0.74647887,
       0.8028169 , 0.81690141, 0.73239437, 0.88732394, 0.78873239])

### Calculate Mean Performance Score

In [88]:
# Calculate mean
print("Mean performance score: %.2f%%" % (cv_results.mean()*100))

Mean performance score: 79.27%
