## Preliminaries

In [41]:
# Load libraries
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

from sklearn.linear_model import LogisticRegression
from sklearn import datasets
import warnings
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold, cross_val_score


from sklearn.pipeline import make_pipeline
from sklearn import preprocessing
from sklearn import cross_validation
from sklearn import svm


## Load Preprocessed data and target

In [64]:
X_train = np.loadtxt('data_preproc_train.txt')
y_train = np.loadtxt('target_train.txt')
X_test = np.loadtxt('data_preproc_test.txt')

## Fit A Logistic Regression

A logistic regression is a binary classifier (i.e. the target vector can only take two values). In a logistic regression, a linear model (e.g. β0+β1) is included in a logistic also called sigmoid function, 
\begin{equation*}
\frac{1}{(1+e-z)}
\end{equation*}

such that: 

\begin{equation*}
P(yi=1∣X) = \frac{1}{1+e^−(β0+β1x)} 
\end{equation*}

where P(yi=1∣X) is the probability of the ith observation’s target value, yi, being class 1, X is the training data, β0 and β1 are the parameters to be learned, and e is Euler’s number.

### Create logistic regression object

In [59]:
clf = LogisticRegression(random_state=0)

In [60]:
clf

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=0, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

### Train Logistic Regression

In [62]:
# Train model
model = clf.fit(X_train, y_train)

In [63]:
model

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=0, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

### Create new observation

In [65]:
new_observation = X_test

### Predict Class Of Observation

In [66]:
np.shape(X_test)

(332, 6)

In [67]:
print(np.shape(model.predict(X_test)))
print(model.predict(X_test))

(332,)
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 1. 1. 0. 0. 0. 0. 0. 0. 1. 1. 0.
 1. 0. 1. 0. 0. 0. 1. 0. 0. 0. 0. 0. 1. 1. 0. 0. 1. 0. 1. 0. 1. 1. 0. 0.
 0. 1. 0. 0. 0. 0. 1. 0. 0. 1. 0. 0. 0. 0. 1. 1. 1. 1. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 1. 1. 0. 1. 0. 0. 0. 1. 0. 0. 1. 0. 0. 0. 0. 1. 0. 1. 0. 1. 1.
 1. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 1. 0. 0. 0.
 1. 0. 0. 0. 0. 1. 0. 1. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 0. 1. 1.
 0. 1. 1. 1. 0. 1. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 1. 0. 0. 0. 1. 0.
 0. 0. 0. 0. 1. 0. 1. 1. 0. 1. 0. 1. 0. 0. 0. 0. 1. 0. 1. 0. 0. 0. 1. 1.
 1. 0. 0. 1. 1. 0. 1. 1. 0. 1. 0. 0. 0. 1. 0. 0. 0. 1. 1. 0. 0. 0. 1. 1.
 0. 0. 0. 0. 0. 0. 1. 0. 0. 1. 0. 0. 0. 0. 1. 1. 0. 0. 0. 0. 1. 0. 0. 0.
 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 0. 1. 0. 0. 0. 0. 1. 0. 0.
 1. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 1. 1. 0. 0. 0. 1. 0. 1. 0. 0. 1. 1. 0.
 1. 0. 1. 0. 0. 1. 0. 0. 1. 1. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 1. 1. 0.
 0. 0. 1. 0. 1. 0. 0. 1. 0. 1. 1. 1. 0. 0. 0

### View predicted probabilities

In [68]:
print(np.shape(model.predict_proba(X_test)))
print(model.predict_proba(X_test))

(332, 2)
[[9.67788477e-01 3.22115227e-02]
 [8.51099576e-01 1.48900424e-01]
 [9.67241653e-01 3.27583467e-02]
 [9.67788477e-01 3.22115227e-02]
 [7.07063272e-01 2.92936728e-01]
 [9.26939571e-01 7.30604293e-02]
 [7.07063272e-01 2.92936728e-01]
 [8.40387609e-01 1.59612391e-01]
 [6.71962897e-01 3.28037103e-01]
 [9.83150960e-01 1.68490398e-02]
 [6.86030951e-01 3.13969049e-01]
 [6.90107592e-02 9.30989241e-01]
 [9.67241653e-01 3.27583467e-02]
 [1.49325959e-01 8.50674041e-01]
 [2.64152171e-01 7.35847829e-01]
 [8.40387609e-01 1.59612391e-01]
 [9.62261382e-01 3.77386180e-02]
 [7.07063272e-01 2.92936728e-01]
 [8.29086121e-01 1.70913879e-01]
 [6.49659132e-01 3.50340868e-01]
 [9.26939571e-01 7.30604293e-02]
 [4.39166216e-01 5.60833784e-01]
 [2.46823436e-01 7.53176564e-01]
 [9.86139806e-01 1.38601940e-02]
 [5.91853331e-02 9.40814667e-01]
 [9.62261382e-01 3.77386180e-02]
 [4.79895970e-01 5.20104030e-01]
 [9.25752495e-01 7.42475051e-02]
 [9.10917920e-01 8.90820799e-02]
 [7.58404045e-01 2.41595955e-01]
 

### Create Pipeline

In [69]:
# Create standardizer
standardizer = StandardScaler()

# Create logistic regression
logit = LogisticRegression()

# Create a pipeline that standardizes, then runs logistic regression
pipeline = make_pipeline(standardizer, logit)

### Create k-Fold Cross-Validation

In [70]:
# Create k-Fold cross-validation
kf = KFold(n_splits=10, shuffle=True, random_state=1)

### Conduct k-Fold Cross-Validation

In [71]:
# Do k-fold cross-validation
cv_results = cross_val_score(pipeline, # Pipeline
                             X_train, # Feature matrix
                             y_train, # Target vector
                             cv=kf, # Cross-validation technique
                             scoring="accuracy", # Loss function
                             n_jobs=-1) # Use all CPU scores

In [72]:
cv_results

array([0.77777778, 0.77777778, 0.79166667, 0.79166667, 0.73239437,
       0.8028169 , 0.78873239, 0.73239437, 0.85915493, 0.78873239])

### Calculate Mean Performance Score

In [73]:
# Calculate mean
cv_results.mean()

0.7843114241001565