## Preliminaries

In [41]:
# Load libraries
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

from sklearn.linear_model import LogisticRegression
from sklearn import datasets
import warnings
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold, cross_val_score


from sklearn.pipeline import make_pipeline
from sklearn import preprocessing
from sklearn import cross_validation
from sklearn import svm


## Load Preprocessed data and target

In [107]:
X_train = np.loadtxt('data_preproc_train.txt', delimiter=',')
y_train = np.loadtxt('target_train.txt',delimiter=',')
X_test = np.loadtxt('data_preproc_test.txt',delimiter=',')

## Fit A Logistic Regression

A logistic regression is a binary classifier (i.e. the target vector can only take two values). In a logistic regression, a linear model (e.g. β0+β1) is included in a logistic also called sigmoid function, 
\begin{equation*}
\frac{1}{(1+e-z)}
\end{equation*}

such that: 

\begin{equation*}
P(yi=1∣X) = \frac{1}{1+e^−(β0+β1x)} 
\end{equation*}

where P(yi=1∣X) is the probability of the ith observation’s target value, yi, being class 1, X is the training data, β0 and β1 are the parameters to be learned, and e is Euler’s number.

### Create logistic regression object

In [108]:
clf = LogisticRegression(random_state=0)

In [109]:
clf

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=0, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

### Train Logistic Regression

In [110]:
# Train model
model = clf.fit(X_train, y_train)

In [111]:
model

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=0, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

### Create new observation

### Predict Class Of Observation

In [112]:
np.shape(X_test)

(418, 6)

In [113]:
print(np.shape(model.predict(X_test)))
print(model.predict(X_test))

(418,)
[0. 0. 0. 0. 1. 0. 1. 0. 1. 0. 0. 0. 1. 0. 1. 1. 0. 0. 1. 0. 0. 0. 1. 1.
 1. 0. 1. 0. 0. 0. 0. 0. 0. 0. 1. 0. 1. 1. 0. 0. 0. 1. 0. 1. 1. 0. 0. 0.
 1. 0. 1. 0. 1. 1. 0. 0. 0. 0. 0. 1. 0. 0. 0. 1. 1. 1. 1. 0. 1. 1. 1. 0.
 1. 1. 1. 1. 0. 1. 0. 1. 0. 0. 0. 0. 0. 0. 1. 1. 1. 0. 1. 0. 1. 0. 1. 0.
 1. 0. 1. 0. 1. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 1. 1. 1. 1. 0. 0. 1. 1. 1.
 1. 0. 1. 0. 0. 1. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 1. 0. 0. 1. 0. 0.
 0. 0. 1. 0. 1. 0. 1. 0. 0. 0. 0. 0. 1. 1. 0. 1. 1. 0. 1. 0. 0. 1. 0. 0.
 1. 1. 0. 0. 0. 0. 0. 1. 1. 0. 1. 1. 0. 0. 1. 0. 1. 0. 1. 0. 0. 0. 0. 1.
 0. 0. 0. 0. 1. 1. 0. 1. 1. 0. 0. 1. 0. 1. 1. 0. 1. 0. 0. 0. 0. 0. 0. 0.
 1. 0. 1. 0. 1. 0. 1. 0. 1. 1. 0. 1. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 1. 1.
 1. 1. 0. 0. 0. 0. 1. 0. 1. 1. 1. 0. 1. 0. 0. 0. 0. 0. 1. 0. 0. 0. 1. 1.
 0. 0. 1. 0. 1. 0. 0. 0. 1. 1. 0. 1. 0. 0. 0. 0. 1. 0. 1. 1. 1. 0. 0. 1.
 0. 0. 1. 1. 0. 0. 0. 0. 1. 0. 1. 0. 0. 0. 0. 0. 1. 1. 0. 0. 0. 0. 0. 0.
 0. 1. 1. 1. 0. 0. 0. 0. 0. 0. 0. 1. 1. 0. 1

### View predicted probabilities

In [114]:
print(np.shape(model.predict_proba(X_test)))
print(model.predict_proba(X_test))

(418, 2)
[[0.9108687  0.0891313 ]
 [0.70423945 0.29576055]
 [0.91843456 0.08156544]
 [0.89402612 0.10597388]
 [0.42229396 0.57770604]
 [0.81772967 0.18227033]
 [0.38105661 0.61894339]
 [0.77425237 0.22574763]
 [0.2244431  0.7755569 ]
 [0.92744916 0.07255084]
 [0.90815706 0.09184294]
 [0.64044545 0.35955455]
 [0.05860539 0.94139461]
 [0.95238696 0.04761304]
 [0.16653254 0.83346746]
 [0.13776304 0.86223696]
 [0.75209676 0.24790324]
 [0.81695052 0.18304948]
 [0.47397148 0.52602852]
 [0.51786225 0.48213775]
 [0.73573164 0.26426836]
 [0.78446619 0.21553381]
 [0.05857375 0.94142625]
 [0.27930706 0.72069294]
 [0.14135235 0.85864765]
 [0.97348958 0.02651042]
 [0.02959982 0.97040018]
 [0.82760989 0.17239011]
 [0.58285564 0.41714436]
 [0.93425122 0.06574878]
 [0.91406915 0.08593085]
 [0.81077965 0.18922035]
 [0.56353362 0.43646638]
 [0.53073228 0.46926772]
 [0.45232522 0.54767478]
 [0.79809439 0.20190561]
 [0.42576844 0.57423156]
 [0.32087106 0.67912894]
 [0.8844291  0.1155709 ]
 [0.90815706 0.0

### Create Pipeline

In [115]:
# Create standardizer
standardizer = StandardScaler()

# Create logistic regression
logit = LogisticRegression()

# Create a pipeline that standardizes, then runs logistic regression
pipeline = make_pipeline(standardizer, logit)

### Create k-Fold Cross-Validation

In [116]:
# Create k-Fold cross-validation
kf = KFold(n_splits=10, shuffle=True, random_state=1)

### Conduct k-Fold Cross-Validation

In [117]:
# Do k-fold cross-validation
cv_results = cross_val_score(pipeline, # Pipeline
                             X_train, # Feature matrix
                             y_train, # Target vector
                             cv=kf, # Cross-validation technique
                             scoring="accuracy", # Loss function
                             n_jobs=-1) # Use all CPU scores

In [118]:
cv_results

array([0.77777778, 0.83333333, 0.75      , 0.79166667, 0.74647887,
       0.8028169 , 0.81690141, 0.73239437, 0.88732394, 0.78873239])

### Calculate Mean Performance Score

In [119]:
# Calculate mean
print("Mean performance score: %.2f%%" % (cv_results.mean()*100))

Mean performance score: 79.27%


### Export results to CSV

In [126]:
prediction = model.predict(X_test)
np.shape(prediction)

(418,)

In [162]:
X_test_df = pd.read_csv('data_preproc_test.csv')

In [163]:
X_test_df = X_test_df.drop(columns=['Unnamed: 0', 'Pclass','Sex_cat','Age','SibSp','Parch','Embarked_cat'])

In [164]:
X_test_df['Survived']= pd.DataFrame(prediction, columns=['Survived'])

In [167]:
X_test_df.head()

Unnamed: 0,PassengerId,Survived
0,892,0.0
1,893,0.0
2,894,0.0
3,895,0.0
4,896,1.0


In [168]:
X_test_df.to_csv('logisticregression_submission.csv', index=False)