# Logistic Regression and Bootstrapping

In [1]:
import warnings
warnings.filterwarnings("ignore")
import sys
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 999)
pd.set_option('display.max_columns', 999)
pd.set_option('display.width', 1000)
pd.set_option('display.notebook_repr_html', True)
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
import statsmodels as sm
from statsmodels.regression.linear_model import OLS
from statsmodels.tools.tools import add_constant
from sklearn.model_selection import KFold
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.neighbors import KNeighborsRegressor
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import r2_score

matplotlib.rcParams['figure.figsize'] = (13,6)
%matplotlib inline
sns.set_style('ticks')

In [3]:
titanic = sns.load_dataset('titanic')
titanic.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [11]:
titanic = titanic[['sex', 'age', 'class', 'survived']]

In [13]:
titanic.columns = ['sex','age','pclass','survived']

In [15]:
titanic = titanic.dropna()
titanic.head()

Unnamed: 0,sex,age,pclass,survived
0,male,22.0,Third,0
1,female,38.0,First,1
2,female,26.0,Third,1
3,female,35.0,First,1
4,male,35.0,Third,0


In [17]:
is_female = pd.get_dummies(titanic.sex)['female']
pclass_coded = pd.get_dummies(titanic.pclass)[['First','Second']]

In [19]:
titanic_c = pd.concat([is_female,pclass_coded,titanic[['age','survived']]],axis=1)
titanic_c.head()

Unnamed: 0,female,First,Second,age,survived
0,0,0,0,22.0,0
1,1,1,0,38.0,1
2,1,0,0,26.0,1
3,1,1,0,35.0,1
4,0,0,0,35.0,0


In [24]:
train,test = train_test_split(titanic_c,test_size = 0.20,random_state = 123)

In [42]:
from statsmodels.discrete.discrete_model import Logit
model = LogisticRegression(C = 100000)
model.fit(train[['female','First','Second','age']],train['survived'])

LogisticRegression(C=100000, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [48]:
model.predict_proba(train[['female','First','Second','age']].values)[:,1]

array([0.10487945, 0.08628725, 0.78531484, 0.05587416, 0.11916565,
       0.58521527, 0.46782452, 0.49618527, 0.86900204, 0.19196176,
       0.51415538, 0.11062469, 0.19196176, 0.31644325, 0.0951779 ,
       0.15124152, 0.0479288 , 0.18644744, 0.90304788, 0.09673711,
       0.03967171, 0.18105604, 0.47822501, 0.41996249, 0.27890384,
       0.08628725, 0.39016127, 0.18829363, 0.30110076, 0.95748257,
       0.25092272, 0.07313004, 0.80856362, 0.62672273, 0.88452079,
       0.93383019, 0.12692122, 0.76228454, 0.03967171, 0.07313004,
       0.88452079, 0.87697162, 0.91080785, 0.60255346, 0.81944616,
       0.8376068 , 0.09831908, 0.22765049, 0.46926306, 0.04875568,
       0.14224196, 0.6350934 , 0.2186695 , 0.72102626, 0.11421131,
       0.09413881, 0.60975604, 0.9191893 , 0.48575781, 0.76008041,
       0.15751653, 0.86249049, 0.06114352, 0.76008041, 0.12692122,
       0.05220038, 0.89166395, 0.4030645 , 0.96544647, 0.24053844,
       0.16729401, 0.08078489, 0.10487945, 0.18829363, 0.08078