### Cross Validation

### Data Preprocessing

In [1]:
import pandas as pd

In [58]:
df = pd.read_csv('titanic.csv')
# df_copy = df.copy()

In [59]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [60]:
df.Embarked = df.Embarked.fillna(str(df.Embarked.mode()))
df.Age = df.Age.fillna(df.Age.mean())

In [61]:
df = df.drop(columns = ['PassengerId', 'Cabin', 'Ticket'], axis = 1)

In [62]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 9 columns):
Survived    891 non-null int64
Pclass      891 non-null int64
Name        891 non-null object
Sex         891 non-null object
Age         891 non-null float64
SibSp       891 non-null int64
Parch       891 non-null int64
Fare        891 non-null float64
Embarked    891 non-null object
dtypes: float64(2), int64(4), object(3)
memory usage: 52.2+ KB


In [63]:
dummies = pd.get_dummies(df[['Sex', 'Embarked']], drop_first = True)

In [64]:
df = pd.concat([df, dummies], axis = 1)

In [65]:
df.head()

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked,Sex_male,Embarked_C,Embarked_Q,Embarked_S
0,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,7.25,S,1,0,0,1
1,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,71.2833,C,0,1,0,0
2,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,7.925,S,0,0,0,1
3,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,53.1,S,0,0,0,1
4,0,3,"Allen, Mr. William Henry",male,35.0,0,0,8.05,S,1,0,0,1


In [66]:
df_copy = df.copy()

In [74]:
X = df.drop(columns = ['Name', 'Survived','Embarked','Sex'])
Y = df.Survived

In [75]:
import warnings
warnings.filterwarnings('ignore')

### Data Split 

In [93]:
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size = 0.3, random_state = 111)

### Grid Search CV with pipeline

In [77]:
import numpy as np
from sklearn import datasets
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

np.random.seed(0)

In [88]:
# Create a pipeline
pipe = Pipeline([('classifier', RandomForestClassifier())])

# Create space of candidate learning algorithms and their hyperparameters
search_space = [{'classifier': [LogisticRegression()],
                 'classifier__penalty': ['l1', 'l2'],
                 'classifier__C': np.logspace(0, 4, 10)},
                {'classifier': [RandomForestClassifier()],
                 'classifier__n_estimators': [10, 100, 1000],
                 'classifier__max_features': [1, 2, 3]}]

In [111]:
clf_1 = GridSearchCV(pipe, search_space, cv=5, verbose=0)

In [112]:
best_model = clf_1.fit(X_train, Y_train)

In [113]:
best_model.best_estimator_.get_params()['classifier']

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features=3, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [141]:
best_model.score(X_train, Y_train)

0.9823434991974318

In [122]:
best_model.score(X_test,Y_test)

0.832089552238806

### Fast tuning C parameter

In [95]:
from sklearn import linear_model

In [117]:
clf_2 = linear_model.LogisticRegressionCV(Cs = 100)

In [118]:
model_best = clf_2.fit(X_train,Y_train)

In [119]:
model_best.score(X_train, Y_train)

0.812199036918138

In [120]:
model_best.score(X_test, Y_test)

0.7873134328358209

### Parameter Tuning Using Gread Search

### Estimator SVM

In [124]:
from sklearn.model_selection import GridSearchCV
from sklearn import svm

In [103]:
# aday parametreler

parameter_candidates = [
  {'C': [1, 10, 100, 1000], 'kernel': ['linear']},
  {'C': [1, 10, 100, 1000], 'gamma': [0.001, 0.0001], 'kernel': ['rbf']},
]

In [125]:
# estimator support vector machine

clf_3 = GridSearchCV(estimator=svm.SVC(), param_grid=parameter_candidates, n_jobs=-1)

model_3 = clf_3.fit(X_train, Y_train)

In [127]:
model_3.best_score_

0.7945425361155698

In [134]:
model_3.score(X_test, Y_test)

0.7761194029850746

In [138]:
print('Best C:',clf_3.best_estimator_.C) 
print('Best Kernel:',clf_3.best_estimator_.kernel)
print('Best Gamma:',clf_3.best_estimator_.gamma)

Best C: 1000
Best Kernel: rbf
Best Gamma: 0.0001


In [140]:
svm.SVC(C=1000, kernel='rbf', gamma=0.0001).fit(X_train, Y_train).score(X_test, Y_test)

0.7761194029850746

### Estimator logistic regression

In [128]:
parametreler = {"C": [10 ** x for x in range (-5, 5, 1)],
                "penalty": ['l1', 'l2']
                }

In [131]:
# estimator logistic regression

from sklearn.model_selection import GridSearchCV
grid_cv = GridSearchCV(estimator=LogisticRegression(),
                       param_grid = parametreler,
                       cv = 10
                      )
grid_cv.fit(X_train, Y_train)

GridSearchCV(cv=10, error_score='raise-deprecating',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'C': [1e-05, 0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000], 'penalty': ['l1', 'l2']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [132]:
grid_cv.best_score_

0.797752808988764

### K Fold Cross Validation

In [143]:
from sklearn import metrics
from sklearn.model_selection import KFold, cross_val_score
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

In [144]:
standardizer = StandardScaler()

In [145]:
logit = LogisticRegression()

In [146]:
pipeline = make_pipeline(standardizer, logit)

In [147]:
kf = KFold(n_splits=10, shuffle=True, random_state=1)

In [148]:
# Do k-fold cross-validation
cv_results = cross_val_score(pipeline, # Pipeline
                             X, # Feature matrix
                             Y, # Target vector
                             cv=kf, # Cross-validation technique
                             scoring="accuracy", # Loss function
                             n_jobs=-1) # Use all CPU scores

In [149]:
cv_results.mean()

0.7991260923845193