In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

In [23]:
df = pd.read_csv('ship_data.csv')

In [24]:
df.head()

Unnamed: 0,Passenger ID,Class,Name,Gender,Age,Siblings Count,Parents Count,Fare,Embarked,Survived
0,1,3,Alexander Harris,male,22.0,1,0,7250.0,New York,0
1,2,1,Frank Parsons,female,38.0,1,0,71283.3,Los Angeles,1
2,3,3,Anthony Churchill,female,26.0,0,0,7925.0,New York,1
3,4,1,Alexandra Hughes,female,35.0,1,0,53100.0,New York,1
4,5,3,Joan Fraser,male,35.0,0,0,8050.0,New York,0


In [25]:
df.tail()

Unnamed: 0,Passenger ID,Class,Name,Gender,Age,Siblings Count,Parents Count,Fare,Embarked,Survived
886,887,2,Bella Tucker,male,27.0,0,0,13000.0,New York,0
887,888,1,Boris Howard,female,19.0,0,0,30000.0,New York,1
888,889,3,Cameron Lambert,female,,1,2,23450.0,New York,0
889,890,1,Theresa Hill,male,26.0,0,0,30000.0,Los Angeles,1
890,891,3,Caroline Fraser,male,32.0,0,0,7750.0,Chicago,0


In [26]:
df.isnull().sum()

Passenger ID        0
Class               0
Name                0
Gender              0
Age               177
Siblings Count      0
Parents Count       0
Fare                0
Embarked            2
Survived            0
dtype: int64

Replacing the missing values from the Age column with the mean value in the column.

In [27]:
df['Age'].fillna(df['Age'].mean(), inplace=True)

In [28]:
df.isnull().sum()

Passenger ID      0
Class             0
Name              0
Gender            0
Age               0
Siblings Count    0
Parents Count     0
Fare              0
Embarked          2
Survived          0
dtype: int64

Now filling the missing values in the Embarked column with the modal value in the Embarked column

In [29]:
# the modal value in the Embarked column
print df['Embarked'].mode()
print '\n'
print df['Embarked'].mode()[0]

0    New York
dtype: object


New York


In [30]:
df['Embarked'].fillna(df['Embarked'].mode()[0], inplace=True)

In [31]:
df.isnull().sum()

Passenger ID      0
Class             0
Name              0
Gender            0
Age               0
Siblings Count    0
Parents Count     0
Fare              0
Embarked          0
Survived          0
dtype: int64

Now we can see that all the missing values in the dataframe have been removed.

In [32]:
df.head()

Unnamed: 0,Passenger ID,Class,Name,Gender,Age,Siblings Count,Parents Count,Fare,Embarked,Survived
0,1,3,Alexander Harris,male,22.0,1,0,7250.0,New York,0
1,2,1,Frank Parsons,female,38.0,1,0,71283.3,Los Angeles,1
2,3,3,Anthony Churchill,female,26.0,0,0,7925.0,New York,1
3,4,1,Alexandra Hughes,female,35.0,1,0,53100.0,New York,1
4,5,3,Joan Fraser,male,35.0,0,0,8050.0,New York,0


For training the model we must decide on the features on which we should train our model, Passenger ID and Name don't have anything to do with the survival hence dropping those features for our train and test set.

In [38]:
df = df.drop(['Passenger ID', 'Name'],axis=1)

In [39]:
df.head()

Unnamed: 0,Class,Gender,Age,Siblings Count,Parents Count,Fare,Embarked,Survived
0,3,male,22.0,1,0,7250.0,New York,0
1,1,female,38.0,1,0,71283.3,Los Angeles,1
2,3,female,26.0,0,0,7925.0,New York,1
3,1,female,35.0,1,0,53100.0,New York,1
4,3,male,35.0,0,0,8050.0,New York,0


Having eliminated the irrelavent features, we can proceed further in processing the data,  
now let's convert the categorical data in the Gender column to numerical form. For this we can use LabelEncoder available in the scikit-learn module


In [40]:
from sklearn.preprocessing import LabelEncoder
le_gender = LabelEncoder()
df['Gender'] = le_gender.fit_transform(df['Gender'])
df.head()

Unnamed: 0,Class,Gender,Age,Siblings Count,Parents Count,Fare,Embarked,Survived
0,3,1,22.0,1,0,7250.0,New York,0
1,1,0,38.0,1,0,71283.3,Los Angeles,1
2,3,0,26.0,0,0,7925.0,New York,1
3,1,0,35.0,1,0,53100.0,New York,1
4,3,1,35.0,0,0,8050.0,New York,0


Now let's see how many distict values are there in the Embarked column:

In [41]:
df['Embarked'].nunique()

3

Since there are just three unique values in the Embarked column, here also we can use a LabelEncoder to transform the categorical data to numerical form.

In [42]:
le_embarked = LabelEncoder()
df['Embarked'] = le_embarked.fit_transform(df['Embarked'])
df.head()

Unnamed: 0,Class,Gender,Age,Siblings Count,Parents Count,Fare,Embarked,Survived
0,3,1,22.0,1,0,7250.0,2,0
1,1,0,38.0,1,0,71283.3,1,1
2,3,0,26.0,0,0,7925.0,2,1
3,1,0,35.0,1,0,53100.0,2,1
4,3,1,35.0,0,0,8050.0,2,0


Now seperating the features and label column.

In [54]:
# feature table
X = df.iloc[:,:7].values
# labels
y = df.iloc[:,[7]].values

In [74]:
y[:5]

array([[0],
       [1],
       [1],
       [1],
       [0]])

In [67]:
len(y)

891

In [77]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [94]:
from sklearn.preprocessing import StandardScaler
from sklearn.cross_validation import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

from sklearn.pipeline import Pipeline

In [95]:
clf_lr = LogisticRegression(penalty='l2', C=0.002, random_state=0)
clf_knn = KNeighborsClassifier(n_neighbors = 5, p=2, metric='minkowski')
clf_svm = SVC(kernel='linear', C=0.01, random_state=0)
clf_rf = RandomForestClassifier(n_estimators = 10, criterion='entropy', n_jobs=2, random_state=0)

In [96]:
pipe_lr = Pipeline([['sc', StandardScaler()],['clf', clf_lr]])
pipe_knn = Pipeline([['sc', StandardScaler()],['clf', clf_knn]])
pipe_svm = Pipeline([['sc', StandardScaler()],['clf', clf_svm]])
# no pipeline required for random forest

In [97]:
c, r = y_train.shape
y_train = y_train.reshape(c,)

In [98]:
clf_labels = ['Logistic Regression', 'KNN', 'SVM', 'Random Forest']
print '10-fold cross validatioin: \n'
for clf, label in zip([pipe_lr, pipe_knn, pipe_svm, clf_rf], clf_labels):
    scores = cross_val_score(estimator = clf, X=X_train, y=y_train, cv=10, scoring='roc_auc')
    print 'ROC AUC: %0.2f (+/- %0.2f) [%s]' %(scores.mean(), scores.std(), label)

10-fold cross validatioin: 

ROC AUC: 0.84 (+/- 0.05) [Logistic Regression]
ROC AUC: 0.84 (+/- 0.04) [KNN]
ROC AUC: 0.82 (+/- 0.06) [SVM]
ROC AUC: 0.84 (+/- 0.05) [Random Forest]


In [100]:
from sklearn.metrics import roc_auc_score

In [101]:
pipe_svm.fit(X_train,y_train)
y_train_pred = pipe_svm.predict(X_train)
y_test_pred = pipe_svm.predict(X_test)
print 'Training ROC AUC:  %0.3f' %roc_auc_score(y_train, y_train_pred)
print 'Test ROC AUC:      %0.3f' %roc_auc_score(y_test, y_test_pred)

Training ROC AUC:  0.765
Test ROC AUC:      0.773


In [102]:
pipe_svm.get_params()

{'clf': SVC(C=0.01, cache_size=200, class_weight=None, coef0=0.0,
   decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
   max_iter=-1, probability=False, random_state=0, shrinking=True,
   tol=0.001, verbose=False),
 'clf__C': 0.01,
 'clf__cache_size': 200,
 'clf__class_weight': None,
 'clf__coef0': 0.0,
 'clf__decision_function_shape': 'ovr',
 'clf__degree': 3,
 'clf__gamma': 'auto',
 'clf__kernel': 'linear',
 'clf__max_iter': -1,
 'clf__probability': False,
 'clf__random_state': 0,
 'clf__shrinking': True,
 'clf__tol': 0.001,
 'clf__verbose': False,
 'memory': None,
 'sc': StandardScaler(copy=True, with_mean=True, with_std=True),
 'sc__copy': True,
 'sc__with_mean': True,
 'sc__with_std': True,
 'steps': [('sc', StandardScaler(copy=True, with_mean=True, with_std=True)),
  ['clf', SVC(C=0.01, cache_size=200, class_weight=None, coef0=0.0,
     decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
     max_iter=-1, probability=False, random_state=

In [104]:
from sklearn.grid_search import GridSearchCV
param_range = [ 0.001, 0.01, 0.10, 1.0, 10.0, 100.0, 1000]
params = {'clf__C': param_range}
grid_svm = GridSearchCV(estimator = pipe_svm,
                       param_grid = params,
                       cv=10,
                       scoring='roc_auc'
                       )
grid_svm.fit(X_train, y_train)



GridSearchCV(cv=10, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('sc', StandardScaler(copy=True, with_mean=True, with_std=True)), ['clf', SVC(C=0.01, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=0, shrinking=True,
  tol=0.001, verbose=False)]]),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'clf__C': [0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000]},
       pre_dispatch='2*n_jobs', refit=True, scoring='roc_auc', verbose=0)

In [105]:
for params, mean_score, scores in grid_svm.grid_scores_:
    print('% 0.3f +/-%0.2f %r'%(mean_score, scores.std() / 2, params))


 0.842 +/-0.03 {'clf__C': 0.001}
 0.818 +/-0.03 {'clf__C': 0.01}
 0.794 +/-0.03 {'clf__C': 0.1}
 0.798 +/-0.02 {'clf__C': 1.0}
 0.791 +/-0.03 {'clf__C': 10.0}
 0.787 +/-0.03 {'clf__C': 100.0}
 0.826 +/-0.02 {'clf__C': 1000}


In [106]:
print('Best parameters: %s'% grid_svm.best_params_)

Best parameters: {'clf__C': 0.001}


In [107]:
print('Accuracy: %0.2f'%grid_svm.best_score_)

Accuracy: 0.84


With C=0.001 we get 84% accuracy in our svm model.

In [109]:
c, r = y.shape
y = y.reshape(c,)

In [1]:
print 'a'

a


In [2]:
param_range = [ 0.0001, 0.001, 0.01, 0.10, 1.0, 10.0, 100.0, 1000]
params = {'clf__C': param_range}
grid_svm = GridSearchCV(estimator = pipe_svm,
                       param_grid = params,
                       cv=10,
                       scoring='roc_auc',
                        n_jobs=-1
                       )
grid_svm.fit(X_train, y_train)

NameError: name 'GridSearchCV' is not defined