In [74]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style="darkgrid")

In [92]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
dataset = [train, test]
print('train data: ',train.shape)
print('test data: ',test.shape)

train data:  (891, 12)
test data:  (418, 11)


In [171]:
train['Ticket'].nunique()

681

In [94]:
for data in dataset:
    logfare = np.log(data['Fare']+1)
    data['logfare'] = logfare
    age_mean = data['Age'].mean()
    age_std = data['Age'].std()
    nmissing_age = data['Age'].isnull().sum()
    age_random = list(map(int, np.random.normal(age_mean, age_std, nmissing_age)))
    data['Age'][data['Age'].isnull()] = age_random
    data['FamilySize'] = data['SibSp']+data['Parch']+1
    data['IsAlone'] = 0
    data.loc[data['FamilySize']==1,'IsAlone'] = 1
    data['CabinType'] = data['Cabin'].apply(lambda x: x[0] if type(x) == str else np.nan )
    data['CabinType'].fillna('No Cabin', inplace=True)
    data['Cabin'].fillna('No Cabin', inplace=True)
    data.loc[data['CabinType']=='G','CabinType'] = 'GT'
    data.loc[data['CabinType']=='T','CabinType'] = 'GT'
    data['Sex'] = data['Sex'].map({'male':1,'female':0})
    
train['Embarked'].fillna('S', inplace=True)
test['Fare'].fillna(test['Fare'].mean(), inplace=True)
test['logfare'].fillna(test['logfare'].mean(), inplace=True)
train_age_mean = train['Age'].mean()
train_age_std = train['Age'].std()
train['Age'] = (train['Age']-train_age_mean)/train_age_std
test['Age'] = (test['Age']-train_age_mean)/train_age_std

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [95]:
X_train = train[['Pclass','Sex','Age','SibSp','Parch','Embarked','logfare','FamilySize','IsAlone','CabinType']]
X_test = test[['Pclass','Sex','Age','SibSp','Parch','Embarked','logfare','FamilySize','IsAlone','CabinType']]
y_train = train['Survived']

In [96]:
X_train = pd.get_dummies(X_train)
X_test= pd.get_dummies(X_test)
X_train, X_test = X_train.align(X_test, join='left', axis=1)

In [97]:
X_train.drop(['SibSp','Parch','CabinType_A','CabinType_B','CabinType_C','CabinType_D','CabinType_E','CabinType_F','CabinType_GT'],axis=1,inplace=True)
X_test.drop(['SibSp','Parch','CabinType_A','CabinType_B','CabinType_C','CabinType_D','CabinType_E','CabinType_F','CabinType_GT'],axis=1,inplace=True)

In [98]:
X_train.head()

Unnamed: 0,Pclass,Sex,Age,logfare,FamilySize,IsAlone,Embarked_C,Embarked_Q,Embarked_S,CabinType_No Cabin
0,3,1,-0.511823,2.110213,2,0,0,0,1,1
1,1,0,0.555365,4.280593,2,0,1,0,0,0
2,3,0,-0.245026,2.188856,1,1,0,0,1,1
3,1,0,0.355268,3.990834,2,0,0,0,1,0
4,3,1,0.355268,2.202765,1,1,0,0,1,1


In [99]:
X_test.head()

Unnamed: 0,Pclass,Sex,Age,logfare,FamilySize,IsAlone,Embarked_C,Embarked_Q,Embarked_S,CabinType_No Cabin
0,3,1,0.321918,2.178064,1,1,0,1,0,1
1,3,0,1.155659,2.079442,2,0,0,0,1,1
2,2,1,2.156147,2.369075,1,1,0,1,0,1
3,3,1,-0.178327,2.268252,1,1,0,0,1,1
4,3,0,-0.511823,2.586824,3,0,0,0,1,1


In [100]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score

clf1 = LogisticRegression(solver='lbfgs')
clf2 = SVC(gamma="auto",C=10)
clf3 = RandomForestClassifier(n_estimators=1000,max_depth=10)
clf4 = GradientBoostingClassifier(n_estimators = 100)
clf5 = GaussianNB()

ecf = VotingClassifier(estimators = [
    ('svc',clf2),('rf',clf3),('gb',clf4)], voting='hard')

scores = cross_val_score(ecf, X_train, y_train, cv=4, scoring='accuracy')
print('Scores:', scores)
print('Mean score:', np.mean(scores))

Scores: [0.77232143 0.85650224 0.80630631 0.83333333]
Mean score: 0.8171158275908837


In [72]:
for C in [1.0, 10.0, 100.0]:
    clf = SVC(gamma="auto",C=C)
    scores = cross_val_score(clf, X_train, y_train, cv=4, scoring='accuracy')
    print('Mean score:', np.mean(scores))

Mean score: 0.8215550441937773
Mean score: 0.8125913489562593
Mean score: 0.7968959660733764


In [168]:
from sklearn.ensemble import GradientBoostingClassifier

clf = RandomForestClassifier(n_estimators = 100, max_depth=5, random_state=0)
scores = cross_val_score(clf, X_train, y_train, cv=4, scoring='accuracy')
print('Mean score:', np.mean(scores))

Mean score: 0.8193329109458595


In [140]:
clf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=4, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=50, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [135]:
yp_test = clf.predict(X_test)

In [136]:
Pid = test['PassengerId']
submission_df = pd.DataFrame({'PassengerId':Pid,'Survived':yp_test})
submission_df.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,0


In [137]:
submission_df.to_csv('submission.csv', index=False)