In [88]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [17]:
dtypes = {
    "PassengerId" : "int64",
    "Survived": "int64",
    "Pclass" : "category",
    "Sex" : "category",
    "Age" : "float64",
    "SibSp" : "int64",
    "Parch" : "int64",
    "Fare" : "float64",
    "Cabin" : "category",
    "Embarked" : "category"
}

In [119]:
train_df = pd.read_csv('train.csv',sep=',',header=0,usecols=["PassengerId","Survived",
            "Pclass","Sex","Age","SibSp","Parch","Fare","Cabin","Embarked"],dtype=dtypes,na_values=[' '])

train_df.set_index("PassengerId",inplace=True)
train_df.dropna(subset=['Embarked'],inplace=True)
train_df.Age.fillna(X.Age.mean(),inplace=True)

train_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 889 entries, 1 to 891
Data columns (total 9 columns):
Survived    889 non-null int64
Pclass      889 non-null category
Sex         889 non-null category
Age         889 non-null float64
SibSp       889 non-null int64
Parch       889 non-null int64
Fare        889 non-null float64
Cabin       202 non-null object
Embarked    889 non-null category
dtypes: category(3), float64(2), int64(3), object(1)
memory usage: 51.5+ KB


In [120]:
def parse_train_df(train_df):
    X = train_df[["Pclass","Sex","Age","SibSp","Parch","Fare","Embarked"]]
    y = train_df[["Survived"]]
    
    num_feat = X.select_dtypes('number').columns.values
    cat_feat = X.select_dtypes('category').columns.values
    X_num = X[num_feat]
    X_cat = X[cat_feat]
    
    X_num = (X_num - X_num.mean()) / X_num.std()
    X_num = X_num.fillna(X_num.mean())
    X_num.head()
    
    X_cat = pd.get_dummies(X_cat)
    
    X = pd.concat([X_num,X_cat],axis=1)
    return X,y

In [121]:
X,y = parse_train_df(train_df)

In [122]:
def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn

from sklearn.cross_validation import train_test_split
from sklearn.ensemble import GradientBoostingClassifier

X_train, X_test, y_train, y_test = train_test_split(X, y,train_size=0.2,random_state=42)
model = GradientBoostingClassifier(learning_rate=0.05).fit(X_train,y_train)
model.score(X_test,y_test)

0.7893258426966292

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = [
  {'learning_rate': np.linspace(0.1,0.001,20), 'loss': ['deviance','exponential'],
  'n_estimators': np.arange(50,200,20) }
 ]

search = GridSearchCV(model,param_grid).fit(X_test,y_test)
best_model = search.best_estimator_
best_model.score(X_test,y_test)

In [112]:
best_model.score(X_test,y_test)

0.8918539325842697

In [136]:
test_df = pd.read_csv('test.csv',sep=',',header=0,usecols=["PassengerId",
            "Pclass","Sex","Age","SibSp","Parch","Fare","Cabin","Embarked"],dtype=dtypes,na_values=[' '])

test_df.set_index('PassengerId',inplace=True)
test_df.Age.fillna(test_df.Age.mean(),inplace=True)
test_df.Fare.fillna(test_df.Fare.mean(),inplace=True)

test_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 418 entries, 892 to 1309
Data columns (total 8 columns):
Pclass      418 non-null category
Sex         418 non-null category
Age         418 non-null float64
SibSp       418 non-null int64
Parch       418 non-null int64
Fare        418 non-null float64
Cabin       91 non-null object
Embarked    418 non-null category
dtypes: category(3), float64(2), int64(2), object(1)
memory usage: 21.1+ KB


In [137]:
def parse_test_df(test_df):
    X = test_df[["Pclass","Sex","Age","SibSp","Parch","Fare","Embarked"]]
    
    num_feat = X.select_dtypes('number').columns.values
    cat_feat = X.select_dtypes('category').columns.values
    X_num = X[num_feat]
    X_cat = X[cat_feat]
    
    X_num = (X_num - X_num.mean()) / X_num.std()
    X_num = X_num.fillna(X_num.mean())
    X_num.head()
    
    X_cat = pd.get_dummies(X_cat)
    
    X = pd.concat([X_num,X_cat],axis=1)
    return X

In [138]:
X = parse_test_df(test_df)

In [146]:
y_predict = best_model.predict(X)
result_df = pd.DataFrame(columns=['PassengerId','Survived'])
result_df['PassengerId'] = X.index
result_df['Survived'] = y_predict
result_df.to_csv('result.csv',index=False)
result_df

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1
5,897,0
6,898,1
7,899,0
8,900,0
9,901,0
