In [103]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
import matplotlib.pyplot as plt

df_train_orig = pd.read_csv('train.csv', keep_default_na=False)
df_test = pd.read_csv('test.csv', keep_default_na=False)

# 1. Extract y_test
df_train_result = df_train_orig[['PassengerId','Survived']]
df_train = df_train_orig.drop('Survived', axis=1)

In [104]:
# 2. Clean Data
def clean_data(df):
    df['Cabin'] = df['Cabin'].replace(to_replace='',value='Z')
    df['Age'] = df['Age'].replace(to_replace='', value=0)
    return df

In [105]:
# 3. Convert features values to numeric
def encode_categorical_data(df):
    #3.1 Convert female = 1; male = 0
    df = df.replace(to_replace={'Sex':{'female':1, 'male':0}})
    #3.2 Convert Embarked  S=0;C=1;Q=2
    df = df.replace(to_replace={'Embarked':{'S':0, 'C':1, 'Q':2, '':0}})
    return df

In [106]:
# 4. Add more features
def add_features(df):
    df = df[['PassengerId', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Embarked']]
    df = df.applymap(lambda x: float(x))
    df['is_child'] = df['Age'].apply(lambda x: float(x)<=15)
    df['family_size'] = df['SibSp'] + df['Parch'] + 1
    df['is_alone'] = df['family_size'] == 0
    return df

In [107]:
def interpolate_age(df):
    age_mean = df['Age'].mean()
    age_std = df['Age'].std()
    null_num = df.where(df['Age'] == 0)['Age'].dropna().count()
    print(len(np.random.randint(age_mean - age_std, age_mean + age_std, null_num)))
    df.loc[df['Age'] == 0, 'Age'] = np.random.randint(age_mean - age_std, age_mean + age_std, null_num)
    return df

In [108]:
# 5. Scatter Plot. I am currently planning to use only Pclass, Sex, Age, SibSp, Parch and Embarked columns
#pd.plotting.scatter_matrix(df_selected_train[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Embarked']], c=df_train_result['Survived'], alpha=0.2, figsize=(30, 30))


In [109]:
df_train = clean_data(df_train)
df_train = encode_categorical_data(df_train)
df_train = add_features(df_train)
df_train = interpolate_age(df_train)

X_train, X_test, y_train, y_test = train_test_split(df_train[['Pclass', 'Sex', 'Age',  'Embarked', 'is_child','family_size','is_alone']], df_train_result['Survived'], random_state=0) 

177


In [110]:
data_test = clean_data(df_test)
data_test = encode_categorical_data(data_test)
data_test = add_features(data_test)
data_test = interpolate_age(data_test)

selected_features = ['Pclass', 'Sex', 'Age', 'Embarked', 'is_child','family_size','is_alone']

86


In [111]:
#LogisticRegression
X_train_Poly = PolynomialFeatures(degree = 2).fit_transform(X_train)
X_test_Poly = PolynomialFeatures(degree = 2).fit_transform(X_test)
clf1 = LogisticRegression().fit(X_train_Poly, y_train)
print('Logistic Regression with Polynomial Features score = {}'.format(clf1.score(X_test_Poly, y_test)))

dfl_test = data_test.copy()
poly_features = PolynomialFeatures(degree = 2).fit_transform(data_test[selected_features])
dfl_test['Survived'] = clf1.predict(poly_features)
dfw_test = pd.DataFrame({'PassengerId':df_test['PassengerId'].values, 'Survived' : dfl_test['Survived'].values})
(dfw_test[['PassengerId', 'Survived']].applymap(lambda x : int(x)).set_index('PassengerId')).to_csv('titanic_pred_logistic.csv')
dfw_test = pd.read_csv('titanic_pred_logistic.csv')

Logistic Regression with Polynomial Features score = 0.8071748878923767


In [112]:
#GridSearchCV
clf2 = LogisticRegression()
g = GridSearchCV(clf2, param_grid={'C' : [0.01, 0.1, 10, 100]}, scoring='precision')
g.fit(X_train, y_train)
print('Grid Search CV Score = {}'.format(g.score(X_test, y_test)))

dfg_test['Survived'] = g.predict(data_test[selected_features])
(dfg_test[['PassengerId', 'Survived']].applymap(lambda x : int(x)).set_index('PassengerId')).to_csv('titanic_pred.csv')
dfg_test = pd.read_csv('titanic_pred.csv')

Grid Search CV Score = 1.0


In [113]:
#Gradien Boosting decision Tree Classifier
gb = GradientBoostingClassifier(random_state=0)
gb.fit(X_train, y_train)
print('Gradient Boosting Classifier Score = {}'.format(gb.score(X_test, y_test)))

dfgb_test = data_test[selected_features]
dfgb_test['Survived'] = gb.predict(dfgb_test)
dfgw_test = pd.DataFrame({'PassengerId':df_test['PassengerId'].values, 'Survived' : dfgb_test['Survived'].values})
(dfgw_test[['PassengerId', 'Survived']].applymap(lambda x : int(x)).set_index('PassengerId')).to_csv('titanic_pred_gb.csv')
dfgw_test = pd.read_csv('titanic_pred_gb.csv')

Gradient Boosting Classifier Score = 0.8430493273542601


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys


In [114]:
#Support Vector Machines
svc = SVC()
svc.fit(X_train, y_train)
print('SVC score = {}'.format(svc.score(X_test, y_test)))

dfsvc_test = data_test[selected_features]
dfsvc_test['Survived'] = svc.predict(dfsvc_test)
dfsvc_test = pd.DataFrame({'PassengerId':df_test['PassengerId'].values, 'Survived' : dfsvc_test['Survived'].values})
(dfsvc_test[['PassengerId', 'Survived']].applymap(lambda x : int(x)).set_index('PassengerId')).to_csv('titanic_pred_svc.csv')
dfsvc_test = pd.read_csv('titanic_pred_svc.csv')

SVC score = 0.8295964125560538


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys
