In [1]:
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeRegressor
from sklearn import svm
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
%matplotlib inline

train_data = pd.read_csv("train.csv")


In [2]:
def split_train_test(data, test_ratio):
    np.random.seed(42)
    shuffled_indices = np.random.permutation(len(data))
    test_set_size = int(len(data) * test_ratio)
    test_indices = shuffled_indices[:test_set_size]
    train_indices = shuffled_indices[test_set_size:]
    return data.iloc[train_indices], data.iloc[test_indices]
    
    

In [3]:
train_data.Age = train_data.Age.fillna(train_data.Age.mean())
train_data.Embarked = train_data.Embarked.fillna(2.0)
train_data = train_data.replace('male', 0).replace('female', 1).replace('C', 0).replace('Q', 1).replace('S', 2)
train_data.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1, inplace=True)
train_data.dropna()

train_set, test_set = split_train_test(train_data, 0.2)



In [4]:
train_set.describe()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
count,713.0,713.0,713.0,713.0,713.0,713.0,713.0,713.0
mean,0.377279,2.330996,0.345021,29.502407,0.553997,0.380084,32.563995,1.565217
std,0.485046,0.824387,0.475709,13.020593,1.175697,0.791455,51.936429,0.772797
min,0.0,1.0,0.0,0.42,0.0,0.0,0.0,0.0
25%,0.0,2.0,0.0,22.0,0.0,0.0,7.925,1.0
50%,0.0,3.0,0.0,29.699118,0.0,0.0,14.4542,2.0
75%,1.0,3.0,1.0,35.0,1.0,0.0,30.5,2.0
max,1.0,3.0,1.0,80.0,8.0,6.0,512.3292,2.0


In [5]:
test_set.describe()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
count,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0
mean,0.410112,2.219101,0.382022,30.487064,0.398876,0.38764,30.763038,1.421348
std,0.493241,0.878028,0.487253,12.933776,0.731378,0.864402,39.544957,0.855025
min,0.0,1.0,0.0,0.83,0.0,0.0,0.0,0.0
25%,0.0,1.0,0.0,22.0,0.0,0.0,7.8958,1.0
50%,0.0,3.0,0.0,29.699118,0.0,0.0,14.47915,2.0
75%,1.0,3.0,1.0,36.0,1.0,0.0,32.4552,2.0
max,1.0,3.0,1.0,71.0,4.0,5.0,262.375,2.0


In [6]:
test_set



Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
709,1,3,0,29.699118,1,1,15.2458,0.0
439,0,2,0,31.000000,0,0,10.5000,2.0
840,0,3,0,20.000000,0,0,7.9250,2.0
720,1,2,1,6.000000,0,1,33.0000,2.0
39,1,3,1,14.000000,1,0,11.2417,0.0
290,1,1,1,26.000000,0,0,78.8500,2.0
300,1,3,1,29.699118,0,0,7.7500,1.0
333,0,3,0,16.000000,2,0,18.0000,2.0
208,1,3,1,16.000000,0,0,7.7500,1.0
136,1,1,1,19.000000,0,2,26.2833,2.0


In [7]:

sample_x = train_set.drop(['Survived'], axis=1)
sample_y = train_set.drop(["Pclass", "Age", "SibSp", "Parch", "Fare", "Embarked", "Sex"], axis=1)

In [8]:
rf = RandomForestClassifier(min_samples_leaf=1, random_state=0)
rf.fit(sample_x, sample_y.values.ravel())

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [10]:
test_sample_x = test_set.drop(['Survived'], axis=1)
test_sample_y = test_set.drop(["Pclass", "Age", "SibSp", "Parch", "Fare", "Embarked", "Sex"], axis=1)

rf_result = rf.predict(test_sample_x)

print("Result from Random Forrest")
print(np.sum(rf_result==test_sample_y['Survived']))
print(np.sum(rf_result == test_sample_y['Survived']) / len(rf_result))

Result from Random Forrest
142
0.797752808988764


In [11]:
dt = DecisionTreeRegressor()
dt.fit(sample_x, sample_y.values.ravel())

DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=None, splitter='best')

In [12]:
test_sample_x = test_set.drop(['Survived'], axis=1)
test_sample_y = test_set.drop(["Pclass", "Age", "SibSp", "Parch", "Fare", "Embarked", "Sex"], axis=1)

dt_result = dt.predict(test_sample_x)

print("Result from Decision Tree")
print(np.sum(dt_result==test_sample_y['Survived']))
print(np.sum(dt_result == test_sample_y['Survived']) / len(dt_result))

Result from Decision Tree
130
0.7303370786516854


In [14]:
sv = svm.SVC()
sv.fit(sample_x, sample_y.values.ravel())

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [15]:
test_sample_x = test_set.drop(['Survived'], axis=1)
test_sample_y = test_set.drop(["Pclass", "Age", "SibSp", "Parch", "Fare", "Embarked", "Sex"], axis=1)

sv_result = sv.predict(test_sample_x)

print("Result from svm")
print(np.sum(sv_result==test_sample_y['Survived']))
print(np.sum(sv_result == test_sample_y['Survived']) / len(sv_result))

Result from svm
122
0.6853932584269663


In [16]:
rf_mse = mean_squared_error(test_sample_y, rf_result)
rf_rmse = np.sqrt(rf_mse)
rf_rmse

0.44971901339751685

In [19]:
dt_mse = mean_squared_error(test_sample_y, dt_result)
dt_rmse = np.sqrt(dt_mse)
dt_rmse

0.4576518597352152

In [20]:
sv_mse = mean_squared_error(test_sample_y, sv_result)
sv_rmse = np.sqrt(sv_mse)
sv_rmse

0.5608981561505026

In [21]:
scores = cross_val_score(dt, sample_x, sample_y, scoring="neg_mean_squared_error", cv = 10)
tree_rmse_scores = np.sqrt(-scores)


In [22]:
scores = cross_val_score(rf, sample_x, sample_y.values.ravel(), scoring="neg_mean_squared_error", cv = 10)
rf_rmse_scores = np.sqrt(-scores)

In [23]:
scores = cross_val_score(sv, sample_x, sample_y.values.ravel(), scoring="neg_mean_squared_error", cv = 10)
sv_rmse_scores = np.sqrt(-scores)

In [24]:
def display_scores(scores):
    print("Scores", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())

In [25]:
display_scores(tree_rmse_scores)

Scores [0.46428571 0.50717639 0.54645321 0.46225899 0.44046142 0.48989795
 0.53733825 0.47958315 0.46909162 0.48563077]
Mean: 0.4882177459801625
Standard deviation: 0.031844575864372074


In [26]:
display_scores(rf_rmse_scores)

Scores [0.42491829 0.48591266 0.5        0.40824829 0.41111323 0.44405304
 0.53074489 0.47471266 0.42790021 0.39641248]
Mean: 0.45040157542881154
Standard deviation: 0.0426596145759079


In [27]:
display_scores(sv_rmse_scores)

Scores [0.52704628 0.60092521 0.56519417 0.52704628 0.50350881 0.54385168
 0.61666984 0.51730613 0.63910148 0.53452248]
Mean: 0.5575172360638014
Standard deviation: 0.04382295581095799


In [28]:
# random forest
param_grid = [
        {
            'n_estimators': [3, 10, 15, 20, 25, 30], 
            'max_features': [2, 3, 4, 5, 6, 7],
            'max_depth': [3, 5, 10, 15, 20],
    },
    {
        'bootstrap': [False], 
        'n_estimators': [3, 10, 15, 20, 25, 30], 
        'max_features': [2, 3, 4, 5, 6, 7],
        'max_depth': [3, 5, 10, 15, 20],
    },
]

rf_grid_search = GridSearchCV(rf, param_grid, cv = 10,scoring = 'neg_mean_squared_error')

rf_grid_search.fit(sample_x, sample_y.values.ravel())

GridSearchCV(cv=10, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=0, verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid=[{'n_estimators': [3, 10, 15, 20, 25, 30], 'max_features': [2, 3, 4, 5, 6, 7], 'max_depth': [3, 5, 10, 15, 20]}, {'bootstrap': [False], 'n_estimators': [3, 10, 15, 20, 25, 30], 'max_features': [2, 3, 4, 5, 6, 7], 'max_depth': [3, 5, 10, 15, 20]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='neg_mean_squared_error', verbose=0)

In [29]:
rf_grid_search.best_params_

{'max_depth': 5, 'max_features': 3, 'n_estimators': 15}

In [32]:
rf_grid_result = rf_grid_search.predict(test_sample_x)

print("Result from grid seasrched random forest")
print(np.sum(rf_grid_result==test_sample_y['Survived']))
print(np.sum(rf_grid_result == test_sample_y['Survived']) / len(rf_grid_result))

Result from grid seasrched random forest
142
0.797752808988764


In [33]:
# Decision Tree
param_grid = [
        {
            'max_depth': [3, 5, 10, 15, 20],
            'min_samples_split': [2, 3, 4, 5, 6, 7, 8, 9, 10, 100, 1000],
            'min_samples_leaf': [1, 10, 100],
            'max_features': [2, 3, 4, 5, 6, 7],
    },
]

dt_grid_search = GridSearchCV(dt, param_grid, cv = 10)

dt_grid_search.fit(sample_x, sample_y.values.ravel())

GridSearchCV(cv=10, error_score='raise',
       estimator=DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=None, splitter='best'),
       fit_params=None, iid=True, n_jobs=1,
       param_grid=[{'max_depth': [3, 5, 10, 15, 20], 'min_samples_split': [2, 3, 4, 5, 6, 7, 8, 9, 10, 100, 1000], 'min_samples_leaf': [1, 10, 100], 'max_features': [2, 3, 4, 5, 6, 7]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [34]:
dt_grid_search.best_params_

{'max_depth': 5,
 'max_features': 6,
 'min_samples_leaf': 1,
 'min_samples_split': 9}

In [36]:
dt_grid_result = dt_grid_search.predict(test_sample_x)

print("Result from grid seasrched decision tree")
print(np.sum(dt_grid_result == test_sample_y['Survived']))
print(np.sum(dt_grid_result == test_sample_y['Survived']) / len(dt_grid_result))

Result from grid seasrched decision tree
7
0.03932584269662921


In [37]:
# SVM
param_grid = [
        {
            'degree': [1, 2, 3, 4, 5, 6, 7],
            'gamma': [0.1, 1, 10]
        },
]

sv_grid_search = GridSearchCV(sv, param_grid, cv = 10,scoring = 'neg_mean_squared_error')

sv_grid_search.fit(sample_x, sample_y.values.ravel())

GridSearchCV(cv=10, error_score='raise',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid=[{'degree': [1, 2, 3, 4, 5, 6, 7], 'gamma': [0.1, 1, 10]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='neg_mean_squared_error', verbose=0)

In [38]:
sv_grid_search.best_params_

{'degree': 1, 'gamma': 0.1}

In [40]:
sv_grid_result = sv_grid_search.predict(test_sample_x)

print("Result from grid seasrched random forest")
print(np.sum(sv_grid_result==test_sample_y['Survived']))
print(np.sum(sv_grid_result == test_sample_y['Survived']) / len(sv_grid_result))

Result from grid seasrched random forest
125
0.702247191011236


In [41]:
test_data = pd.read_csv("test.csv")
test_data.Age = test_data.Age.fillna(test_data.Age.mean())
test_data = test_data.replace('male', 0).replace('female', 1).replace('C', 0).replace('Q', 1).replace('S', 2)
test_data.Embarked = test_data.Embarked.fillna(2.0)
test_data.Fare = test_data.Fare.fillna(0) # to be dead

ids = test_data['PassengerId']
test_data.drop(['PassengerId',  'Name', 'Ticket', 'Cabin'], axis=1, inplace=True)
test_data.head()


Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,3,0,34.5,0,0,7.8292,1
1,3,1,47.0,1,0,7.0,2
2,2,0,62.0,0,0,9.6875,1
3,3,0,27.0,0,0,8.6625,2
4,3,1,22.0,1,1,12.2875,2


In [42]:
output = pd.DataFrame(columns=["PassengerId","Survived"])


for i in range(len(ids)) :
    row = test_data[i:i+1]
    predicted = rf_grid_search.predict(row)
    insert = pd.Series([ids[i], predicted[0]], index=output.columns)
    output = output.append(insert, ignore_index=True)

#    df = pd.DataFrame([[ids[i], predicted[0]]], columns=["PassengerId","Survived"])
#    output = output.append(df, ignore_index=True)
#print(output)

In [43]:
output.describe()

Unnamed: 0,PassengerId,Survived
count,418,418
unique,418,2
top,1023,0
freq,1,284


In [44]:
output.to_csv('4th_submission_by_grid_search_RandomForrest.csv', index=False)