In [1]:
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeRegressor
from sklearn import svm
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
%matplotlib inline




In [2]:
def split_train_test(data, test_ratio):
    np.random.seed(42)
    shuffled_indices = np.random.permutation(len(data))
    test_set_size = int(len(data) * test_ratio)
    test_indices = shuffled_indices[:test_set_size]
    train_indices = shuffled_indices[test_set_size:]
    return data.iloc[train_indices], data.iloc[test_indices]
    
    

In [3]:
def zscore(df, name):
    xmean = df[name].mean()
    xstd = np.std(df[name])
    result = (df[name]-xmean)/xstd
    df[name] = result


In [4]:
def update_cabin(df):
    cabin = df["Cabin"]
    tmp = []
    for i in range(len(cabin)):
        if type(cabin[i]) is str:
            tmp.append(1)
        else :
            tmp.append(0)
    df["Cabin"] = tmp

In [5]:
train_data = pd.read_csv("train.csv")
train_data.Age = train_data.Age.fillna(train_data.Age.mean())
train_data.Embarked = train_data.Embarked.fillna(2.0)
train_data = train_data.replace('male', 0).replace('female', 1).replace('C', 0).replace('Q', 1).replace('S', 2)

train_data.drop(['PassengerId', 'Name', 'Ticket'], axis=1, inplace=True)
train_data.dropna()

train_data.Cabin = train_data.Cabin.fillna(0)
update_cabin(train_data)
zscore(train_data, "Age")
zscore(train_data, "Fare")

train_data


Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,0,3,0,-0.592481,1,0,-0.502445,0,2.0
1,1,1,1,0.638789,1,0,0.786845,1,0.0
2,1,3,1,-0.284663,0,0,-0.488854,0,2.0
3,1,1,1,0.407926,1,0,0.420730,1,2.0
4,0,3,0,0.407926,0,0,-0.486337,0,2.0
5,0,3,0,0.000000,0,0,-0.478116,0,1.0
6,0,1,0,1.870059,0,0,0.395814,1,2.0
7,0,3,0,-2.131568,3,1,-0.224083,0,2.0
8,1,3,1,-0.207709,0,2,-0.424256,0,2.0
9,1,2,1,-1.208115,1,0,-0.042956,0,0.0


In [6]:
train_set, test_set = split_train_test(train_data, 0.2)


In [7]:
train_set.describe()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
count,713.0,713.0,713.0,713.0,713.0,713.0,713.0,713.0,713.0
mean,0.377279,2.330996,0.345021,-0.015138,0.553997,0.380084,0.007244,0.224404,1.565217
std,0.485046,0.824387,0.475709,1.001991,1.175697,0.791455,1.045724,0.417482,0.772797
min,0.0,1.0,0.0,-2.253155,0.0,0.0,-0.648422,0.0,0.0
25%,0.0,2.0,0.0,-0.592481,0.0,0.0,-0.488854,0.0,1.0
50%,0.0,3.0,0.0,0.0,0.0,0.0,-0.357391,0.0,2.0
75%,1.0,3.0,1.0,0.407926,1.0,0.0,-0.034314,0.0,2.0
max,1.0,3.0,1.0,3.870872,8.0,6.0,9.667167,1.0,2.0


In [8]:
test_set.describe()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
count,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0
mean,0.410112,2.219101,0.382022,0.060636,0.398876,0.38764,-0.029018,0.247191,1.421348
std,0.493241,0.878028,0.487253,0.99531,0.731378,0.864402,0.796225,0.432596,0.855025
min,0.0,1.0,0.0,-2.221604,0.0,0.0,-0.648422,0.0,0.0
25%,0.0,1.0,0.0,-0.592481,0.0,0.0,-0.489442,0.0,1.0
50%,0.0,3.0,0.0,0.0,0.0,0.0,-0.356888,0.0,2.0
75%,1.0,3.0,1.0,0.48488,1.0,0.0,0.005054,0.0,2.0
max,1.0,3.0,1.0,3.178283,4.0,5.0,4.634417,1.0,2.0


In [9]:
test_set

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
709,1,3,0,0.000000,1,1,-0.341452,0,0.0
439,0,2,0,0.100109,0,0,-0.437007,0,2.0
840,0,3,0,-0.746389,0,0,-0.488854,0,2.0
720,1,2,1,-1.823750,0,1,0.016023,0,2.0
39,1,3,1,-1.208115,1,0,-0.422074,0,0.0
290,1,1,1,-0.284663,0,0,0.939198,0,2.0
300,1,3,1,0.000000,0,0,-0.492378,0,1.0
333,0,3,0,-1.054207,2,0,-0.285997,0,2.0
208,1,3,1,-1.054207,0,0,-0.492378,0,1.0
136,1,1,1,-0.823344,0,2,-0.119216,1,2.0


In [10]:
sample_x = train_set.drop(['Survived'], axis=1)
sample_y = train_set.drop(["Pclass", "Age", "SibSp", "Parch", "Fare", "Cabin", "Embarked", "Sex"], axis=1)

In [11]:
rf = RandomForestClassifier(min_samples_leaf=1, random_state=0)
rf.fit(sample_x, sample_y.values.ravel())

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [12]:
test_sample_x = test_set.drop(['Survived'], axis=1)
test_sample_y = test_set.drop(["Pclass", "Age", "SibSp", "Parch", "Fare", "Embarked", "Sex", "Cabin"], axis=1)

rf_result = rf.predict(test_sample_x)

print("Result from Random Forrest")
print(np.sum(rf_result == test_sample_y['Survived']))
print(np.sum(rf_result == test_sample_y['Survived']) / len(rf_result))

Result from Random Forrest
146
0.8202247191011236


In [13]:
rf_mse = mean_squared_error(test_sample_y, rf_result)
rf_rmse = np.sqrt(rf_mse)
rf_rmse

0.423999152002544

In [14]:
scores = cross_val_score(rf, sample_x, sample_y.values.ravel(), scoring="neg_mean_squared_error", cv = 10)
rf_rmse_scores = np.sqrt(-scores)

In [15]:
def display_scores(scores):
    print("Scores", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())

In [16]:
display_scores(rf_rmse_scores)

Scores [0.48591266 0.5        0.51370117 0.40824829 0.45963856 0.47471266
 0.48932261 0.44405304 0.44405304 0.29277002]
Mean: 0.45124120449224076
Standard deviation: 0.06051218164645683


In [17]:
# random forest
param_grid = [
        {
            'n_estimators': [3, 10, 15, 20, 25, 30], 
            'max_features': [2, 3, 4, 5, 6, 7, 8],
            'max_depth': [3, 5, 10, 15, 20],
    },
    {
        'bootstrap': [False], 
        'n_estimators': [3, 10, 15, 20, 25, 30], 
        'max_features': [2, 3, 4, 5, 6, 7, 8],
        'max_depth': [3, 5, 10, 15, 20],
    },
]

rf_grid_search = GridSearchCV(rf, param_grid, cv = 10,scoring = 'neg_mean_squared_error')

rf_grid_search.fit(sample_x, sample_y.values.ravel())

GridSearchCV(cv=10, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=0, verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid=[{'n_estimators': [3, 10, 15, 20, 25, 30], 'max_features': [2, 3, 4, 5, 6, 7, 8], 'max_depth': [3, 5, 10, 15, 20]}, {'bootstrap': [False], 'n_estimators': [3, 10, 15, 20, 25, 30], 'max_features': [2, 3, 4, 5, 6, 7, 8], 'max_depth': [3, 5, 10, 15, 20]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='neg_mean_squared_error', verbose=0)

In [18]:
rf_grid_search.best_params_

{'bootstrap': False, 'max_depth': 5, 'max_features': 3, 'n_estimators': 30}

In [19]:
rf_grid_result = rf_grid_search.predict(test_sample_x)

print("Result from grid seasrched random forest")
print(np.sum(rf_grid_result==test_sample_y['Survived']))
print(np.sum(rf_grid_result == test_sample_y['Survived']) / len(rf_grid_result))

Result from grid seasrched random forest
145
0.8146067415730337


In [20]:
test_data = pd.read_csv("test.csv")
test_data.Age = test_data.Age.fillna(test_data.Age.mean())
test_data = test_data.replace('male', 0).replace('female', 1).replace('C', 0).replace('Q', 1).replace('S', 2)
test_data.Embarked = test_data.Embarked.fillna(2.0)
test_data.Fare = test_data.Fare.fillna(0) # to be dead

test_data.Cabin = test_data.Cabin.fillna(0)
update_cabin(test_data)
zscore(test_data, "Age")
zscore(test_data, "Fare")

ids = test_data['PassengerId']
test_data.drop(['PassengerId',  'Name', 'Ticket'], axis=1, inplace=True)
test_data.head()


Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,3,0,0.334993,0,0,-0.496637,0,1
1,3,1,1.32553,1,0,-0.511497,0,2
2,2,0,2.514175,0,0,-0.463335,0,1
3,3,0,-0.25933,0,0,-0.481704,0,2
4,3,1,-0.655545,1,1,-0.41674,0,2


In [21]:
output = pd.DataFrame(columns=["PassengerId","Survived"])


for i in range(len(ids)) :
    row = test_data[i:i+1]
    predicted = rf_grid_search.predict(row)
    insert = pd.Series([ids[i], predicted[0]], index=output.columns)
    output = output.append(insert, ignore_index=True)

#    df = pd.DataFrame([[ids[i], predicted[0]]], columns=["PassengerId","Survived"])
#    output = output.append(df, ignore_index=True)
#print(output)

In [22]:
output.describe()

Unnamed: 0,PassengerId,Survived
count,418,418
unique,418,2
top,1023,0
freq,1,286


In [23]:
output.to_csv('5th_submission_by_grid_search_RandomForrest.csv', index=False)