In [1]:
import pandas as pd
import numpy as np


from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import accuracy_score

In [2]:
path = "../data/train.csv"
df = pd.read_csv(path)

features = ['Age', 'Sex', 'Pclass']
label = 'Survived'

X = df[features]
y = df[label]

In [3]:
X

Unnamed: 0,Age,Sex,Pclass
0,22.0,male,3
1,38.0,female,1
2,26.0,female,3
3,35.0,female,1
4,35.0,male,3
...,...,...,...
886,27.0,male,2
887,19.0,female,1
888,,female,3
889,26.0,male,1


# Simple train-test split

In [4]:
from sklearn.model_selection import train_test_split

# SPLIT DATA INTO TRAIN AND TEST SUBSETS
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=43)

mean_age = X_train['Age'].mean()

def impute_age(df, value):
    df['Age'] = df["Age"].fillna(value)
    return df

def convert_sex(df):
    df['is_male'] = 0
    df.loc[df['Sex'] == 'male', 'is_male'] = 1
    df = df.drop(columns=['Sex'])
    return df

X_train = impute_age(X_train, mean_age)
X_train = convert_sex(X_train)

X_test = impute_age(X_test, mean_age)
X_test = convert_sex(X_test)

clf = RandomForestClassifier(n_estimators=100, bootstrap=True, criterion='entropy',
                               min_samples_leaf=5, min_samples_split=4, random_state=42)

clf.fit(X_train, y_train)

y_predict = clf.predict(X_test)

print(round(accuracy_score(y_test, y_predict), 3))

0.757


# KFold

In [5]:
from sklearn.model_selection import KFold


kf = KFold(
    n_splits=5, 
    shuffle=True, 
    random_state=46
)

scores = []
# SPLIT DATA INTO TRAIN AND TEST SUBSETS IN A LOOP
i = 0
for train_index, test_index in kf.split(X):
    X_train, X_test = X.loc[train_index], X.loc[test_index]
    y_train, y_test = y.loc[train_index], y.loc[test_index]
    
    
    mean_age = X_train['Age'].mean()
    
    X_train = impute_age(X_train, mean_age)
    X_train = convert_sex(X_train)

    X_test = impute_age(X_test, mean_age)
    X_test = convert_sex(X_test)
    
    clf = RandomForestClassifier(n_estimators=60, bootstrap=True, criterion='entropy',
                               min_samples_leaf=2, min_samples_split=5, random_state=42)
    
    clf.fit(X_train, y_train)
    y_predict = clf.predict(X_test)
    
    acc_score = round(accuracy_score(y_test, y_predict),3)
    
    print("Fold", i, ":", acc_score)
    i = i+1
    
    scores.append(acc_score)
    
print()
print("Average:", round(100*np.mean(scores), 1), "%")
print("Std:", round(100*np.std(scores), 1), "%")

Fold 0 : 0.838
Fold 1 : 0.803
Fold 2 : 0.798
Fold 3 : 0.803
Fold 4 : 0.82

Average: 81.2 %
Std: 1.5 %


# Variant 1) Train final model on all of data

In [6]:
mean_age = X['Age'].mean()
    
X_train = impute_age(X, mean_age)
X_train = convert_sex(X)

y_train = y

clf = RandomForestClassifier(n_estimators=50, bootstrap=True, criterion='entropy',
                               min_samples_leaf=2, min_samples_split=5, random_state=42)

clf.fit(X_train, y_train)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Age'] = df["Age"].fillna(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['is_male'] = 0


RandomForestClassifier(criterion='entropy', min_samples_leaf=2,
                       min_samples_split=5, n_estimators=50, random_state=42)

In [7]:
# Let's use our model to predict survival of this person:
age = 70
pclass = 1
is_male = 0


completely_different_data = np.array([[age, pclass, is_male]])

y_predict = clf.predict(completely_different_data)

print(y_predict)

[1]




In [8]:
# Let's use our model to predict survival of this person:
age = 5
pclass = 3
is_male = 1


another_completely_different_data = np.array([[age, pclass, is_male]])

y_predict = clf.predict(another_completely_different_data)

print(y_predict)



[0]
