In [1]:
import pandas as pd
from sklearn.preprocessing import OrdinalEncoder

PATH = "../data/train.csv"
features = ["Age", "Group", "NumInGroup"]

def splitting_id(df):
    '''
    Originally Id is in format gggg:pp where gggg is group and pp is person in group

    Parameters:
        dataframe (pandas.DataFrame): DataFrame on which to operate
    Returns:
        pandas.DataFrame
    '''

    df[['Group', 'NumInGroup']] = df['PassengerId'].str.split('_', 1, expand=True)
    return df

def encode_to_float(df):
    '''
    encode categorical data to float since group and num in group are objects

    Parameters:
        dataframe (pandas.DataFrame): DataFrame on which to operate
    Returns:
        pandas.DataFrame
    '''

    df_objects = (df[features].dtypes == 'object')
    object_cols = list(df_objects[df_objects].index)
    ordinal_encoder = OrdinalEncoder()
    df[object_cols] = ordinal_encoder.fit_transform(df[object_cols])
    return df

def impute_age(df, value):
    '''
    Replaces Nulls in column "Age" of a dataframe with the passed value

    Parameters:
        dataframe (pandas.DataFrame): DataFrame on which to operate
        value (float): Value used for imputation
    Returns:
        pandas.DataFrame
    '''

    df['Age'] = df['Age'].fillna(value)
    return df

def transform_data(df, mean_age_value):
    '''
    Applying data cleaning functions to data sets

    Paramters:
        dataframe (pandas.DataFrame): Dataframe on which to operate
        mean_age (float): Mean age of training data set
    Retruns:
        pandas.DataFrame
    '''

    df = splitting_id(df)
    df = encode_to_float(df)
    df = impute_age(df, mean_age_value)
    return df

In [2]:
df = pd.read_csv(PATH)
mean_age = df['Age'].mean()

In [3]:
df = transform_data(df, mean_age)

In [4]:
df.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported,Group,NumInGroup
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False,0.0,0.0
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True,1.0,0.0
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False,2.0,0.0
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False,2.0,1.0
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True,3.0,0.0


In [5]:
X = df[features]

In [6]:
label = 'Transported'

In [7]:
y = df[label]

In [8]:
from sklearn.model_selection import KFold

In [9]:
kf = KFold(
    n_splits=5,
    shuffle=True,
    random_state=46
)

In [10]:
scores = []

In [14]:
from sklearn.ensemble import RandomForestClassifier
import numpy as np
from sklearn.metrics import accuracy_score

In [20]:
for train_index, test_index in kf.split(X):
    X_train, X_test = X.loc[train_index], X.loc[test_index]
    y_train, y_test = y.loc[train_index], y.loc[test_index]
    
    mean_age = X_train['Age'].mean()

    clf = RandomForestClassifier(n_estimators=300, random_state=0, max_depth = 12)

    clf.fit(X_train, y_train)

    y_predict = clf.predict(X_test)

    acc_score = round(accuracy_score(y_test, y_predict),3)

    print(acc_score)

    scores.append(acc_score)

print()
print("Average:", round(100*np.mean(scores), 1), "%")
print("Std:", round(100*np.std(scores), 1), "%")

0.532
0.53
0.538
0.531
0.532

Average: 52.2 %
Std: 1.9 %
