In [1]:
'''
main module for app
'''


import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder

RANDOM_STATE = 42
MIN_SAMPLE_SPLIT=4
MIN_SAMPLES_LEAF=5
N_ESTIMATORS=100
N_SPLITS = 5
PATH = '../data/train.csv'
df_raw = pd.read_csv(PATH)


def impute_age(df, value):
    '''
    Replaces Nulls in column "Age" of a dataframe with the passed value

    Parameters:
        dataframe (pandas.DataFrame): DataFrame on which to operate
        value (float): Value used for imputation
    Returns:
        pandas.DataFrame
    '''

    df['Age'] = df["Age"].fillna(value)
    return df


def convert_sex(df):
    '''
    Replacing sex in column "Sex" of a dataframe to 1 if it's male and 0 if it's female

    Parameters:
        dataframe (pandas.DataFrame): Dataframe on which to operate
    Returns:
        pandas.DataFrame
    '''

    df['is_male'] = 0
    df.loc[df['Sex'] == 'male', 'is_male'] = 1
    df = df.drop(columns=['Sex'])
    return df

def count_relatives_on_board(df):
    df["RelativesOnboard"] = df["SibSp"] + df["Parch"]
    df["RelativesOnboard"] = df["SibSp"] + df["Parch"]
    return df

def set_title(df):
    mapping = {'Mlle': 'Miss', 'Major': 'Mr', 'Col': 'Mr', 'Sir': 'Mr', 'Don': 'Mr', 'Mme': 'Miss',
      'Jonkheer': 'Mr', 'Lady': 'Mrs', 'Capt': 'Mr', 'Countess': 'Mrs', 'Ms': 'Miss', 'Dona': 'Mrs'}
    
    df['Title'] = df['Name'].str.extract('([A-Za-z]+)\.', expand=True)
    df.replace({'Title': mapping}, inplace=True)
    return df

def transform_data(df, mean_age_value):
    '''
    Applying data cleaning functions to data sets

    Paramters:
        dataframe (pandas.DataFrame): Dataframe on which to operate
        mean_age (float): Mean age of training data set
    Retruns:
        pandas.DataFrame
    '''
    df = set_title(df)
    df = count_relatives_on_board(df)
    df = impute_age(df, mean_age_value)
    df = convert_sex(df)
    return df


'''
Main Function
'''
features = ['Age', 'Sex', 'Pclass', 'Title', 'RelativesOnboard', 'Name']
LABEL = 'Survived'

X = df_raw[:-1]
y = df_raw[LABEL]

k_fold = KFold(
    n_splits=N_SPLITS,
    shuffle=True,
    random_state=RANDOM_STATE
)

scores = []

for train_index, test_index in k_fold.split(X):
    X_train, X_test = X.loc[train_index], X.loc[test_index]
    y_train, y_test = y.loc[train_index], y.loc[test_index]

    mean_age = X_train['Age'].mean()

    X_train = transform_data(X_train, mean_age)

    X_test = transform_data(X_test, mean_age)

    X_train.drop(["PassengerId", "Name", "Ticket", "Cabin", "Parch", "SibSp", "Survived", 'Embarked', 'Fare'], inplace=True,axis=1)
    X_test.drop(["PassengerId", "Name", "Ticket", "Cabin", "Parch", "SibSp", "Survived", 'Embarked', 'Fare'], inplace=True,axis=1)
    #print(X_train.columns)
    label_encoder = LabelEncoder()
    X_train['Title'] = label_encoder.fit_transform(X_train['Title'])
    X_test['Title'] = label_encoder.fit_transform(X_test['Title'])

    clf = RandomForestClassifier(n_estimators=N_ESTIMATORS, bootstrap=True, criterion='entropy',
                                min_samples_leaf=MIN_SAMPLES_LEAF,
                                min_samples_split=MIN_SAMPLE_SPLIT, random_state=RANDOM_STATE)

    clf.fit(X_train, y_train)
    y_predict = clf.predict(X_test)

    acc_score = round(accuracy_score(y_test, y_predict),3)

    print(acc_score)

    scores.append(acc_score)

print()
print("Average:", round(100*np.mean(scores), 1), "%")
print("Std:", round(100*np.std(scores), 1), "%")


0.82
0.792
0.854
0.792
0.831

Average: 81.8 %
Std: 2.4 %


In [2]:
X_test = pd.read_csv('../data/test.csv')
test_ids = X_test["PassengerId"]
mean_age = X_test['Age'].mean()
mean_age

30.272590361445783

In [3]:
X_test = transform_data(X_test, mean_age)
X_test

Unnamed: 0,PassengerId,Pclass,Name,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title,RelativesOnboard,is_male
0,892,3,"Kelly, Mr. James",34.50000,0,0,330911,7.8292,,Q,Mr,0,1
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",47.00000,1,0,363272,7.0000,,S,Mrs,1,0
2,894,2,"Myles, Mr. Thomas Francis",62.00000,0,0,240276,9.6875,,Q,Mr,0,1
3,895,3,"Wirz, Mr. Albert",27.00000,0,0,315154,8.6625,,S,Mr,0,1
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",22.00000,1,1,3101298,12.2875,,S,Mrs,2,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
413,1305,3,"Spector, Mr. Woolf",30.27259,0,0,A.5. 3236,8.0500,,S,Mr,0,1
414,1306,1,"Oliva y Ocana, Dona. Fermina",39.00000,0,0,PC 17758,108.9000,C105,C,Mrs,0,0
415,1307,3,"Saether, Mr. Simon Sivertsen",38.50000,0,0,SOTON/O.Q. 3101262,7.2500,,S,Mr,0,1
416,1308,3,"Ware, Mr. Frederick",30.27259,0,0,359309,8.0500,,S,Mr,0,1


In [5]:
X_test.drop(["PassengerId", "Name", "Ticket", "Cabin", "Parch", "SibSp", 'Embarked', 'Fare'], inplace=True,axis=1)
X_test

Unnamed: 0,Pclass,Age,Title,RelativesOnboard,is_male
0,3,34.50000,Mr,0,1
1,3,47.00000,Mrs,1,0
2,2,62.00000,Mr,0,1
3,3,27.00000,Mr,0,1
4,3,22.00000,Mrs,2,0
...,...,...,...,...,...
413,3,30.27259,Mr,0,1
414,1,39.00000,Mrs,0,0
415,3,38.50000,Mr,0,1
416,3,30.27259,Mr,0,1


In [6]:
label_encoder = LabelEncoder()
X_test['Title'] = label_encoder.fit_transform(X_test['Title'])

In [7]:
y_test = clf.predict(X_test)

In [8]:
output = pd.DataFrame({'PassengerId': test_ids, 'Survived': y_test})
output.to_csv("submission.csv", index=False)
print("Done")

Done
