In [14]:
'''
main module for app
'''


import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder

RANDOM_STATE = 42
MIN_SAMPLE_SPLIT=4
MIN_SAMPLES_LEAF=5
N_ESTIMATORS=100
N_SPLITS = 5
USELESS_FEATURES = ["PassengerId", "Name", "Ticket", "Cabin",
                    "Parch", "SibSp", 'Embarked']
PATH = '../data/train.csv'
df_raw = pd.read_csv(PATH)


def impute_age(df, value):
    '''
    Replaces Nulls in column "Age" of a dataframe with the passed value

    Parameters:
        dataframe (pandas.DataFrame): DataFrame on which to operate
        value (float): Value used for imputation
    Returns:
        pandas.DataFrame
    '''

    df['Age'] = df["Age"].fillna(value)
    return df


def convert_sex(df):
    '''
    Replacing sex in column "Sex" of a dataframe to 1 if it's male and 0 if it's female

    Parameters:
        dataframe (pandas.DataFrame): Dataframe on which to operate
    Returns:
        pandas.DataFrame
    '''

    df['is_male'] = 0
    df.loc[df['Sex'] == 'male', 'is_male'] = 1
    df = df.drop(columns=['Sex'])
    return df

def count_relatives_on_board(df):
    '''
    Counting Relatives on board based of sibsp and parch columns

    Paramters:
        dataframe (pandas.DataFrame): Dataframe on which to operate
    Retruns:
        pandas.DataFrame
    '''

    df["RelativesOnboard"] = df["SibSp"] + df["Parch"]
    return df

def set_title(df):
    '''
    Changing name titles to cryptonims

    Paramters:
        dataframe (pandas.DataFrame): Dataframe on which to operate
    Retruns:
        pandas.DataFrame
    '''

    mapping = {'Mlle': 'Miss', 'Major': 'Mr', 'Col': 'Mr', 'Sir': 'Mr', 'Don': 'Mr', 'Mme': 'Miss',
      'Jonkheer': 'Mr', 'Lady': 'Mrs', 'Capt': 'Mr', 'Countess': 'Mrs', 'Ms': 'Miss', 'Dona': 'Mrs'}

    df['Title'] = df['Name'].str.extract('([A-Za-z]+)\.', expand=True)
    df.replace({'Title': mapping}, inplace=True)
    return df

def title_encode(df):
    label_encoder = LabelEncoder()
    df['Title'] = label_encoder.fit_transform(df['Title'])
    return df

def transform_data(df, mean_age_value):
    '''
    Applying data cleaning functions to data sets

    Paramters:
        dataframe (pandas.DataFrame): Dataframe on which to operate
        mean_age (float): Mean age of training data set
    Retruns:
        pandas.DataFrame
    '''
    df = set_title(df)
    df = count_relatives_on_board(df)
    df = impute_age(df, mean_age_value)
    df = convert_sex(df)
    df = title_encode(df)
    df.drop(USELESS_FEATURES, inplace=True,axis=1)
    return df


'''
Main Function
'''

LABEL = 'Survived'
mean_age = df_raw['Age'].mean()
df = transform_data(df_raw, mean_age)
X = df
X = X.drop('Survived', axis=1)
y = df['Survived']

k_fold = KFold(
    n_splits=N_SPLITS,
    shuffle=True,
    random_state=RANDOM_STATE
)

scores = []

for train_index, test_index in k_fold.split(X):
    X_train, X_test = X.loc[train_index], X.loc[test_index]
    y_train, y_test = y.loc[train_index], y.loc[test_index]

    clf = RandomForestClassifier(n_estimators=N_ESTIMATORS, bootstrap=True, criterion='entropy',
                                min_samples_leaf=MIN_SAMPLES_LEAF,
                                min_samples_split=MIN_SAMPLE_SPLIT, random_state=RANDOM_STATE)

    clf.fit(X_train, y_train)
    y_predict = clf.predict(X_test)

    acc_score = round(accuracy_score(y_test, y_predict),3)

    print(acc_score)

    scores.append(acc_score)

print()
print("Average:", round(100*np.mean(scores), 1), "%")
print("Std:", round(100*np.std(scores), 1), "%")



0.793
0.831
0.854
0.803
0.837

Average: 82.4 %
Std: 2.2 %


In [18]:
X_test = pd.read_csv('../data/test.csv')
test_ids = X_test["PassengerId"]
mean_age = X_test['Age'].mean()
mean_age

30.272590361445783

In [23]:
X_test = transform_data(X_test, mean_age)

KeyError: 'Name'

In [31]:
y_test = clf.predict(X_test)

In [32]:
output = pd.DataFrame({'PassengerId': test_ids, 'Survived': y_test})
output.to_csv("submission.csv", index=False)
print("Done")

Done


In [25]:
X_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 6 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Pclass            418 non-null    int64  
 1   Age               418 non-null    float64
 2   Fare              417 non-null    float64
 3   Title             418 non-null    int64  
 4   RelativesOnboard  418 non-null    int64  
 5   is_male           418 non-null    int64  
dtypes: float64(2), int64(4)
memory usage: 19.7 KB


In [29]:
X_test.describe()

Unnamed: 0,Pclass,Age,Fare,Title,RelativesOnboard,is_male
count,418.0,418.0,417.0,418.0,418.0,418.0
mean,2.26555,30.27259,35.627188,2.88756,0.839713,0.636364
std,0.841838,12.634534,55.907576,0.770741,1.519072,0.481622
min,1.0,0.17,0.0,0.0,0.0,0.0
25%,1.0,23.0,7.8958,3.0,0.0,0.0
50%,3.0,30.27259,14.4542,3.0,0.0,1.0
75%,3.0,35.75,31.5,3.0,1.0,1.0
max,3.0,76.0,512.3292,5.0,10.0,1.0


In [30]:
X_test['Fare'] = X_test['Fare'].fillna(35.627188)