In [2]:
# Import Fastai and Pandas
from fastai.tabular.all import *
import pandas as pd

In [9]:
# Read training data from csv file into Dataframe
df = pd.read_csv("../data/raw/train.csv")
print(len(df))
print(df.columns)
df

891
Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Thayer)",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [18]:
categorical_features = ["Pclass", "Name", "Sex", "Ticket", "Cabin", "Embarked"]
continuous_features = ["Age", "SibSp", "Parch", "Fare"]
tabular_processors = [Categorify, FillMissing, Normalize]
y_variable = "Survived"
prediction_columns = {"PassengerId", "Survived"}

In [11]:
# Write exploratory code to get insights in the data
# Get histogram for all columns

def display_histogram(df: pd.DataFrame, column_name: str):
    df.hist(column_name)

In [58]:
# Create transformation methods to be applied to dataframe
splits = RandomSplitter(valid_pct=0.2, seed=None)(range_of(df))
processed_df = df

In [76]:
# Create TabularDataLoader for the loaded DataFrame
tabular_dl = TabularPandas(processed_df, 
                           cat_names=categorical_features, 
                           cont_names=continuous_features,
                           y_names=y_variable, 
                           splits=splits, procs=tabular_processors)

In [78]:
# Create baseline predictions based on sex

def is_female(sex: str):
    if sex == "female":
        return 1
    else:
        return 0

def get_prediction_by_sex(test_df):
    test_df["Survived"] = test_df["Sex"].apply(is_female)
    pred_df = test_df[["PassengerId", "Survived"]]
    return pred_df

valid_pred_df = get_prediction_by_sex(tabular_dl.valid)

In [79]:
# Write diagnostic code to check model performance
def get_accuracy_from_predictions(valid_df, pred_df):
    return (valid_df["Survived"] == pred_df["Survived"]).mean()

get_accuracy_from_predictions(tabular_dl.valid, valid_pred_df)

0.601123595505618

In [81]:
test_df = pd.read_csv("../data/raw/test.csv")
test_pred_df = get_prediction_by_sex(test_df)

test_pred_df

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0


In [84]:
#  Export baseline predictions to csv
test_pred_df.to_csv("../data/predictions/gender_baseline.csv", index=False)