In [209]:
import os
import tarfile
# import urllib
import pandas as pd
from zipfile import ZipFile as zp

ROOT = os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(""))), "machine-learning")
RAW = os.path.join(os.path.join(ROOT, "datasets"), "titanic")
# DATASET = os.path.join(RAW, "titanic.csv")

print(RAW)
def unpackTar(path: str = RAW):
    target_p = os.path.join(path, "titanic.zip")
    print(target_p)
    if not os.path.exists(target_p):
        print("raw data does not exist")
        return
    titanic_path = tarfile.open(target_p)
    try:
        titanic_path.extractall(path=RAW, filter="fully_trusted")
    except Exception as e:
        print("unable to extract data")
        raise e
    titanic_path.close()

def unpackZip(path: str = RAW):
    target_p = os.path.join(path, "titanic.zip")
    with zp(target_p) as zf:
        zf.extractall(path=path)

unpackZip(RAW)
train = None
test = None
gender = None
dataframes = {"train": None, "test": None, "gender": None}

for f_name in dataframes.keys():
    file: str = None
    if f_name == "gender":
        file = "gender_submission"
    else:
        file = f_name
    p = os.path.join(RAW, file+".csv")
    try:
        if os.path.exists(p):
            dataframes[f_name] = pd.read_csv(p)
    except Exception as e:
        print(f"error in reading: {f_name}")

x:\E\Documents\Code\Repo\machine-learning\datasets\titanic


In [210]:
df_tr = dataframes["train"]
df_tst = dataframes["test"]

In [211]:
df_tr.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [212]:
def preprocess(df: pd.DataFrame):
    df = df.copy()

    for f in ["Name", "Ticket", "Cabin", "Embarked"]:
        df.drop(f, axis=1, inplace=True)

    # def normalize_name(x: str):
    #     return " ".join([v.strip(",()[].\"'") for v in x.split(" ")])
    
    # def ticket_number(x: str):
    #     return x.split(" ")[-1]

    # def ticket_item(x: str):
    #     items = x.split(" ")
    #     if len(items) <= 1:
    #         return "None"
    #     return "_".join(items[0:-1])
    
    # df["Name"] = df["Name"].apply(normalize_name)
    # df["Ticket_number"] =  df["Ticket"].apply(ticket_number)
    # df["Ticket_item"] = df["Ticket"].apply(ticket_item)

    return df

prepared_df_tr = preprocess(df_tr)
prepared_df_tst = preprocess(df_tst)
target_tr = prepared_df_tr.pop("Survived")

In [213]:
prepared_df_tr.head()

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare
0,1,3,male,22.0,1,0,7.25
1,2,1,female,38.0,1,0,71.2833
2,3,3,female,26.0,0,0,7.925
3,4,1,female,35.0,1,0,53.1
4,5,3,male,35.0,0,0,8.05


In [214]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

cat_col = []
for col in prepared_df_tr.columns:
    if prepared_df_tr[col].dtype == "object":
        cat_col.append(col)

process_cat = ColumnTransformer(
    [
        ("Encoder", OneHotEncoder(), cat_col)
    ],
    remainder="passthrough"
)

prepared_df_tr = process_cat.fit_transform(prepared_df_tr)
prepared_df_tst = process_cat.fit_transform(prepared_df_tst)

In [215]:
prepared_df_tst.shape

(418, 8)

In [216]:
prepared_df_tr.shape

(891, 8)

In [217]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

grid_params = {
    "n_estimators": [90, 91, 92, 93, 94, 95, 96, 97, 98, 99],
    "criterion": ["gini", "entropy", "log_loss"]
}

random_forest_cls = RandomForestClassifier()
grid_random_forest = GridSearchCV(estimator=random_forest_cls, param_grid=grid_params, cv=3)

In [218]:
grid_random_forest.fit(prepared_df_tr, target_tr)

In [219]:
grid_random_forest.best_estimator_

In [220]:
grid_random_forest.best_params_

{'criterion': 'entropy', 'n_estimators': 92}

In [221]:
predictions = grid_random_forest.predict(prepared_df_tst)

In [222]:
output = df_tst.copy()
output["Survived"] = predictions

In [223]:
output

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Survived
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q,0
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0000,,S,0
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q,0
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S,0
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S,1
...,...,...,...,...,...,...,...,...,...,...,...,...
413,1305,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.0500,,S,0
414,1306,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9000,C105,C,1
415,1307,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.2500,,S,0
416,1308,3,"Ware, Mr. Frederick",male,,0,0,359309,8.0500,,S,0


In [224]:
survived = output[output["Survived"] == 1]
survived

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Survived
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S,1
6,898,3,"Connolly, Miss. Kate",female,30.0,0,0,330972,7.6292,,Q,1
8,900,3,"Abrahim, Mrs. Joseph (Sophie Halaut Easu)",female,18.0,0,0,2657,7.2292,,C,1
12,904,1,"Snyder, Mrs. John Pillsbury (Nelle Stevenson)",female,23.0,1,0,21228,82.2667,B45,S,1
14,906,1,"Chaffee, Mrs. Herbert Fuller (Carrie Constance...",female,47.0,1,0,W.E.P. 5734,61.1750,E31,S,1
...,...,...,...,...,...,...,...,...,...,...,...,...
408,1300,3,"Riordan, Miss. Johanna Hannah""""",female,,0,0,334915,7.7208,,Q,1
409,1301,3,"Peacock, Miss. Treasteall",female,3.0,1,1,SOTON/O.Q. 3101315,13.7750,,S,1
410,1302,3,"Naughton, Miss. Hannah",female,,0,0,365237,7.7500,,Q,1
411,1303,1,"Minahan, Mrs. William Edward (Lillian E Thorpe)",female,37.0,1,0,19928,90.0000,C78,Q,1


In [225]:
not_survived = output[output["Survived"] == 0]
not_survived

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Survived
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q,0
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0000,,S,0
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q,0
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S,0
5,897,3,"Svensson, Mr. Johan Cervin",male,14.0,0,0,7538,9.2250,,S,0
...,...,...,...,...,...,...,...,...,...,...,...,...
412,1304,3,"Henriksson, Miss. Jenny Lovisa",female,28.0,0,0,347086,7.7750,,S,0
413,1305,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.0500,,S,0
415,1307,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.2500,,S,0
416,1308,3,"Ware, Mr. Frederick",male,,0,0,359309,8.0500,,S,0
