In [128]:
import os
import tarfile
# import urllib
import pandas as pd
from zipfile import ZipFile as zp

ROOT = os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(""))), "machine-learning")
RAW = os.path.join(os.path.join(ROOT, "datasets"), "titanic")
# DATASET = os.path.join(RAW, "titanic.csv")

print(RAW)
def unpackTar(path: str = RAW):
    target_p = os.path.join(path, "titanic.zip")
    print(target_p)
    if not os.path.exists(target_p):
        print("raw data does not exist")
        return
    titanic_path = tarfile.open(target_p)
    try:
        titanic_path.extractall(path=RAW, filter="fully_trusted")
    except Exception as e:
        print("unable to extract data")
        raise e
    titanic_path.close()

def unpackZip(path: str = RAW):
    target_p = os.path.join(path, "titanic.zip")
    with zp(target_p) as zf:
        zf.extractall(path=path)

unpackZip(RAW)
train = None
test = None
gender = None
dataframes = {"train": None, "test": None, "gender": None}

for f_name in dataframes.keys():
    file: str = None
    if f_name == "gender":
        file = "gender_submission"
    else:
        file = f_name
    p = os.path.join(RAW, file+".csv")
    try:
        if os.path.exists(p):
            dataframes[f_name] = pd.read_csv(p)
    except Exception as e:
        print(f"error in reading: {f_name}")

x:\E\Documents\Code\Repo\machine-learning\datasets\titanic


In [129]:
df_tr = dataframes["train"]
df_tst = dataframes["test"]

In [130]:
df_tr.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [131]:
def preprocess(df: pd.DataFrame):
    df = df.copy()

    for f in ["Name", "Ticket", "Cabin", "Embarked"]:
        df.drop(f, axis=1, inplace=True)

    # def normalize_name(x: str):
    #     return " ".join([v.strip(",()[].\"'") for v in x.split(" ")])
    
    # def ticket_number(x: str):
    #     return x.split(" ")[-1]

    # def ticket_item(x: str):
    #     items = x.split(" ")
    #     if len(items) <= 1:
    #         return "None"
    #     return "_".join(items[0:-1])
    
    # df["Name"] = df["Name"].apply(normalize_name)
    # df["Ticket_number"] =  df["Ticket"].apply(ticket_number)
    # df["Ticket_item"] = df["Ticket"].apply(ticket_item)

    return df

prepared_df_tr = preprocess(df_tr)
prepared_df_tst = preprocess(df_tst)
target_tr = prepared_df_tr.pop("Survived")

In [132]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

cat_col = []
for col in prepared_df_tr.columns:
    if prepared_df_tr[col].dtype == "object":
        cat_col.append(col)

process_cat = ColumnTransformer(
    [
        ("Encoder", OneHotEncoder(), cat_col)
    ]
)

prepared_df_tr = process_cat.fit_transform(prepared_df_tr)
prepared_df_tst = process_cat.fit_transform(prepared_df_tst)

In [133]:
prepared_df_tst.shape

(418, 2)

In [134]:
prepared_df_tr.shape

(891, 2)

In [135]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

grid_params = {
    "n_estimators": [90, 91, 92, 93, 94, 95, 96, 97, 98, 99],
    "criterion": ["gini", "entropy", "log_loss"]
}

random_forest_cls = RandomForestClassifier()
grid_random_forest = GridSearchCV(estimator=random_forest_cls, param_grid=grid_params, cv=3)

In [136]:
grid_random_forest.fit(prepared_df_tr, target_tr)

In [137]:
grid_random_forest.best_estimator_

In [138]:
grid_random_forest.best_params_

{'criterion': 'gini', 'n_estimators': 90}

In [139]:
grid_random_forest.predict(prepared_df_tst)

array([0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0,
       1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1,
       1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1,
       1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1,
       0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,