In [1]:
import pandas as pd

import sys
import os
from pathlib import Path 
dir_path = os.path.dirname(os.path.abspath(''))
sys.path.append(dir_path)
import dc_get_assets
import deepchecks.tabular as dct


In [2]:
label_column_name = "Survived"
features = ["Pclass", "Sex_male", "SibSp", "Parch"]

def make_titanic_simple_features_df(df):
    # make df only with simple features:
    org_features = ["Pclass", "Sex", "SibSp", "Parch"]
    label_col = df[label_column_name]
    # get only desired features
    df = pd.get_dummies(df[org_features], drop_first = True)
    # add label
    df[label_column_name] = label_col
    return df

In [4]:
from sklearn.model_selection import train_test_split

# use only train data from kaggle because it has labels, and split to train-test
# data can be downloaded from here: https://www.kaggle.com/competitions/titanic/data
titanic = pd.read_csv("kaggle_titanic_data/train.csv")
train_data, test_data  = train_test_split(titanic, test_size=0.3, random_state=17)

df_train = make_titanic_simple_features_df(train_data)
df_test = make_titanic_simple_features_df(test_data)

In [5]:
from deepchecks.tabular import Dataset
train_ds = Dataset(df_train, cat_features=features, label=label_column_name)
test_ds = Dataset(df_test, cat_features=features, label=label_column_name)

In [6]:
# initiate a simple model for training
# preprocessing and model is similar to: https://www.kaggle.com/code/alexisbcook/titanic-tutorial
from sklearn.ensemble import RandomForestClassifier 
rf = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=1)

rf.fit(train_ds.features_columns, train_ds.label_col);

In [7]:
from deepchecks.tabular.suites import model_evaluation

eval_results = model_evaluation().run(train_ds, test_ds, rf)
# lets see the suite's output
eval_results.save_as_html('correct.html');

In [8]:
# code for jumbling up labels/samples by mistake "because of numpy"

import numpy as np
optimized_df_test = test_ds.data
optimized_df_test[test_ds.label_name] = np.array(test_ds.data[test_ds.label_name].sample(frac=1))
optimized_test_ds = test_ds.copy(optimized_df_test) 
model_eval_results = model_evaluation().run(train_ds, optimized_test_ds, rf)
# lets see the suite's output
model_eval_results.save_as_html('numpy.html');

In [9]:
## uncomment this cell to run from the assets uploaded to the s3 bucket

# train_ds = dc_get_assets.get_train_ds()
# test_ds = dc_get_assets.get_test_ds()

# ###### The following is relevant only if we have the original_titanic train and test datasets #######

# # def get_only_simple_features_df(df):
# #     # make df only with simple features:
# #     org_features = ["Pclass", "Sex", "SibSp", "Parch"]
# #     # get only desired features
# #     df = pd.get_dummies(df[org_features], drop_first = True)
# #     # add label
# #     return df


# # def make_new_ds(df, org_ds):
# #     return dct.Dataset(df, cat_features=df.columns, features=df.columns, label=org_ds.label_col)

# # train_data = get_only_simple_features_df(train_ds.data)
# # test_data = get_only_simple_features_df(test_ds.data)
# # new_train_ds = make_new_ds(train_data, train_ds)
# # new_test_ds = make_new_ds(test_data, test_ds)

# ####### otherwise just do the following: #####
# new_train_ds = train_ds
# new_test_ds = test_ds

# ###########################################

# rf.fit(new_train_ds.features_columns, new_train_ds.label_col)

# from deepchecks.tabular.suites import model_evaluation
# model_eval_results = model_evaluation().run(new_train_ds, new_test_ds, rf)


# # jumble labels/samples by mistake because of numpy
# import numpy as np
# optimized_test_df = new_test_ds.data
# optimized_test_df[new_test_ds.label_name] = np.array(new_test_ds.data[new_test_ds.label_name].sample(frac=1))
# optimized_test_ds = new_test_ds.copy(optimized_test_df) 
# model_eval_results = model_evaluation().run(new_train_ds, optimized_test_ds, rf)
# model_eval_results.save_as_html()

# # save all
# import joblib
# joblib.dump(model, filename='titanic_rf.model')
# new_train_ds.data.to_csv('titanic_train.csv', index=False)
# new_test_ds.data.to_csv('titanic_test.csv', index=False)