In [1]:
import os
import numpy as np
import pandas as pd
import pandas_profiling
from datetime import datetime
# from sklearn.impute import KNNImputer
from tqdm.notebook import tqdm
from sklearn.metrics import make_scorer, accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedShuffleSplit, GridSearchCV

**Ordinal Categoricals:**
 - _Severity_
 - 

**Nominal Categoricals:**
 - _Medical_Tent_
 - _Title_
 - 

In [2]:
#------------ CONFIG ------------#
eda     = False
OHE_all = True
gridsearch = False
nan_bfill = False
#--------------------------------#

In [3]:
input_folder = "./data"
output_folder = "./output"
profiles_folder = "./profiles"
seed = 2020

train = pd.read_csv(f"{input_folder}/train.csv")
test = pd.read_csv(f"{input_folder}/test.csv")

In [4]:
if eda:
    trainprofile = train.profile_report(title='Train Profiling Report', plot={'histogram': {'bins': 8}}, progress_bar=False)
    trainprofile.to_file(output_file=f"{profiles_folder}/train_profiling.html")
    testprofile = test.profile_report(title='Test Profiling Report', plot={'histogram': {'bins': 8}}, progress_bar=False)
    testprofile.to_file(output_file=f"{profiles_folder}/test_profiling.html")

In [5]:
# new features

train["Title"] = pd.DataFrame(train["Name"].str.split().tolist(),
                              index=train.index,
                              columns=["Title", "First_name", "Last_name"]
                             )["Title"]
train.drop(["Name"], axis=1, inplace=True)

test["Title"] = pd.DataFrame(test["Name"].str.split().tolist(),
                             index=test.index,
                             columns=["Title", "First_name", "Last_name"]
                            )["Title"]
test.drop(["Name"], axis=1, inplace=True)

In [6]:
target = "Deceased"
id_col = "Patient_ID"
train_ids = train[id_col]
test_ids = test[id_col]

In [7]:
# defining categoricals

ordinal_cat_cols = ["Severity"]
nominal_cat_cols = ["Title", "City", "Medical_Tent"]

if OHE_all:
    nominal_cat_cols.extend(ordinal_cat_cols)

TODO:

 - Relacionar Family_Case_ID do treino com a do teste
 - 

In [8]:
# ordinal categorical


In [9]:
# nominal categorical

train["In_Tent"] = train["Medical_Tent"].isna()
test["In_Tent"] = test["Medical_Tent"].isna()

train["is_train"] = 1
test["is_train"] = 0

dfcombined = pd.concat([train, test], axis=0)

for col in nominal_cat_cols:
    dfcombined = pd.concat([
        dfcombined.drop(col, axis=1),
        pd.get_dummies(dfcombined[col], prefix=col)
    ], axis=1)

train = dfcombined[dfcombined["is_train"]==1].drop("is_train", axis=1)
test = dfcombined[dfcombined["is_train"]==0].drop(["Deceased", "is_train"], axis=1)

In [10]:
X, y = train.drop([target], axis=1), train[target]

In [11]:
X.drop(["Patient_ID"], axis=1, inplace=True)

In [12]:
# imputer = KNNImputer()
# train = pd.DataFrame(imputer.fit_transform(train.drop([target, "Patient_ID", "Family_Case_ID"], axis=1)),
#                      columns=list(set(Xcols)-set([target, "Patient_ID", "Family_Case_ID"])))
# test = pd.DataFrame(imputer.transform(test.drop(["Patient_ID", "Family_Case_ID"], axis=1)),
#                     columns=list(set(Xcols)-set(["Patient_ID", "Family_Case_ID"])))

In [13]:
# nan imputing

if nan_bfill == True:
    X = X.fillna(method="bfill")
else:
    X = X.fillna(X.median())

#### Model Train

In [14]:
%%time

if gridsearch:
    rfc = RandomForestClassifier(n_estimators = 200,
                                 class_weight = "balanced",
                                 n_jobs = -1,
                                 random_state=seed)
    parameters = {
        'max_depth': range(2, 8),
        'min_samples_leaf': range(1, 3),
        'min_samples_split': range(2, 3)
    }
    sss = StratifiedShuffleSplit(n_splits=nfolds, test_size=0.25, random_state=seed)
    grid_obj = GridSearchCV(rfc,
                            parameters,
                            scoring=make_scorer(accuracy_score),
                            cv=sss,
                            verbose=0,
                            n_jobs=-1,
                            return_train_score=True)
    grid_obj.fit(X, y)
    rfc = grid_obj.best_estimator_
else:
    rfc = RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight='balanced',
                                 criterion='gini', max_depth=7, max_features='auto',
                                 max_leaf_nodes=None, max_samples=None,
                                 min_impurity_decrease=0.0, min_impurity_split=None,
                                 min_samples_leaf=1, min_samples_split=2,
                                 min_weight_fraction_leaf=0.0, n_estimators=5000,
                                 n_jobs=-1, oob_score=False, random_state=seed, verbose=0,
                                 warm_start=False)
    rfc.fit(X, y)

Wall time: 7.58 s


In [15]:
%%time

import xgboost as xgb

# param = {
#     'eta': 0.3, 
#     'max_depth': 7,  
#     'objective': 'multi:softprob',  
#     'num_class': 3} 

# steps = 1000

xgb_train = xgb.DMatrix(X.values, label=y.values)
xgb_test = xgb.DMatrix(test.fillna(test.median()).drop(id_col, axis=1).values)
    
xgb_model = xgb.XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=10,
       min_child_weight=1, missing=None, n_estimators=1000, nthread=-1,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=seed, silent=False, subsample=1)


xgb_model.fit(X.values, y.values)

# xgb_preds = xgb_model.predict(test.fillna(test.median()).drop(id_col, axis=1).values)


# xgb_model.eval()#y.values)

# train = pd.read_csv("train.csv")
# target = train['target']
# train = train.drop(['ID','target'],axis=1)
# test = pd.read_csv("test.csv")
# test = test.drop(['ID'],axis=1)

# xgtrain = xgb.DMatrix(train.values, target.values)
# xgtest = xgb.DMatrix(test.values)

# xgb_clf = xgb.XGBClassifier(
#     learning_rate
# )

Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Wall time: 863 ms


XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.1, max_delta_step=0, max_depth=10,
              min_child_weight=1, missing=None, monotone_constraints='()',
              n_estimators=1000, n_jobs=-1, nthread=-1, num_parallel_tree=1,
              objective='binary:logistic', random_state=2020, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=1, seed=2020, silent=False,
              subsample=1, tree_method='exact', validate_parameters=1,
              verbosity=None)

#### Model Evaluation

In [16]:
def run_kfold(clf, random_state=2020, nfolds=100):      
    sss = StratifiedShuffleSplit(n_splits=nfolds, test_size=0.25, random_state=random_state)
    mean_outcome = 0

    for train_indices, test_indices in tqdm(sss.split(X, y), total=sss.get_n_splits(), desc="fold"):
        X_train, X_test = X.values[train_indices], X.values[test_indices]
        y_train, y_test = y.values[train_indices], y.values[test_indices]
        clf.fit(X_train, y_train)
        predictions = clf.predict(X_test)
        acc = accuracy_score(y_test, predictions)
        mean_outcome += (acc/sss.get_n_splits())
    return mean_outcome

In [17]:
mean_accuracy = run_kfold(rfc, random_state=seed, nfolds=200)
print(mean_accuracy)

HBox(children=(FloatProgress(value=0.0, description='fold', max=200.0, style=ProgressStyle(description_width='…


0.8145333333333326


In [18]:
mean_accuracy_xgb = run_kfold(xgb_model, random_state=seed, nfolds=200)
print(mean_accuracy_xgb)

HBox(children=(FloatProgress(value=0.0, description='fold', max=200.0, style=ProgressStyle(description_width='…

Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoo

Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoo

Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoo

Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoo

Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoo

Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoo

Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoo

Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoo

Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoo

Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.



0.811333333333333


In [19]:
halt

NameError: name 'halt' is not defined

In [None]:
rfc.fit(X, y)

In [None]:
feat_importances = pd.DataFrame(
    zip(X.columns, rfc.feature_importances_),
    columns=["column", "feature_importance"]
).set_index("column").sort_values(by="feature_importance", ascending=False)

feat_importances

#### Model Predict

In [None]:
# nan imputing
if nan_bfill == True:
    preds = rfc.predict(test.drop(id_col, axis=1).fillna(method="bfill")).astype(np.int8)
else:
    preds = rfc.predict(test.fillna(test.median()).drop(id_col, axis=1)).astype(np.int8)

In [None]:
def find_version():
    """
    Finds automatically the version of the submission. :)
    """
    version = 1
    if len(os.listdir(output_folder))==0:
        return version
    else:
        for file in os.listdir(output_folder):
            if file.split("-")[0].startswith("version"):
                if int(file.split("-")[0].split("n")[1].split("-")[0]) > version:
                    version = int(file.split("-")[0].split("n")[1].split("-")[0])
        return version + 1

In [None]:
predictions = pd.DataFrame({id_col: test_ids, target: preds})
version = find_version()
now = datetime.today().strftime("%Y%m%d_%H%M%S")
filename = f"version{version}-{now}-{round(mean_accuracy, 4)}.csv"
predictions.to_csv(f'{output_folder}/{filename}', index = False)
print(f"Written predictions to 'file {output_folder}/{filename}'...")
predictions.head()

In [None]:
predictions["Deceased"].value_counts()

In [None]:
sum = 0
for row, index in enumerate(test.index):
    if test.loc[index,"Family_Case_ID"] in train["Family_Case_ID"].unique():
        sum +=1
        
sum

In [None]:
print(datetime.now())