# Import

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
# from sklearn.tree import DecisionTreeClassifier
# from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
# from sklearn.naive_bayes import BernoulliNB, GaussianNB
from sklearn.linear_model import LogisticRegression
# from sklearn.neural_network import MLPClassifier, MLPRegressor
# from sklearn.svm import SVC
# from sklearn.dummy import DummyClassifier
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold

In [3]:
from imblearn.under_sampling import RandomUnderSampler

In [4]:
from imblearn.over_sampling import SMOTE, RandomOverSampler
from imblearn.pipeline import Pipeline

In [5]:
from collections import Counter

In [6]:
import wandb

In [7]:
from sklearn import preprocessing

In [89]:
from skl2onnx import convert_sklearn
from skl2onnx.common.data_types import FloatTensorType
import onnxruntime as rt

In [None]:
# 用來保存print的階段(長時間)
# %%capture stored_output

In [93]:
from mlxtend.feature_selection import SequentialFeatureSelector
from mlxtend.feature_selection import ExhaustiveFeatureSelector

# Function(Utility)

In [8]:
# check Duplicate file
def checkDuplicateFile(file_path):
    import os
    if os.path.isfile(file_path):
        print("Caution: File existed!")
        ans = input("Do you want to cover it?(Y/others)")
        if ans == "Y":
            return False
        else:
            print("Canceled....")
            return True
    else:
        return False

In [69]:
def gridsearch(clf, p_grid, X_train, y_train, X_test, y_test, run_name="Test", scoring="f1",cv=5):
    # clf: model (classifier)
    # p_grid: search space
    # X_train: training data
    # y_train: training data(target)
    # X_test: testing data
    # y_test: testing data(target)
    
    #########
    ##wandb##
    #########
    wandb.init(project="DataMining_Project2", entity="oscarchencs10")
    wandb.run.name = run_name
    wandb.run.save()
    
    pipe = Pipeline(steps=[
        ('model', clf)]
    )

    #採用F1-Score最高為標準
    grid_search = GridSearchCV(
        estimator=pipe, param_grid=p_grid, cv=cv, n_jobs=12, scoring=scoring, refit=True)
    grid_search.fit(X_train, y_train)

    print("Best parameter (CV score=%0.3f):" % grid_search.best_score_)
    print(grid_search.best_params_)
    print('\nBest estimator')
    print(grid_search.best_estimator_)
    # print(grid_search.cv_results_)

    #############
    ### train ###
    #############
    print('\n\ntrain')
    y_pred = grid_search.predict(X_train)
    report = classification_report(y_train, y_pred, labels=[
                                   1, 0], output_dict=True)  # , target_names=['0', '1']
    acc = report.pop('accuracy')
    target_f1_score = report['1']['f1-score']
    target_recall = report['1']['recall']
    target_precision = report['1']['precision']
    y_pred_prob = grid_search.predict_proba(X_train)
    auc = roc_auc_score(y_train, y_pred_prob[:, 1])

    print(confusion_matrix(y_train, y_pred, labels=[1, 0]))
    display(pd.DataFrame(report).T)
    print(
        f'Accuracy: {acc:.3f}, AUC: {auc:.3f}, f1-score: {target_f1_score:.3f} \n\n')
    
    print(">>>> Wandb(Train)....")
    
    wandb.log({"Train-Acc": acc, "Train-AUC":auc, "Train-f1(pos)":target_f1_score, 
               "Train-pre":target_precision, "Train-rec":target_recall})
    
    #<wandb> train#
#     wandb.sklearn.plot_classifier(grid_search, X_train, X_test, y_train, y_test, y_pred, 
#                                   y_pred_prob, labels=["infected","non-infected"], 
#                                   model_name='RandomForest', 
#                                   feature_names=None) ###train
#     wandb.sklearn.plot_learning_curve(grid_search, X_train, y_train)

    print(">>>> Wandb(Train)(End)....")
    ############
    ### test ###
    ############
    print('test')
    y_pred = grid_search.predict(X_test)
    report = classification_report(y_test, y_pred, labels=[
                                   1, 0], output_dict=True)  # , target_names=['0', '1']
    acc = report.pop('accuracy')
    target_f1_score = report['1']['f1-score']
    target_recall = report['1']['recall']
    target_precision = report['1']['precision']
    y_pred_prob = grid_search.predict_proba(X_test)
    auc = roc_auc_score(y_test, y_pred_prob[:, 1])

    print(confusion_matrix(y_test, y_pred, labels=[1, 0]))
    display(pd.DataFrame(report).T)
    print(
        f'Accuracy: {acc:.3f}, AUC: {auc:.3f}, f1-score: {target_f1_score:.3f} \n\n')
    
    print(">>>> Wandb(Test)....")
    
#     #<wandb> test#
#     wandb.sklearn.plot_classifier(grid_search, X_train, X_test, y_train, y_test, y_pred, 
#                                   y_pred_prob, labels=["infected","non-infected"], 
#                                   model_name='RandomForest', feature_names=None) ###test
    wandb.sklearn.plot_confusion_matrix(y_test, y_pred, labels=["Non-infected","infected"])
    wandb.sklearn.plot_roc(y_test, y_pred_prob, labels=["Non-infected","infected"])
    wandb.sklearn.plot_precision_recall(y_test, y_pred_prob, labels=["Non-infected","infected"])
    wandb.log({"Test-Acc":acc, "Test-AUC":auc, "Test-f1(pos)":target_f1_score,
               "Test-pre":target_precision, "Test-rec":target_recall})
    print(">>>> Wandb(Test)(End)....")


    if grid_search.best_estimator_.steps[0][1].__class__.__name__ == 'RandomForestClassifier':
        name = 'RandomForest (dep=' + str(grid_search.best_params_['model__max_depth']) + \
            ' feature=' + \
            str(grid_search.best_params_['model__max_features']) + ')'
    elif grid_search.best_estimator_.steps[0][1].__class__.__name__ == 'KNeighborsClassifier':
        name = 'KNeighbors (k=' + \
            str(grid_search.best_params_['model__n_neighbors']) + ')'
    elif grid_search.best_estimator_.steps[0][1].__class__.__name__ == 'LogisticRegression':
        name = 'LogisticRegression (penalty=' + \
            grid_search.best_params_['model__penalty']
        if len(grid_search.best_params_) == 2:
            name = name + ' solver=' + \
                grid_search.best_params_['model__solver']
        name = name + ')'
    else:
        name = ''
    
    wandb.finish()
    return grid_search, name, acc, target_f1_score, auc

# Function(Use)

In [10]:
# Test for checking ICU_id missing in Lab_1103_csv
def getMissingIDinLab(Lab_file, show=True):
    test = sorted(Lab_file.ICU_id.unique())
    s = 0
    error_list = list()
    for i in test:
        #         print(i)
        s += 1
        if s != i:
            if show:
                print(f"error! : {s}")
            error_list.append(s)
            s += 1
    if show:
        print(f"Missing ID Result: {error_list}")
    if show:
        print(f"Missing Length:{len(error_list)}")
    return error_list

In [11]:
# store Dataframe to CSV
def store2CSV(data, target_name, target_loc_prefix='./'):
    file_path = target_loc_prefix+target_name+".csv"
    if checkDuplicateFile(file_path):
        print("store2CSV failed")
        return
    data.to_csv(file_path)
    print("store2CSV Successful!")

In [12]:
# store Datastruc. to pickle
def store2Pickle(data, target_name, target_loc_prefix='./'):
    import pickle
    file_path = target_loc_prefix+target_name+'.pickle'
    if checkDuplicateFile(file_path):
        print("store2Pickle failed")
        return
    with open(file_path, 'wb') as f:
        pickle.dump(data, f)
    print("store2Pickle Successful!")

In [13]:
def readFPickle(target_name, target_loc_prefix='./'):
    import pickle
    file_name = target_loc_prefix+target_name+'.pickle'
    with open(file_name, 'rb') as f:
        temp = pickle.load(f)
    return temp

# Function(Data preprocessing)

In [14]:
#Function: 補值
# 將針對輸入的df_data直接進行inplace插補
# 須確保df_data的缺失值位置有放np.nan
def handleMissing(df_data, df_feature, outFeature=["outcome"], cate_astype = "int"):
    for featureName in df_data.columns:
        if featureName not in outFeature:
            if df_data[featureName].isna().sum() == 0:
                print(f"{featureName}: Not need to fill.")
                continue
            else:
                # 先去看是連續與否 (1代表連續,0代表離散)
                kindValue = df_feature.loc[df_feature["features name"]
                                           == featureName, "kind"].values[0]
                if kindValue == 1:
                    # continuous
                    # mean filling
                    targetMean = df_data[featureName].mean()
                    df_data[featureName].fillna(value=targetMean, inplace=True)
                    print(f"{featureName}: Fill, Continuous.")

                else:
                    # categorical
                    # mode filling
                    targetMode = df_data[featureName].mode()[0]
                    df_data[featureName].fillna(value=targetMode, inplace=True)
                    df_data[featureName] = df_data[featureName].astype(cate_astype)
                    print(f"{featureName}: Fill, Categorical. (astype to {cate_astype})")
            
    print("---handleMissing Finish---")

In [15]:
# plot hist
# filtered_data need to check not have nan
def plotHist(df_data, target, outcome="outcome", bins=20):
    filtered_data = pd.concat([df_data["outcome"], df_data[target]], axis=1)
    filtered_data = filtered_data.dropna()
    print(filtered_data.isna().sum())
    plt.hist(filtered_data.loc[filtered_data[outcome]==0, target], bins=bins, alpha=0.5, label='0')
    plt.hist(filtered_data.loc[filtered_data[outcome]==1, target], bins=bins, alpha=0.5, label='1')
    plt.xlabel(target)
    plt.ylabel('count')
    plt.legend(title=outcome)

In [16]:
# plot countplot
# filtered_data need to check not have nan
def plotCountplot(df_data, target, outcome="outcome"):
    filtered_data = pd.concat([df_data["outcome"], df_data[target]], axis=1)
    filtered_data = filtered_data.dropna()
    print(filtered_data.isna().sum())
    sns.countplot(x=target, hue=outcome, data=filtered_data)

In [17]:
# plot boxplot
# filtered_data need to check not have nan
def plotBoxplot(df_data, target, outcome="outcome"):
    filtered_data = pd.concat([df_data["outcome"], df_data[target]], axis=1)
    filtered_data = filtered_data.dropna()
    print(filtered_data.isna().sum())
    sns.boxplot(x=filtered_data[target], data=filtered_data)

# Function(Model)

In [None]:
def serializeModel(model, modelName, featureNum):
    initial_type = [('float_input', FloatTensorType([None, featureNum]))]
    onx = convert_sklearn(model, initial_types=initial_type)
    with open(modelName + ".onnx", "wb") as f:
        f.write(onx.SerializeToString())

In [None]:
def modelPredict(modelName, testData):
    sess = rt.InferenceSession(modelName + '.onnx')#load the onnx
    input_name = sess.get_inputs()[0].name
    label_name = sess.get_outputs()[0].name
    pred_onx = sess.run([label_name], {input_name: testData.astype(np.float32)})[0]#predict testData
    print(pred_onx)

# Load Data(From Project1)

In [18]:
raw_p1_training_csv = pd.read_csv("./data/p1_training.csv")

In [19]:
raw_p1_validation_csv = pd.read_csv("./data/p1_validation.csv")

In [20]:
raw_p1_training_csv

Unnamed: 0.1,Unnamed: 0,outcome,AGE,SEX,LOS,Joint,Drain,Commercial_ALBC,Non_commercial_ALBC,cci_index,...,Weight Loss,Fluid and Electrolyte Disorders,Blood Loss Anemia,Deficiency Anemia,Anemia,Alcohol Abuse,Drug Abuse,Psychoses,Depression,Psyciatric disorder
0,12660,1,60,1,5,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,15988,0,75,0,7,0,1,0,1,2,...,0,0,0,0,0,0,0,0,0,0
2,31224,1,50,0,5,0,1,0,0,3,...,0,0,0,0,0,0,0,0,0,0
3,10398,0,39,1,5,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,41231,1,55,1,2,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
43412,6265,0,80,1,4,0,1,0,1,3,...,0,0,0,0,0,0,0,0,0,0
43413,11284,0,73,0,4,0,1,0,0,1,...,0,0,0,0,0,0,0,0,0,0
43414,38158,1,78,1,4,0,1,0,1,2,...,0,0,0,0,0,0,0,0,0,0
43415,860,0,64,0,5,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [21]:
raw_p1_validation_csv

Unnamed: 0.1,Unnamed: 0,outcome,AGE,SEX,LOS,Joint,Drain,Commercial_ALBC,Non_commercial_ALBC,cci_index,...,Weight Loss,Fluid and Electrolyte Disorders,Blood Loss Anemia,Deficiency Anemia,Anemia,Alcohol Abuse,Drug Abuse,Psychoses,Depression,Psyciatric disorder
0,24690,0,77.0,0,7,1,0,0,0,2,...,0,0,0,0,0,0,0,0,0,0
1,18416,0,63.0,0,4,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,23808,0,55.0,1,4,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,15647,0,63.0,1,8,0,1,0,1,1,...,0,0,0,0,0,0,0,0,0,0
4,24418,0,69.0,0,5,0,1,0,1,1,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7631,33892,0,81.0,0,7,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
7632,22026,0,69.0,0,4,1,1,0,0,1,...,0,0,0,0,0,0,0,0,0,0
7633,50989,0,82.0,0,4,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
7634,13424,0,70.0,0,4,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [22]:
raw_p1_training_csv.drop("Unnamed: 0", axis=1, inplace=True)

In [23]:
raw_p1_validation_csv.drop("Unnamed: 0", axis=1, inplace=True)

In [24]:
raw_p1_X_train = raw_p1_training_csv.copy()
raw_p1_y_train = raw_p1_X_train.pop("outcome")

In [25]:
raw_p1_X_val = raw_p1_validation_csv.copy()
raw_p1_y_val = raw_p1_X_val.pop("outcome")

# Test case (Baseline)
    - 使用Project1最後的training dataset, val
            - training有經過SMOTE
                - 正樣本變成負樣本的0.45倍 （29943*0.45 = 13474）
                - 正：29943 (69%), 負:13474 (31%), total: 43417
                - 無缺失值
            - val:
                - +:7486(98%),-:150(2%),total:7636
                - 無缺失值

## Data anlysis

In [24]:
raw_p1_X_train

Unnamed: 0,AGE,SEX,LOS,Joint,Drain,Commercial_ALBC,Non_commercial_ALBC,cci_index,elx_index,Blood_trans,...,Weight Loss,Fluid and Electrolyte Disorders,Blood Loss Anemia,Deficiency Anemia,Anemia,Alcohol Abuse,Drug Abuse,Psychoses,Depression,Psyciatric disorder
0,60,1,5,1,1,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1,75,0,7,0,1,0,1,2,3,1,...,0,0,0,0,0,0,0,0,0,0
2,50,0,5,0,1,0,0,3,2,0,...,0,0,0,0,0,0,0,0,0,0
3,39,1,5,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,55,1,2,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
43412,80,1,4,0,1,0,1,3,4,0,...,0,0,0,0,0,0,0,0,0,0
43413,73,0,4,0,1,0,0,1,2,0,...,0,0,0,0,0,0,0,0,0,0
43414,78,1,4,0,1,0,1,2,1,0,...,0,0,0,0,0,0,0,0,0,0
43415,64,0,5,0,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [25]:
raw_p1_y_train

0        1
1        0
2        1
3        0
4        1
        ..
43412    0
43413    0
43414    1
43415    0
43416    0
Name: outcome, Length: 43417, dtype: int64

In [26]:
raw_p1_X_val 

Unnamed: 0,AGE,SEX,LOS,Joint,Drain,Commercial_ALBC,Non_commercial_ALBC,cci_index,elx_index,Blood_trans,...,Weight Loss,Fluid and Electrolyte Disorders,Blood Loss Anemia,Deficiency Anemia,Anemia,Alcohol Abuse,Drug Abuse,Psychoses,Depression,Psyciatric disorder
0,77.0,0,7,1,0,0,0,2,2,0,...,0,0,0,0,0,0,0,0,0,0
1,63.0,0,4,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,55.0,1,4,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,63.0,1,8,0,1,0,1,1,2,0,...,0,0,0,0,0,0,0,0,0,0
4,69.0,0,5,0,1,0,1,1,3,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7631,81.0,0,7,0,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7632,69.0,0,4,1,1,0,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0
7633,82.0,0,4,0,1,0,1,0,1,1,...,0,0,0,0,0,0,0,0,0,0
7634,70.0,0,4,0,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [27]:
raw_p1_y_val

0       0
1       0
2       0
3       0
4       0
       ..
7631    0
7632    0
7633    0
7634    0
7635    0
Name: outcome, Length: 7636, dtype: int64

In [98]:
raw_p1_X_train.isna().sum()

AGE                                        0
SEX                                        0
LOS                                        0
Joint                                      0
Drain                                      0
Commercial_ALBC                            0
Non_commercial_ALBC                        0
cci_index                                  0
elx_index                                  0
Blood_trans                                0
OP_time_minute                             0
ASA                                        0
Diagnosis                                  0
Congestive Heart Failure                   0
Valvular Disease                           0
Heart disease                              0
Peripheral Vascular Disorders              0
Hypertension Uncomplicated                 0
Paralysis                                  0
Lung disease                               0
Diabetes                                   0
Hypothyroidism                             0
Renal Fail

In [99]:
raw_p1_y_train.value_counts()

0    29943
1    13474
Name: outcome, dtype: int64

In [101]:
raw_p1_X_val.isna().sum()

AGE                                        0
SEX                                        0
LOS                                        0
Joint                                      0
Drain                                      0
Commercial_ALBC                            0
Non_commercial_ALBC                        0
cci_index                                  0
elx_index                                  0
Blood_trans                                0
OP_time_minute                             0
ASA                                        0
Diagnosis                                  0
Congestive Heart Failure                   0
Valvular Disease                           0
Heart disease                              0
Peripheral Vascular Disorders              0
Hypertension Uncomplicated                 0
Paralysis                                  0
Lung disease                               0
Diabetes                                   0
Hypothyroidism                             0
Renal Fail

In [103]:
raw_p1_y_val.value_counts()

0    7486
1     150
Name: outcome, dtype: int64

## Exp 1
- f1

In [110]:
# gridsearch(clf, p_grid, X_train, y_train, X_test, y_test, run_name="Test")
p_grid_rf = {'model__max_depth': [i for i in range(5, 10)],
             'model__max_features': [i for i in range(20, 30)]}

p_grid_rf_2 = {'model__max_depth': [i for i in range(1, 20)],
               'model__max_features': [i for i in range(5, 15)]}

p_grid_rf_3 = {'model__max_depth': [i for i in range(5, 25)],
               'model__max_features': [i for i in range(10, 20)]}

base_lines_RF1 = [
    gridsearch(RandomForestClassifier(random_state=42), p_grid_rf, raw_p1_X_train, raw_p1_y_train, 
               raw_p1_X_val, raw_p1_y_val, run_name="RF Conf.1"),
    gridsearch(RandomForestClassifier(random_state=42), p_grid_rf_2, raw_p1_X_train, raw_p1_y_train, 
               raw_p1_X_val, raw_p1_y_val, run_name="RF Conf.2"),
    gridsearch(RandomForestClassifier(random_state=42), p_grid_rf_3, raw_p1_X_train, raw_p1_y_train, 
               raw_p1_X_val, raw_p1_y_val, run_name="RF Conf.3"),
]

VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…



Best parameter (CV score=0.814):
{'model__max_depth': 9, 'model__max_features': 29}

Best estimator
Pipeline(steps=[('model',
                 RandomForestClassifier(max_depth=9, max_features=29,
                                        random_state=42))])


train
[[ 9759  3715]
 [  442 29501]]


Unnamed: 0,precision,recall,f1-score,support
1,0.956671,0.724284,0.824414,13474.0
0,0.888156,0.985239,0.934182,29943.0
macro avg,0.922414,0.854761,0.879298,43417.0
weighted avg,0.909419,0.904254,0.900117,43417.0


Accuracy: 0.904, AUC: 0.972, f1-score: 0.824 


>>>> Wandb(Train)....
>>>> Wandb(Train)(End)....
test
[[  58   92]
 [ 128 7358]]


Unnamed: 0,precision,recall,f1-score,support
1,0.311828,0.386667,0.345238,150.0
0,0.987651,0.982901,0.98527,7486.0
macro avg,0.649739,0.684784,0.665254,7636.0
weighted avg,0.974375,0.971189,0.972698,7636.0


Accuracy: 0.971, AUC: 0.764, f1-score: 0.345 


>>>> Wandb(Test)....
>>>> Wandb(Test)(End)....


VBox(children=(Label(value=' 0.12MB of 0.12MB uploaded (0.00MB deduped)\r'), FloatProgress(value=0.99030444817…

0,1
Test-AUC,▁
Test-Acc,▁
Test-f1(pos),▁
Train-AUC,▁
Train-Acc,▁
Train-f1(pos),▁

0,1
Test-AUC,0.76369
Test-Acc,0.97119
Test-f1(pos),0.34524
Train-AUC,0.9723
Train-Acc,0.90425
Train-f1(pos),0.82441




Best parameter (CV score=0.950):
{'model__max_depth': 19, 'model__max_features': 14}

Best estimator
Pipeline(steps=[('model',
                 RandomForestClassifier(max_depth=19, max_features=14,
                                        random_state=42))])


train
[[13310   164]
 [  256 29687]]


Unnamed: 0,precision,recall,f1-score,support
1,0.981129,0.987828,0.984467,13474.0
0,0.994506,0.99145,0.992976,29943.0
macro avg,0.987818,0.989639,0.988722,43417.0
weighted avg,0.990355,0.990326,0.990335,43417.0


Accuracy: 0.990, AUC: 0.999, f1-score: 0.984 


>>>> Wandb(Train)....
>>>> Wandb(Train)(End)....
test
[[  53   97]
 [ 130 7356]]


Unnamed: 0,precision,recall,f1-score,support
1,0.289617,0.353333,0.318318,150.0
0,0.986985,0.982634,0.984805,7486.0
macro avg,0.638301,0.667984,0.651562,7636.0
weighted avg,0.973286,0.970272,0.971713,7636.0


Accuracy: 0.970, AUC: 0.746, f1-score: 0.318 


>>>> Wandb(Test)....
>>>> Wandb(Test)(End)....


VBox(children=(Label(value=' 0.09MB of 0.09MB uploaded (0.00MB deduped)\r'), FloatProgress(value=0.98650625559…

0,1
Test-AUC,▁
Test-Acc,▁
Test-f1(pos),▁
Train-AUC,▁
Train-Acc,▁
Train-f1(pos),▁

0,1
Test-AUC,0.74557
Test-Acc,0.97027
Test-f1(pos),0.31832
Train-AUC,0.99928
Train-Acc,0.99033
Train-f1(pos),0.98447




Best parameter (CV score=0.958):
{'model__max_depth': 24, 'model__max_features': 18}

Best estimator
Pipeline(steps=[('model',
                 RandomForestClassifier(max_depth=24, max_features=18,
                                        random_state=42))])


train
[[13448    26]
 [  148 29795]]


Unnamed: 0,precision,recall,f1-score,support
1,0.989114,0.99807,0.993572,13474.0
0,0.999128,0.995057,0.997089,29943.0
macro avg,0.994121,0.996564,0.99533,43417.0
weighted avg,0.99602,0.995992,0.995997,43417.0


Accuracy: 0.996, AUC: 1.000, f1-score: 0.994 


>>>> Wandb(Train)....
>>>> Wandb(Train)(End)....
test
[[  52   98]
 [ 138 7348]]


Unnamed: 0,precision,recall,f1-score,support
1,0.273684,0.346667,0.305882,150.0
0,0.986839,0.981566,0.984195,7486.0
macro avg,0.630261,0.664116,0.645039,7636.0
weighted avg,0.97283,0.969094,0.97087,7636.0


Accuracy: 0.969, AUC: 0.735, f1-score: 0.306 


>>>> Wandb(Test)....
>>>> Wandb(Test)(End)....


VBox(children=(Label(value=' 0.04MB of 0.05MB uploaded (0.00MB deduped)\r'), FloatProgress(value=0.97417258544…

0,1
Test-AUC,▁
Test-Acc,▁
Test-f1(pos),▁
Train-AUC,▁
Train-Acc,▁
Train-f1(pos),▁

0,1
Test-AUC,0.7355
Test-Acc,0.96909
Test-f1(pos),0.30588
Train-AUC,0.9998
Train-Acc,0.99599
Train-f1(pos),0.99357


## Exp 2
- f1, balanced

In [111]:
# gridsearch(clf, p_grid, X_train, y_train, X_test, y_test, run_name="Test")
p_grid_rf = {'model__max_depth': [i for i in range(5, 10)],
             'model__max_features': [i for i in range(20, 30)]}

p_grid_rf_2 = {'model__max_depth': [i for i in range(1, 20)],
               'model__max_features': [i for i in range(5, 15)]}

p_grid_rf_3 = {'model__max_depth': [i for i in range(5, 25)],
               'model__max_features': [i for i in range(10, 20)]}

base_lines_RF2 = [
    gridsearch(RandomForestClassifier(random_state=42, class_weight="balanced"), p_grid_rf, raw_p1_X_train, raw_p1_y_train, 
               raw_p1_X_val, raw_p1_y_val, run_name="RF Conf.1(b)"),
    gridsearch(RandomForestClassifier(random_state=42, class_weight="balanced"), p_grid_rf_2, raw_p1_X_train, raw_p1_y_train, 
               raw_p1_X_val, raw_p1_y_val, run_name="RF Conf.2(b)"),
    gridsearch(RandomForestClassifier(random_state=42, class_weight="balanced"), p_grid_rf_3, raw_p1_X_train, raw_p1_y_train, 
               raw_p1_X_val, raw_p1_y_val, run_name="RF Conf.3(b)"),
]



Best parameter (CV score=0.842):
{'model__max_depth': 9, 'model__max_features': 29}

Best estimator
Pipeline(steps=[('model',
                 RandomForestClassifier(class_weight='balanced', max_depth=9,
                                        max_features=29, random_state=42))])


train
[[12339  1135]
 [ 3282 26661]]


Unnamed: 0,precision,recall,f1-score,support
1,0.789898,0.915764,0.848187,13474.0
0,0.959167,0.890392,0.923501,29943.0
macro avg,0.874533,0.903078,0.885844,43417.0
weighted avg,0.906636,0.898266,0.900128,43417.0


Accuracy: 0.898, AUC: 0.972, f1-score: 0.848 


>>>> Wandb(Train)....
>>>> Wandb(Train)(End)....
test
[[  72   78]
 [ 823 6663]]


Unnamed: 0,precision,recall,f1-score,support
1,0.080447,0.48,0.137799,150.0
0,0.988429,0.890061,0.93667,7486.0
macro avg,0.534438,0.685031,0.537234,7636.0
weighted avg,0.970593,0.882006,0.920977,7636.0


Accuracy: 0.882, AUC: 0.760, f1-score: 0.138 


>>>> Wandb(Test)....
>>>> Wandb(Test)(End)....


VBox(children=(Label(value=' 0.13MB of 0.13MB uploaded (0.00MB deduped)\r'), FloatProgress(value=0.99070844358…

0,1
Test-AUC,▁
Test-Acc,▁
Test-f1(pos),▁
Train-AUC,▁
Train-Acc,▁
Train-f1(pos),▁

0,1
Test-AUC,0.75961
Test-Acc,0.88201
Test-f1(pos),0.1378
Train-AUC,0.97156
Train-Acc,0.89827
Train-f1(pos),0.84819




Best parameter (CV score=0.944):
{'model__max_depth': 19, 'model__max_features': 14}

Best estimator
Pipeline(steps=[('model',
                 RandomForestClassifier(class_weight='balanced', max_depth=19,
                                        max_features=14, random_state=42))])


train
[[13433    41]
 [  694 29249]]


Unnamed: 0,precision,recall,f1-score,support
1,0.950874,0.996957,0.973371,13474.0
0,0.9986,0.976823,0.987591,29943.0
macro avg,0.974737,0.98689,0.980481,43417.0
weighted avg,0.983789,0.983071,0.983178,43417.0


Accuracy: 0.983, AUC: 0.999, f1-score: 0.973 


>>>> Wandb(Train)....
>>>> Wandb(Train)(End)....
test
[[  54   96]
 [ 266 7220]]


Unnamed: 0,precision,recall,f1-score,support
1,0.16875,0.36,0.229787,150.0
0,0.986878,0.964467,0.975544,7486.0
macro avg,0.577814,0.662234,0.602666,7636.0
weighted avg,0.970807,0.952593,0.960894,7636.0


Accuracy: 0.953, AUC: 0.742, f1-score: 0.230 


>>>> Wandb(Test)....
>>>> Wandb(Test)(End)....


VBox(children=(Label(value=' 0.08MB of 0.09MB uploaded (0.00MB deduped)\r'), FloatProgress(value=0.98617444563…

0,1
Test-AUC,▁
Test-Acc,▁
Test-f1(pos),▁
Train-AUC,▁
Train-Acc,▁
Train-f1(pos),▁

0,1
Test-AUC,0.74195
Test-Acc,0.95259
Test-f1(pos),0.22979
Train-AUC,0.99899
Train-Acc,0.98307
Train-f1(pos),0.97337




Best parameter (CV score=0.954):
{'model__max_depth': 24, 'model__max_features': 18}

Best estimator
Pipeline(steps=[('model',
                 RandomForestClassifier(class_weight='balanced', max_depth=24,
                                        max_features=18, random_state=42))])


train
[[13462    12]
 [  280 29663]]


Unnamed: 0,precision,recall,f1-score,support
1,0.979625,0.999109,0.989271,13474.0
0,0.999596,0.990649,0.995102,29943.0
macro avg,0.98961,0.994879,0.992187,43417.0
weighted avg,0.993398,0.993275,0.993293,43417.0


Accuracy: 0.993, AUC: 1.000, f1-score: 0.989 


>>>> Wandb(Train)....
>>>> Wandb(Train)(End)....
test
[[  51   99]
 [ 178 7308]]


Unnamed: 0,precision,recall,f1-score,support
1,0.222707,0.34,0.269129,150.0
0,0.986634,0.976222,0.981401,7486.0
macro avg,0.604671,0.658111,0.625265,7636.0
weighted avg,0.971628,0.963724,0.967409,7636.0


Accuracy: 0.964, AUC: 0.743, f1-score: 0.269 


>>>> Wandb(Test)....
>>>> Wandb(Test)(End)....


VBox(children=(Label(value=' 0.04MB of 0.05MB uploaded (0.00MB deduped)\r'), FloatProgress(value=0.97381380112…

0,1
Test-AUC,▁
Test-Acc,▁
Test-f1(pos),▁
Train-AUC,▁
Train-Acc,▁
Train-f1(pos),▁

0,1
Test-AUC,0.7426
Test-Acc,0.96372
Test-f1(pos),0.26913
Train-AUC,0.99968
Train-Acc,0.99327
Train-f1(pos),0.98927


## Exp3
- macro

In [113]:
# gridsearch(clf, p_grid, X_train, y_train, X_test, y_test, run_name="Test", scoring="f1")
p_grid_rf = {'model__max_depth': [i for i in range(5, 10)],
             'model__max_features': [i for i in range(20, 30)]}

p_grid_rf_2 = {'model__max_depth': [i for i in range(1, 20)],
               'model__max_features': [i for i in range(5, 15)]}

p_grid_rf_3 = {'model__max_depth': [i for i in range(5, 25)],
               'model__max_features': [i for i in range(10, 20)]}

base_lines_RF3 = [
    gridsearch(RandomForestClassifier(random_state=42), p_grid_rf, raw_p1_X_train, raw_p1_y_train, 
               raw_p1_X_val, raw_p1_y_val, run_name="RF Conf.1", scoring="f1_macro"),
    gridsearch(RandomForestClassifier(random_state=42), p_grid_rf_2, raw_p1_X_train, raw_p1_y_train, 
               raw_p1_X_val, raw_p1_y_val, run_name="RF Conf.2", scoring="f1_macro"),
    gridsearch(RandomForestClassifier(random_state=42), p_grid_rf_3, raw_p1_X_train, raw_p1_y_train, 
               raw_p1_X_val, raw_p1_y_val, run_name="RF Conf.3", scoring="f1_macro"),
]



Best parameter (CV score=0.872):
{'model__max_depth': 9, 'model__max_features': 29}

Best estimator
Pipeline(steps=[('model',
                 RandomForestClassifier(max_depth=9, max_features=29,
                                        random_state=42))])


train
[[ 9759  3715]
 [  442 29501]]


Unnamed: 0,precision,recall,f1-score,support
1,0.956671,0.724284,0.824414,13474.0
0,0.888156,0.985239,0.934182,29943.0
macro avg,0.922414,0.854761,0.879298,43417.0
weighted avg,0.909419,0.904254,0.900117,43417.0


Accuracy: 0.904, AUC: 0.972, f1-score: 0.824 


>>>> Wandb(Train)....
>>>> Wandb(Train)(End)....
test
[[  58   92]
 [ 128 7358]]


Unnamed: 0,precision,recall,f1-score,support
1,0.311828,0.386667,0.345238,150.0
0,0.987651,0.982901,0.98527,7486.0
macro avg,0.649739,0.684784,0.665254,7636.0
weighted avg,0.974375,0.971189,0.972698,7636.0


Accuracy: 0.971, AUC: 0.764, f1-score: 0.345 


>>>> Wandb(Test)....
>>>> Wandb(Test)(End)....


VBox(children=(Label(value=' 0.12MB of 0.12MB uploaded (0.00MB deduped)\r'), FloatProgress(value=0.99030444817…

0,1
Test-AUC,▁
Test-Acc,▁
Test-f1(pos),▁
Train-AUC,▁
Train-Acc,▁
Train-f1(pos),▁

0,1
Test-AUC,0.76369
Test-Acc,0.97119
Test-f1(pos),0.34524
Train-AUC,0.9723
Train-Acc,0.90425
Train-f1(pos),0.82441




Best parameter (CV score=0.964):
{'model__max_depth': 19, 'model__max_features': 14}

Best estimator
Pipeline(steps=[('model',
                 RandomForestClassifier(max_depth=19, max_features=14,
                                        random_state=42))])


train
[[13310   164]
 [  256 29687]]


Unnamed: 0,precision,recall,f1-score,support
1,0.981129,0.987828,0.984467,13474.0
0,0.994506,0.99145,0.992976,29943.0
macro avg,0.987818,0.989639,0.988722,43417.0
weighted avg,0.990355,0.990326,0.990335,43417.0


Accuracy: 0.990, AUC: 0.999, f1-score: 0.984 


>>>> Wandb(Train)....
>>>> Wandb(Train)(End)....
test
[[  53   97]
 [ 130 7356]]


Unnamed: 0,precision,recall,f1-score,support
1,0.289617,0.353333,0.318318,150.0
0,0.986985,0.982634,0.984805,7486.0
macro avg,0.638301,0.667984,0.651562,7636.0
weighted avg,0.973286,0.970272,0.971713,7636.0


Accuracy: 0.970, AUC: 0.746, f1-score: 0.318 


>>>> Wandb(Test)....
>>>> Wandb(Test)(End)....


VBox(children=(Label(value=' 0.09MB of 0.09MB uploaded (0.00MB deduped)\r'), FloatProgress(value=0.98650625559…

0,1
Test-AUC,▁
Test-Acc,▁
Test-f1(pos),▁
Train-AUC,▁
Train-Acc,▁
Train-f1(pos),▁

0,1
Test-AUC,0.74557
Test-Acc,0.97027
Test-f1(pos),0.31832
Train-AUC,0.99928
Train-Acc,0.99033
Train-f1(pos),0.98447




Best parameter (CV score=0.969):
{'model__max_depth': 24, 'model__max_features': 18}

Best estimator
Pipeline(steps=[('model',
                 RandomForestClassifier(max_depth=24, max_features=18,
                                        random_state=42))])


train
[[13448    26]
 [  148 29795]]


Unnamed: 0,precision,recall,f1-score,support
1,0.989114,0.99807,0.993572,13474.0
0,0.999128,0.995057,0.997089,29943.0
macro avg,0.994121,0.996564,0.99533,43417.0
weighted avg,0.99602,0.995992,0.995997,43417.0


Accuracy: 0.996, AUC: 1.000, f1-score: 0.994 


>>>> Wandb(Train)....
>>>> Wandb(Train)(End)....
test
[[  52   98]
 [ 138 7348]]


Unnamed: 0,precision,recall,f1-score,support
1,0.273684,0.346667,0.305882,150.0
0,0.986839,0.981566,0.984195,7486.0
macro avg,0.630261,0.664116,0.645039,7636.0
weighted avg,0.97283,0.969094,0.97087,7636.0


Accuracy: 0.969, AUC: 0.735, f1-score: 0.306 


>>>> Wandb(Test)....
>>>> Wandb(Test)(End)....


VBox(children=(Label(value=' 0.04MB of 0.05MB uploaded (0.00MB deduped)\r'), FloatProgress(value=0.97417258544…

0,1
Test-AUC,▁
Test-Acc,▁
Test-f1(pos),▁
Train-AUC,▁
Train-Acc,▁
Train-f1(pos),▁

0,1
Test-AUC,0.7355
Test-Acc,0.96909
Test-f1(pos),0.30588
Train-AUC,0.9998
Train-Acc,0.99599
Train-f1(pos),0.99357


## Exp4
- macro, balanced

In [114]:
# gridsearch(clf, p_grid, X_train, y_train, X_test, y_test, run_name="Test")
p_grid_rf = {'model__max_depth': [i for i in range(5, 10)],
             'model__max_features': [i for i in range(20, 30)]}

p_grid_rf_2 = {'model__max_depth': [i for i in range(1, 20)],
               'model__max_features': [i for i in range(5, 15)]}

p_grid_rf_3 = {'model__max_depth': [i for i in range(5, 25)],
               'model__max_features': [i for i in range(10, 20)]}

base_lines_RF4 = [
    gridsearch(RandomForestClassifier(random_state=42, class_weight="balanced"), p_grid_rf, raw_p1_X_train, raw_p1_y_train, 
               raw_p1_X_val, raw_p1_y_val, run_name="RF Conf.1(b)", scoring="f1_macro"),
    gridsearch(RandomForestClassifier(random_state=42, class_weight="balanced"), p_grid_rf_2, raw_p1_X_train, raw_p1_y_train, 
               raw_p1_X_val, raw_p1_y_val, run_name="RF Conf.2(b)", scoring="f1_macro"),
    gridsearch(RandomForestClassifier(random_state=42, class_weight="balanced"), p_grid_rf_3, raw_p1_X_train, raw_p1_y_train, 
               raw_p1_X_val, raw_p1_y_val, run_name="RF Conf.3(b)", scoring="f1_macro"),
]



Best parameter (CV score=0.882):
{'model__max_depth': 9, 'model__max_features': 29}

Best estimator
Pipeline(steps=[('model',
                 RandomForestClassifier(class_weight='balanced', max_depth=9,
                                        max_features=29, random_state=42))])


train
[[12339  1135]
 [ 3282 26661]]


Unnamed: 0,precision,recall,f1-score,support
1,0.789898,0.915764,0.848187,13474.0
0,0.959167,0.890392,0.923501,29943.0
macro avg,0.874533,0.903078,0.885844,43417.0
weighted avg,0.906636,0.898266,0.900128,43417.0


Accuracy: 0.898, AUC: 0.972, f1-score: 0.848 


>>>> Wandb(Train)....
>>>> Wandb(Train)(End)....
test
[[  72   78]
 [ 823 6663]]


Unnamed: 0,precision,recall,f1-score,support
1,0.080447,0.48,0.137799,150.0
0,0.988429,0.890061,0.93667,7486.0
macro avg,0.534438,0.685031,0.537234,7636.0
weighted avg,0.970593,0.882006,0.920977,7636.0


Accuracy: 0.882, AUC: 0.760, f1-score: 0.138 


>>>> Wandb(Test)....
>>>> Wandb(Test)(End)....


VBox(children=(Label(value=' 0.13MB of 0.13MB uploaded (0.00MB deduped)\r'), FloatProgress(value=0.99070844358…

0,1
Test-AUC,▁
Test-Acc,▁
Test-f1(pos),▁
Train-AUC,▁
Train-Acc,▁
Train-f1(pos),▁

0,1
Test-AUC,0.75961
Test-Acc,0.88201
Test-f1(pos),0.1378
Train-AUC,0.97156
Train-Acc,0.89827
Train-f1(pos),0.84819




Best parameter (CV score=0.959):
{'model__max_depth': 19, 'model__max_features': 14}

Best estimator
Pipeline(steps=[('model',
                 RandomForestClassifier(class_weight='balanced', max_depth=19,
                                        max_features=14, random_state=42))])


train
[[13433    41]
 [  694 29249]]


Unnamed: 0,precision,recall,f1-score,support
1,0.950874,0.996957,0.973371,13474.0
0,0.9986,0.976823,0.987591,29943.0
macro avg,0.974737,0.98689,0.980481,43417.0
weighted avg,0.983789,0.983071,0.983178,43417.0


Accuracy: 0.983, AUC: 0.999, f1-score: 0.973 


>>>> Wandb(Train)....
>>>> Wandb(Train)(End)....
test
[[  54   96]
 [ 266 7220]]


Unnamed: 0,precision,recall,f1-score,support
1,0.16875,0.36,0.229787,150.0
0,0.986878,0.964467,0.975544,7486.0
macro avg,0.577814,0.662234,0.602666,7636.0
weighted avg,0.970807,0.952593,0.960894,7636.0


Accuracy: 0.953, AUC: 0.742, f1-score: 0.230 


>>>> Wandb(Test)....
>>>> Wandb(Test)(End)....


VBox(children=(Label(value=' 0.08MB of 0.09MB uploaded (0.00MB deduped)\r'), FloatProgress(value=0.98617444563…

0,1
Test-AUC,▁
Test-Acc,▁
Test-f1(pos),▁
Train-AUC,▁
Train-Acc,▁
Train-f1(pos),▁

0,1
Test-AUC,0.74195
Test-Acc,0.95259
Test-f1(pos),0.22979
Train-AUC,0.99899
Train-Acc,0.98307
Train-f1(pos),0.97337




Best parameter (CV score=0.967):
{'model__max_depth': 24, 'model__max_features': 18}

Best estimator
Pipeline(steps=[('model',
                 RandomForestClassifier(class_weight='balanced', max_depth=24,
                                        max_features=18, random_state=42))])


train
[[13462    12]
 [  280 29663]]


Unnamed: 0,precision,recall,f1-score,support
1,0.979625,0.999109,0.989271,13474.0
0,0.999596,0.990649,0.995102,29943.0
macro avg,0.98961,0.994879,0.992187,43417.0
weighted avg,0.993398,0.993275,0.993293,43417.0


Accuracy: 0.993, AUC: 1.000, f1-score: 0.989 


>>>> Wandb(Train)....
>>>> Wandb(Train)(End)....
test
[[  51   99]
 [ 178 7308]]


Unnamed: 0,precision,recall,f1-score,support
1,0.222707,0.34,0.269129,150.0
0,0.986634,0.976222,0.981401,7486.0
macro avg,0.604671,0.658111,0.625265,7636.0
weighted avg,0.971628,0.963724,0.967409,7636.0


Accuracy: 0.964, AUC: 0.743, f1-score: 0.269 


>>>> Wandb(Test)....
>>>> Wandb(Test)(End)....


VBox(children=(Label(value=' 0.04MB of 0.05MB uploaded (0.00MB deduped)\r'), FloatProgress(value=0.97381380112…

0,1
Test-AUC,▁
Test-Acc,▁
Test-f1(pos),▁
Train-AUC,▁
Train-Acc,▁
Train-f1(pos),▁

0,1
Test-AUC,0.7426
Test-Acc,0.96372
Test-f1(pos),0.26913
Train-AUC,0.99968
Train-Acc,0.99327
Train-f1(pos),0.98927


In [50]:
# p_grid_rf = {'model__max_depth': [i for i in range(5, 10)],
#              'model__max_features': [i for i in range(20, 30)]}

# p_grid_rf_2 = {'model__max_depth': [i for i in range(1, 20)],
#                'model__max_features': [i for i in range(5, 15)]}

# p_grid_rf_3 = {'model__max_depth': [i for i in range(5, 25)],
#                'model__max_features': [i for i in range(10, 20)]}

# p_grid_knn = {'model__n_neighbors': [i for i in range(1, 15)]}

# p_grid_lr_1 = {'model__penalty':['l1', 'l2']}

# p_grid_lr_2 = {'model__penalty':['l1'], 'model__solver':['liblinear', 'saga']}

# p_grid_lr_3 = {'model__penalty':['l2'], 'model__solver':['newton-cg', 'lbfgs', 'sag', 'saga']}

# base_lines = [
#     gridsearch(KNeighborsClassifier(), p_grid_knn),
    
#     gridsearch(RandomForestClassifier(random_state=42), p_grid_rf),
#     gridsearch(RandomForestClassifier(random_state=42), p_grid_rf_2),
#     gridsearch(RandomForestClassifier(random_state=42), p_grid_rf_3),
    
#     gridsearch(LogisticRegression(random_state=42), p_grid_lr_1),
#     gridsearch(LogisticRegression(random_state=42), p_grid_lr_2),
#     gridsearch(LogisticRegression(random_state=42), p_grid_lr_3),

# ]

## Exp5
- recall

In [115]:
# gridsearch(clf, p_grid, X_train, y_train, X_test, y_test, run_name="Test")
p_grid_rf = {'model__max_depth': [i for i in range(5, 10)],
             'model__max_features': [i for i in range(20, 30)]}

p_grid_rf_2 = {'model__max_depth': [i for i in range(1, 20)],
               'model__max_features': [i for i in range(5, 15)]}

p_grid_rf_3 = {'model__max_depth': [i for i in range(5, 25)],
               'model__max_features': [i for i in range(10, 20)]}

base_lines_RF5 = [
    gridsearch(RandomForestClassifier(random_state=42), p_grid_rf, raw_p1_X_train, raw_p1_y_train, 
               raw_p1_X_val, raw_p1_y_val, run_name="RF Conf.1(E5)", scoring="recall"),
    gridsearch(RandomForestClassifier(random_state=42), p_grid_rf_2, raw_p1_X_train, raw_p1_y_train, 
               raw_p1_X_val, raw_p1_y_val, run_name="RF Conf.2(E5)", scoring="recall"),
    gridsearch(RandomForestClassifier(random_state=42), p_grid_rf_3, raw_p1_X_train, raw_p1_y_train, 
               raw_p1_X_val, raw_p1_y_val, run_name="RF Conf.3(E5)", scoring="recall"),
]



Best parameter (CV score=0.711):
{'model__max_depth': 9, 'model__max_features': 29}

Best estimator
Pipeline(steps=[('model',
                 RandomForestClassifier(max_depth=9, max_features=29,
                                        random_state=42))])


train
[[ 9759  3715]
 [  442 29501]]


Unnamed: 0,precision,recall,f1-score,support
1,0.956671,0.724284,0.824414,13474.0
0,0.888156,0.985239,0.934182,29943.0
macro avg,0.922414,0.854761,0.879298,43417.0
weighted avg,0.909419,0.904254,0.900117,43417.0


Accuracy: 0.904, AUC: 0.972, f1-score: 0.824 


>>>> Wandb(Train)....
>>>> Wandb(Train)(End)....
test
[[  58   92]
 [ 128 7358]]


Unnamed: 0,precision,recall,f1-score,support
1,0.311828,0.386667,0.345238,150.0
0,0.987651,0.982901,0.98527,7486.0
macro avg,0.649739,0.684784,0.665254,7636.0
weighted avg,0.974375,0.971189,0.972698,7636.0


Accuracy: 0.971, AUC: 0.764, f1-score: 0.345 


>>>> Wandb(Test)....
>>>> Wandb(Test)(End)....


VBox(children=(Label(value=' 0.12MB of 0.12MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
Test-AUC,▁
Test-Acc,▁
Test-f1(pos),▁
Train-AUC,▁
Train-Acc,▁
Train-f1(pos),▁

0,1
Test-AUC,0.76369
Test-Acc,0.97119
Test-f1(pos),0.34524
Train-AUC,0.9723
Train-Acc,0.90425
Train-f1(pos),0.82441




Best parameter (CV score=0.946):
{'model__max_depth': 19, 'model__max_features': 14}

Best estimator
Pipeline(steps=[('model',
                 RandomForestClassifier(max_depth=19, max_features=14,
                                        random_state=42))])


train
[[13310   164]
 [  256 29687]]


Unnamed: 0,precision,recall,f1-score,support
1,0.981129,0.987828,0.984467,13474.0
0,0.994506,0.99145,0.992976,29943.0
macro avg,0.987818,0.989639,0.988722,43417.0
weighted avg,0.990355,0.990326,0.990335,43417.0


Accuracy: 0.990, AUC: 0.999, f1-score: 0.984 


>>>> Wandb(Train)....
>>>> Wandb(Train)(End)....
test
[[  53   97]
 [ 130 7356]]


Unnamed: 0,precision,recall,f1-score,support
1,0.289617,0.353333,0.318318,150.0
0,0.986985,0.982634,0.984805,7486.0
macro avg,0.638301,0.667984,0.651562,7636.0
weighted avg,0.973286,0.970272,0.971713,7636.0


Accuracy: 0.970, AUC: 0.746, f1-score: 0.318 


>>>> Wandb(Test)....
>>>> Wandb(Test)(End)....


VBox(children=(Label(value=' 0.09MB of 0.09MB uploaded (0.00MB deduped)\r'), FloatProgress(value=0.98650625559…

0,1
Test-AUC,▁
Test-Acc,▁
Test-f1(pos),▁
Train-AUC,▁
Train-Acc,▁
Train-f1(pos),▁

0,1
Test-AUC,0.74557
Test-Acc,0.97027
Test-f1(pos),0.31832
Train-AUC,0.99928
Train-Acc,0.99033
Train-f1(pos),0.98447




Best parameter (CV score=0.961):
{'model__max_depth': 24, 'model__max_features': 18}

Best estimator
Pipeline(steps=[('model',
                 RandomForestClassifier(max_depth=24, max_features=18,
                                        random_state=42))])


train
[[13448    26]
 [  148 29795]]


Unnamed: 0,precision,recall,f1-score,support
1,0.989114,0.99807,0.993572,13474.0
0,0.999128,0.995057,0.997089,29943.0
macro avg,0.994121,0.996564,0.99533,43417.0
weighted avg,0.99602,0.995992,0.995997,43417.0


Accuracy: 0.996, AUC: 1.000, f1-score: 0.994 


>>>> Wandb(Train)....
>>>> Wandb(Train)(End)....
test
[[  52   98]
 [ 138 7348]]


Unnamed: 0,precision,recall,f1-score,support
1,0.273684,0.346667,0.305882,150.0
0,0.986839,0.981566,0.984195,7486.0
macro avg,0.630261,0.664116,0.645039,7636.0
weighted avg,0.97283,0.969094,0.97087,7636.0


Accuracy: 0.969, AUC: 0.735, f1-score: 0.306 


>>>> Wandb(Test)....
>>>> Wandb(Test)(End)....


VBox(children=(Label(value=' 0.04MB of 0.05MB uploaded (0.00MB deduped)\r'), FloatProgress(value=0.97417258544…

0,1
Test-AUC,▁
Test-Acc,▁
Test-f1(pos),▁
Train-AUC,▁
Train-Acc,▁
Train-f1(pos),▁

0,1
Test-AUC,0.7355
Test-Acc,0.96909
Test-f1(pos),0.30588
Train-AUC,0.9998
Train-Acc,0.99599
Train-f1(pos),0.99357


In [116]:
pd.DataFrame(base_lines_RF5,
             columns=['Grid', 'Classifier', 'Accuracy', 'f1-score', 'AUC']).sort_values('f1-score', ascending=False)

Unnamed: 0,Grid,Classifier,Accuracy,f1-score,AUC
0,"GridSearchCV(cv=5,\n estimator=Pip...",RandomForest (dep=9 feature=29),0.971189,0.345238,0.763693
1,"GridSearchCV(cv=5,\n estimator=Pip...",RandomForest (dep=19 feature=14),0.970272,0.318318,0.745571
2,"GridSearchCV(cv=5,\n estimator=Pip...",RandomForest (dep=24 feature=18),0.969094,0.305882,0.735496


# Case 1. 
 - 承襲project1後(raw_p1_X_train)，做undersampling
 - pos:13474, neg:13474
 - val:沒有改raw_p1_X_val

## undersampling

In [151]:
p1_X_train_undersampler = RandomUnderSampler(random_state=42, sampling_strategy='majority')

In [152]:
under_p1_X_train, under_p1_y_train= p1_X_train_undersampler.fit_resample(raw_p1_X_train, raw_p1_y_train)

In [153]:
Counter(under_p1_y_train)

Counter({0: 13474, 1: 13474})

In [154]:
under_p1_y_train

0        0
1        0
2        0
3        0
4        0
        ..
26943    1
26944    1
26945    1
26946    1
26947    1
Name: outcome, Length: 26948, dtype: int64

## Exp6

In [130]:
# gridsearch(clf, p_grid, X_train, y_train, X_test, y_test, run_name="Test")
p_grid_rf = {'model__max_depth': [i for i in range(5, 10)],
             'model__max_features': [i for i in range(20, 30)]}

p_grid_rf_2 = {'model__max_depth': [i for i in range(1, 20)],
               'model__max_features': [i for i in range(5, 15)]}

p_grid_rf_3 = {'model__max_depth': [i for i in range(5, 25)],
               'model__max_features': [i for i in range(10, 20)]}

base_lines_RF6 = [
    gridsearch(RandomForestClassifier(random_state=42), p_grid_rf, under_p1_X_train, under_p1_y_train, 
               raw_p1_X_val, raw_p1_y_val, run_name="RF Conf.1(E6)", scoring="f1"),
    gridsearch(RandomForestClassifier(random_state=42), p_grid_rf_2, under_p1_X_train, under_p1_y_train, 
               raw_p1_X_val, raw_p1_y_val, run_name="RF Conf.2(E6)", scoring="f1"),
    gridsearch(RandomForestClassifier(random_state=42), p_grid_rf_3, under_p1_X_train, under_p1_y_train, 
               raw_p1_X_val, raw_p1_y_val, run_name="RF Conf.3(E6)", scoring="f1"),
]



Best parameter (CV score=0.901):
{'model__max_depth': 9, 'model__max_features': 24}

Best estimator
Pipeline(steps=[('model',
                 RandomForestClassifier(max_depth=9, max_features=24,
                                        random_state=42))])


train
[[12404  1070]
 [ 1264 12210]]


Unnamed: 0,precision,recall,f1-score,support
1,0.907521,0.920588,0.914008,13474.0
0,0.919428,0.90619,0.912761,13474.0
macro avg,0.913474,0.913389,0.913384,26948.0
weighted avg,0.913474,0.913389,0.913384,26948.0


Accuracy: 0.913, AUC: 0.975, f1-score: 0.914 


>>>> Wandb(Train)....
>>>> Wandb(Train)(End)....
test
[[  70   80]
 [ 763 6723]]


Unnamed: 0,precision,recall,f1-score,support
1,0.084034,0.466667,0.142421,150.0
0,0.98824,0.898076,0.941004,7486.0
macro avg,0.536137,0.682372,0.541712,7636.0
weighted avg,0.970478,0.889602,0.925316,7636.0


Accuracy: 0.890, AUC: 0.758, f1-score: 0.142 


>>>> Wandb(Test)....
>>>> Wandb(Test)(End)....


VBox(children=(Label(value=' 0.11MB of 0.11MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
Test-AUC,▁
Test-Acc,▁
Test-f1(pos),▁
Train-AUC,▁
Train-Acc,▁
Train-f1(pos),▁

0,1
Test-AUC,0.75766
Test-Acc,0.8896
Test-f1(pos),0.14242
Train-AUC,0.97516
Train-Acc,0.91339
Train-f1(pos),0.91401




Best parameter (CV score=0.963):
{'model__max_depth': 19, 'model__max_features': 13}

Best estimator
Pipeline(steps=[('model',
                 RandomForestClassifier(max_depth=19, max_features=13,
                                        random_state=42))])


train
[[13434    40]
 [  272 13202]]


Unnamed: 0,precision,recall,f1-score,support
1,0.980155,0.997031,0.988521,13474.0
0,0.996979,0.979813,0.988322,13474.0
macro avg,0.988567,0.988422,0.988421,26948.0
weighted avg,0.988567,0.988422,0.988421,26948.0


Accuracy: 0.988, AUC: 0.999, f1-score: 0.989 


>>>> Wandb(Train)....
>>>> Wandb(Train)(End)....
test
[[  59   91]
 [ 344 7142]]


Unnamed: 0,precision,recall,f1-score,support
1,0.146402,0.393333,0.213382,150.0
0,0.987419,0.954048,0.970446,7486.0
macro avg,0.56691,0.67369,0.591914,7636.0
weighted avg,0.970898,0.943033,0.955575,7636.0


Accuracy: 0.943, AUC: 0.729, f1-score: 0.213 


>>>> Wandb(Test)....
>>>> Wandb(Test)(End)....


VBox(children=(Label(value=' 0.09MB of 0.09MB uploaded (0.00MB deduped)\r'), FloatProgress(value=0.98637318880…

0,1
Test-AUC,▁
Test-Acc,▁
Test-f1(pos),▁
Train-AUC,▁
Train-Acc,▁
Train-f1(pos),▁

0,1
Test-AUC,0.72907
Test-Acc,0.94303
Test-f1(pos),0.21338
Train-AUC,0.99934
Train-Acc,0.98842
Train-f1(pos),0.98852




Best parameter (CV score=0.966):
{'model__max_depth': 24, 'model__max_features': 15}

Best estimator
Pipeline(steps=[('model',
                 RandomForestClassifier(max_depth=24, max_features=15,
                                        random_state=42))])


train
[[13459    15]
 [   98 13376]]


Unnamed: 0,precision,recall,f1-score,support
1,0.992771,0.998887,0.99582,13474.0
0,0.99888,0.992727,0.995794,13474.0
macro avg,0.995826,0.995807,0.995807,26948.0
weighted avg,0.995826,0.995807,0.995807,26948.0


Accuracy: 0.996, AUC: 1.000, f1-score: 0.996 


>>>> Wandb(Train)....
>>>> Wandb(Train)(End)....
test
[[  57   93]
 [ 288 7198]]


Unnamed: 0,precision,recall,f1-score,support
1,0.165217,0.38,0.230303,150.0
0,0.987245,0.961528,0.974217,7486.0
macro avg,0.576231,0.670764,0.60226,7636.0
weighted avg,0.971097,0.950105,0.959603,7636.0


Accuracy: 0.950, AUC: 0.732, f1-score: 0.230 


>>>> Wandb(Test)....
>>>> Wandb(Test)(End)....


VBox(children=(Label(value=' 0.05MB of 0.05MB uploaded (0.00MB deduped)\r'), FloatProgress(value=0.97800060663…

0,1
Test-AUC,▁
Test-Acc,▁
Test-f1(pos),▁
Train-AUC,▁
Train-Acc,▁
Train-f1(pos),▁

0,1
Test-AUC,0.73225
Test-Acc,0.9501
Test-f1(pos),0.2303
Train-AUC,0.99984
Train-Acc,0.99581
Train-f1(pos),0.99582


In [155]:
# gridsearch(clf, p_grid, X_train, y_train, X_test, y_test, run_name="Test")
p_grid_rf = {'model__max_depth': [i for i in range(5, 10)],
             'model__max_features': [i for i in range(20, 30)]}

p_grid_rf_2 = {'model__max_depth': [i for i in range(1, 20)],
               'model__max_features': [i for i in range(5, 15)]}

p_grid_rf_3 = {'model__max_depth': [i for i in range(5, 25)],
               'model__max_features': [i for i in range(10, 20)]}

base_lines_RF6_v2 = [
    gridsearch(RandomForestClassifier(random_state=42), p_grid_rf, under_p1_X_train, under_p1_y_train, 
               raw_p1_X_val, raw_p1_y_val, run_name="RF Conf.1(E6) v2", scoring="f1"),
    gridsearch(RandomForestClassifier(random_state=42), p_grid_rf_2, under_p1_X_train, under_p1_y_train, 
               raw_p1_X_val, raw_p1_y_val, run_name="RF Conf.2(E6) v2", scoring="f1"),
    gridsearch(RandomForestClassifier(random_state=42), p_grid_rf_3, under_p1_X_train, under_p1_y_train, 
               raw_p1_X_val, raw_p1_y_val, run_name="RF Conf.3(E6) v2", scoring="f1"),
]



Best parameter (CV score=0.897):
{'model__max_depth': 9, 'model__max_features': 24}

Best estimator
Pipeline(steps=[('model',
                 RandomForestClassifier(max_depth=9, max_features=24,
                                        random_state=42))])


train
[[12290  1184]
 [ 1281 12193]]


Unnamed: 0,precision,recall,f1-score,support
1,0.905608,0.912127,0.908856,13474.0
0,0.91149,0.904928,0.908197,13474.0
macro avg,0.908549,0.908528,0.908526,26948.0
weighted avg,0.908549,0.908528,0.908526,26948.0


Accuracy: 0.909, AUC: 0.973, f1-score: 0.909 


>>>> Wandb(Train)....
>>>> Wandb(Train)(End)....
test
[[  71   79]
 [ 773 6713]]


Unnamed: 0,precision,recall,f1-score,support
1,0.084123,0.473333,0.142857,150.0
0,0.988369,0.896741,0.940328,7486.0
macro avg,0.536246,0.685037,0.541592,7636.0
weighted avg,0.970606,0.888423,0.924662,7636.0


Accuracy: 0.888, AUC: 0.761, f1-score: 0.143 


>>>> Wandb(Test)....
>>>> Wandb(Test)(End)....


VBox(children=(Label(value=' 0.11MB of 0.11MB uploaded (0.00MB deduped)\r'), FloatProgress(value=0.98968420352…

0,1
Test-AUC,▁
Test-Acc,▁
Test-f1(pos),▁
Train-AUC,▁
Train-Acc,▁
Train-f1(pos),▁
Train-pre,█▁
Train-rec,█▁

0,1
Test-AUC,0.76139
Test-Acc,0.88842
Test-f1(pos),0.14286
Train-AUC,0.97318
Train-Acc,0.90853
Train-f1(pos),0.90886
Train-pre,0.08412
Train-rec,0.47333




Best parameter (CV score=0.961):
{'model__max_depth': 19, 'model__max_features': 13}

Best estimator
Pipeline(steps=[('model',
                 RandomForestClassifier(max_depth=19, max_features=13,
                                        random_state=42))])


train
[[13441    33]
 [  295 13179]]


Unnamed: 0,precision,recall,f1-score,support
1,0.978524,0.997551,0.987946,13474.0
0,0.997502,0.978106,0.987709,13474.0
macro avg,0.988013,0.987828,0.987827,26948.0
weighted avg,0.988013,0.987828,0.987827,26948.0


Accuracy: 0.988, AUC: 0.999, f1-score: 0.988 


>>>> Wandb(Train)....
>>>> Wandb(Train)(End)....
test
[[  58   92]
 [ 352 7134]]


Unnamed: 0,precision,recall,f1-score,support
1,0.141463,0.386667,0.207143,150.0
0,0.987268,0.952979,0.969821,7486.0
macro avg,0.564366,0.669823,0.588482,7636.0
weighted avg,0.970653,0.941854,0.954839,7636.0


Accuracy: 0.942, AUC: 0.741, f1-score: 0.207 


>>>> Wandb(Test)....
>>>> Wandb(Test)(End)....


VBox(children=(Label(value=' 0.09MB of 0.09MB uploaded (0.00MB deduped)\r'), FloatProgress(value=0.98710036316…

0,1
Test-AUC,▁
Test-Acc,▁
Test-f1(pos),▁
Train-AUC,▁
Train-Acc,▁
Train-f1(pos),▁
Train-pre,█▁
Train-rec,█▁

0,1
Test-AUC,0.74065
Test-Acc,0.94185
Test-f1(pos),0.20714
Train-AUC,0.99934
Train-Acc,0.98783
Train-f1(pos),0.98795
Train-pre,0.14146
Train-rec,0.38667




Best parameter (CV score=0.965):
{'model__max_depth': 24, 'model__max_features': 13}

Best estimator
Pipeline(steps=[('model',
                 RandomForestClassifier(max_depth=24, max_features=13,
                                        random_state=42))])


train
[[13462    12]
 [  124 13350]]


Unnamed: 0,precision,recall,f1-score,support
1,0.990873,0.999109,0.994974,13474.0
0,0.999102,0.990797,0.994932,13474.0
macro avg,0.994987,0.994953,0.994953,26948.0
weighted avg,0.994987,0.994953,0.994953,26948.0


Accuracy: 0.995, AUC: 1.000, f1-score: 0.995 


>>>> Wandb(Train)....
>>>> Wandb(Train)(End)....
test
[[  55   95]
 [ 283 7203]]


Unnamed: 0,precision,recall,f1-score,support
1,0.162722,0.366667,0.22541,150.0
0,0.986983,0.962196,0.974432,7486.0
macro avg,0.574852,0.664431,0.599921,7636.0
weighted avg,0.970791,0.950498,0.959718,7636.0


Accuracy: 0.950, AUC: 0.735, f1-score: 0.225 


>>>> Wandb(Test)....
>>>> Wandb(Test)(End)....


VBox(children=(Label(value=' 0.06MB of 0.06MB uploaded (0.00MB deduped)\r'), FloatProgress(value=0.97907815224…

0,1
Test-AUC,▁
Test-Acc,▁
Test-f1(pos),▁
Train-AUC,▁
Train-Acc,▁
Train-f1(pos),▁
Train-pre,█▁
Train-rec,█▁

0,1
Test-AUC,0.73539
Test-Acc,0.9505
Test-f1(pos),0.22541
Train-AUC,0.99982
Train-Acc,0.99495
Train-f1(pos),0.99497
Train-pre,0.16272
Train-rec,0.36667


# Case 2.
 - SMOTE+做downsampling 但不要太多

## Undersampling

In [131]:
raw_p1_y_train

0        1
1        0
2        1
3        0
4        1
        ..
43412    0
43413    0
43414    1
43415    0
43416    0
Name: outcome, Length: 43417, dtype: int64

In [141]:
29943 * 0.7

20960.1

In [145]:
Counter(raw_p1_y_train)

Counter({1: 13474, 0: 29943})

In [142]:
p1_X_train_undersampler_2 = RandomUnderSampler(random_state=42, sampling_strategy={0:20960})

In [143]:
under2_p1_X_train, under2_p1_y_train= p1_X_train_undersampler_2.fit_resample(raw_p1_X_train, raw_p1_y_train)

In [144]:
Counter(under2_p1_y_train)

Counter({0: 20960, 1: 13474})

In [148]:
under2_p1_X_train

Unnamed: 0,AGE,SEX,LOS,Joint,Drain,Commercial_ALBC,Non_commercial_ALBC,cci_index,elx_index,Blood_trans,...,Weight Loss,Fluid and Electrolyte Disorders,Blood Loss Anemia,Deficiency Anemia,Anemia,Alcohol Abuse,Drug Abuse,Psychoses,Depression,Psyciatric disorder
0,61,0,4,1,1,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1,81,0,5,1,0,0,0,2,1,0,...,0,0,0,0,0,0,0,0,0,0
2,60,1,2,0,1,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3,75,0,2,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,70,1,2,0,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34429,69,0,6,1,1,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
34430,55,1,6,0,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
34431,65,1,5,0,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
34432,69,1,5,0,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Exp 7

In [149]:
# gridsearch(clf, p_grid, X_train, y_train, X_test, y_test, run_name="Test")
p_grid_rf = {'model__max_depth': [i for i in range(5, 10)],
             'model__max_features': [i for i in range(20, 30)]}

p_grid_rf_2 = {'model__max_depth': [i for i in range(1, 20)],
               'model__max_features': [i for i in range(5, 15)]}

p_grid_rf_3 = {'model__max_depth': [i for i in range(5, 25)],
               'model__max_features': [i for i in range(10, 20)]}

base_lines_RF7 = [
    gridsearch(RandomForestClassifier(random_state=42), p_grid_rf, under2_p1_X_train, under2_p1_y_train, 
               raw_p1_X_val, raw_p1_y_val, run_name="RF Conf.1(E7)", scoring="f1"),
    gridsearch(RandomForestClassifier(random_state=42), p_grid_rf_2, under2_p1_X_train, under2_p1_y_train, 
               raw_p1_X_val, raw_p1_y_val, run_name="RF Conf.2(E7)", scoring="f1"),
    gridsearch(RandomForestClassifier(random_state=42), p_grid_rf_3, under2_p1_X_train, under2_p1_y_train, 
               raw_p1_X_val, raw_p1_y_val, run_name="RF Conf.3(E7)", scoring="f1"),
]



Best parameter (CV score=0.847):
{'model__max_depth': 9, 'model__max_features': 29}

Best estimator
Pipeline(steps=[('model',
                 RandomForestClassifier(max_depth=9, max_features=29,
                                        random_state=42))])


train
[[10398  3076]
 [  448 20512]]


Unnamed: 0,precision,recall,f1-score,support
1,0.958694,0.771708,0.855099,13474.0
0,0.869595,0.978626,0.920894,20960.0
macro avg,0.914145,0.875167,0.887997,34434.0
weighted avg,0.904459,0.897659,0.895149,34434.0


Accuracy: 0.898, AUC: 0.973, f1-score: 0.855 


>>>> Wandb(Train)....
>>>> Wandb(Train)(End)....
test
[[  59   91]
 [ 177 7309]]


Unnamed: 0,precision,recall,f1-score,support
1,0.25,0.393333,0.305699,150.0
0,0.987703,0.976356,0.981997,7486.0
macro avg,0.618851,0.684845,0.643848,7636.0
weighted avg,0.973211,0.964903,0.968711,7636.0


Accuracy: 0.965, AUC: 0.760, f1-score: 0.306 


>>>> Wandb(Test)....
>>>> Wandb(Test)(End)....


VBox(children=(Label(value=' 0.12MB of 0.12MB uploaded (0.00MB deduped)\r'), FloatProgress(value=0.99013761102…

0,1
Test-AUC,▁
Test-Acc,▁
Test-f1(pos),▁
Train-AUC,▁
Train-Acc,▁
Train-f1(pos),▁

0,1
Test-AUC,0.76035
Test-Acc,0.9649
Test-f1(pos),0.3057
Train-AUC,0.97329
Train-Acc,0.89766
Train-f1(pos),0.8551




Best parameter (CV score=0.958):
{'model__max_depth': 19, 'model__max_features': 14}

Best estimator
Pipeline(steps=[('model',
                 RandomForestClassifier(max_depth=19, max_features=14,
                                        random_state=42))])


train
[[13416    58]
 [  303 20657]]


Unnamed: 0,precision,recall,f1-score,support
1,0.977914,0.995695,0.986725,13474.0
0,0.9972,0.985544,0.991338,20960.0
macro avg,0.987557,0.99062,0.989031,34434.0
weighted avg,0.989653,0.989516,0.989533,34434.0


Accuracy: 0.990, AUC: 0.999, f1-score: 0.987 


>>>> Wandb(Train)....
>>>> Wandb(Train)(End)....
test
[[  54   96]
 [ 234 7252]]


Unnamed: 0,precision,recall,f1-score,support
1,0.1875,0.36,0.246575,150.0
0,0.986935,0.968742,0.977754,7486.0
macro avg,0.587218,0.664371,0.612165,7636.0
weighted avg,0.971231,0.956784,0.963391,7636.0


Accuracy: 0.957, AUC: 0.749, f1-score: 0.247 


>>>> Wandb(Test)....
>>>> Wandb(Test)(End)....


VBox(children=(Label(value=' 0.09MB of 0.09MB uploaded (0.00MB deduped)\r'), FloatProgress(value=0.98683817569…

0,1
Test-AUC,▁
Test-Acc,▁
Test-f1(pos),▁
Train-AUC,▁
Train-Acc,▁
Train-f1(pos),▁

0,1
Test-AUC,0.74931
Test-Acc,0.95678
Test-f1(pos),0.24658
Train-AUC,0.99934
Train-Acc,0.98952
Train-f1(pos),0.98672




Best parameter (CV score=0.962):
{'model__max_depth': 23, 'model__max_features': 17}

Best estimator
Pipeline(steps=[('model',
                 RandomForestClassifier(max_depth=23, max_features=17,
                                        random_state=42))])


train
[[13454    20]
 [  148 20812]]


Unnamed: 0,precision,recall,f1-score,support
1,0.989119,0.998516,0.993795,13474.0
0,0.99904,0.992939,0.99598,20960.0
macro avg,0.99408,0.995727,0.994888,34434.0
weighted avg,0.995158,0.995121,0.995125,34434.0


Accuracy: 0.995, AUC: 1.000, f1-score: 0.994 


>>>> Wandb(Train)....
>>>> Wandb(Train)(End)....
test
[[  54   96]
 [ 213 7273]]


Unnamed: 0,precision,recall,f1-score,support
1,0.202247,0.36,0.258993,150.0
0,0.986972,0.971547,0.979199,7486.0
macro avg,0.59461,0.665773,0.619096,7636.0
weighted avg,0.971557,0.959534,0.965051,7636.0


Accuracy: 0.960, AUC: 0.730, f1-score: 0.259 


>>>> Wandb(Test)....
>>>> Wandb(Test)(End)....


VBox(children=(Label(value=' 0.05MB of 0.05MB uploaded (0.00MB deduped)\r'), FloatProgress(value=0.97784281687…

0,1
Test-AUC,▁
Test-Acc,▁
Test-f1(pos),▁
Train-AUC,▁
Train-Acc,▁
Train-f1(pos),▁

0,1
Test-AUC,0.73019
Test-Acc,0.95953
Test-f1(pos),0.25899
Train-AUC,0.9998
Train-Acc,0.99512
Train-f1(pos),0.9938


# Case 3.
 - 最簡單處理 case1_after_data

In [26]:
case1_after_data_csv = pd.read_csv("case1_after_data.csv")

In [27]:
case1_after_data_csv.drop("Unnamed: 0", axis=1, inplace=True)

In [28]:
case1_after_data_csv

Unnamed: 0,outcome,AGE,SEX,LOS,Joint,Drain,Cemented,Commercial_ALBC,Non_commercial_ALBC,cci_index,...,Weight Loss,Fluid and Electrolyte Disorders,Blood Loss Anemia,Deficiency Anemia,Anemia,Alcohol Abuse,Drug Abuse,Psychoses,Depression,Psyciatric disorder
0,1,59.0,0,8.0,0,1,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
1,1,69.0,1,5.0,0,1,1,1,0,1,...,0,0,0,0,0,0,0,0,0,0
2,1,80.0,1,8.0,0,1,1,0,1,1,...,0,0,0,0,0,0,0,0,0,0
3,1,66.0,1,14.0,0,1,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,1,52.0,0,4.0,0,1,0,0,0,1,...,0,0,0,1,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49136,0,78.0,0,3.0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
49137,0,78.0,1,4.0,0,1,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
49138,0,73.0,0,5.0,1,1,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
49139,0,74.0,0,5.0,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [163]:
case3_train_split, case3_val_split = train_test_split(
    case1_after_data_csv, test_size=0.2, random_state=42, stratify=case1_after_data_csv["outcome"])

In [164]:
case3_train_split

Unnamed: 0,outcome,AGE,SEX,LOS,Joint,Drain,Cemented,Commercial_ALBC,Non_commercial_ALBC,cci_index,...,Weight Loss,Fluid and Electrolyte Disorders,Blood Loss Anemia,Deficiency Anemia,Anemia,Alcohol Abuse,Drug Abuse,Psychoses,Depression,Psyciatric disorder
41218,0,68.0,0,7.0,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
22985,0,63.0,0,3.0,0,1,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
13783,0,40.0,1,6.0,1,0,0,0,0,1,...,0,0,0,0,0,1,0,0,0,0
6145,0,65.0,0,5.0,0,1,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
14712,0,71.0,1,7.0,0,1,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35557,0,46.0,1,4.0,0,1,1,0,1,5,...,0,1,0,0,0,0,0,0,0,0
24374,0,62.0,0,8.0,0,1,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
33083,0,74.0,1,7.0,0,1,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
8650,0,80.0,0,7.0,0,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [165]:
case3_val_split

Unnamed: 0,outcome,AGE,SEX,LOS,Joint,Drain,Cemented,Commercial_ALBC,Non_commercial_ALBC,cci_index,...,Weight Loss,Fluid and Electrolyte Disorders,Blood Loss Anemia,Deficiency Anemia,Anemia,Alcohol Abuse,Drug Abuse,Psychoses,Depression,Psyciatric disorder
12183,0,55.0,0,5.0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
23981,0,78.0,1,6.0,0,1,1,1,0,2,...,0,0,0,0,0,0,0,0,0,0
38576,0,79.0,1,5.0,0,1,1,1,0,1,...,0,0,0,0,0,0,0,0,0,0
28283,0,61.0,1,5.0,0,1,1,1,0,1,...,0,0,1,0,1,0,0,0,0,0
28351,0,81.0,1,3.0,1,0,0,0,0,2,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38572,0,86.0,0,6.0,0,1,1,0,1,0,...,0,0,0,1,1,0,0,0,1,1
8727,0,50.0,1,3.0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
38694,0,64.0,0,5.0,0,1,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
33279,0,78.0,1,3.0,0,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [166]:
case3_X_train = case3_train_split.copy()
case3_X_val = case3_val_split.copy()

In [167]:
case3_y_train = case3_X_train.pop("outcome")

In [168]:
case3_y_val = case3_X_val.pop("outcome")

In [173]:
case3_X_train.shape

(39312, 51)

In [174]:
case3_y_train.value_counts()

0    38615
1      697
Name: outcome, dtype: int64

In [175]:
case3_y_val.value_counts()

0    9655
1     174
Name: outcome, dtype: int64

## Exp 8.
- 沒有class weight, 原生非常不平衡(pos:1.7%)進行訓練
- 極差無比

In [176]:
# gridsearch(clf, p_grid, X_train, y_train, X_test, y_test, run_name="Test")
p_grid_rf = {'model__max_depth': [i for i in range(5, 10)],
             'model__max_features': [i for i in range(20, 30)]}

p_grid_rf_2 = {'model__max_depth': [i for i in range(1, 20)],
               'model__max_features': [i for i in range(5, 15)]}

p_grid_rf_3 = {'model__max_depth': [i for i in range(5, 25)],
               'model__max_features': [i for i in range(10, 20)]}

base_lines_RF8 = [
    gridsearch(RandomForestClassifier(random_state=42), p_grid_rf, case3_X_train, case3_y_train, 
               case3_X_val, case3_y_val, run_name="RF Conf.1(E8)", scoring="f1"),
    gridsearch(RandomForestClassifier(random_state=42), p_grid_rf_2, case3_X_train, case3_y_train, 
               case3_X_val, case3_y_val, run_name="RF Conf.2(E8)", scoring="f1"),
    gridsearch(RandomForestClassifier(random_state=42), p_grid_rf_3, case3_X_train, case3_y_train, 
               case3_X_val, case3_y_val, run_name="RF Conf.3(E8)", scoring="f1"),
]



Best parameter (CV score=0.003):
{'model__max_depth': 7, 'model__max_features': 27}

Best estimator
Pipeline(steps=[('model',
                 RandomForestClassifier(max_depth=7, max_features=27,
                                        random_state=42))])


train
[[   15   682]
 [    0 38615]]


Unnamed: 0,precision,recall,f1-score,support
1,1.0,0.021521,0.042135,697.0
0,0.982645,1.0,0.991247,38615.0
macro avg,0.991322,0.51076,0.516691,39312.0
weighted avg,0.982953,0.982652,0.974419,39312.0


Accuracy: 0.983, AUC: 0.826, f1-score: 0.042 


>>>> Wandb(Train)....
>>>> Wandb(Train)(End)....
test
[[   0  174]
 [   0 9655]]


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,precision,recall,f1-score,support
1,0.0,0.0,0.0,174.0
0,0.982297,1.0,0.99107,9655.0
macro avg,0.491149,0.5,0.495535,9829.0
weighted avg,0.964908,0.982297,0.973525,9829.0


Accuracy: 0.982, AUC: 0.727, f1-score: 0.000 


>>>> Wandb(Test)....
>>>> Wandb(Test)(End)....


VBox(children=(Label(value=' 0.16MB of 0.16MB uploaded (0.00MB deduped)\r'), FloatProgress(value=0.99271625628…

0,1
Test-AUC,▁
Test-Acc,▁
Test-f1(pos),▁
Test-pre,▁
Test-rec,▁
Train-AUC,▁
Train-Acc,▁
Train-f1(pos),▁
Train-pre,▁
Train-rec,▁

0,1
Test-AUC,0.7266
Test-Acc,0.9823
Test-f1(pos),0.0
Test-pre,0.0
Test-rec,0.0
Train-AUC,0.82553
Train-Acc,0.98265
Train-f1(pos),0.04213
Train-pre,1.0
Train-rec,0.02152




KeyboardInterrupt: 

## Exp 9.
- class weight

In [178]:
# gridsearch(clf, p_grid, X_train, y_train, X_test, y_test, run_name="Test")
X_train = case3_X_train
y_train = case3_y_train
X_val = case3_X_val
y_val = case3_y_val

p_grid_rf = {'model__max_depth': [i for i in range(5, 10)],
             'model__max_features': [i for i in range(20, 30)]}

p_grid_rf_2 = {'model__max_depth': [i for i in range(1, 20)],
               'model__max_features': [i for i in range(5, 15)]}

p_grid_rf_3 = {'model__max_depth': [i for i in range(5, 25)],
               'model__max_features': [i for i in range(10, 20)]}

base_lines_RF9 = [
    gridsearch(RandomForestClassifier(random_state=42, class_weight = "balanced"), p_grid_rf, X_train, y_train, 
               X_val, y_val, run_name="RF Conf.1(E9)", scoring="f1"),
    gridsearch(RandomForestClassifier(random_state=42, class_weight = "balanced"), p_grid_rf_2, X_train, y_train, 
               X_val, y_val, run_name="RF Conf.2(E9)", scoring="f1"),
    gridsearch(RandomForestClassifier(random_state=42, class_weight = "balanced"), p_grid_rf_3, X_train, y_train, 
               X_val, y_val, run_name="RF Conf.3(E9)", scoring="f1"),
]



Best parameter (CV score=0.074):
{'model__max_depth': 9, 'model__max_features': 29}

Best estimator
Pipeline(steps=[('model',
                 RandomForestClassifier(class_weight='balanced', max_depth=9,
                                        max_features=29, random_state=42))])


train
[[  564   133]
 [ 8960 29655]]


Unnamed: 0,precision,recall,f1-score,support
1,0.059219,0.809182,0.110361,697.0
0,0.995535,0.767966,0.867067,38615.0
macro avg,0.527377,0.788574,0.488714,39312.0
weighted avg,0.978934,0.768697,0.853651,39312.0


Accuracy: 0.769, AUC: 0.875, f1-score: 0.110 


>>>> Wandb(Train)....
>>>> Wandb(Train)(End)....
test
[[ 101   73]
 [2277 7378]]


Unnamed: 0,precision,recall,f1-score,support
1,0.042473,0.58046,0.079154,174.0
0,0.990203,0.764164,0.862621,9655.0
macro avg,0.516338,0.672312,0.470887,9829.0
weighted avg,0.973425,0.760912,0.848752,9829.0


Accuracy: 0.761, AUC: 0.735, f1-score: 0.079 


>>>> Wandb(Test)....
>>>> Wandb(Test)(End)....


AttributeError: 'ZMQDisplayPublisher' object has no attribute '_orig_publish'

# Case 4.

In [29]:
case1_after_data_csv.columns

Index(['outcome', 'AGE', 'SEX', 'LOS', 'Joint', 'Drain', 'Cemented',
       'Commercial_ALBC', 'Non_commercial_ALBC', 'cci_index', 'elx_index',
       'Blood_trans', 'OP_time_minute', 'OP_time_hour', 'ASA', 'CBC_HG',
       'CBC_HT', 'Diagnosis', 'Congestive Heart Failure', 'Cardiac Arrhythmia',
       'Valvular Disease', 'Heart disease', 'Pulmonary Circulation Disorders',
       'Peripheral Vascular Disorders', 'Hypertension Uncomplicated',
       'Paralysis', 'Other Neurological Disorders',
       'Chronic Pulmonary Disease', 'Lung disease', 'Diabetes',
       'Hypothyroidism', 'Renal Failure', 'Liver Disease',
       'Peptic Ulcer Disease excluding bleeding', 'AIDS/HIV', 'Lymphoma',
       'Metastatic Cancer', 'Solid Tumor without Metastasis', 'Cancer history',
       'Rheumatoid Arthritis/collagen', 'Coagulopathy', 'Obesity',
       'Weight Loss', 'Fluid and Electrolyte Disorders', 'Blood Loss Anemia',
       'Deficiency Anemia', 'Anemia', 'Alcohol Abuse', 'Drug Abuse',
       'Psy

In [30]:
case1_after_data_csv.shape

(49141, 52)

In [31]:
case4_data_csv = case1_after_data_csv.drop(["CBC_HG","CBC_HT"], axis=1)

In [32]:
case4_data_csv.shape

(49141, 50)

In [33]:
case4_data_csv.outcome.value_counts()

0    48270
1      871
Name: outcome, dtype: int64

In [34]:
case4_train_split, case4_val_split = train_test_split(
    case4_data_csv, test_size=0.2, random_state=42, stratify=case4_data_csv["outcome"])

In [35]:
case4_train_split.outcome.value_counts()

0    38615
1      697
Name: outcome, dtype: int64

In [36]:
case4_X_train = case4_train_split.copy()

In [37]:
case4_X_val = case4_val_split.copy()

In [38]:
case4_y_train = case4_X_train.pop("outcome")

In [39]:
case4_y_val = case4_X_val.pop("outcome")

In [45]:
case4_X_train_undersampler = RandomUnderSampler(random_state=42, sampling_strategy={0:19307})

In [46]:
case4_under_X_train, case4_under_y_train= case4_X_train_undersampler.fit_resample(case4_X_train, case4_y_train)

In [47]:
case4_under_y_train.value_counts()

0    19307
1      697
Name: outcome, dtype: int64

In [50]:
case4_under_smo_X_trainer = SMOTE(sampling_strategy={1:19307},random_state=42)

In [52]:
case4_under_smo_X_train,  case4_under_smo_y_train = case4_under_smo_X_trainer.fit_resample(case4_under_X_train, case4_under_y_train)

In [53]:
case4_under_smo_y_train.value_counts()

0    19307
1    19307
Name: outcome, dtype: int64

## Exp 10.

In [54]:
# gridsearch(clf, p_grid, X_train, y_train, X_test, y_test, run_name="Test")
X_train = case4_under_smo_X_train
y_train = case4_under_smo_y_train
X_val = case4_X_val
y_val = case4_y_val

p_grid_rf = {'model__max_depth': [i for i in range(5, 10)],
             'model__max_features': [i for i in range(20, 30)]}

p_grid_rf_2 = {'model__max_depth': [i for i in range(1, 20)],
               'model__max_features': [i for i in range(5, 15)]}

p_grid_rf_3 = {'model__max_depth': [i for i in range(5, 25)],
               'model__max_features': [i for i in range(10, 20)]}

base_lines_RF10 = [
    gridsearch(RandomForestClassifier(random_state=42), p_grid_rf, X_train, y_train, 
               X_val, y_val, run_name="RF Conf.1(E10)(smo)", scoring="f1"),
    gridsearch(RandomForestClassifier(random_state=42), p_grid_rf_2, X_train, y_train, 
               X_val, y_val, run_name="RF Conf.2(E10)(smo)", scoring="f1"),
    gridsearch(RandomForestClassifier(random_state=42), p_grid_rf_3, X_train, y_train, 
               X_val, y_val, run_name="RF Conf.3(E10)(smo)", scoring="f1"),
]



Best parameter (CV score=0.871):
{'model__max_depth': 9, 'model__max_features': 28}

Best estimator
Pipeline(steps=[('model',
                 RandomForestClassifier(max_depth=9, max_features=28,
                                        random_state=42))])


train
[[16988  2319]
 [ 2502 16805]]


Unnamed: 0,precision,recall,f1-score,support
1,0.871626,0.879888,0.875738,19307.0
0,0.878739,0.87041,0.874554,19307.0
macro avg,0.875183,0.875149,0.875146,38614.0
weighted avg,0.875183,0.875149,0.875146,38614.0


Accuracy: 0.875, AUC: 0.957, f1-score: 0.876 


>>>> Wandb(Train)....
>>>> Wandb(Train)(End)....
test
[[  35  139]
 [1302 8353]]


Unnamed: 0,precision,recall,f1-score,support
1,0.026178,0.201149,0.046327,174.0
0,0.983632,0.865148,0.920593,9655.0
macro avg,0.504905,0.533149,0.48346,9829.0
weighted avg,0.966682,0.853393,0.905116,9829.0


Accuracy: 0.853, AUC: 0.584, f1-score: 0.046 


>>>> Wandb(Test)....
>>>> Wandb(Test)(End)....


VBox(children=(Label(value=' 0.15MB of 0.15MB uploaded (0.00MB deduped)\r'), FloatProgress(value=0.99198814012…

0,1
Test-AUC,▁
Test-Acc,▁
Test-f1(pos),▁
Test-pre,▁
Test-rec,▁
Train-AUC,▁
Train-Acc,▁
Train-f1(pos),▁
Train-pre,▁
Train-rec,▁

0,1
Test-AUC,0.58359
Test-Acc,0.85339
Test-f1(pos),0.04633
Test-pre,0.02618
Test-rec,0.20115
Train-AUC,0.95657
Train-Acc,0.87515
Train-f1(pos),0.87574
Train-pre,0.87163
Train-rec,0.87989




Best parameter (CV score=0.948):
{'model__max_depth': 19, 'model__max_features': 14}

Best estimator
Pipeline(steps=[('model',
                 RandomForestClassifier(max_depth=19, max_features=14,
                                        random_state=42))])


train
[[19130   177]
 [ 1027 18280]]


Unnamed: 0,precision,recall,f1-score,support
1,0.94905,0.990832,0.969491,19307.0
0,0.99041,0.946807,0.968118,19307.0
macro avg,0.96973,0.96882,0.968804,38614.0
weighted avg,0.96973,0.96882,0.968804,38614.0


Accuracy: 0.969, AUC: 0.998, f1-score: 0.969 


>>>> Wandb(Train)....
>>>> Wandb(Train)(End)....
test
[[  16  158]
 [ 651 9004]]


Unnamed: 0,precision,recall,f1-score,support
1,0.023988,0.091954,0.03805,174.0
0,0.982755,0.932574,0.957007,9655.0
macro avg,0.503371,0.512264,0.497528,9829.0
weighted avg,0.965782,0.917693,0.940739,9829.0


Accuracy: 0.918, AUC: 0.580, f1-score: 0.038 


>>>> Wandb(Test)....
>>>> Wandb(Test)(End)....


VBox(children=(Label(value=' 0.14MB of 0.14MB uploaded (0.00MB deduped)\r'), FloatProgress(value=0.99166067480…

0,1
Test-AUC,▁
Test-Acc,▁
Test-f1(pos),▁
Test-pre,▁
Test-rec,▁
Train-AUC,▁
Train-Acc,▁
Train-f1(pos),▁
Train-pre,▁
Train-rec,▁

0,1
Test-AUC,0.57987
Test-Acc,0.91769
Test-f1(pos),0.03805
Test-pre,0.02399
Test-rec,0.09195
Train-AUC,0.99765
Train-Acc,0.96882
Train-f1(pos),0.96949
Train-pre,0.94905
Train-rec,0.99083




Best parameter (CV score=0.966):
{'model__max_depth': 24, 'model__max_features': 19}

Best estimator
Pipeline(steps=[('model',
                 RandomForestClassifier(max_depth=24, max_features=19,
                                        random_state=42))])


train
[[19199   108]
 [  221 19086]]


Unnamed: 0,precision,recall,f1-score,support
1,0.98862,0.994406,0.991505,19307.0
0,0.994373,0.988553,0.991455,19307.0
macro avg,0.991497,0.99148,0.99148,38614.0
weighted avg,0.991497,0.99148,0.99148,38614.0


Accuracy: 0.991, AUC: 1.000, f1-score: 0.992 


>>>> Wandb(Train)....
>>>> Wandb(Train)(End)....
test
[[   7  167]
 [ 294 9361]]


Unnamed: 0,precision,recall,f1-score,support
1,0.023256,0.04023,0.029474,174.0
0,0.982473,0.969549,0.975968,9655.0
macro avg,0.502864,0.50489,0.502721,9829.0
weighted avg,0.965492,0.953098,0.959213,9829.0


Accuracy: 0.953, AUC: 0.592, f1-score: 0.029 


>>>> Wandb(Test)....
>>>> Wandb(Test)(End)....


VBox(children=(Label(value=' 0.11MB of 0.11MB uploaded (0.00MB deduped)\r'), FloatProgress(value=0.98906040444…

0,1
Test-AUC,▁
Test-Acc,▁
Test-f1(pos),▁
Test-pre,▁
Test-rec,▁
Train-AUC,▁
Train-Acc,▁
Train-f1(pos),▁
Train-pre,▁
Train-rec,▁

0,1
Test-AUC,0.59157
Test-Acc,0.9531
Test-f1(pos),0.02947
Test-pre,0.02326
Test-rec,0.04023
Train-AUC,0.99962
Train-Acc,0.99148
Train-f1(pos),0.9915
Train-pre,0.98862
Train-rec,0.99441


# Case 5.
- 開始用另外notebook處理資料
- store in data/exp11
- case3(ori), 不做cci,elx,LOS的outlier改值,sample方式用SMOTE(pos = neg*0.45)

In [56]:
exp11_X_train = readFPickle("/data/exp11/exp11_X_train")
exp11_X_train

Unnamed: 0,AGE,SEX,LOS,Joint,Drain,Cemented,Commercial_ALBC,Non_commercial_ALBC,cci_index,elx_index,...,Coagulopathy,Fluid and Electrolyte Disorders,Blood Loss Anemia,Deficiency Anemia,Anemia,Alcohol Abuse,Drug Abuse,Psychoses,Depression,Psyciatric disorder
6217,66,0,5,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
15771,75,1,4,0,1,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
8393,60,0,2,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
15381,84,1,4,0,1,1,1,0,2,4,...,0,0,0,0,0,0,0,0,0,0
19907,74,0,2,0,1,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6265,66,0,4,0,1,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
11284,73,1,4,0,1,1,0,1,2,1,...,0,0,0,0,0,0,0,0,0,0
38158,66,0,7,0,1,1,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0
860,47,1,7,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [57]:
exp11_y_train = readFPickle("/data/exp11/exp11_y_train")
exp11_y_train

6217     0
15771    0
8393     0
15381    0
19907    0
        ..
6265     0
11284    0
38158    1
860      0
15795    0
Name: outcome, Length: 44290, dtype: int64

In [58]:
exp11_X_val = readFPickle("/data/exp11/exp11_X_val")
exp11_X_val

Unnamed: 0,AGE,SEX,LOS,Joint,Drain,Cemented,Commercial_ALBC,Non_commercial_ALBC,cci_index,elx_index,...,Coagulopathy,Fluid and Electrolyte Disorders,Blood Loss Anemia,Deficiency Anemia,Anemia,Alcohol Abuse,Drug Abuse,Psychoses,Depression,Psyciatric disorder
19319,70.0,0,5,0,1,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
17906,82.0,0,5,0,1,1,0,1,0,2,...,0,0,0,0,0,0,0,0,0,0
28885,61.0,0,5,0,1,1,0,1,2,3,...,0,0,0,0,0,0,0,0,0,0
10228,69.0,0,5,0,0,1,0,1,1,1,...,0,0,0,0,0,0,0,0,0,0
28738,70.0,0,6,0,1,1,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16487,65.0,0,4,0,1,1,0,1,1,1,...,0,0,0,0,0,0,0,0,0,0
7672,78.0,1,8,0,0,1,0,1,3,6,...,0,0,0,0,0,0,0,0,0,0
47059,78.0,1,2,1,0,0,0,0,3,3,...,0,0,0,0,0,0,0,0,0,0
22344,76.0,0,3,0,1,0,0,0,2,3,...,0,0,0,0,0,0,0,0,0,0


In [59]:
exp11_y_val = readFPickle("/data/exp11/exp11_y_val")
exp11_y_val

19319    0
17906    0
28885    0
10228    0
28738    0
        ..
16487    0
7672     0
47059    0
22344    0
17350    0
Name: outcome, Length: 7801, dtype: int64

## Exp. 11

In [60]:
# gridsearch(clf, p_grid, X_train, y_train, X_test, y_test, run_name="Test")
X_train = exp11_X_train
y_train = exp11_y_train
X_val = exp11_X_val
y_val = exp11_y_val

p_grid_rf = {'model__max_depth': [i for i in range(5, 10)],
             'model__max_features': [i for i in range(20, 30)]}

p_grid_rf_2 = {'model__max_depth': [i for i in range(1, 20)],
               'model__max_features': [i for i in range(5, 15)]}

p_grid_rf_3 = {'model__max_depth': [i for i in range(5, 25)],
               'model__max_features': [i for i in range(10, 20)]}

base_lines_RF11 = [
    gridsearch(RandomForestClassifier(random_state=42), p_grid_rf, X_train, y_train, 
               X_val, y_val, run_name="RF Conf.1(E11)", scoring="f1"),
    gridsearch(RandomForestClassifier(random_state=42), p_grid_rf_2, X_train, y_train, 
               X_val, y_val, run_name="RF Conf.2(E11)", scoring="f1"),
    gridsearch(RandomForestClassifier(random_state=42), p_grid_rf_3, X_train, y_train, 
               X_val, y_val, run_name="RF Conf.3(E11)", scoring="f1"),
]



Best parameter (CV score=0.799):
{'model__max_depth': 9, 'model__max_features': 29}

Best estimator
Pipeline(steps=[('model',
                 RandomForestClassifier(max_depth=9, max_features=29,
                                        random_state=42))])


train
[[ 9572  4173]
 [  466 30079]]


Unnamed: 0,precision,recall,f1-score,support
1,0.953576,0.696399,0.804945,13745.0
0,0.878168,0.984744,0.928407,30545.0
macro avg,0.915872,0.840571,0.866676,44290.0
weighted avg,0.90157,0.895259,0.890092,44290.0


Accuracy: 0.895, AUC: 0.964, f1-score: 0.805 


>>>> Wandb(Train)....
>>>> Wandb(Train)(End)....
test
[[  75   90]
 [ 134 7502]]


Unnamed: 0,precision,recall,f1-score,support
1,0.358852,0.454545,0.40107,165.0
0,0.988145,0.982452,0.98529,7636.0
macro avg,0.673499,0.718498,0.69318,7801.0
weighted avg,0.974835,0.971286,0.972933,7801.0


Accuracy: 0.971, AUC: 0.786, f1-score: 0.401 


>>>> Wandb(Test)....
>>>> Wandb(Test)(End)....


VBox(children=(Label(value=' 0.14MB of 0.14MB uploaded (0.00MB deduped)\r'), FloatProgress(value=0.99153322289…

0,1
Test-AUC,▁
Test-Acc,▁
Test-f1(pos),▁
Test-pre,▁
Test-rec,▁
Train-AUC,▁
Train-Acc,▁
Train-f1(pos),▁
Train-pre,▁
Train-rec,▁

0,1
Test-AUC,0.7857
Test-Acc,0.97129
Test-f1(pos),0.40107
Test-pre,0.35885
Test-rec,0.45455
Train-AUC,0.9643
Train-Acc,0.89526
Train-f1(pos),0.80494
Train-pre,0.95358
Train-rec,0.6964




Best parameter (CV score=0.936):
{'model__max_depth': 19, 'model__max_features': 14}

Best estimator
Pipeline(steps=[('model',
                 RandomForestClassifier(max_depth=19, max_features=14,
                                        random_state=42))])


train
[[13316   429]
 [  320 30225]]


Unnamed: 0,precision,recall,f1-score,support
1,0.976533,0.968789,0.972645,13745.0
0,0.986005,0.989524,0.987761,30545.0
macro avg,0.981269,0.979156,0.980203,44290.0
weighted avg,0.983065,0.983089,0.98307,44290.0


Accuracy: 0.983, AUC: 0.999, f1-score: 0.973 


>>>> Wandb(Train)....
>>>> Wandb(Train)(End)....
test
[[  63  102]
 [ 172 7464]]


Unnamed: 0,precision,recall,f1-score,support
1,0.268085,0.381818,0.315,165.0
0,0.986519,0.977475,0.981976,7636.0
macro avg,0.627302,0.679647,0.648488,7801.0
weighted avg,0.971323,0.964876,0.967869,7801.0


Accuracy: 0.965, AUC: 0.769, f1-score: 0.315 


>>>> Wandb(Test)....
>>>> Wandb(Test)(End)....


VBox(children=(Label(value=' 0.10MB of 0.10MB uploaded (0.00MB deduped)\r'), FloatProgress(value=0.98799747927…

0,1
Test-AUC,▁
Test-Acc,▁
Test-f1(pos),▁
Test-pre,▁
Test-rec,▁
Train-AUC,▁
Train-Acc,▁
Train-f1(pos),▁
Train-pre,▁
Train-rec,▁

0,1
Test-AUC,0.76929
Test-Acc,0.96488
Test-f1(pos),0.315
Test-pre,0.26809
Test-rec,0.38182
Train-AUC,0.99859
Train-Acc,0.98309
Train-f1(pos),0.97265
Train-pre,0.97653
Train-rec,0.96879




Best parameter (CV score=0.952):
{'model__max_depth': 24, 'model__max_features': 13}

Best estimator
Pipeline(steps=[('model',
                 RandomForestClassifier(max_depth=24, max_features=13,
                                        random_state=42))])


train
[[13680    65]
 [  191 30354]]


Unnamed: 0,precision,recall,f1-score,support
1,0.98623,0.995271,0.99073,13745.0
0,0.997863,0.993747,0.995801,30545.0
macro avg,0.992047,0.994509,0.993265,44290.0
weighted avg,0.994253,0.99422,0.994227,44290.0


Accuracy: 0.994, AUC: 1.000, f1-score: 0.991 


>>>> Wandb(Train)....
>>>> Wandb(Train)(End)....
test
[[  60  105]
 [ 178 7458]]


Unnamed: 0,precision,recall,f1-score,support
1,0.252101,0.363636,0.297767,165.0
0,0.986117,0.976689,0.98138,7636.0
macro avg,0.619109,0.670163,0.639574,7801.0
weighted avg,0.970591,0.963723,0.966921,7801.0


Accuracy: 0.964, AUC: 0.766, f1-score: 0.298 


>>>> Wandb(Test)....
>>>> Wandb(Test)(End)....


VBox(children=(Label(value=' 0.07MB of 0.07MB uploaded (0.00MB deduped)\r'), FloatProgress(value=0.98221459718…

0,1
Test-AUC,▁
Test-Acc,▁
Test-f1(pos),▁
Test-pre,▁
Test-rec,▁
Train-AUC,▁
Train-Acc,▁
Train-f1(pos),▁
Train-pre,▁
Train-rec,▁

0,1
Test-AUC,0.76572
Test-Acc,0.96372
Test-f1(pos),0.29777
Test-pre,0.2521
Test-rec,0.36364
Train-AUC,0.99969
Train-Acc,0.99422
Train-f1(pos),0.99073
Train-pre,0.98623
Train-rec,0.99527


In [68]:
# gridsearch(clf, p_grid, X_train, y_train, X_test, y_test, run_name="Test")
X_train = exp11_X_train
y_train = exp11_y_train
X_val = exp11_X_val
y_val = exp11_y_val

p_grid_rf = {'model__max_depth': [i for i in range(5, 15)],
             'model__max_features': [i for i in range(28, 35)]}

p_grid_rf_2 = {'model__max_depth': [i for i in range(1, 25)],
               'model__max_features': [i for i in range(5, 25)]}

p_grid_rf_3 = {'model__max_depth': [i for i in range(5, 30)],
               'model__max_features': [i for i in range(10, 20)]}

base_lines_RF11_v2 = [
    gridsearch(RandomForestClassifier(random_state=42), p_grid_rf, X_train, y_train, 
               X_val, y_val, run_name="RF Conf.1(E11)v2", scoring="f1"),
    gridsearch(RandomForestClassifier(random_state=42), p_grid_rf_2, X_train, y_train, 
               X_val, y_val, run_name="RF Conf.2(E11)v2", scoring="f1"),
    gridsearch(RandomForestClassifier(random_state=42), p_grid_rf_3, X_train, y_train, 
               X_val, y_val, run_name="RF Conf.3(E11)v2", scoring="f1"),
]



Best parameter (CV score=0.900):
{'model__max_depth': 14, 'model__max_features': 34}

Best estimator
Pipeline(steps=[('model',
                 RandomForestClassifier(max_depth=14, max_features=34,
                                        random_state=42))])


train
[[12484  1261]
 [  598 29947]]


Unnamed: 0,precision,recall,f1-score,support
1,0.954288,0.908258,0.930704,13745.0
0,0.959594,0.980422,0.969896,30545.0
macro avg,0.956941,0.94434,0.9503,44290.0
weighted avg,0.957947,0.958027,0.957733,44290.0


Accuracy: 0.958, AUC: 0.994, f1-score: 0.931 


>>>> Wandb(Train)....
>>>> Wandb(Train)(End)....
test
[[  70   95]
 [ 242 7394]]


Unnamed: 0,precision,recall,f1-score,support
1,0.224359,0.424242,0.293501,165.0
0,0.987315,0.968308,0.977719,7636.0
macro avg,0.605837,0.696275,0.63561,7801.0
weighted avg,0.971177,0.9568,0.963247,7801.0


Accuracy: 0.957, AUC: 0.778, f1-score: 0.294 


>>>> Wandb(Test)....
>>>> Wandb(Test)(End)....


VBox(children=(Label(value=' 0.10MB of 0.10MB uploaded (0.00MB deduped)\r'), FloatProgress(value=0.98792531402…

0,1
Test-AUC,▁
Test-Acc,▁
Test-f1(pos),▁
Test-pre,▁
Test-rec,▁
Train-AUC,▁
Train-Acc,▁
Train-f1(pos),▁
Train-pre,▁
Train-rec,▁

0,1
Test-AUC,0.77797
Test-Acc,0.9568
Test-f1(pos),0.2935
Test-pre,0.22436
Test-rec,0.42424
Train-AUC,0.99411
Train-Acc,0.95803
Train-f1(pos),0.9307
Train-pre,0.95429
Train-rec,0.90826




Best parameter (CV score=0.952):
{'model__max_depth': 24, 'model__max_features': 13}

Best estimator
Pipeline(steps=[('model',
                 RandomForestClassifier(max_depth=24, max_features=13,
                                        random_state=42))])


train
[[13680    65]
 [  191 30354]]


Unnamed: 0,precision,recall,f1-score,support
1,0.98623,0.995271,0.99073,13745.0
0,0.997863,0.993747,0.995801,30545.0
macro avg,0.992047,0.994509,0.993265,44290.0
weighted avg,0.994253,0.99422,0.994227,44290.0


Accuracy: 0.994, AUC: 1.000, f1-score: 0.991 


>>>> Wandb(Train)....
>>>> Wandb(Train)(End)....
test
[[  60  105]
 [ 178 7458]]


Unnamed: 0,precision,recall,f1-score,support
1,0.252101,0.363636,0.297767,165.0
0,0.986117,0.976689,0.98138,7636.0
macro avg,0.619109,0.670163,0.639574,7801.0
weighted avg,0.970591,0.963723,0.966921,7801.0


Accuracy: 0.964, AUC: 0.766, f1-score: 0.298 


>>>> Wandb(Test)....
>>>> Wandb(Test)(End)....


VBox(children=(Label(value=' 0.07MB of 0.07MB uploaded (0.00MB deduped)\r'), FloatProgress(value=0.98221459718…

0,1
Test-AUC,▁
Test-Acc,▁
Test-f1(pos),▁
Test-pre,▁
Test-rec,▁
Train-AUC,▁
Train-Acc,▁
Train-f1(pos),▁
Train-pre,▁
Train-rec,▁

0,1
Test-AUC,0.76572
Test-Acc,0.96372
Test-f1(pos),0.29777
Test-pre,0.2521
Test-rec,0.36364
Train-AUC,0.99969
Train-Acc,0.99422
Train-f1(pos),0.99073
Train-pre,0.98623
Train-rec,0.99527




Best parameter (CV score=0.953):
{'model__max_depth': 27, 'model__max_features': 12}

Best estimator
Pipeline(steps=[('model',
                 RandomForestClassifier(max_depth=27, max_features=12,
                                        random_state=42))])


train
[[13702    43]
 [  147 30398]]


Unnamed: 0,precision,recall,f1-score,support
1,0.989386,0.996872,0.993114,13745.0
0,0.998587,0.995187,0.996885,30545.0
macro avg,0.993986,0.99603,0.994999,44290.0
weighted avg,0.995732,0.99571,0.995715,44290.0


Accuracy: 0.996, AUC: 1.000, f1-score: 0.993 


>>>> Wandb(Train)....
>>>> Wandb(Train)(End)....
test
[[  59  106]
 [ 170 7466]]


Unnamed: 0,precision,recall,f1-score,support
1,0.257642,0.357576,0.299492,165.0
0,0.986001,0.977737,0.981852,7636.0
macro avg,0.621821,0.667656,0.640672,7801.0
weighted avg,0.970595,0.96462,0.967419,7801.0


Accuracy: 0.965, AUC: 0.759, f1-score: 0.299 


>>>> Wandb(Test)....
>>>> Wandb(Test)(End)....


VBox(children=(Label(value=' 0.05MB of 0.05MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
Test-AUC,▁
Test-Acc,▁
Test-f1(pos),▁
Test-pre,▁
Test-rec,▁
Train-AUC,▁
Train-Acc,▁
Train-f1(pos),▁
Train-pre,▁
Train-rec,▁

0,1
Test-AUC,0.75926
Test-Acc,0.96462
Test-f1(pos),0.29949
Test-pre,0.25764
Test-rec,0.35758
Train-AUC,0.99979
Train-Acc,0.99571
Train-f1(pos),0.99311
Train-pre,0.98939
Train-rec,0.99687


In [70]:
# gridsearch(clf, p_grid, X_train, y_train, X_test, y_test, run_name="Test")
X_train = exp11_X_train
y_train = exp11_y_train
X_val = exp11_X_val
y_val = exp11_y_val

p_grid_rf = {'model__max_depth': [i for i in range(5, 10)],
             'model__max_features': [i for i in range(20, 30)]}

p_grid_rf_2 = {'model__max_depth': [i for i in range(1, 20)],
               'model__max_features': [i for i in range(5, 15)]}

p_grid_rf_3 = {'model__max_depth': [i for i in range(5, 25)],
               'model__max_features': [i for i in range(10, 20)]}

base_lines_RF11_v3 = [
    gridsearch(RandomForestClassifier(random_state=42), p_grid_rf, X_train, y_train, 
               X_val, y_val, run_name="RF Conf.1(E11)v3", scoring="f1",cv=10),
    gridsearch(RandomForestClassifier(random_state=42), p_grid_rf_2, X_train, y_train, 
               X_val, y_val, run_name="RF Conf.2(E11)v3", scoring="f1",cv=10),
    gridsearch(RandomForestClassifier(random_state=42), p_grid_rf_3, X_train, y_train, 
               X_val, y_val, run_name="RF Conf.3(E11)v3", scoring="f1",cv=10),
]



Best parameter (CV score=0.796):
{'model__max_depth': 9, 'model__max_features': 29}

Best estimator
Pipeline(steps=[('model',
                 RandomForestClassifier(max_depth=9, max_features=29,
                                        random_state=42))])


train
[[ 9572  4173]
 [  466 30079]]


Unnamed: 0,precision,recall,f1-score,support
1,0.953576,0.696399,0.804945,13745.0
0,0.878168,0.984744,0.928407,30545.0
macro avg,0.915872,0.840571,0.866676,44290.0
weighted avg,0.90157,0.895259,0.890092,44290.0


Accuracy: 0.895, AUC: 0.964, f1-score: 0.805 


>>>> Wandb(Train)....
>>>> Wandb(Train)(End)....
test
[[  75   90]
 [ 134 7502]]


Unnamed: 0,precision,recall,f1-score,support
1,0.358852,0.454545,0.40107,165.0
0,0.988145,0.982452,0.98529,7636.0
macro avg,0.673499,0.718498,0.69318,7801.0
weighted avg,0.974835,0.971286,0.972933,7801.0


Accuracy: 0.971, AUC: 0.786, f1-score: 0.401 


>>>> Wandb(Test)....
>>>> Wandb(Test)(End)....


VBox(children=(Label(value=' 0.14MB of 0.14MB uploaded (0.00MB deduped)\r'), FloatProgress(value=0.99153322289…

0,1
Test-AUC,▁
Test-Acc,▁
Test-f1(pos),▁
Test-pre,▁
Test-rec,▁
Train-AUC,▁
Train-Acc,▁
Train-f1(pos),▁
Train-pre,▁
Train-rec,▁

0,1
Test-AUC,0.7857
Test-Acc,0.97129
Test-f1(pos),0.40107
Test-pre,0.35885
Test-rec,0.45455
Train-AUC,0.9643
Train-Acc,0.89526
Train-f1(pos),0.80494
Train-pre,0.95358
Train-rec,0.6964




Best parameter (CV score=0.938):
{'model__max_depth': 19, 'model__max_features': 14}

Best estimator
Pipeline(steps=[('model',
                 RandomForestClassifier(max_depth=19, max_features=14,
                                        random_state=42))])


train
[[13316   429]
 [  320 30225]]


Unnamed: 0,precision,recall,f1-score,support
1,0.976533,0.968789,0.972645,13745.0
0,0.986005,0.989524,0.987761,30545.0
macro avg,0.981269,0.979156,0.980203,44290.0
weighted avg,0.983065,0.983089,0.98307,44290.0


Accuracy: 0.983, AUC: 0.999, f1-score: 0.973 


>>>> Wandb(Train)....
>>>> Wandb(Train)(End)....
test
[[  63  102]
 [ 172 7464]]


Unnamed: 0,precision,recall,f1-score,support
1,0.268085,0.381818,0.315,165.0
0,0.986519,0.977475,0.981976,7636.0
macro avg,0.627302,0.679647,0.648488,7801.0
weighted avg,0.971323,0.964876,0.967869,7801.0


Accuracy: 0.965, AUC: 0.769, f1-score: 0.315 


>>>> Wandb(Test)....
>>>> Wandb(Test)(End)....


VBox(children=(Label(value=' 0.10MB of 0.10MB uploaded (0.00MB deduped)\r'), FloatProgress(value=0.98799747927…

0,1
Test-AUC,▁
Test-Acc,▁
Test-f1(pos),▁
Test-pre,▁
Test-rec,▁
Train-AUC,▁
Train-Acc,▁
Train-f1(pos),▁
Train-pre,▁
Train-rec,▁

0,1
Test-AUC,0.76929
Test-Acc,0.96488
Test-f1(pos),0.315
Test-pre,0.26809
Test-rec,0.38182
Train-AUC,0.99859
Train-Acc,0.98309
Train-f1(pos),0.97265
Train-pre,0.97653
Train-rec,0.96879




Best parameter (CV score=0.954):
{'model__max_depth': 24, 'model__max_features': 16}

Best estimator
Pipeline(steps=[('model',
                 RandomForestClassifier(max_depth=24, max_features=16,
                                        random_state=42))])


train
[[13685    60]
 [  173 30372]]


Unnamed: 0,precision,recall,f1-score,support
1,0.987516,0.995635,0.991559,13745.0
0,0.998028,0.994336,0.996179,30545.0
macro avg,0.992772,0.994986,0.993869,44290.0
weighted avg,0.994766,0.994739,0.994745,44290.0


Accuracy: 0.995, AUC: 1.000, f1-score: 0.992 


>>>> Wandb(Train)....
>>>> Wandb(Train)(End)....
test
[[  63  102]
 [ 189 7447]]


Unnamed: 0,precision,recall,f1-score,support
1,0.25,0.381818,0.302158,165.0
0,0.986488,0.975249,0.980836,7636.0
macro avg,0.618244,0.678534,0.641497,7801.0
weighted avg,0.970911,0.962697,0.966482,7801.0


Accuracy: 0.963, AUC: 0.774, f1-score: 0.302 


>>>> Wandb(Test)....
>>>> Wandb(Test)(End)....


VBox(children=(Label(value=' 0.06MB of 0.06MB uploaded (0.00MB deduped)\r'), FloatProgress(value=0.97921225382…

0,1
Test-AUC,▁
Test-Acc,▁
Test-f1(pos),▁
Test-pre,▁
Test-rec,▁
Train-AUC,▁
Train-Acc,▁
Train-f1(pos),▁
Train-pre,▁
Train-rec,▁

0,1
Test-AUC,0.7736
Test-Acc,0.9627
Test-f1(pos),0.30216
Test-pre,0.25
Test-rec,0.38182
Train-AUC,0.99973
Train-Acc,0.99474
Train-f1(pos),0.99156
Train-pre,0.98752
Train-rec,0.99563


## Exp12.
- 延續exp11,做undersampling

In [61]:
exp12_X_train_undersampler = RandomUnderSampler(random_state=42, sampling_strategy="majority")

In [64]:
under_exp12_X_train, under_exp12_y_train= exp12_X_train_undersampler.fit_resample(exp11_X_train, exp11_y_train)

In [66]:
Counter(under_exp12_y_train)

Counter({0: 13745, 1: 13745})

In [67]:
# gridsearch(clf, p_grid, X_train, y_train, X_test, y_test, run_name="Test")
X_train = under_exp12_X_train
y_train = under_exp12_y_train
X_val = exp11_X_val
y_val = exp11_y_val

p_grid_rf = {'model__max_depth': [i for i in range(5, 10)],
             'model__max_features': [i for i in range(20, 30)]}

p_grid_rf_2 = {'model__max_depth': [i for i in range(1, 20)],
               'model__max_features': [i for i in range(5, 15)]}

p_grid_rf_3 = {'model__max_depth': [i for i in range(5, 25)],
               'model__max_features': [i for i in range(10, 20)]}

base_lines_RF12 = [
    gridsearch(RandomForestClassifier(random_state=42), p_grid_rf, X_train, y_train, 
               X_val, y_val, run_name="RF Conf.1(E12)", scoring="f1"),
    gridsearch(RandomForestClassifier(random_state=42), p_grid_rf_2, X_train, y_train, 
               X_val, y_val, run_name="RF Conf.2(E12)", scoring="f1"),
    gridsearch(RandomForestClassifier(random_state=42), p_grid_rf_3, X_train, y_train, 
               X_val, y_val, run_name="RF Conf.3(E12)", scoring="f1"),
]



Best parameter (CV score=0.883):
{'model__max_depth': 9, 'model__max_features': 24}

Best estimator
Pipeline(steps=[('model',
                 RandomForestClassifier(max_depth=9, max_features=24,
                                        random_state=42))])


train
[[12279  1466]
 [ 1503 12242]]


Unnamed: 0,precision,recall,f1-score,support
1,0.890945,0.893343,0.892142,13745.0
0,0.893055,0.890651,0.891852,13745.0
macro avg,0.892,0.891997,0.891997,27490.0
weighted avg,0.892,0.891997,0.891997,27490.0


Accuracy: 0.892, AUC: 0.965, f1-score: 0.892 


>>>> Wandb(Train)....
>>>> Wandb(Train)(End)....
test
[[  84   81]
 [ 926 6710]]


Unnamed: 0,precision,recall,f1-score,support
1,0.083168,0.509091,0.142979,165.0
0,0.988072,0.878732,0.9302,7636.0
macro avg,0.53562,0.693912,0.53659,7801.0
weighted avg,0.968933,0.870914,0.91355,7801.0


Accuracy: 0.871, AUC: 0.780, f1-score: 0.143 


>>>> Wandb(Test)....
>>>> Wandb(Test)(End)....


VBox(children=(Label(value=' 0.12MB of 0.12MB uploaded (0.00MB deduped)\r'), FloatProgress(value=0.99029627254…

0,1
Test-AUC,▁
Test-Acc,▁
Test-f1(pos),▁
Test-pre,▁
Test-rec,▁
Train-AUC,▁
Train-Acc,▁
Train-f1(pos),▁
Train-pre,▁
Train-rec,▁

0,1
Test-AUC,0.77952
Test-Acc,0.87091
Test-f1(pos),0.14298
Test-pre,0.08317
Test-rec,0.50909
Train-AUC,0.9653
Train-Acc,0.892
Train-f1(pos),0.89214
Train-pre,0.89094
Train-rec,0.89334




Best parameter (CV score=0.954):
{'model__max_depth': 19, 'model__max_features': 14}

Best estimator
Pipeline(steps=[('model',
                 RandomForestClassifier(max_depth=19, max_features=14,
                                        random_state=42))])


train
[[13670    75]
 [  394 13351]]


Unnamed: 0,precision,recall,f1-score,support
1,0.971985,0.994543,0.983135,13745.0
0,0.994414,0.971335,0.982739,13745.0
macro avg,0.9832,0.982939,0.982937,27490.0
weighted avg,0.9832,0.982939,0.982937,27490.0


Accuracy: 0.983, AUC: 0.999, f1-score: 0.983 


>>>> Wandb(Train)....
>>>> Wandb(Train)(End)....
test
[[  71   94]
 [ 432 7204]]


Unnamed: 0,precision,recall,f1-score,support
1,0.141153,0.430303,0.212575,165.0
0,0.98712,0.943426,0.964778,7636.0
macro avg,0.564136,0.686864,0.588677,7801.0
weighted avg,0.969227,0.932573,0.948868,7801.0


Accuracy: 0.933, AUC: 0.762, f1-score: 0.213 


>>>> Wandb(Test)....
>>>> Wandb(Test)(End)....


VBox(children=(Label(value=' 0.09MB of 0.09MB uploaded (0.00MB deduped)\r'), FloatProgress(value=0.98721217062…

0,1
Test-AUC,▁
Test-Acc,▁
Test-f1(pos),▁
Test-pre,▁
Test-rec,▁
Train-AUC,▁
Train-Acc,▁
Train-f1(pos),▁
Train-pre,▁
Train-rec,▁

0,1
Test-AUC,0.76178
Test-Acc,0.93257
Test-f1(pos),0.21257
Test-pre,0.14115
Test-rec,0.4303
Train-AUC,0.99884
Train-Acc,0.98294
Train-f1(pos),0.98313
Train-pre,0.97199
Train-rec,0.99454




Best parameter (CV score=0.960):
{'model__max_depth': 24, 'model__max_features': 14}

Best estimator
Pipeline(steps=[('model',
                 RandomForestClassifier(max_depth=24, max_features=14,
                                        random_state=42))])


train
[[13727    18]
 [  158 13587]]


Unnamed: 0,precision,recall,f1-score,support
1,0.988621,0.99869,0.99363,13745.0
0,0.998677,0.988505,0.993565,13745.0
macro avg,0.993649,0.993598,0.993598,27490.0
weighted avg,0.993649,0.993598,0.993598,27490.0


Accuracy: 0.994, AUC: 1.000, f1-score: 0.994 


>>>> Wandb(Train)....
>>>> Wandb(Train)(End)....
test
[[  70   95]
 [ 364 7272]]


Unnamed: 0,precision,recall,f1-score,support
1,0.16129,0.424242,0.233723,165.0
0,0.987105,0.952331,0.969406,7636.0
macro avg,0.574197,0.688287,0.601564,7801.0
weighted avg,0.969638,0.941161,0.953846,7801.0


Accuracy: 0.941, AUC: 0.766, f1-score: 0.234 


>>>> Wandb(Test)....
>>>> Wandb(Test)(End)....


VBox(children=(Label(value=' 0.07MB of 0.07MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
Test-AUC,▁
Test-Acc,▁
Test-f1(pos),▁
Test-pre,▁
Test-rec,▁
Train-AUC,▁
Train-Acc,▁
Train-f1(pos),▁
Train-pre,▁
Train-rec,▁

0,1
Test-AUC,0.76579
Test-Acc,0.94116
Test-f1(pos),0.23372
Test-pre,0.16129
Test-rec,0.42424
Train-AUC,0.99977
Train-Acc,0.9936
Train-f1(pos),0.99363
Train-pre,0.98862
Train-rec,0.99869


# Case 6.
- 同Exp 11但多加了讓Age不要去掉outlier, op_time_min outcome=1的改成填平均137

In [71]:
exp13_X_train = readFPickle("/data/exp13/exp13_X_train")
exp13_X_train

Unnamed: 0,AGE,SEX,LOS,Joint,Drain,Cemented,Commercial_ALBC,Non_commercial_ALBC,cci_index,elx_index,...,Coagulopathy,Fluid and Electrolyte Disorders,Blood Loss Anemia,Deficiency Anemia,Anemia,Alcohol Abuse,Drug Abuse,Psychoses,Depression,Psyciatric disorder
25753,66,0,6,0,1,1,1,0,4,3,...,0,0,0,0,0,0,0,0,1,1
628,61,0,10,0,1,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
26083,80,1,2,0,1,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
37780,67,0,6,0,1,1,0,1,1,1,...,0,0,0,0,0,0,0,0,0,0
28632,62,0,4,1,0,0,0,0,0,2,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11284,70,0,7,0,1,1,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0
44732,72,0,3,0,1,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
38158,68,0,6,0,1,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
860,31,1,4,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [72]:
exp13_y_train = readFPickle("/data/exp13/exp13_y_train")
exp13_y_train

25753    0
628      0
26083    0
37780    1
28632    0
        ..
11284    0
44732    1
38158    1
860      0
15795    0
Name: outcome, Length: 45627, dtype: int64

In [75]:
Counter(exp13_y_train)

Counter({0: 31467, 1: 14160})

In [73]:
exp13_X_val = readFPickle("/data/exp13/exp13_X_val")
exp13_X_val

Unnamed: 0,AGE,SEX,LOS,Joint,Drain,Cemented,Commercial_ALBC,Non_commercial_ALBC,cci_index,elx_index,...,Coagulopathy,Fluid and Electrolyte Disorders,Blood Loss Anemia,Deficiency Anemia,Anemia,Alcohol Abuse,Drug Abuse,Psychoses,Depression,Psyciatric disorder
286,72.0,0,13,0,1,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
40555,78.0,1,3,0,1,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
28125,69.0,0,8,1,1,0,0,0,2,2,...,0,0,0,0,0,0,0,0,0,0
44616,72.0,1,7,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
49606,80.0,0,3,1,0,0,0,0,2,3,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23499,81.0,1,5,0,1,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
47449,71.0,0,7,0,1,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
10480,55.0,0,3,0,1,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
35724,73.0,0,7,0,1,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [74]:
exp13_y_val = readFPickle("/data/exp13/exp13_y_val")
exp13_y_val

286      1
40555    0
28125    0
44616    0
49606    0
        ..
23499    0
47449    0
10480    0
35724    0
5180     0
Name: outcome, Length: 8037, dtype: int64

## Exp 13

In [76]:
# gridsearch(clf, p_grid, X_train, y_train, X_test, y_test, run_name="Test")
X_train = exp13_X_train
y_train = exp13_y_train
X_val = exp13_X_val
y_val = exp13_y_val

p_grid_rf = {'model__max_depth': [i for i in range(5, 10)],
             'model__max_features': [i for i in range(20, 30)]}

p_grid_rf_2 = {'model__max_depth': [i for i in range(1, 20)],
               'model__max_features': [i for i in range(5, 15)]}

p_grid_rf_3 = {'model__max_depth': [i for i in range(5, 25)],
               'model__max_features': [i for i in range(10, 20)]}

base_lines_RF13 = [
    gridsearch(RandomForestClassifier(random_state=42), p_grid_rf, X_train, y_train, 
               X_val, y_val, run_name="RF Conf.1(E13)", scoring="f1"),
    gridsearch(RandomForestClassifier(random_state=42), p_grid_rf_2, X_train, y_train, 
               X_val, y_val, run_name="RF Conf.2(E13)", scoring="f1"),
    gridsearch(RandomForestClassifier(random_state=42), p_grid_rf_3, X_train, y_train, 
               X_val, y_val, run_name="RF Conf.3(E13)", scoring="f1"),
]



Best parameter (CV score=0.784):
{'model__max_depth': 9, 'model__max_features': 28}

Best estimator
Pipeline(steps=[('model',
                 RandomForestClassifier(max_depth=9, max_features=28,
                                        random_state=42))])


train
[[ 9642  4518]
 [  477 30990]]


Unnamed: 0,precision,recall,f1-score,support
1,0.952861,0.680932,0.794267,14160.0
0,0.872761,0.984841,0.92542,31467.0
macro avg,0.912811,0.832887,0.859843,45627.0
weighted avg,0.897619,0.890525,0.884717,45627.0


Accuracy: 0.891, AUC: 0.961, f1-score: 0.794 


>>>> Wandb(Train)....
>>>> Wandb(Train)(End)....
test
[[  79   91]
 [ 127 7740]]


Unnamed: 0,precision,recall,f1-score,support
1,0.383495,0.464706,0.420213,170.0
0,0.98838,0.983857,0.986113,7867.0
macro avg,0.685937,0.724281,0.703163,8037.0
weighted avg,0.975585,0.972875,0.974143,8037.0


Accuracy: 0.973, AUC: 0.784, f1-score: 0.420 


>>>> Wandb(Test)....
>>>> Wandb(Test)(End)....


VBox(children=(Label(value=' 0.15MB of 0.15MB uploaded (0.00MB deduped)\r'), FloatProgress(value=0.99192726753…

0,1
Test-AUC,▁
Test-Acc,▁
Test-f1(pos),▁
Test-pre,▁
Test-rec,▁
Train-AUC,▁
Train-Acc,▁
Train-f1(pos),▁
Train-pre,▁
Train-rec,▁

0,1
Test-AUC,0.78443
Test-Acc,0.97288
Test-f1(pos),0.42021
Test-pre,0.3835
Test-rec,0.46471
Train-AUC,0.96093
Train-Acc,0.89053
Train-f1(pos),0.79427
Train-pre,0.95286
Train-rec,0.68093




Best parameter (CV score=0.931):
{'model__max_depth': 19, 'model__max_features': 14}

Best estimator
Pipeline(steps=[('model',
                 RandomForestClassifier(max_depth=19, max_features=14,
                                        random_state=42))])


train
[[13664   496]
 [  389 31078]]


Unnamed: 0,precision,recall,f1-score,support
1,0.972319,0.964972,0.968631,14160.0
0,0.984291,0.987638,0.985962,31467.0
macro avg,0.978305,0.976305,0.977296,45627.0
weighted avg,0.980576,0.980604,0.980583,45627.0


Accuracy: 0.981, AUC: 0.998, f1-score: 0.969 


>>>> Wandb(Train)....
>>>> Wandb(Train)(End)....
test
[[  77   93]
 [ 171 7696]]


Unnamed: 0,precision,recall,f1-score,support
1,0.310484,0.452941,0.368421,170.0
0,0.98806,0.978264,0.983137,7867.0
macro avg,0.649272,0.715602,0.675779,8037.0
weighted avg,0.973728,0.967152,0.970135,8037.0


Accuracy: 0.967, AUC: 0.797, f1-score: 0.368 


>>>> Wandb(Test)....
>>>> Wandb(Test)(End)....


VBox(children=(Label(value=' 0.10MB of 0.10MB uploaded (0.00MB deduped)\r'), FloatProgress(value=0.98815642671…

0,1
Test-AUC,▁
Test-Acc,▁
Test-f1(pos),▁
Test-pre,▁
Test-rec,▁
Train-AUC,▁
Train-Acc,▁
Train-f1(pos),▁
Train-pre,▁
Train-rec,▁

0,1
Test-AUC,0.79669
Test-Acc,0.96715
Test-f1(pos),0.36842
Test-pre,0.31048
Test-rec,0.45294
Train-AUC,0.99802
Train-Acc,0.9806
Train-f1(pos),0.96863
Train-pre,0.97232
Train-rec,0.96497




Best parameter (CV score=0.947):
{'model__max_depth': 24, 'model__max_features': 19}

Best estimator
Pipeline(steps=[('model',
                 RandomForestClassifier(max_depth=24, max_features=19,
                                        random_state=42))])


train
[[14102    58]
 [  229 31238]]


Unnamed: 0,precision,recall,f1-score,support
1,0.984021,0.995904,0.989927,14160.0
0,0.998147,0.992723,0.995427,31467.0
macro avg,0.991084,0.994313,0.992677,45627.0
weighted avg,0.993763,0.99371,0.99372,45627.0


Accuracy: 0.994, AUC: 1.000, f1-score: 0.990 


>>>> Wandb(Train)....
>>>> Wandb(Train)(End)....
test
[[  77   93]
 [ 183 7684]]


Unnamed: 0,precision,recall,f1-score,support
1,0.296154,0.452941,0.35814,170.0
0,0.988042,0.976738,0.982357,7867.0
macro avg,0.642098,0.71484,0.670248,8037.0
weighted avg,0.973407,0.965659,0.969154,8037.0


Accuracy: 0.966, AUC: 0.794, f1-score: 0.358 


>>>> Wandb(Test)....
>>>> Wandb(Test)(End)....


VBox(children=(Label(value=' 0.05MB of 0.05MB uploaded (0.00MB deduped)\r'), FloatProgress(value=0.97789810047…

0,1
Test-AUC,▁
Test-Acc,▁
Test-f1(pos),▁
Test-pre,▁
Test-rec,▁
Train-AUC,▁
Train-Acc,▁
Train-f1(pos),▁
Train-pre,▁
Train-rec,▁

0,1
Test-AUC,0.7944
Test-Acc,0.96566
Test-f1(pos),0.35814
Test-pre,0.29615
Test-rec,0.45294
Train-AUC,0.9997
Train-Acc,0.99371
Train-f1(pos),0.98993
Train-pre,0.98402
Train-rec,0.9959


# Case 7.

In [83]:
exp14_X_train = readFPickle("/data/exp14/exp14_X_train")
exp14_X_train

Unnamed: 0,AGE,SEX,LOS,Joint,Drain,Cemented,Commercial_ALBC,Non_commercial_ALBC,cci_index,elx_index,...,Coagulopathy,Fluid and Electrolyte Disorders,Blood Loss Anemia,Deficiency Anemia,Anemia,Alcohol Abuse,Drug Abuse,Psychoses,Depression,Psyciatric disorder
1641,66,0,5,0,1,1,1,0,1,1,...,0,0,0,0,0,0,0,0,0,0
11114,44,0,6,1,0,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
59217,69,0,7,0,1,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
19270,70,0,3,0,0,1,0,1,3,4,...,0,0,0,0,0,0,0,0,0,0
6821,59,0,8,0,1,1,0,1,4,4,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
54343,41,1,39,0,1,1,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0
38158,66,0,7,0,1,1,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0
860,47,1,7,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
15795,80,0,5,0,1,1,1,0,2,3,...,0,0,0,0,0,0,0,0,0,0


In [84]:
exp14_y_train = readFPickle("/data/exp14/exp14_y_train")
exp14_y_train

1641     0
11114    0
59217    1
19270    0
6821     0
        ..
54343    1
38158    1
860      0
15795    0
56422    1
Name: outcome, Length: 61090, dtype: int64

In [85]:
Counter(exp13_y_train)

Counter({0: 31467, 1: 14160})

In [86]:
exp14_X_val = readFPickle("/data/exp14/exp14_X_val")
exp14_X_val

Unnamed: 0,AGE,SEX,LOS,Joint,Drain,Cemented,Commercial_ALBC,Non_commercial_ALBC,cci_index,elx_index,...,Coagulopathy,Fluid and Electrolyte Disorders,Blood Loss Anemia,Deficiency Anemia,Anemia,Alcohol Abuse,Drug Abuse,Psychoses,Depression,Psyciatric disorder
19319,70.0,0,5,0,1,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
17906,82.0,0,5,0,1,1,0,1,0,2,...,0,0,0,0,0,0,0,0,0,0
28885,61.0,0,5,0,1,1,0,1,2,3,...,0,0,0,0,0,0,0,0,0,0
10228,69.0,0,5,0,0,1,0,1,1,1,...,0,0,0,0,0,0,0,0,0,0
28738,70.0,0,6,0,1,1,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16487,65.0,0,4,0,1,1,0,1,1,1,...,0,0,0,0,0,0,0,0,0,0
7672,78.0,1,8,0,0,1,0,1,3,6,...,0,0,0,0,0,0,0,0,0,0
47059,78.0,1,2,1,0,0,0,0,3,3,...,0,0,0,0,0,0,0,0,0,0
22344,76.0,0,3,0,1,0,0,0,2,3,...,0,0,0,0,0,0,0,0,0,0


In [80]:
exp14_y_val = readFPickle("/data/exp14/exp14_y_val")
exp14_y_val

19319    0
17906    0
28885    0
10228    0
28738    0
        ..
16487    0
7672     0
47059    0
22344    0
17350    0
Name: outcome, Length: 7801, dtype: int64

## Exp 14

In [88]:
# gridsearch(clf, p_grid, X_train, y_train, X_test, y_test, run_name="Test")
X_train = exp14_X_train
y_train = exp14_y_train
X_val = exp14_X_val
y_val = exp14_y_val

p_grid_rf = {'model__max_depth': [i for i in range(5, 10)],
             'model__max_features': [i for i in range(20, 30)]}

p_grid_rf_2 = {'model__max_depth': [i for i in range(1, 20)],
               'model__max_features': [i for i in range(5, 15)]}

p_grid_rf_3 = {'model__max_depth': [i for i in range(5, 25)],
               'model__max_features': [i for i in range(10, 20)]}

base_lines_RF14 = [
    gridsearch(RandomForestClassifier(random_state=42), p_grid_rf, X_train, y_train, 
               X_val, y_val, run_name="RF Conf.1(E14)", scoring="f1"),
    gridsearch(RandomForestClassifier(random_state=42), p_grid_rf_2, X_train, y_train, 
               X_val, y_val, run_name="RF Conf.2(E14)", scoring="f1"),
    gridsearch(RandomForestClassifier(random_state=42), p_grid_rf_3, X_train, y_train, 
               X_val, y_val, run_name="RF Conf.3(E14)", scoring="f1"),
]



Best parameter (CV score=0.887):
{'model__max_depth': 9, 'model__max_features': 24}

Best estimator
Pipeline(steps=[('model',
                 RandomForestClassifier(max_depth=9, max_features=24,
                                        random_state=42))])


train
[[27408  3137]
 [ 3487 27058]]


Unnamed: 0,precision,recall,f1-score,support
1,0.887134,0.897299,0.892188,30545.0
0,0.896109,0.885841,0.890945,30545.0
macro avg,0.891621,0.89157,0.891566,61090.0
weighted avg,0.891621,0.89157,0.891566,61090.0


Accuracy: 0.892, AUC: 0.963, f1-score: 0.892 


>>>> Wandb(Train)....
>>>> Wandb(Train)(End)....
test
[[  87   78]
 [ 925 6711]]


Unnamed: 0,precision,recall,f1-score,support
1,0.085968,0.527273,0.147833,165.0
0,0.988511,0.878863,0.930468,7636.0
macro avg,0.53724,0.703068,0.539151,7801.0
weighted avg,0.969421,0.871427,0.913914,7801.0


Accuracy: 0.871, AUC: 0.780, f1-score: 0.148 


>>>> Wandb(Test)....
>>>> Wandb(Test)(End)....


VBox(children=(Label(value=' 0.13MB of 0.13MB uploaded (0.00MB deduped)\r'), FloatProgress(value=0.99073363103…

0,1
Test-AUC,▁
Test-Acc,▁
Test-f1(pos),▁
Test-pre,▁
Test-rec,▁
Train-AUC,▁
Train-Acc,▁
Train-f1(pos),▁
Train-pre,▁
Train-rec,▁

0,1
Test-AUC,0.77953
Test-Acc,0.87143
Test-f1(pos),0.14783
Test-pre,0.08597
Test-rec,0.52727
Train-AUC,0.96341
Train-Acc,0.89157
Train-f1(pos),0.89219
Train-pre,0.88713
Train-rec,0.8973




Best parameter (CV score=0.966):
{'model__max_depth': 19, 'model__max_features': 13}

Best estimator
Pipeline(steps=[('model',
                 RandomForestClassifier(max_depth=19, max_features=13,
                                        random_state=42))])


train
[[30329   216]
 [ 1077 29468]]


Unnamed: 0,precision,recall,f1-score,support
1,0.965707,0.992928,0.979129,30545.0
0,0.992723,0.964741,0.978532,30545.0
macro avg,0.979215,0.978835,0.97883,61090.0
weighted avg,0.979215,0.978835,0.97883,61090.0


Accuracy: 0.979, AUC: 0.998, f1-score: 0.979 


>>>> Wandb(Train)....
>>>> Wandb(Train)(End)....
test
[[  65  100]
 [ 385 7251]]


Unnamed: 0,precision,recall,f1-score,support
1,0.144444,0.393939,0.211382,165.0
0,0.986396,0.949581,0.967639,7636.0
macro avg,0.56542,0.67176,0.58951,7801.0
weighted avg,0.968588,0.937828,0.951643,7801.0


Accuracy: 0.938, AUC: 0.761, f1-score: 0.211 


>>>> Wandb(Test)....
>>>> Wandb(Test)(End)....


VBox(children=(Label(value=' 0.09MB of 0.10MB uploaded (0.00MB deduped)\r'), FloatProgress(value=0.98758224260…

0,1
Test-AUC,▁
Test-Acc,▁
Test-f1(pos),▁
Test-pre,▁
Test-rec,▁
Train-AUC,▁
Train-Acc,▁
Train-f1(pos),▁
Train-pre,▁
Train-rec,▁

0,1
Test-AUC,0.761
Test-Acc,0.93783
Test-f1(pos),0.21138
Test-pre,0.14444
Test-rec,0.39394
Train-AUC,0.99785
Train-Acc,0.97883
Train-f1(pos),0.97913
Train-pre,0.96571
Train-rec,0.99293




Best parameter (CV score=0.975):
{'model__max_depth': 24, 'model__max_features': 18}

Best estimator
Pipeline(steps=[('model',
                 RandomForestClassifier(max_depth=24, max_features=18,
                                        random_state=42))])


train
[[30507    38]
 [  481 30064]]


Unnamed: 0,precision,recall,f1-score,support
1,0.984478,0.998756,0.991566,30545.0
0,0.998738,0.984253,0.991442,30545.0
macro avg,0.991608,0.991504,0.991504,61090.0
weighted avg,0.991608,0.991504,0.991504,61090.0


Accuracy: 0.992, AUC: 1.000, f1-score: 0.992 


>>>> Wandb(Train)....
>>>> Wandb(Train)(End)....
test
[[  62  103]
 [ 286 7350]]


Unnamed: 0,precision,recall,f1-score,support
1,0.178161,0.375758,0.241715,165.0
0,0.98618,0.962546,0.97422,7636.0
macro avg,0.58217,0.669152,0.607968,7801.0
weighted avg,0.96909,0.950135,0.958726,7801.0


Accuracy: 0.950, AUC: 0.768, f1-score: 0.242 


>>>> Wandb(Test)....
>>>> Wandb(Test)(End)....


VBox(children=(Label(value=' 0.05MB of 0.05MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
Test-AUC,▁
Test-Acc,▁
Test-f1(pos),▁
Test-pre,▁
Test-rec,▁
Train-AUC,▁
Train-Acc,▁
Train-f1(pos),▁
Train-pre,▁
Train-rec,▁

0,1
Test-AUC,0.76814
Test-Acc,0.95013
Test-f1(pos),0.24172
Test-pre,0.17816
Test-rec,0.37576
Train-AUC,0.99963
Train-Acc,0.9915
Train-f1(pos),0.99157
Train-pre,0.98448
Train-rec,0.99876


# Wrapper Approach
- use exp13 config.1 as base
- Best parameter (CV score=0.784):{'model__max_depth': 9, 'model__max_features': 28}
- 原本有40個feature

In [119]:
sfs = SequentialFeatureSelector(RandomForestClassifier(random_state=42), 
                                k_features=30, forward=True,
                                scoring='f1',cv=5,n_jobs=12,)

In [120]:
exp13_X_train.columns.shape

(40,)

In [121]:
sfs.fit(exp13_X_train, exp13_y_train, custom_feature_names=exp13_X_train.columns)

SequentialFeatureSelector(estimator=RandomForestClassifier(random_state=42),
                          k_features=30, n_jobs=12, scoring='f1')

In [122]:
feature_selection_metric = pd.DataFrame(sfs.get_metric_dict())
feature_selection_metric

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,21,22,23,24,25,26,27,28,29,30
feature_idx,"(11,)","(0, 11)","(0, 2, 11)","(0, 2, 9, 11)","(0, 2, 9, 11, 12)","(0, 1, 2, 9, 11, 12)","(0, 1, 2, 7, 9, 11, 12)","(0, 1, 2, 7, 9, 11, 12, 13)","(0, 1, 2, 4, 7, 9, 11, 12, 13)","(0, 1, 2, 3, 4, 7, 9, 11, 12, 13)",...,"(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 11, 12, 13, 16,...","(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 11, 12, 13, 16,...","(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 11, 12, 13, 16,...","(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 11, 12, 13, 16,...","(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 11, 12, 13, 16,...","(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 11, 12, 13, 16,...","(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 11, 12, 13, 16,...","(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 11, 12, 13, 16,...","(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 11, 12, 13, 16,...","(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 11, 12, 13, 16,..."
cv_scores,"[0.5753357753357753, 0.5843023255813953, 0.583...","[0.7448448986910525, 0.7477365524587253, 0.751...","[0.8378237441334957, 0.8278617335417752, 0.837...","[0.8836802518803568, 0.8805710306406687, 0.883...","[0.9122070654074851, 0.9041527948133871, 0.901...","[0.9182456140350878, 0.9170336445305619, 0.916...","[0.9255039439088518, 0.9241597747668485, 0.921...","[0.933639251676668, 0.9272246696035241, 0.9295...","[0.9400986610288937, 0.9311683727497353, 0.939...","[0.9423788546255507, 0.9362150764364787, 0.942...",...,"[0.9480588548129765, 0.9458790237000353, 0.951...","[0.9486951890644416, 0.9454545454545454, 0.950...","[0.9472937000887311, 0.9459936463113309, 0.950...","[0.9503369989357928, 0.9448373408769449, 0.951...","[0.9483002832861189, 0.9470151889791594, 0.952...","[0.9487861066808436, 0.9467915856461021, 0.953...","[0.9493805309734512, 0.946661956905687, 0.9518...","[0.948749778329491, 0.9476661951909476, 0.9500...","[0.9481718139865104, 0.945898161244696, 0.9518...","[0.9484682132105543, 0.9463621879978756, 0.953..."
avg_score,0.577579,0.744262,0.832789,0.881197,0.903899,0.915883,0.923784,0.928551,0.937172,0.94025,...,0.947577,0.947707,0.947162,0.947738,0.948866,0.948942,0.948978,0.948814,0.948812,0.948344
feature_names,"(OP_time_minute,)","(AGE, OP_time_minute)","(AGE, LOS, OP_time_minute)","(AGE, LOS, elx_index, OP_time_minute)","(AGE, LOS, elx_index, OP_time_minute, ASA)","(AGE, SEX, LOS, elx_index, OP_time_minute, ASA)","(AGE, SEX, LOS, Non_commercial_ALBC, elx_index...","(AGE, SEX, LOS, Non_commercial_ALBC, elx_index...","(AGE, SEX, LOS, Drain, Non_commercial_ALBC, el...","(AGE, SEX, LOS, Joint, Drain, Non_commercial_A...",...,"(AGE, SEX, LOS, Joint, Drain, Cemented, Commer...","(AGE, SEX, LOS, Joint, Drain, Cemented, Commer...","(AGE, SEX, LOS, Joint, Drain, Cemented, Commer...","(AGE, SEX, LOS, Joint, Drain, Cemented, Commer...","(AGE, SEX, LOS, Joint, Drain, Cemented, Commer...","(AGE, SEX, LOS, Joint, Drain, Cemented, Commer...","(AGE, SEX, LOS, Joint, Drain, Cemented, Commer...","(AGE, SEX, LOS, Joint, Drain, Cemented, Commer...","(AGE, SEX, LOS, Joint, Drain, Cemented, Commer...","(AGE, SEX, LOS, Joint, Drain, Cemented, Commer..."
ci_bound,0.007254,0.006561,0.005682,0.002524,0.005626,0.002895,0.001588,0.004206,0.004198,0.004326,...,0.003116,0.002968,0.003645,0.003676,0.003005,0.004174,0.003081,0.001807,0.004047,0.003949
std_dev,0.005644,0.005105,0.004421,0.001964,0.004377,0.002252,0.001236,0.003272,0.003266,0.003366,...,0.002425,0.002309,0.002836,0.00286,0.002338,0.003248,0.002397,0.001406,0.003149,0.003072
std_err,0.002822,0.002552,0.00221,0.000982,0.002188,0.001126,0.000618,0.001636,0.001633,0.001683,...,0.001212,0.001155,0.001418,0.00143,0.001169,0.001624,0.001199,0.000703,0.001574,0.001536


In [123]:
feature_selection_metric[30].feature_names

('AGE',
 'SEX',
 'LOS',
 'Joint',
 'Drain',
 'Cemented',
 'Commercial_ALBC',
 'Non_commercial_ALBC',
 'cci_index',
 'elx_index',
 'OP_time_minute',
 'ASA',
 'Diagnosis',
 'Heart disease',
 'Chronic Pulmonary Disease',
 'Diabetes',
 'Hypothyroidism',
 'Renal Failure',
 'Liver Disease',
 'Peptic Ulcer Disease excluding bleeding',
 'Lymphoma',
 'Cancer history',
 'Rheumatoid Arthritis/collagen',
 'Blood Loss Anemia',
 'Deficiency Anemia',
 'Alcohol Abuse',
 'Drug Abuse',
 'Psychoses',
 'Depression',
 'Psyciatric disorder')

In [124]:
exp13_X_train_selec = exp13_X_train.loc[:,feature_selection_metric[30].feature_names]
exp13_X_train_selec 

Unnamed: 0,AGE,SEX,LOS,Joint,Drain,Cemented,Commercial_ALBC,Non_commercial_ALBC,cci_index,elx_index,...,Lymphoma,Cancer history,Rheumatoid Arthritis/collagen,Blood Loss Anemia,Deficiency Anemia,Alcohol Abuse,Drug Abuse,Psychoses,Depression,Psyciatric disorder
25753,66,0,6,0,1,1,1,0,4,3,...,0,0,0,0,0,0,0,0,1,1
628,61,0,10,0,1,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
26083,80,1,2,0,1,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
37780,67,0,6,0,1,1,0,1,1,1,...,0,0,1,0,0,0,0,0,0,0
28632,62,0,4,1,0,0,0,0,0,2,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11284,70,0,7,0,1,1,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0
44732,72,0,3,0,1,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
38158,68,0,6,0,1,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
860,31,1,4,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [125]:
exp13_X_val_selec = exp13_X_val.loc[:,feature_selection_metric[30].feature_names]
exp13_X_val_selec

Unnamed: 0,AGE,SEX,LOS,Joint,Drain,Cemented,Commercial_ALBC,Non_commercial_ALBC,cci_index,elx_index,...,Lymphoma,Cancer history,Rheumatoid Arthritis/collagen,Blood Loss Anemia,Deficiency Anemia,Alcohol Abuse,Drug Abuse,Psychoses,Depression,Psyciatric disorder
286,72.0,0,13,0,1,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
40555,78.0,1,3,0,1,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
28125,69.0,0,8,1,1,0,0,0,2,2,...,0,0,0,0,0,0,0,0,0,0
44616,72.0,1,7,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
49606,80.0,0,3,1,0,0,0,0,2,3,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23499,81.0,1,5,0,1,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
47449,71.0,0,7,0,1,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
10480,55.0,0,3,0,1,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
35724,73.0,0,7,0,1,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


## Exp 15

In [126]:
# gridsearch(clf, p_grid, X_train, y_train, X_test, y_test, run_name="Test")
X_train = exp13_X_train_selec
y_train = exp13_y_train
X_val = exp13_X_val_selec
y_val = exp13_y_val

p_grid_rf = {'model__max_depth': [i for i in range(5, 10)],
             'model__max_features': [i for i in range(20, 30)]}

p_grid_rf_2 = {'model__max_depth': [i for i in range(1, 20)],
               'model__max_features': [i for i in range(5, 15)]}

p_grid_rf_3 = {'model__max_depth': [i for i in range(5, 25)],
               'model__max_features': [i for i in range(10, 20)]}

base_lines_RF15 = [
    gridsearch(RandomForestClassifier(random_state=42), p_grid_rf, X_train, y_train, 
               X_val, y_val, run_name="RF Conf.1(E15)v2", scoring="f1"),
    gridsearch(RandomForestClassifier(random_state=42), p_grid_rf_2, X_train, y_train, 
               X_val, y_val, run_name="RF Conf.2(E15)v2", scoring="f1"),
    gridsearch(RandomForestClassifier(random_state=42), p_grid_rf_3, X_train, y_train, 
               X_val, y_val, run_name="RF Conf.3(E15)v2", scoring="f1"),
]



Best parameter (CV score=0.790):
{'model__max_depth': 9, 'model__max_features': 25}

Best estimator
Pipeline(steps=[('model',
                 RandomForestClassifier(max_depth=9, max_features=25,
                                        random_state=42))])


train
[[ 9809  4351]
 [  483 30984]]


Unnamed: 0,precision,recall,f1-score,support
1,0.95307,0.692726,0.802307,14160.0
0,0.876864,0.984651,0.927637,31467.0
macro avg,0.914967,0.838688,0.864972,45627.0
weighted avg,0.900514,0.894054,0.888742,45627.0


Accuracy: 0.894, AUC: 0.962, f1-score: 0.802 


>>>> Wandb(Train)....
>>>> Wandb(Train)(End)....
test
[[  78   92]
 [ 131 7736]]


Unnamed: 0,precision,recall,f1-score,support
1,0.373206,0.458824,0.411609,170.0
0,0.988247,0.983348,0.985792,7867.0
macro avg,0.680727,0.721086,0.698701,8037.0
weighted avg,0.975238,0.972253,0.973646,8037.0


Accuracy: 0.972, AUC: 0.791, f1-score: 0.412 


>>>> Wandb(Test)....
>>>> Wandb(Test)(End)....


VBox(children=(Label(value=' 0.15MB of 0.15MB uploaded (0.00MB deduped)\r'), FloatProgress(value=0.99220381718…

0,1
Test-AUC,▁
Test-Acc,▁
Test-f1(pos),▁
Test-pre,▁
Test-rec,▁
Train-AUC,▁
Train-Acc,▁
Train-f1(pos),▁
Train-pre,▁
Train-rec,▁

0,1
Test-AUC,0.79133
Test-Acc,0.97225
Test-f1(pos),0.41161
Test-pre,0.37321
Test-rec,0.45882
Train-AUC,0.96164
Train-Acc,0.89405
Train-f1(pos),0.80231
Train-pre,0.95307
Train-rec,0.69273




Best parameter (CV score=0.938):
{'model__max_depth': 19, 'model__max_features': 14}

Best estimator
Pipeline(steps=[('model',
                 RandomForestClassifier(max_depth=19, max_features=14,
                                        random_state=42))])


train
[[13857   303]
 [  349 31118]]


Unnamed: 0,precision,recall,f1-score,support
1,0.975433,0.978602,0.977015,14160.0
0,0.990357,0.988909,0.989632,31467.0
macro avg,0.982895,0.983755,0.983324,45627.0
weighted avg,0.985725,0.98571,0.985717,45627.0


Accuracy: 0.986, AUC: 0.999, f1-score: 0.977 


>>>> Wandb(Train)....
>>>> Wandb(Train)(End)....
test
[[  74   96]
 [ 190 7677]]


Unnamed: 0,precision,recall,f1-score,support
1,0.280303,0.435294,0.341014,170.0
0,0.98765,0.975848,0.981714,7867.0
macro avg,0.633976,0.705571,0.661364,8037.0
weighted avg,0.972688,0.964415,0.968161,8037.0


Accuracy: 0.964, AUC: 0.796, f1-score: 0.341 


>>>> Wandb(Test)....
>>>> Wandb(Test)(End)....


VBox(children=(Label(value=' 0.09MB of 0.09MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
Test-AUC,▁
Test-Acc,▁
Test-f1(pos),▁
Test-pre,▁
Test-rec,▁
Train-AUC,▁
Train-Acc,▁
Train-f1(pos),▁
Train-pre,▁
Train-rec,▁

0,1
Test-AUC,0.79592
Test-Acc,0.96441
Test-f1(pos),0.34101
Test-pre,0.2803
Test-rec,0.43529
Train-AUC,0.99877
Train-Acc,0.98571
Train-f1(pos),0.97701
Train-pre,0.97543
Train-rec,0.9786




Best parameter (CV score=0.948):
{'model__max_depth': 24, 'model__max_features': 13}

Best estimator
Pipeline(steps=[('model',
                 RandomForestClassifier(max_depth=24, max_features=13,
                                        random_state=42))])


train
[[14102    58]
 [  210 31257]]


Unnamed: 0,precision,recall,f1-score,support
1,0.985327,0.995904,0.990587,14160.0
0,0.998148,0.993326,0.995731,31467.0
macro avg,0.991737,0.994615,0.993159,45627.0
weighted avg,0.994169,0.994126,0.994135,45627.0


Accuracy: 0.994, AUC: 1.000, f1-score: 0.991 


>>>> Wandb(Train)....
>>>> Wandb(Train)(End)....
test
[[  72   98]
 [ 175 7692]]


Unnamed: 0,precision,recall,f1-score,support
1,0.291498,0.423529,0.345324,170.0
0,0.98742,0.977755,0.982564,7867.0
macro avg,0.639459,0.700642,0.663944,8037.0
weighted avg,0.9727,0.966032,0.969085,8037.0


Accuracy: 0.966, AUC: 0.796, f1-score: 0.345 


>>>> Wandb(Test)....
>>>> Wandb(Test)(End)....


VBox(children=(Label(value=' 0.06MB of 0.06MB uploaded (0.00MB deduped)\r'), FloatProgress(value=0.97999355150…

0,1
Test-AUC,▁
Test-Acc,▁
Test-f1(pos),▁
Test-pre,▁
Test-rec,▁
Train-AUC,▁
Train-Acc,▁
Train-f1(pos),▁
Train-pre,▁
Train-rec,▁

0,1
Test-AUC,0.79619
Test-Acc,0.96603
Test-f1(pos),0.34532
Test-pre,0.2915
Test-rec,0.42353
Train-AUC,0.99969
Train-Acc,0.99413
Train-f1(pos),0.99059
Train-pre,0.98533
Train-rec,0.9959
