# Import

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
# from sklearn.tree import DecisionTreeClassifier
# from sklearn.neighbors import KNeighborsClassifier
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.naive_bayes import BernoulliNB, GaussianNB
# from sklearn.linear_model import LogisticRegression
# from sklearn.neural_network import MLPClassifier, MLPRegressor
# from sklearn.svm import SVC
# from sklearn.dummy import DummyClassifier
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold

In [3]:
from imblearn.over_sampling import SMOTE, RandomOverSampler
from imblearn.pipeline import Pipeline

In [4]:
from collections import Counter

In [15]:
import wandb

# Function(Utility)

In [5]:
# check Duplicate file
def checkDuplicateFile(file_path):
    import os
    if os.path.isfile(file_path):
        print("Caution: File existed!")
        ans = input("Do you want to cover it?(Y/others)")
        if ans == "Y":
            return False
        else:
            print("Canceled....")
            return True
    else:
        return False

In [19]:
def gridsearch(clf, p_grid, X_train, y_train, X_test, y_test, run_name="Test"):
    # clf: model (classifier)
    # p_grid: search space
    # X_train: training data
    # y_train: training data(target)
    # X_test: testing data
    # y_test: testing data(target)
    
    #########
    ##wandb##
    #########
    wandb.init(project="DataMining_Project2", entity="oscarchencs10")
    wandb.run.name = run_name
    wandb.run.save()
    
    pipe = Pipeline(steps=[
        ('model', clf)]
    )

    #採用F1-Score最高為標準
    grid_search = GridSearchCV(
        estimator=pipe, param_grid=p_grid, cv=5, n_jobs=12, scoring='f1', refit=True)
    grid_search.fit(X_train, y_train)

    print("Best parameter (CV score=%0.3f):" % grid_search.best_score_)
    print(grid_search.best_params_)
    print('\nBest estimator')
    print(grid_search.best_estimator_)
    # print(grid_search.cv_results_)

    #############
    ### train ###
    #############
    print('\n\ntrain')
    y_pred = grid_search.predict(X_train)
    report = classification_report(y_train, y_pred, labels=[
                                   1, 0], output_dict=True)  # , target_names=['0', '1']
    acc = report.pop('accuracy')
    target_f1_score = report['1']['f1-score']
    y_pred_prob = grid_search.predict_proba(X_train)
    auc = roc_auc_score(y_train, y_pred_prob[:, 1])

    print(confusion_matrix(y_train, y_pred, labels=[1, 0]))
    display(pd.DataFrame(report).T)
    print(
        f'Accuracy: {acc:.3f}, AUC: {auc:.3f}, f1-score: {target_f1_score:.3f} \n\n')
    
    #<wandb>#
    wandb.sklearn.plot_classifier(grid_search, X_train, X_test, y_train, y_test, y_pred, 
                                  y_pred_prob, labels=["infected","non-infected"], model_name='SVC', feature_names=None)

    ############
    ### test ###
    ############
    print('test')
    y_pred = grid_search.predict(X_test)
    report = classification_report(y_test, y_pred, labels=[
                                   1, 0], output_dict=True)  # , target_names=['0', '1']
    acc = report.pop('accuracy')
    target_f1_score = report['1']['f1-score']
    y_pred_prob = grid_search.predict_proba(X_test)
    auc = roc_auc_score(y_test, y_pred_prob[:, 1])

    print(confusion_matrix(y_test, y_pred, labels=[1, 0]))
    display(pd.DataFrame(report).T)
    print(
        f'Accuracy: {acc:.3f}, AUC: {auc:.3f}, f1-score: {target_f1_score:.3f} \n\n')

    if grid_search.best_estimator_.steps[1][1].__class__.__name__ == 'RandomForestClassifier':
        name = 'RandomForest (dep=' + str(grid_search.best_params_['model__max_depth']) + \
            ' feature=' + \
            str(grid_search.best_params_['model__max_features']) + ')'
    elif grid_search.best_estimator_.steps[1][1].__class__.__name__ == 'KNeighborsClassifier':
        name = 'KNeighbors (k=' + \
            str(grid_search.best_params_['model__n_neighbors']) + ')'
    elif grid_search.best_estimator_.steps[1][1].__class__.__name__ == 'LogisticRegression':
        name = 'LogisticRegression (penalty=' + \
            grid_search.best_params_['model__penalty']
        if len(grid_search.best_params_) == 2:
            name = name + ' solver=' + \
                grid_search.best_params_['model__solver']
        name = name + ')'
    else:
        name = ''

    return grid_search, name, acc, target_f1_score, auc

# Function(Use)

In [7]:
# Test for checking ICU_id missing in Lab_1103_csv
def getMissingIDinLab(Lab_file, show=True):
    test = sorted(Lab_file.ICU_id.unique())
    s = 0
    error_list = list()
    for i in test:
        #         print(i)
        s += 1
        if s != i:
            if show:
                print(f"error! : {s}")
            error_list.append(s)
            s += 1
    if show:
        print(f"Missing ID Result: {error_list}")
    if show:
        print(f"Missing Length:{len(error_list)}")
    return error_list

In [8]:
# store Dataframe to CSV
def store2CSV(data, target_name, target_loc_prefix='./'):
    file_path = target_loc_prefix+target_name+".csv"
    if checkDuplicateFile(file_path):
        print("store2CSV failed")
        return
    data.to_csv(file_path)
    print("store2CSV Successful!")

In [9]:
# store Datastruc. to pickle
def store2Pickle(data, target_name, target_loc_prefix='./'):
    import pickle
    file_path = target_loc_prefix+target_name+'.pickle'
    if checkDuplicateFile(file_path):
        print("store2Pickle failed")
        return
    with open(file_path, 'wb') as f:
        pickle.dump(data, f)
    print("store2Pickle Successful!")

In [10]:
def readFPickle(target_name, target_loc_prefix='./'):
    import pickle
    file_name = target_loc_prefix+target_name+'.pickle'
    with open(file_name, 'rb') as f:
        temp = pickle.load(f)
    return temp

# Function(Data preprocessing)

In [11]:
#Function: 補值
# 將針對輸入的df_data直接進行inplace插補
# 須確保df_data的缺失值位置有放np.nan
def handleMissing(df_data, df_feature, outFeature=["outcome"], cate_astype = "int"):
    for featureName in df_data.columns:
        if featureName not in outFeature:
            if df_data[featureName].isna().sum() == 0:
                print(f"{featureName}: Not need to fill.")
                continue
            else:
                # 先去看是連續與否 (1代表連續,0代表離散)
                kindValue = df_feature.loc[df_feature["features name"]
                                           == featureName, "kind"].values[0]
                if kindValue == 1:
                    # continuous
                    # mean filling
                    targetMean = df_data[featureName].mean()
                    df_data[featureName].fillna(value=targetMean, inplace=True)
                    print(f"{featureName}: Fill, Continuous.")

                else:
                    # categorical
                    # mode filling
                    targetMode = df_data[featureName].mode()[0]
                    df_data[featureName].fillna(value=targetMode, inplace=True)
                    df_data[featureName] = df_data[featureName].astype(cate_astype)
                    print(f"{featureName}: Fill, Categorical. (astype to {cate_astype})")
            
    print("---handleMissing Finish---")

In [12]:
# plot hist
# filtered_data need to check not have nan
def plotHist(df_data, target, outcome="outcome", bins=20):
    filtered_data = pd.concat([df_data["outcome"], df_data[target]], axis=1)
    filtered_data = filtered_data.dropna()
    print(filtered_data.isna().sum())
    plt.hist(filtered_data.loc[filtered_data[outcome]==0, target], bins=bins, alpha=0.5, label='0')
    plt.hist(filtered_data.loc[filtered_data[outcome]==1, target], bins=bins, alpha=0.5, label='1')
    plt.xlabel(target)
    plt.ylabel('count')
    plt.legend(title=outcome)

In [13]:
# plot countplot
# filtered_data need to check not have nan
def plotCountplot(df_data, target, outcome="outcome"):
    filtered_data = pd.concat([df_data["outcome"], df_data[target]], axis=1)
    filtered_data = filtered_data.dropna()
    print(filtered_data.isna().sum())
    sns.countplot(x=target, hue=outcome, data=filtered_data)

In [14]:
# plot boxplot
# filtered_data need to check not have nan
def plotBoxplot(df_data, target, outcome="outcome"):
    filtered_data = pd.concat([df_data["outcome"], df_data[target]], axis=1)
    filtered_data = filtered_data.dropna()
    print(filtered_data.isna().sum())
    sns.boxplot(x=filtered_data[target], data=filtered_data)

# Load Data(From Project1)

In [None]:
p_grid_rf = {'model__max_depth': [i for i in range(5, 10)],
             'model__max_features': [i for i in range(20, 30)]}

p_grid_rf_2 = {'model__max_depth': [i for i in range(1, 20)],
               'model__max_features': [i for i in range(5, 15)]}

p_grid_rf_3 = {'model__max_depth': [i for i in range(5, 25)],
               'model__max_features': [i for i in range(10, 20)]}

p_grid_knn = {'model__n_neighbors': [i for i in range(1, 15)]}

p_grid_lr_1 = {'model__penalty':['l1', 'l2']}

p_grid_lr_2 = {'model__penalty':['l1'], 'model__solver':['liblinear', 'saga']}

p_grid_lr_3 = {'model__penalty':['l2'], 'model__solver':['newton-cg', 'lbfgs', 'sag', 'saga']}

base_lines = [
    gridsearch(KNeighborsClassifier(), p_grid_knn),
    
    gridsearch(RandomForestClassifier(random_state=42), p_grid_rf),
    gridsearch(RandomForestClassifier(random_state=42), p_grid_rf_2),
    gridsearch(RandomForestClassifier(random_state=42), p_grid_rf_3),
    
    gridsearch(LogisticRegression(random_state=42), p_grid_lr_1),
    gridsearch(LogisticRegression(random_state=42), p_grid_lr_2),
    gridsearch(LogisticRegression(random_state=42), p_grid_lr_3),

]

In [16]:
wandb.init(project="DataMining_Project2", entity="oscarchencs10")
wandb.run.name = "test1"
wandb.run.save()

[34m[1mwandb[0m: Currently logged in as: [33moscarchencs10[0m (use `wandb login --relogin` to force relogin)




True