# Import

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
# from sklearn.tree import DecisionTreeClassifier
# from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
# from sklearn.naive_bayes import BernoulliNB, GaussianNB
from sklearn.linear_model import LogisticRegression
# from sklearn.neural_network import MLPClassifier, MLPRegressor
# from sklearn.svm import SVC
# from sklearn.dummy import DummyClassifier
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold

In [3]:
from imblearn.over_sampling import SMOTE, RandomOverSampler
from imblearn.pipeline import Pipeline

In [4]:
from collections import Counter

In [5]:
import wandb

In [6]:
from sklearn import preprocessing

In [7]:
from skl2onnx import convert_sklearn
from skl2onnx.common.data_types import FloatTensorType
import onnxruntime as rt

ModuleNotFoundError: No module named 'skl2onnx'

In [None]:
# 用來保存print的階段(長時間)
# %%capture stored_output

# Function(Utility)

In [8]:
# check Duplicate file
def checkDuplicateFile(file_path):
    import os
    if os.path.isfile(file_path):
        print("Caution: File existed!")
        ans = input("Do you want to cover it?(Y/others)")
        if ans == "Y":
            return False
        else:
            print("Canceled....")
            return True
    else:
        return False

In [31]:
def gridsearch(clf, p_grid, X_train, y_train, X_test, y_test, run_name="Test", scoring="f1"):
    # clf: model (classifier)
    # p_grid: search space
    # X_train: training data
    # y_train: training data(target)
    # X_test: testing data
    # y_test: testing data(target)
    
    #########
    ##wandb##
    #########
    wandb.init(project="DataMining_Project2", entity="oscarchencs10")
    wandb.run.name = run_name
    wandb.run.save()
    
    pipe = Pipeline(steps=[
        ('model', clf)]
    )

    #採用F1-Score最高為標準
    grid_search = GridSearchCV(
        estimator=pipe, param_grid=p_grid, cv=5, n_jobs=12, scoring=scoring, refit=True)
    grid_search.fit(X_train, y_train)

    print("Best parameter (CV score=%0.3f):" % grid_search.best_score_)
    print(grid_search.best_params_)
    print('\nBest estimator')
    print(grid_search.best_estimator_)
    # print(grid_search.cv_results_)

    #############
    ### train ###
    #############
    print('\n\ntrain')
    y_pred = grid_search.predict(X_train)
    report = classification_report(y_train, y_pred, labels=[
                                   1, 0], output_dict=True)  # , target_names=['0', '1']
    acc = report.pop('accuracy')
    target_f1_score = report['1']['f1-score']
    y_pred_prob = grid_search.predict_proba(X_train)
    auc = roc_auc_score(y_train, y_pred_prob[:, 1])

    print(confusion_matrix(y_train, y_pred, labels=[1, 0]))
    display(pd.DataFrame(report).T)
    print(
        f'Accuracy: {acc:.3f}, AUC: {auc:.3f}, f1-score: {target_f1_score:.3f} \n\n')
    
    print(">>>> Wandb(Train)....")
    
    wandb.log({"Train-Acc": acc, "Train-AUC":auc, "Train-f1(pos)":target_f1_score})
    
    #<wandb> train#
#     wandb.sklearn.plot_classifier(grid_search, X_train, X_test, y_train, y_test, y_pred, 
#                                   y_pred_prob, labels=["infected","non-infected"], 
#                                   model_name='RandomForest', 
#                                   feature_names=None) ###train
#     wandb.sklearn.plot_learning_curve(grid_search, X_train, y_train)

    print(">>>> Wandb(Train)(End)....")
    ############
    ### test ###
    ############
    print('test')
    y_pred = grid_search.predict(X_test)
    report = classification_report(y_test, y_pred, labels=[
                                   1, 0], output_dict=True)  # , target_names=['0', '1']
    acc = report.pop('accuracy')
    target_f1_score = report['1']['f1-score']
    y_pred_prob = grid_search.predict_proba(X_test)
    auc = roc_auc_score(y_test, y_pred_prob[:, 1])

    print(confusion_matrix(y_test, y_pred, labels=[1, 0]))
    display(pd.DataFrame(report).T)
    print(
        f'Accuracy: {acc:.3f}, AUC: {auc:.3f}, f1-score: {target_f1_score:.3f} \n\n')
    
    print(">>>> Wandb(Test)....")
    
#     #<wandb> test#
#     wandb.sklearn.plot_classifier(grid_search, X_train, X_test, y_train, y_test, y_pred, 
#                                   y_pred_prob, labels=["infected","non-infected"], 
#                                   model_name='RandomForest', feature_names=None) ###test
    wandb.sklearn.plot_confusion_matrix(y_test, y_pred, labels=["Non-infected","infected"])
    wandb.sklearn.plot_roc(y_test, y_pred_prob, labels=["Non-infected","infected"])
    wandb.sklearn.plot_precision_recall(y_test, y_pred_prob, labels=["Non-infected","infected"])
    wandb.log({"Test-Acc":acc, "Test-AUC":auc, "Test-f1(pos)":target_f1_score})
    print(">>>> Wandb(Test)(End)....")


    if grid_search.best_estimator_.steps[0][1].__class__.__name__ == 'RandomForestClassifier':
        name = 'RandomForest (dep=' + str(grid_search.best_params_['model__max_depth']) + \
            ' feature=' + \
            str(grid_search.best_params_['model__max_features']) + ')'
    elif grid_search.best_estimator_.steps[0][1].__class__.__name__ == 'KNeighborsClassifier':
        name = 'KNeighbors (k=' + \
            str(grid_search.best_params_['model__n_neighbors']) + ')'
    elif grid_search.best_estimator_.steps[0][1].__class__.__name__ == 'LogisticRegression':
        name = 'LogisticRegression (penalty=' + \
            grid_search.best_params_['model__penalty']
        if len(grid_search.best_params_) == 2:
            name = name + ' solver=' + \
                grid_search.best_params_['model__solver']
        name = name + ')'
    else:
        name = ''
    
    wandb.finish()
    return grid_search, name, acc, target_f1_score, auc

# Function(Use)

In [10]:
# Test for checking ICU_id missing in Lab_1103_csv
def getMissingIDinLab(Lab_file, show=True):
    test = sorted(Lab_file.ICU_id.unique())
    s = 0
    error_list = list()
    for i in test:
        #         print(i)
        s += 1
        if s != i:
            if show:
                print(f"error! : {s}")
            error_list.append(s)
            s += 1
    if show:
        print(f"Missing ID Result: {error_list}")
    if show:
        print(f"Missing Length:{len(error_list)}")
    return error_list

In [11]:
# store Dataframe to CSV
def store2CSV(data, target_name, target_loc_prefix='./'):
    file_path = target_loc_prefix+target_name+".csv"
    if checkDuplicateFile(file_path):
        print("store2CSV failed")
        return
    data.to_csv(file_path)
    print("store2CSV Successful!")

In [12]:
# store Datastruc. to pickle
def store2Pickle(data, target_name, target_loc_prefix='./'):
    import pickle
    file_path = target_loc_prefix+target_name+'.pickle'
    if checkDuplicateFile(file_path):
        print("store2Pickle failed")
        return
    with open(file_path, 'wb') as f:
        pickle.dump(data, f)
    print("store2Pickle Successful!")

In [13]:
def readFPickle(target_name, target_loc_prefix='./'):
    import pickle
    file_name = target_loc_prefix+target_name+'.pickle'
    with open(file_name, 'rb') as f:
        temp = pickle.load(f)
    return temp

# Function(Data preprocessing)

In [14]:
#Function: 補值
# 將針對輸入的df_data直接進行inplace插補
# 須確保df_data的缺失值位置有放np.nan
def handleMissing(df_data, df_feature, outFeature=["outcome"], cate_astype = "int"):
    for featureName in df_data.columns:
        if featureName not in outFeature:
            if df_data[featureName].isna().sum() == 0:
                print(f"{featureName}: Not need to fill.")
                continue
            else:
                # 先去看是連續與否 (1代表連續,0代表離散)
                kindValue = df_feature.loc[df_feature["features name"]
                                           == featureName, "kind"].values[0]
                if kindValue == 1:
                    # continuous
                    # mean filling
                    targetMean = df_data[featureName].mean()
                    df_data[featureName].fillna(value=targetMean, inplace=True)
                    print(f"{featureName}: Fill, Continuous.")

                else:
                    # categorical
                    # mode filling
                    targetMode = df_data[featureName].mode()[0]
                    df_data[featureName].fillna(value=targetMode, inplace=True)
                    df_data[featureName] = df_data[featureName].astype(cate_astype)
                    print(f"{featureName}: Fill, Categorical. (astype to {cate_astype})")
            
    print("---handleMissing Finish---")

In [15]:
# plot hist
# filtered_data need to check not have nan
def plotHist(df_data, target, outcome="outcome", bins=20):
    filtered_data = pd.concat([df_data["outcome"], df_data[target]], axis=1)
    filtered_data = filtered_data.dropna()
    print(filtered_data.isna().sum())
    plt.hist(filtered_data.loc[filtered_data[outcome]==0, target], bins=bins, alpha=0.5, label='0')
    plt.hist(filtered_data.loc[filtered_data[outcome]==1, target], bins=bins, alpha=0.5, label='1')
    plt.xlabel(target)
    plt.ylabel('count')
    plt.legend(title=outcome)

In [16]:
# plot countplot
# filtered_data need to check not have nan
def plotCountplot(df_data, target, outcome="outcome"):
    filtered_data = pd.concat([df_data["outcome"], df_data[target]], axis=1)
    filtered_data = filtered_data.dropna()
    print(filtered_data.isna().sum())
    sns.countplot(x=target, hue=outcome, data=filtered_data)

In [17]:
# plot boxplot
# filtered_data need to check not have nan
def plotBoxplot(df_data, target, outcome="outcome"):
    filtered_data = pd.concat([df_data["outcome"], df_data[target]], axis=1)
    filtered_data = filtered_data.dropna()
    print(filtered_data.isna().sum())
    sns.boxplot(x=filtered_data[target], data=filtered_data)

# Function(Model)

In [None]:
def serializeModel(model, modelName, featureNum):
    initial_type = [('float_input', FloatTensorType([None, featureNum]))]
    onx = convert_sklearn(model, initial_types=initial_type)
    with open(modelName + ".onnx", "wb") as f:
        f.write(onx.SerializeToString())

In [None]:
def modelPredict(modelName, testData):
    sess = rt.InferenceSession(modelName + '.onnx')#load the onnx
    input_name = sess.get_inputs()[0].name
    label_name = sess.get_outputs()[0].name
    pred_onx = sess.run([label_name], {input_name: testData.astype(np.float32)})[0]#predict testData
    print(pred_onx)

# Load Data(From Project1)

In [18]:
raw_p1_training_csv = pd.read_csv("./data/p1_training.csv")

In [19]:
raw_p1_validation_csv = pd.read_csv("./data/p1_validation.csv")

In [20]:
raw_p1_training_csv

Unnamed: 0.1,Unnamed: 0,outcome,AGE,SEX,LOS,Joint,Drain,Commercial_ALBC,Non_commercial_ALBC,cci_index,...,Weight Loss,Fluid and Electrolyte Disorders,Blood Loss Anemia,Deficiency Anemia,Anemia,Alcohol Abuse,Drug Abuse,Psychoses,Depression,Psyciatric disorder
0,12660,1,60,1,5,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,15988,0,75,0,7,0,1,0,1,2,...,0,0,0,0,0,0,0,0,0,0
2,31224,1,50,0,5,0,1,0,0,3,...,0,0,0,0,0,0,0,0,0,0
3,10398,0,39,1,5,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,41231,1,55,1,2,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
43412,6265,0,80,1,4,0,1,0,1,3,...,0,0,0,0,0,0,0,0,0,0
43413,11284,0,73,0,4,0,1,0,0,1,...,0,0,0,0,0,0,0,0,0,0
43414,38158,1,78,1,4,0,1,0,1,2,...,0,0,0,0,0,0,0,0,0,0
43415,860,0,64,0,5,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [21]:
raw_p1_validation_csv

Unnamed: 0.1,Unnamed: 0,outcome,AGE,SEX,LOS,Joint,Drain,Commercial_ALBC,Non_commercial_ALBC,cci_index,...,Weight Loss,Fluid and Electrolyte Disorders,Blood Loss Anemia,Deficiency Anemia,Anemia,Alcohol Abuse,Drug Abuse,Psychoses,Depression,Psyciatric disorder
0,24690,0,77.0,0,7,1,0,0,0,2,...,0,0,0,0,0,0,0,0,0,0
1,18416,0,63.0,0,4,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,23808,0,55.0,1,4,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,15647,0,63.0,1,8,0,1,0,1,1,...,0,0,0,0,0,0,0,0,0,0
4,24418,0,69.0,0,5,0,1,0,1,1,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7631,33892,0,81.0,0,7,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
7632,22026,0,69.0,0,4,1,1,0,0,1,...,0,0,0,0,0,0,0,0,0,0
7633,50989,0,82.0,0,4,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
7634,13424,0,70.0,0,4,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [22]:
raw_p1_training_csv.drop("Unnamed: 0", axis=1, inplace=True)

In [23]:
raw_p1_validation_csv.drop("Unnamed: 0", axis=1, inplace=True)

In [24]:
raw_p1_X_train = raw_p1_training_csv.copy()
raw_p1_y_train = raw_p1_X_train.pop("outcome")

In [25]:
raw_p1_X_val = raw_p1_validation_csv.copy()
raw_p1_y_val = raw_p1_X_val.pop("outcome")

# Test case (Baseline)

In [24]:
raw_p1_X_train

Unnamed: 0,AGE,SEX,LOS,Joint,Drain,Commercial_ALBC,Non_commercial_ALBC,cci_index,elx_index,Blood_trans,...,Weight Loss,Fluid and Electrolyte Disorders,Blood Loss Anemia,Deficiency Anemia,Anemia,Alcohol Abuse,Drug Abuse,Psychoses,Depression,Psyciatric disorder
0,60,1,5,1,1,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1,75,0,7,0,1,0,1,2,3,1,...,0,0,0,0,0,0,0,0,0,0
2,50,0,5,0,1,0,0,3,2,0,...,0,0,0,0,0,0,0,0,0,0
3,39,1,5,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,55,1,2,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
43412,80,1,4,0,1,0,1,3,4,0,...,0,0,0,0,0,0,0,0,0,0
43413,73,0,4,0,1,0,0,1,2,0,...,0,0,0,0,0,0,0,0,0,0
43414,78,1,4,0,1,0,1,2,1,0,...,0,0,0,0,0,0,0,0,0,0
43415,64,0,5,0,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [25]:
raw_p1_y_train

0        1
1        0
2        1
3        0
4        1
        ..
43412    0
43413    0
43414    1
43415    0
43416    0
Name: outcome, Length: 43417, dtype: int64

In [26]:
raw_p1_X_val 

Unnamed: 0,AGE,SEX,LOS,Joint,Drain,Commercial_ALBC,Non_commercial_ALBC,cci_index,elx_index,Blood_trans,...,Weight Loss,Fluid and Electrolyte Disorders,Blood Loss Anemia,Deficiency Anemia,Anemia,Alcohol Abuse,Drug Abuse,Psychoses,Depression,Psyciatric disorder
0,77.0,0,7,1,0,0,0,2,2,0,...,0,0,0,0,0,0,0,0,0,0
1,63.0,0,4,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,55.0,1,4,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,63.0,1,8,0,1,0,1,1,2,0,...,0,0,0,0,0,0,0,0,0,0
4,69.0,0,5,0,1,0,1,1,3,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7631,81.0,0,7,0,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7632,69.0,0,4,1,1,0,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0
7633,82.0,0,4,0,1,0,1,0,1,1,...,0,0,0,0,0,0,0,0,0,0
7634,70.0,0,4,0,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [27]:
raw_p1_y_val

0       0
1       0
2       0
3       0
4       0
       ..
7631    0
7632    0
7633    0
7634    0
7635    0
Name: outcome, Length: 7636, dtype: int64

In [26]:
raw_p1_y_train.value_counts()

0    29943
1    13474
Name: outcome, dtype: int64

## Exp 1
    - 不處理正規化，但讓最大疊代數調整成3000
    - f1

In [32]:
# gridsearch(clf, p_grid, X_train, y_train, X_test, y_test, run_name="Test", scoring="f1")
p_grid_lr_1 = {'model__penalty':['l1', 'l2']}

p_grid_lr_2 = {'model__penalty':['l1'], 'model__solver':['liblinear', 'saga']}

p_grid_lr_3 = {'model__penalty':['l2'], 'model__solver':['newton-cg', 'lbfgs', 'sag', 'saga']}

base_lines_LR1 = [
    gridsearch(LogisticRegression(random_state=42, max_iter=3000), p_grid_lr_1, raw_p1_X_train, raw_p1_y_train, 
               raw_p1_X_val, raw_p1_y_val, run_name="LR Conf.1(E1)"),
    gridsearch(LogisticRegression(random_state=42, max_iter=3000), p_grid_lr_2, raw_p1_X_train, raw_p1_y_train, 
               raw_p1_X_val, raw_p1_y_val, run_name="LR Conf.2(E1)"),
    gridsearch(LogisticRegression(random_state=42, max_iter=3000), p_grid_lr_3, raw_p1_X_train, raw_p1_y_train, 
               raw_p1_X_val, raw_p1_y_val, run_name="LR Conf.3(E1)")
]

[34m[1mwandb[0m: Currently logged in as: [33moscarchencs10[0m (use `wandb login --relogin` to force relogin)


5 fits failed out of a total of 10.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "/home/oscarchencs10/anaconda3/lib/python3.7/site-packages/sklearn/model_selection/_validation.py", line 681, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/oscarchencs10/anaconda3/lib/python3.7/site-packages/imblearn/pipeline.py", line 266, in fit
    self._final_estimator.fit(Xt, yt, **fit_params_last_step)
  File "/home/oscarchencs10/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py", line 1461, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "/home/oscarchencs10/anaconda3/lib/python3.

Best parameter (CV score=0.654):
{'model__penalty': 'l2'}

Best estimator
Pipeline(steps=[('model', LogisticRegression(max_iter=3000, random_state=42))])


train
[[ 7119  6355]
 [ 1174 28769]]


Unnamed: 0,precision,recall,f1-score,support
1,0.858435,0.528351,0.654109,13474.0
0,0.81907,0.960792,0.884289,29943.0
macro avg,0.838752,0.744572,0.769199,43417.0
weighted avg,0.831286,0.826589,0.812855,43417.0


Accuracy: 0.827, AUC: 0.872, f1-score: 0.654 


>>>> Wandb(Train)....
>>>> Wandb(Train)(End)....
test
[[   7  143]
 [ 271 7215]]


Unnamed: 0,precision,recall,f1-score,support
1,0.02518,0.046667,0.03271,150.0
0,0.980565,0.963799,0.97211,7486.0
macro avg,0.502873,0.505233,0.50241,7636.0
weighted avg,0.961798,0.945783,0.953657,7636.0


Accuracy: 0.946, AUC: 0.522, f1-score: 0.033 


>>>> Wandb(Test)....
>>>> Wandb(Test)(End)....


VBox(children=(Label(value=' 0.06MB of 0.06MB uploaded (0.00MB deduped)\r'), FloatProgress(value=0.98047282969…

0,1
Test-AUC,▁
Test-Acc,▁
Test-f1(pos),▁
Train-AUC,▁
Train-Acc,▁
Train-f1(pos),▁

0,1
Test-AUC,0.52175
Test-Acc,0.94578
Test-f1(pos),0.03271
Train-AUC,0.87182
Train-Acc,0.82659
Train-f1(pos),0.65411




Best parameter (CV score=0.655):
{'model__penalty': 'l1', 'model__solver': 'liblinear'}

Best estimator
Pipeline(steps=[('model',
                 LogisticRegression(max_iter=3000, penalty='l1',
                                    random_state=42, solver='liblinear'))])


train
[[ 7129  6345]
 [ 1177 28766]]


Unnamed: 0,precision,recall,f1-score,support
1,0.858295,0.529093,0.654637,13474.0
0,0.819287,0.960692,0.884373,29943.0
macro avg,0.838791,0.744893,0.769505,43417.0
weighted avg,0.831393,0.82675,0.813077,43417.0


Accuracy: 0.827, AUC: 0.872, f1-score: 0.655 


>>>> Wandb(Train)....
>>>> Wandb(Train)(End)....
test
[[   7  143]
 [ 271 7215]]


Unnamed: 0,precision,recall,f1-score,support
1,0.02518,0.046667,0.03271,150.0
0,0.980565,0.963799,0.97211,7486.0
macro avg,0.502873,0.505233,0.50241,7636.0
weighted avg,0.961798,0.945783,0.953657,7636.0


Accuracy: 0.946, AUC: 0.522, f1-score: 0.033 


>>>> Wandb(Test)....
>>>> Wandb(Test)(End)....


VBox(children=(Label(value=' 0.06MB of 0.06MB uploaded (0.00MB deduped)\r'), FloatProgress(value=0.98111194109…

0,1
Test-AUC,▁
Test-Acc,▁
Test-f1(pos),▁
Train-AUC,▁
Train-Acc,▁
Train-f1(pos),▁

0,1
Test-AUC,0.52178
Test-Acc,0.94578
Test-f1(pos),0.03271
Train-AUC,0.87207
Train-Acc,0.82675
Train-f1(pos),0.65464




Best parameter (CV score=0.654):
{'model__penalty': 'l2', 'model__solver': 'newton-cg'}

Best estimator
Pipeline(steps=[('model',
                 LogisticRegression(max_iter=3000, random_state=42,
                                    solver='newton-cg'))])


train
[[ 7122  6352]
 [ 1172 28771]]


Unnamed: 0,precision,recall,f1-score,support
1,0.858693,0.528574,0.654355,13474.0
0,0.81915,0.960859,0.884364,29943.0
macro avg,0.838921,0.744716,0.769359,43417.0
weighted avg,0.831422,0.826704,0.812983,43417.0


Accuracy: 0.827, AUC: 0.872, f1-score: 0.654 


>>>> Wandb(Train)....
>>>> Wandb(Train)(End)....
test
[[   7  143]
 [ 271 7215]]


Unnamed: 0,precision,recall,f1-score,support
1,0.02518,0.046667,0.03271,150.0
0,0.980565,0.963799,0.97211,7486.0
macro avg,0.502873,0.505233,0.50241,7636.0
weighted avg,0.961798,0.945783,0.953657,7636.0


Accuracy: 0.946, AUC: 0.522, f1-score: 0.033 


>>>> Wandb(Test)....
>>>> Wandb(Test)(End)....


VBox(children=(Label(value=' 0.06MB of 0.06MB uploaded (0.00MB deduped)\r'), FloatProgress(value=0.98027490751…

0,1
Test-AUC,▁
Test-Acc,▁
Test-f1(pos),▁
Train-AUC,▁
Train-Acc,▁
Train-f1(pos),▁

0,1
Test-AUC,0.52168
Test-Acc,0.94578
Test-f1(pos),0.03271
Train-AUC,0.87184
Train-Acc,0.8267
Train-f1(pos),0.65436


In [40]:
pd.DataFrame(base_lines_LR1,
             columns=['Grid', 'Classifier', 'Accuracy', 'f1-score', 'AUC']).sort_values('f1-score', ascending=False)

Unnamed: 0,Grid,Classifier,Accuracy,f1-score,AUC
0,"GridSearchCV(cv=5,\n estimator=Pip...",LogisticRegression (penalty=l2),0.945783,0.03271,0.521747
1,"GridSearchCV(cv=5,\n estimator=Pip...",LogisticRegression (penalty=l1 solver=liblinear),0.945783,0.03271,0.521779
2,"GridSearchCV(cv=5,\n estimator=Pip...",LogisticRegression (penalty=l2 solver=newton-cg),0.945783,0.03271,0.521683


## Exp 2
    - 處理正規化，最大疊代數調整成3000

In [33]:
scaling_p1_X_train_scaler = preprocessing.StandardScaler().fit(raw_p1_X_train)

In [34]:
scaling_p1_X_train = scaling_p1_X_train_scaler.transform(raw_p1_X_train)

In [35]:
scaling_p1_X_train

array([[-0.53776507,  1.52577628, -0.14182997, ..., -0.04923688,
        -0.11625849, -0.12454251],
       [ 0.8442127 , -0.65540408,  0.97069608, ..., -0.04923688,
        -0.11625849, -0.12454251],
       [-1.45908358, -0.65540408, -0.14182997, ..., -0.04923688,
        -0.11625849, -0.12454251],
       ...,
       [ 1.12060826,  1.52577628, -0.69809299, ..., -0.04923688,
        -0.11625849, -0.12454251],
       [-0.16923766, -0.65540408, -0.14182997, ..., -0.04923688,
        -0.11625849, -0.12454251],
       [ 0.8442127 , -0.65540408, -0.69809299, ..., -0.04923688,
        -0.11625849, -0.12454251]])

In [36]:
scaling_p1_X_val = scaling_p1_X_train_scaler.transform(raw_p1_X_val)

In [37]:
scaling_p1_X_val 

array([[ 1.02847641, -0.65540408,  0.97069608, ..., -0.04923688,
        -0.11625849, -0.12454251],
       [-0.26136951, -0.65540408, -0.69809299, ..., -0.04923688,
        -0.11625849, -0.12454251],
       [-0.99842432,  1.52577628, -0.69809299, ..., -0.04923688,
        -0.11625849, -0.12454251],
       ...,
       [ 1.48913566, -0.65540408, -0.69809299, ..., -0.04923688,
        -0.11625849, -0.12454251],
       [ 0.38355345, -0.65540408, -0.69809299, ..., -0.04923688,
        -0.11625849, -0.12454251],
       [ 0.659949  , -0.65540408,  0.41443306, ..., -0.04923688,
        -0.11625849, -0.12454251]])

In [38]:
# gridsearch(clf, p_grid, X_train, y_train, X_test, y_test, run_name="Test", scoring="f1")
p_grid_lr_1 = {'model__penalty':['l1', 'l2']}

p_grid_lr_2 = {'model__penalty':['l1'], 'model__solver':['liblinear', 'saga']}

p_grid_lr_3 = {'model__penalty':['l2'], 'model__solver':['newton-cg', 'lbfgs', 'sag', 'saga']}

base_lines_LR2 = [
    gridsearch(LogisticRegression(random_state=42, max_iter=3000), p_grid_lr_1, scaling_p1_X_train, raw_p1_y_train, 
               scaling_p1_X_val, raw_p1_y_val, run_name="LR Conf.1(E2)"),
    gridsearch(LogisticRegression(random_state=42, max_iter=3000), p_grid_lr_2, scaling_p1_X_train, raw_p1_y_train, 
               scaling_p1_X_val, raw_p1_y_val, run_name="LR Conf.2(E2)"),
    gridsearch(LogisticRegression(random_state=42, max_iter=3000), p_grid_lr_3, scaling_p1_X_train, raw_p1_y_train, 
               scaling_p1_X_val, raw_p1_y_val, run_name="LR Conf.3(E2)")
]

5 fits failed out of a total of 10.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "/home/oscarchencs10/anaconda3/lib/python3.7/site-packages/sklearn/model_selection/_validation.py", line 681, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/oscarchencs10/anaconda3/lib/python3.7/site-packages/imblearn/pipeline.py", line 266, in fit
    self._final_estimator.fit(Xt, yt, **fit_params_last_step)
  File "/home/oscarchencs10/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py", line 1461, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "/home/oscarchencs10/anaconda3/lib/python3.

Best parameter (CV score=0.655):
{'model__penalty': 'l2'}

Best estimator
Pipeline(steps=[('model', LogisticRegression(max_iter=3000, random_state=42))])


train
[[ 7138  6336]
 [ 1187 28756]]


Unnamed: 0,precision,recall,f1-score,support
1,0.857417,0.529761,0.654892,13474.0
0,0.819446,0.960358,0.884324,29943.0
macro avg,0.838432,0.74506,0.769608,43417.0
weighted avg,0.83123,0.826727,0.813122,43417.0


Accuracy: 0.827, AUC: 0.872, f1-score: 0.655 


>>>> Wandb(Train)....
>>>> Wandb(Train)(End)....
test
[[   7  143]
 [ 273 7213]]


Unnamed: 0,precision,recall,f1-score,support
1,0.025,0.046667,0.032558,150.0
0,0.98056,0.963532,0.971971,7486.0
macro avg,0.50278,0.505099,0.502265,7636.0
weighted avg,0.961789,0.945521,0.953518,7636.0


Accuracy: 0.946, AUC: 0.521, f1-score: 0.033 


>>>> Wandb(Test)....
>>>> Wandb(Test)(End)....


VBox(children=(Label(value=' 0.06MB of 0.06MB uploaded (0.00MB deduped)\r'), FloatProgress(value=0.98071685368…

0,1
Test-AUC,▁
Test-Acc,▁
Test-f1(pos),▁
Train-AUC,▁
Train-Acc,▁
Train-f1(pos),▁

0,1
Test-AUC,0.52116
Test-Acc,0.94552
Test-f1(pos),0.03256
Train-AUC,0.8722
Train-Acc,0.82673
Train-f1(pos),0.65489




Best parameter (CV score=0.655):
{'model__penalty': 'l1', 'model__solver': 'liblinear'}

Best estimator
Pipeline(steps=[('model',
                 LogisticRegression(max_iter=3000, penalty='l1',
                                    random_state=42, solver='liblinear'))])


train
[[ 7138  6336]
 [ 1186 28757]]


Unnamed: 0,precision,recall,f1-score,support
1,0.85752,0.529761,0.654922,13474.0
0,0.819451,0.960391,0.884341,29943.0
macro avg,0.838486,0.745076,0.769632,43417.0
weighted avg,0.831266,0.82675,0.813143,43417.0


Accuracy: 0.827, AUC: 0.872, f1-score: 0.655 


>>>> Wandb(Train)....
>>>> Wandb(Train)(End)....
test
[[   7  143]
 [ 273 7213]]


Unnamed: 0,precision,recall,f1-score,support
1,0.025,0.046667,0.032558,150.0
0,0.98056,0.963532,0.971971,7486.0
macro avg,0.50278,0.505099,0.502265,7636.0
weighted avg,0.961789,0.945521,0.953518,7636.0


Accuracy: 0.946, AUC: 0.521, f1-score: 0.033 


>>>> Wandb(Test)....
>>>> Wandb(Test)(End)....


VBox(children=(Label(value=' 0.06MB of 0.06MB uploaded (0.00MB deduped)\r'), FloatProgress(value=0.98060014393…

0,1
Test-AUC,▁
Test-Acc,▁
Test-f1(pos),▁
Train-AUC,▁
Train-Acc,▁
Train-f1(pos),▁

0,1
Test-AUC,0.52124
Test-Acc,0.94552
Test-f1(pos),0.03256
Train-AUC,0.87218
Train-Acc,0.82675
Train-f1(pos),0.65492




Best parameter (CV score=0.655):
{'model__penalty': 'l2', 'model__solver': 'newton-cg'}

Best estimator
Pipeline(steps=[('model',
                 LogisticRegression(max_iter=3000, random_state=42,
                                    solver='newton-cg'))])


train
[[ 7138  6336]
 [ 1187 28756]]


Unnamed: 0,precision,recall,f1-score,support
1,0.857417,0.529761,0.654892,13474.0
0,0.819446,0.960358,0.884324,29943.0
macro avg,0.838432,0.74506,0.769608,43417.0
weighted avg,0.83123,0.826727,0.813122,43417.0


Accuracy: 0.827, AUC: 0.872, f1-score: 0.655 


>>>> Wandb(Train)....
>>>> Wandb(Train)(End)....
test
[[   7  143]
 [ 273 7213]]


Unnamed: 0,precision,recall,f1-score,support
1,0.025,0.046667,0.032558,150.0
0,0.98056,0.963532,0.971971,7486.0
macro avg,0.50278,0.505099,0.502265,7636.0
weighted avg,0.961789,0.945521,0.953518,7636.0


Accuracy: 0.946, AUC: 0.521, f1-score: 0.033 


>>>> Wandb(Test)....
>>>> Wandb(Test)(End)....


VBox(children=(Label(value=' 0.06MB of 0.06MB uploaded (0.00MB deduped)\r'), FloatProgress(value=0.98057506584…

0,1
Test-AUC,▁
Test-Acc,▁
Test-f1(pos),▁
Train-AUC,▁
Train-Acc,▁
Train-f1(pos),▁

0,1
Test-AUC,0.52117
Test-Acc,0.94552
Test-f1(pos),0.03256
Train-AUC,0.8722
Train-Acc,0.82673
Train-f1(pos),0.65489


In [39]:
pd.DataFrame(base_lines_LR2,
             columns=['Grid', 'Classifier', 'Accuracy', 'f1-score', 'AUC']).sort_values('f1-score', ascending=False)

Unnamed: 0,Grid,Classifier,Accuracy,f1-score,AUC
0,"GridSearchCV(cv=5,\n estimator=Pip...",LogisticRegression (penalty=l2),0.945521,0.032558,0.521162
1,"GridSearchCV(cv=5,\n estimator=Pip...",LogisticRegression (penalty=l1 solver=liblinear),0.945521,0.032558,0.521245
2,"GridSearchCV(cv=5,\n estimator=Pip...",LogisticRegression (penalty=l2 solver=newton-cg),0.945521,0.032558,0.521168
