In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:80% !important; }</style>"))

In [2]:
import os
import csv
import random
import pickle
import numpy as np
import pandas as pd
from datetime import datetime
from multiprocessing import cpu_count
from concurrent.futures import ProcessPoolExecutor, as_completed
import matplotlib.pyplot as plt
from xgboost import XGBClassifier, plot_importance
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.feature_selection import RFE
from sklearn.svm import SVR, SVC

### train data: map file paths with labels

In [None]:
a = "label_train_test.pkl"
with open(a, 'rb') as f:
    label, train, test = pickle.load(f)
print(len(train), len(test))

data_path = './train15/data2800'
all_data = os.listdir(data_path)
print('# files', len(all_data))
data = []
all_labels = set()
for file_ in all_data:
    basename = file_
    name = os.path.splitext(basename)[0]
    data.append(os.path.join(data_path, basename))
    label_ = label[name]
    ls = label_.split('+')
    for l in ls:
        all_labels.add(l)
    # print(name, label_)
print(len(all_labels), all_labels)

### test data

In [None]:
area_file = './roi_results.txt'
area_map = {}
with open(area_file, 'r') as f:
    for line in f.readlines():
        name, patches, area = line.strip().split()
        name = os.path.splitext(name)[0]
        patches = int(patches)
        area = float(area)
        area_map[name] = {'patches':patches, 'area':area}
print(area_map)

In [None]:
label_file = './test1521.xlsx'
df_l = pd.read_excel(label_file)
df_l.head(10)

label_map = {}
for i,row in df_l.iterrows():
    label_map[row['case_no']] = row['old_label'].split('+')

In [None]:
data_path = './train15/test1521'
data = [os.path.join(data_path, f) for f in os.listdir(data_path)]

In [None]:
f = data[5]
df = pd.read_csv(f)
# area = float(df.area[df.area.notnull()])
area = area_map[os.path.basename(f).split('_BATCH')[0]]['area']
print(area)
# patches = float(df.patches[df.patches.notnull()])
# print(patches)
df.head(10)

### features

In [3]:
tolerate = {"AGC":{"AGC_A", "AGC_B"}, 
            "LSIL":{"ASCUS", "LSIL_E", "LSIL_F"}, 
            "ASCUS":{"ASCUS", "LSIL_E", "LSIL_F"}, 
            "HSIL-SCC_G":{"HSIL_B", "HSIL_M", "HSIL_S", "SCC_G"}, 
            "SCC_R":{"SCC_R"}, 
            "EC":{"EC"}, 
            "CC":{"CC"}, 
            "VIRUS":{"VIRUS", "HSV"}, 
            "FUNGI":{"FUNGI", "CANDIDA"}, 
            "ACTINO":{"ACTINO"}, 
            "TRI":{"TRI"}, 
            "PH":{"PH"}, 
            "SC":{"SC", "RC", "MC", "GEC"}}

dtct_p = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
clas_p = [0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 0.99, 0.999]
header = ["{}_{:.2f}_{:.3f}".format(key, dp, cp) for key in tolerate for dp in dtct_p for cp in clas_p]
header.sort()
header_map = {key:i for i,key in enumerate(header)}

all_labels = {'ACTINO':0, 'AGC':1, 'ASCH':2, 'ASCUS':3, 'CC':4, 'EC':5, 
              'FUNGI':6, 'CANDIDA':6, 'HSIL':7, 'VIRUS':8, 'HSV':8, 
              'LSIL':9, 'NILM':10, 'SCC':11, 'TRI':12}
bin_labels = {}
for l in all_labels:
    if l == 'NILM':
        bin_labels[l] = 0
    else:
        bin_labels[l] = 1
print(all_labels)
print(bin_labels)

{'ASCUS': 3, 'HSIL': 7, 'SCC': 11, 'CC': 4, 'VIRUS': 8, 'NILM': 10, 'FUNGI': 6, 'ACTINO': 0, 'TRI': 12, 'HSV': 8, 'ASCH': 2, 'LSIL': 9, 'AGC': 1, 'CANDIDA': 6, 'EC': 5}
{'ASCUS': 1, 'HSIL': 1, 'SCC': 1, 'VIRUS': 1, 'CC': 1, 'NILM': 0, 'ACTINO': 1, 'ASCH': 1, 'HSV': 1, 'CANDIDA': 1, 'TRI': 1, 'LSIL': 1, 'AGC': 1, 'FUNGI': 1, 'EC': 1}


In [None]:
# header_imap = {(key, dp, cp):header_map["{}_{:.2f}_{:.3f}".format(key, dp, cp)] for key in tolerate for dp in dtct_p for cp in clas_p}
# print(header_imap)

# with open('header_imap.pkl', 'wb') as f:
#     pickle.dump(header_imap, f)
    
# # # read pkl file
# # with open('header_imap.pkl', 'rb') as f:
# #     header_imap = pickle.load(f)

In [None]:
def extract_oldfashion(f):
    df = pd.read_csv(f)
    features = [0 for i in range(len(header))]
    # check if is empty
    if df.yolo_cell_class.isnull().values.any():  
        return features * 2
    # cross levelup features
    for i,row in df.iterrows():
        for dp in dtct_p:
            for cp in clas_p:
                if row['xcp_cell_class'] in tolerate[row['yolo_cell_class']]:
                    if row['yolo_cell_class_det'] > dp and row['xcp_cell_class_det'] > cp:
                        key = "{}_{:.2f}_{:.3f}".format(row['yolo_cell_class'], dp, cp)
                        features[header_map[key]] += 1
                        
    # area balanced numbers
    try:
        area_mark = 2850000000
        # area = float(df.area[df.area.notnull()])
        area = area_map[os.path.basename(f).split('_BATCH')[0]]['area']
        features_ab = [f*area_mark/area for f in features]
        features += features_ab
    except:
        print(f)
        features *= 2

    return features

def extract(f):
    df = pd.read_csv(f)
    features = [0 for i in range(len(header))]
#     areas = [0.0 for i in range(len(header))]
    # check if is empty
    if df.detect_label.isnull().values.any():  
        return features * 2
    # cross levelup features
    for i,row in df.iterrows():
        for dp in dtct_p:
            for cp in clas_p:
                if row['classify_label'] in tolerate[row['detect_label']]:
                    if row['detect_probability'] > dp and row['classify_probability'] > cp:
                        key = "{}_{:.2f}_{:.3f}".format(row['detect_label'], dp, cp)
                        features[header_map[key]] += 1
#                         areas[header_map[key]] += row['w'] * row['h']
#     # average areas
#     for i in range(len(header)):
#         areas[i] /= features[i] if features[i] != 0 else 1.0
    
    # area balanced numbers
    try:
        area_mark = 2850000000
        area = float(df.area[df.area.notnull()])
        features_ab = [f*area_mark/area for f in features]
        features += features_ab
#         patches = float(df.patches[df.patches.notnull()])
#         features_pb = [f*2000/patches for f in features]
#         features += features_pb
    except:
        print(f)
        features *= 2

#     features += areas
    return features

def collect(data, test=True):
    X = []
    ya = []  # all labels
    yb = []  # binary labels
    names = []
    for f in data:
        features = extract_oldfashion(f)
        if not test:
            basename = os.path.splitext(os.path.basename(f))[0]
            ls = label[basename].split('+')
        else:
            basename = os.path.basename(f).split('_BATCH')[0]
            if not basename in label_map:
                continue
            ls = label_map[basename]
        if sum(features) == 0:
            continue
        for l in ls:
            a = all_labels[l]
            b = bin_labels[l]
            X.append(features)
            ya.append(a)
            yb.append(b)
            names.append(f)
    return X, ya, yb, names

def worker():
    files = data
    random.shuffle(files)
    random.shuffle(files)
    print("# files:", len(files))

    X, ya, yb, names = [], [], [], []
    
    executor = ProcessPoolExecutor(max_workers=36)
    tasks = []

    batch_size = 50
    for i in range(0, len(files), batch_size):
        batch = files[i : i+batch_size]
#         collect(batch)
        tasks.append(executor.submit(collect, batch))

    job_count = len(tasks)
    for future in as_completed(tasks):
        X_, ya_, yb_, names_ = future.result()  # get the returning result from calling fuction
        X += X_
        ya += ya_
        yb += yb_
        names += names_
        job_count -= 1
        if job_count % 8 == 0: 
            print("One Job Done, Remaining Job Count: %s" % (job_count))

    X = np.asarray(X)
    ya = np.asarray(ya)
    yb = np.asarray(yb)
    print(X.shape, ya.shape, yb.shape)
    
    return X, ya, yb, names

X, ya, yb, names = worker()

In [36]:
# with open('train15test1500.pkl', 'wb') as f:
#     pickle.dump(X, f)
#     pickle.dump(ya, f)
#     pickle.dump(yb, f)
#     pickle.dump(names, f)
    
with open('./train15test1500.pkl', 'rb') as f:
    X = pickle.load(f)
    ya = pickle.load(f)
    yb = pickle.load(f)
    names = pickle.load(f)
print(X.shape, ya.shape, yb.shape)
    
# test designated 1000 test data
pd_t = pd.read_excel('./test1000.xlsx')
nlist = set(pd_t.case_no.values)
X_, ya_, yb_, names_ = [], [], [], []
for xx, yya, yyb, nn in zip(X, ya, yb, names):
    basename = os.path.basename(nn).split('_BATCH')[0]
    if not basename in nlist:
        continue
    X_.append(xx)
    ya_.append(yya)
    yb_.append(yyb)
    names_.append(nn)
X = np.asarray(X_)
ya = np.asarray(ya_)
yb = np.asarray(yb_)
names = names_
print(X.shape, ya.shape, yb.shape)

# # load augmented train data
# with open('/home/ssd_array0/Develop/liyu/codect/set1/feature_dict.pkl', 'rb') as f:
#     feature_dict = pickle.load(f)
    
# X, ya, yb = [], [], []
# for key,value in feature_dict.items():
#     ya += [all_labels[key]] * len(value)
#     yb += [0 if key == 'NILM' else 1] * len(value)
#     X += value
# X = np.asarray(X)
# ya = np.asarray(ya)
# yb = np.asarray(yb)
# print(X.shape, ya.shape, yb.shape)

(1420, 2340) (1420,) (1420,)
(1045, 2340) (1045,) (1045,)


### classification

In [8]:
class RFESVM:
    def __init__(self):
        self.estimator = SVR(kernel="linear")
        self.selector = None
        
    def select(self, X, y, num_feature):
        self.selector = RFE(self.estimator, num_feature, step=1)
        self.selector = self.selector.fit(X, y)
        selected_feature_indices = self.selector.support_ # ndarray of True/False
        return selected_feature_indices

def split(X, y, mode, test_size, seed):
    random.seed(seed)
    N = 2 if mode == "bin" else 13
    idx = {i:[] for i in range(N)}
    for i,c in enumerate(y):
        idx[c].append(i)
    idx_t, idx_v = [], []
    for c,indices in idx.items():
        n = len(indices)
        idx_t += indices[:-int(n*test_size)]
        idx_v += indices[-int(n*test_size):]
    X_train = X[idx_t]
    X_valid = X[idx_v]
    y_train = y[idx_t]
    y_valid = y[idx_v]
    return X_train, X_valid, y_train, y_valid

def rfe(X_train, X_valid, y_train, y_valid, num_features):
    rfe_svm = RFESVM()
    selected_feature_indices = rfe_svm.select(X_train, y_train, num_features)
    X_train = X_train[:, selected_feature_indices] # Select elements of numpy array via boolean mask array
    X_valid = X_valid[:, selected_feature_indices]
    return X_train, X_valid, y_train, y_valid

def evaluate(y_valid, y_pred, mode):
    if mode == "bin":
        cur_labels = {0:'NILM', 1:'ABN'}
    else:
        cur_labels = {value:key for key,value in all_labels.items()}
    
    results = {i:[0,0,0] for i in range(len(cur_labels))}  # tp, fn, fp
    for t,p in zip(y_valid, y_pred):
        if t == p:
            results[t][0] += 1
        else:
            results[t][1] += 1
            results[p][2] += 1
    for i in range(len(results)):
        labeli = cur_labels[i]
        recall = results[i][0] / (results[i][0] + results[i][1]) if results[i][0] + results[i][1] != 0 else 0.0
        precision = results[i][0] / (results[i][0] + results[i][2]) if results[i][0] + results[i][2] != 0 else 0.0
        print(labeli, results[i][0] + results[i][1], ' recall = {:.4f}'.format(recall), 'precision = {:.4f}'.format(precision))
        
    
def classify(mode="bin"):  # mode = "bin" or "all"
    seed = 2019
    test_size = 0.2
#     num_features = 1170

    y = yb if mode == "bin" else ya
#     X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=test_size, random_state=seed)
    X_train, X_valid, y_train, y_valid = split(X, y, mode, test_size, seed)
    # X_train, X_valid, y_train, y_valid = rfe(X_train, X_valid, y_train, y_valid, num_features)

    model = XGBClassifier(max_depth=15, 
                          n_jobs=36, 
                          subsample=0.8, 
                          colsample_bylevel=1,
                          colsample_bytree=0.6, 
                          scale_pos_weight=1, 
                          n_estimators=500, 
                          min_child_weight=1, 
                          learning_rate=0.1, 
                          gamma=0,
                          random_state=seed)
    eval_set = [(X_train, y_train), (X_valid, y_valid)]
    if mode == "bin":
        eval_metric = ["auc", "error"]
    else:
        eval_metric = ["merror"]
    model.fit(X_train, y_train, 
              early_stopping_rounds=50, 
              eval_metric=eval_metric, 
              eval_set=eval_set, 
              verbose=True)

    y_pred = model.predict(X_valid)
    accuracy = accuracy_score(y_valid, y_pred)
    print("accuracy: {:.4f}".format(accuracy))
    evaluate(y_valid, y_pred, mode)
    
    return model

print('binary classification')
bin_model = classify("bin")
print()
print('multilabel classification')
all_model = classify("all")

binary classification
[0]	validation_0-auc:0.930266	validation_0-error:0.022692	validation_1-auc:0.906534	validation_1-error:0.21641
Multiple eval metrics have been passed: 'validation_1-error' will be used for early stopping.

Will train until validation_1-error hasn't improved in 50 rounds.
[1]	validation_0-auc:0.957291	validation_0-error:0.023077	validation_1-auc:0.93054	validation_1-error:0.150769
[2]	validation_0-auc:0.971692	validation_0-error:0.019295	validation_1-auc:0.933187	validation_1-error:0.192051
[3]	validation_0-auc:0.982395	validation_0-error:0.019231	validation_1-auc:0.942594	validation_1-error:0.181026
[4]	validation_0-auc:0.983084	validation_0-error:0.019103	validation_1-auc:0.940202	validation_1-error:0.167949
[5]	validation_0-auc:0.984584	validation_0-error:0.018205	validation_1-auc:0.944289	validation_1-error:0.168205
[6]	validation_0-auc:0.994121	validation_0-error:0.017756	validation_1-auc:0.948122	validation_1-error:0.167692
[7]	validation_0-auc:0.995584	valid

Stopping. Best iteration:
[21]	validation_0-auc:0.999362	validation_0-error:0.007564	validation_1-auc:0.976648	validation_1-error:0.15

accuracy: 0.8500
NILM 300  recall = 0.9300 precision = 0.3310
ABN 3600  recall = 0.8433 precision = 0.9931

multilabel classification
[0]	validation_0-merror:0.036154	validation_1-merror:0.017436
Multiple eval metrics have been passed: 'validation_1-merror' will be used for early stopping.

Will train until validation_1-merror hasn't improved in 50 rounds.
[1]	validation_0-merror:0.028462	validation_1-merror:0.013077
[2]	validation_0-merror:0.025705	validation_1-merror:0.011538
[3]	validation_0-merror:0.02359	validation_1-merror:0.008205
[4]	validation_0-merror:0.022692	validation_1-merror:0.008205
[5]	validation_0-merror:0.021795	validation_1-merror:0.008462
[6]	validation_0-merror:0.021026	validation_1-merror:0.006923
[7]	validation_0-merror:0.019615	validation_1-merror:0.006667
[8]	validation_0-merror:0.019038	validation_1-merror:0.006667
[9]	valida

In [29]:
bin_imap = {0:'NILM', 1:'ABN'}
all_imap = {0: 'ACTINO', 1: 'AGC', 2: 'ASCH', 3: 'ASCUS', 4: 'CC', 5: 'EC', 6: 'CANDIDA', 7: 'HSIL', 8: 'HSV', 9: 'LSIL', 10: 'NILM', 11: 'SCC', 12: 'TRI'}

# with open("aug1500.pkl", 'wb') as f:
#     pickle.dump(bin_model, f)
#     pickle.dump(all_model, f)
#     pickle.dump(bin_imap, f)
#     pickle.dump(all_imap, f)


# with open("train15models.pkl", 'rb') as f:
#     bin_model = pickle.load(f)
#     all_model = pickle.load(f)
#     bin_imap = pickle.load(f)
#     all_imap = pickle.load(f)

#### release

In [40]:
class Diagnoser:
    def __init__(self, pkl_file):
        self.bin_cls = range(len(bin_imap))
        self.all_cls = range(len(all_imap))
        with open(pkl_file, 'rb') as f:
            self.bin_model = pickle.load(f)
            self.all_model = pickle.load(f)
            self.bin_imap = pickle.load(f)
            self.all_imap = pickle.load(f)
    
    def extract_old(self, csv_file):
        df = pd.read_csv(csv_file)
        features = [0 for i in range(len(header))]
        # check if is empty
        if df.yolo_cell_class.isnull().values.any():  
            return features * 2
        # cross levelup features
        for i,row in df.iterrows():
            for dp in dtct_p:
                for cp in clas_p:
                    if row['xcp_cell_class'] in tolerate[row['yolo_cell_class']]:
                        if row['yolo_cell_class_det'] > dp and row['xcp_cell_class_det'] > cp:
                            key = "{}_{:.2f}_{:.3f}".format(row['yolo_cell_class'], dp, cp)
                            features[header_map[key]] += 1
        
        # area balanced numbers
        try:
            area_mark = 2850000000
            # area = float(df.area[df.area.notnull()])
            area = area_map[os.path.basename(f).split('_BATCH')[0]]['area']
            features_ab = [f*area_mark/area for f in features]
            features += features_ab
        except:
            print(f)
            features *= 2
        
        features = np.array([features])
        return features
    
    def extract_new(self, csv_file):
        df = pd.read_csv(csv_file)
        features = [0 for i in range(len(header))]
        # check if is empty
        if df.detect_label.isnull().values.any():  
            return features * 2
        # cross levelup features
        for i,row in df.iterrows():
            for dp in dtct_p:
                for cp in clas_p:
                    if row['classify_label'] in tolerate[row['detect_label']]:
                        if row['detect_probability'] > dp and row['classify_probability'] > cp:
                            key = "{}_{:.2f}_{:.3f}".format(row['detect_label'], dp, cp)
                            features[header_map[key]] += 1
                            
        # area balanced numbers
        try:
            area_mark = 2850000000
            area = float(df.area[df.area.notnull()])
            features_ab = [f*area_mark/area for f in features]
            features += features_ab
        except:
            print(f)
            features *= 2
                            
        features = np.array([features])
        return features
    
#     def bin_predict(self, csv_file):
#         f = self.extract_old(csv_file)
#         p = self.bin_model.predict(f)[0]
#         l = self.bin_imap[p]
#         return l
        
#     def all_predict(self, csv_file):
#         f = self.extract_old(csv_file)
#         p = self.all_model.predict(f)[0]
#         l = self.all_imap[p]
#         return l

#     def bin_and_all_predict(self, csv_file):
#         f = self.extract_old(csv_file)
#         pb = self.bin_model.predict(f)[0]
#         lb = self.bin_imap[pb]
#         pa = self.all_model.predict(f)[0]
#         la = self.all_imap[pa]
#         return lb, la
    
    def bin_and_all_predict(self, X):
        pb = self.bin_model.predict(X)
        lb = [self.bin_imap[p] for p in pb]
        pa = self.all_model.predict(X)
        la = [self.all_imap[p] for p in pa]
        return lb, la
    
    def bin_predict(self, X, y):
        y_pred = self.bin_model.predict(X)
        accuracy = accuracy_score(y, y_pred)
        print("accuracy: {:.4f}".format(accuracy))
        evaluate(y, y_pred, 'bin')
        cm = confusion_matrix(y, y_pred, labels=self.bin_cls)
        return cm
        
    def all_predict(self, X, y):
        y_pred = self.all_model.predict(X)
        accuracy = accuracy_score(y, y_pred)
        print("accuracy: {:.4f}".format(accuracy))
        evaluate(y, y_pred, 'all')
        cm = confusion_matrix(y, y_pred, labels=self.all_cls)
        return cm

In [43]:
pkl_file = "train15models.pkl"
d = Diagnoser(pkl_file)

b_cm = d.bin_predict(X, yb)
a_cm = d.all_predict(X, ya)

data_p = {'case_no':[], 'diagnosis_b':[], 'diagnosis_m':[], 'label_b':[], 'label_m':[]}
lb, la = d.bin_and_all_predict(X)
data_p['case_no'] = [os.path.basename(n).split('_BATCH')[0] for n in names]
data_p['diagnosis_b'] = lb
data_p['diagnosis_m'] = la
data_p['label_b'] = [bin_imap[i] for i in yb]
data_p['label_m'] = [all_imap[i] for i in ya]    

df_p = pd.DataFrame(data=data_p)
df_p.to_csv('train15p.csv')
df_p.head(10)

accuracy: 0.8086
NILM 917  recall = 0.8310 precision = 0.9442
ABN 128  recall = 0.6484 precision = 0.3487
accuracy: 0.8545
ACTINO 0  recall = 0.0000 precision = 0.0000
AGC 0  recall = 0.0000 precision = 0.0000
ASCH 7  recall = 0.0000 precision = 0.0000
ASCUS 58  recall = 0.2586 precision = 0.2308
CC 18  recall = 0.7778 precision = 0.7368
EC 2  recall = 0.0000 precision = 0.0000
CANDIDA 22  recall = 0.5000 precision = 0.4783
HSIL 2  recall = 1.0000 precision = 0.2000
HSV 0  recall = 0.0000 precision = 0.0000
LSIL 7  recall = 0.7143 precision = 0.5000
NILM 917  recall = 0.9128 precision = 0.9352
SCC 1  recall = 0.0000 precision = 0.0000
TRI 11  recall = 0.8182 precision = 0.6429


Unnamed: 0,case_no,diagnosis_b,diagnosis_m,label_b,label_m
0,TC19011610,NILM,NILM,NILM,NILM
1,TC19014704,NILM,NILM,ABN,CC
2,TC19010360,NILM,NILM,NILM,NILM
3,TC19010258,NILM,NILM,NILM,NILM
4,TC19014744,NILM,NILM,ABN,CANDIDA
5,TC19005059,NILM,NILM,NILM,NILM
6,TC19005411,NILM,NILM,NILM,NILM
7,TC19010375,NILM,NILM,NILM,NILM
8,TC19012698,NILM,NILM,NILM,NILM
9,TC19012821,NILM,NILM,NILM,NILM


In [38]:
def write_cm(cm, label_imap, csv_name):
    l = len(label_imap)
    header = [label_imap[i] for i in range(l)]
    with open(csv_name, 'w') as csvf:
        writer = csv.writer(csvf, delimiter=',')
        writer.writerow(['-']+header)
        for i in range(l):
            writer.writerow([header[i]] + list(cm[i,:]))
    
write_cm(b_cm, bin_imap, 'cm-train15bin.csv')
write_cm(a_cm, all_imap, 'cm-train15all.csv')

In [39]:
TP = len(df_p[(df_p.diagnosis_m != 'NILM') & (df_p.label_m != 'NILM')])
FN = len(df_p[(df_p.diagnosis_m == 'NILM') & (df_p.label_m != 'NILM')])
FP = len(df_p[(df_p.diagnosis_m != 'NILM') & (df_p.label_m == 'NILM')])
TN = len(df_p[(df_p.diagnosis_m == 'NILM') & (df_p.label_m == 'NILM')])

paiyin = TN / (TN + FP)
jiayin = FN / (TP + FN)
jiayang = FP / (TN + FP)
accuracy = (TN + TP) / (TP + FN + FP + TN)
sensitivity = TP / (TP + FN)
print(paiyin, jiayin, jiayang, accuracy, sensitivity)

0.9127589967284624 0.453125 0.08724100327153762 0.8679425837320575 0.546875


In [27]:
nono = df_p[df_p.diagnosis_m != df_p.label_m]
nono.case_no.to_csv('nono.csv', index=False)

  


### cross validation

In [None]:
def classify(mode="bin"):  # mode = "bin" or "all"
    seed = 2018
    
    y = yb if mode == "bin" else ya
    num_folds = 5
    X_train_folds = np.array_split(X, num_folds)
    y_train_folds = np.array_split(y, num_folds)
    
    best = [0.0, None] # accuracy, i
    for i in range(0, num_folds):
        X_train = np.concatenate(X_train_folds[:i] + X_train_folds[i+1:])
        y_train = np.concatenate(y_train_folds[:i] + y_train_folds[i+1:])
        X_valid = X_train_folds[i]
        y_valid = y_train_folds[i]

        model = XGBClassifier(max_depth=15, 
                              n_jobs=24, 
                              subsample=0.8, 
                              colsample_bylevel=1,
                              colsample_bytree=0.6, 
                              scale_pos_weight=1, 
                              n_estimators=500, 
                              min_child_weight=1, 
                              learning_rate=0.1, 
                              gamma=0,
                              random_state=seed)
        eval_set = [(X_train, y_train), (X_valid, y_valid)]
        if mode == "bin":
            eval_metric = ["auc", "error"]
        else:
            eval_metric = ["merror"]
        model.fit(X_train, y_train, 
                  early_stopping_rounds=50, 
                  eval_metric=eval_metric, 
                  eval_set=eval_set, 
                  verbose=False)

        y_pred = model.predict(X_valid)
        accuracy = accuracy_score(y_valid, y_pred)
        print("accuracy: {:.4f}".format(accuracy))
        
        if accuracy > best[0]:
            best = [accuracy, i]
            
    print(best)

In [None]:
print('binary classification')
classify("bin")
print()
print('multilabel classification')
classify("all")