In [1]:
import os
import gc
import re
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, f1_score
from xgboost import XGBClassifier, XGBRegressor
from collections import Counter



In [2]:
# Data Loading (수술 時 사망 데이터)
data=pd.read_csv("https://raw.githubusercontent.com/GonieAhn/Data-Science-online-course-from-gonie/main/Data%20Store/example_data.csv")

In [3]:
data.describe()

Unnamed: 0,censor,event,age,wtkg,hemo,homo,drugs,karnof,oprior,z30,...,gender,str2,strat,symptom,cd40,cd420,cd496,r,cd80,cd820
count,532.0,532.0,532.0,532.0,532.0,532.0,532.0,532.0,532.0,532.0,...,532.0,532.0,532.0,532.0,532.0,532.0,532.0,532.0,532.0,532.0
mean,0.340226,801.236842,35.225564,76.061855,0.078947,0.640977,0.118421,95.432331,0.030075,0.546992,...,0.81203,0.580827,1.981203,0.167293,353.204887,336.139098,173.146617,0.603383,987.25,928.214286
std,0.474231,326.887929,8.852094,13.224698,0.26991,0.480165,0.32341,5.981856,0.170955,0.498255,...,0.391056,0.493888,0.905946,0.373589,114.105253,130.961573,191.455406,0.489656,475.223907,438.569798
min,0.0,33.0,13.0,47.401,0.0,0.0,0.0,70.0,0.0,0.0,...,0.0,0.0,1.0,0.0,103.0,49.0,-1.0,0.0,221.0,150.0
25%,0.0,535.75,29.0,67.5,0.0,0.0,0.0,90.0,0.0,0.0,...,1.0,0.0,1.0,0.0,271.0,243.75,-1.0,0.0,653.25,626.5
50%,0.0,933.5,34.0,74.6,0.0,1.0,0.0,100.0,0.0,1.0,...,1.0,1.0,2.0,0.0,346.0,330.5,113.0,1.0,881.0,818.0
75%,1.0,1081.0,40.0,83.502,0.0,1.0,0.0,100.0,0.0,1.0,...,1.0,1.0,3.0,0.0,422.0,418.0,324.0,1.0,1190.0,1164.0
max,1.0,1231.0,70.0,149.0,1.0,1.0,1.0,100.0,1.0,1.0,...,1.0,1.0,3.0,1.0,771.0,909.0,857.0,1.0,4255.0,3130.0


In [4]:
# Feature Name Cleaning
regex = re.compile(r"\[|\]|<", re.IGNORECASE)
data.columns = [regex.sub("_", col) if any(x in str(col) for x in set(('[', ']', '<'))) else col for col in data.columns.values]

In [5]:
data.columns

Index(['censor', 'event', 'age', 'wtkg', 'hemo', 'homo', 'drugs', 'karnof',
       'oprior', 'z30', 'zprior', 'preanti', 'race', 'gender', 'str2', 'strat',
       'symptom', 'cd40', 'cd420', 'cd496', 'r', 'cd80', 'cd820'],
      dtype='object')

In [6]:
# Data Quality Checking
col = []
missing = []
level = [] 
for name in data.columns:
    
    # Missing
    missper = data[name].isnull().sum() / data.shape[0]
    missing.append(round(missper, 4))

    # Leveling
    lel = data[name].dropna()
    level.append(len(list(set(lel))))

    # Columns
    col.append(name)

summary = pd.concat([pd.DataFrame(col, columns=['name']), 
                     pd.DataFrame(missing, columns=['Missing Percentage']), 
                     pd.DataFrame(level, columns=['Level'])], axis=1)

drop_col = summary['name'][(summary['Level'] <= 1) | (summary['Missing Percentage'] >= 0.8)]
data.drop(columns=drop_col, inplace=True)
print(">>>> Data Shape : {}".format(data.shape))

>>>> Data Shape : (532, 22)
