In [1]:
import pandas as pd
import numpy as np
from scipy.signal import find_peaks, peak_prominences
import matplotlib.pyplot as plt


In [2]:
dg_df = pd.read_csv('./big-ideas-lab-glycemic-variability-and-wearable-device-data-1.1.2/Demographics.csv')
dg_df.head(16)

Unnamed: 0,ID,Gender,HbA1c
0,13,MALE,5.7
1,1,FEMALE,5.5
2,3,FEMALE,5.9
3,4,FEMALE,6.4
4,5,FEMALE,5.7
5,2,MALE,5.6
6,6,FEMALE,5.8
7,7,FEMALE,5.3
8,8,FEMALE,5.6
9,10,FEMALE,6.0


추후 Feature Enginnering에 이용될 계산 함수 정의

In [3]:
df = pd.read_csv(r'./features.csv', sep='\t', encoding='utf-8')

In [4]:
df = df.dropna().drop(columns=['Unnamed: 0']).reset_index(drop=True)

In [5]:
df.isna().sum()

calories2hr       0
protein2hr        0
sugar2hr          0
carbs2hr          0
calories8hr       0
                 ..
ID                0
HbA1c             0
Biological Sex    0
WakeTime          0
label             0
Length: 72, dtype: int64

In [6]:
df['Biological Sex'] = df['Biological Sex'].map({'FEMALE': 0, 'MALE': 1})

In [7]:
df['label'] = df['label'].map({'PersLow': 0, 'PersNorm': 1, 'PersHigh':2})

In [8]:
##추가
df['datetime'] = pd.to_datetime(df['datetime']) #datetime 변환

# 변수와 라벨 분리 (glucose 제외) -> 실제 모델을 이용한 예측 시에는 , cgm 데이터(직접 뽑아낸 혈당값)을 사용하지 않음
df = df.drop(columns=['datetime', 'glucose'])

## pycaret 해보기

In [9]:
from pycaret.classification import *

# PyCaret 환경 설정 (7:3 비율로 일단 설정) (train이랑 fold랑 중복으로 필요한지 다시 체크)
exp_clf = setup(data=df, 
                target='label', 
                session_id=123, 
                train_size=0.7, 
                fold_strategy='stratifiedkfold',
                fix_imbalance=True,
                fix_imbalance_method='smote'
                )

# 모델 비교 및 최적 모델 선택
best_model = compare_models() #모델 여러개 적용시켜 보고, 각 모델 별 수치값 뽑기 작업


Unnamed: 0,Description,Value
0,Session id,123
1,Target,label
2,Target type,Multiclass
3,Original data shape,"(30061, 70)"
4,Transformed data shape,"(53908, 70)"
5,Transformed train set shape,"(44889, 70)"
6,Transformed test set shape,"(9019, 70)"
7,Numeric features,69
8,Preprocess,True
9,Imputation type,simple


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
et,Extra Trees Classifier,0.8846,0.949,0.8846,0.8834,0.8837,0.7404,0.7409,2.941
rf,Random Forest Classifier,0.8771,0.9449,0.8771,0.8779,0.8774,0.7297,0.7299,7.711
xgboost,Extreme Gradient Boosting,0.8484,0.9203,0.8484,0.8459,0.8464,0.6549,0.6561,2.331
lightgbm,Light Gradient Boosting Machine,0.8156,0.8933,0.8156,0.8127,0.8136,0.5818,0.5826,4.877
dt,Decision Tree Classifier,0.7974,0.7935,0.7974,0.8083,0.8015,0.573,0.575,1.168
knn,K Neighbors Classifier,0.7834,0.901,0.7834,0.8342,0.7945,0.5913,0.612,1.876
gbc,Gradient Boosting Classifier,0.6741,0.0,0.6741,0.7208,0.6893,0.3689,0.3782,56.934
ada,Ada Boost Classifier,0.5298,0.0,0.5298,0.6573,0.562,0.2061,0.2268,3.596
lda,Linear Discriminant Analysis,0.491,0.0,0.491,0.663,0.5276,0.1887,0.2173,0.299
lr,Logistic Regression,0.4797,0.0,0.4797,0.6652,0.517,0.1838,0.2147,3.368


In [10]:
# 최적 모델 학습
final_model = finalize_model(best_model)

# 모델 성능 평가
predictions = predict_model(final_model)
print(predictions)


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Extra Trees Classifier,1.0,1.0,1.0,1.0,1.0,1.0,1.0


       calories2hr  protein2hr  sugar2hr   carbs2hr  calories8hr  protein8hr  \
17140        183.5        12.4      17.9  24.700001  2251.300049  142.800003   
10525          0.0         0.0       0.0   0.000000     0.000000    0.000000   
16645          0.0         0.0       0.0   0.000000   690.700012   36.000000   
23398          0.0         0.0       0.0   0.000000     0.000000    0.000000   
27632          0.0         0.0       0.0   0.000000     0.000000    0.000000   
...            ...         ...       ...        ...          ...         ...   
20362          0.0         0.0       0.0   0.000000   262.000000   11.400000   
23788          0.0         0.0       0.0   0.000000     0.000000    0.000000   
15162          0.0         0.0       0.0   0.000000   542.099976   16.400000   
9266           0.0         0.0       0.0   0.000000   853.500000   29.200001   
26141          0.0         0.0       0.0   0.000000  1360.000000   53.299999   

        sugar8hr    carbs8hr  calories2

In [11]:
# 모델 저장
save_model(final_model, 'best_pycaret_model')


Transformation Pipeline and Model Successfully Saved


(Pipeline(memory=Memory(location=None),
          steps=[('numerical_imputer',
                  TransformerWrapper(exclude=None,
                                     include=['calories2hr', 'protein2hr',
                                              'sugar2hr', 'carbs2hr',
                                              'calories8hr', 'protein8hr',
                                              'sugar8hr', 'carbs8hr',
                                              'calories24hr', 'protein24hr',
                                              'sugar24hr', 'carbs24hr', 'eat',
                                              'eatcnt2hr', 'eatcnt8hr',
                                              'eatcnt24hr', 'eatmean2hr',
                                              'eatmean8hr', 'eatmean24hr',
                                              'acc_mean...
                  ExtraTreesClassifier(bootstrap=False, ccp_alpha=0.0,
                                       class_weight=None, criterion='gini

In [14]:
# 혼동행렬을 생성합니다.
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

# 실제 값과 예측 값을 가져옵니다.
y_true = predictions['label']
y_pred = predictions['Label']

# 혼동행렬을 생성합니다.
cm = confusion_matrix(y_true, y_pred)

# 혼동행렬을 출력합니다.
print('Confusion Matrix:')
print(cm)

# 혼동행렬을 시각화합니다.
import matplotlib.pyplot as plt

disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot(cmap='Blues')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()

KeyError: 'Label'