In [4]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import cross_val_score, KFold
from imblearn.over_sampling import ADASYN
from imblearn.pipeline import Pipeline
from xgboost import XGBClassifier
import joblib

In [7]:
db1 = pd.read_csv('../data/db1.csv')
db2 = pd.read_csv('../data/db2.csv')

y_train = db1['Diabetes_binary'].replace({2: 1})
X_train = db1.drop('Diabetes_binary', axis=1)

X_test = db2.drop('Diabetes_binary', axis=1)
y_test = db2['Diabetes_binary']


In [5]:
pipeline = Pipeline([
    ('adasyn', ADASYN(n_neighbors=2,random_state = 42)),
    ('scaler', StandardScaler()),
    ('model', XGBClassifier(
        random_state=42,
        n_estimators=400,
        learning_rate=0.01,
        scale_pos_weight=1
        )
    )
])

joblib.dump(pipeline, "../models/pipeline_side.pkl")

['../models/pipeline_side.pkl']

In [9]:
kf = KFold(n_splits=10, shuffle= True, random_state=42)
print(kf)
cross_val_res = cross_val_score(pipeline, X_train, y_train, cv=kf)
print(cross_val_res)

KFold(n_splits=10, random_state=42, shuffle=True)
[0.82229581 0.82848471 0.82284768 0.82698675 0.82643488 0.82071902
 0.82304478 0.82430621 0.82075844 0.8230842 ]


In [10]:
print('CVR: ')
for i, res in enumerate(cross_val_res,1):
    print(f"Fold {i}: {res}")
    
print("ACC:", cross_val_res.mean())

print(kf)

CVR: 
Fold 1: 0.8222958057395143
Fold 2: 0.8284847051403342
Fold 3: 0.8228476821192053
Fold 4: 0.8269867549668874
Fold 5: 0.8264348785871964
Fold 6: 0.8207190160832545
Fold 7: 0.8230447808262378
Fold 8: 0.8243062125512457
Fold 9: 0.820758435824661
Fold 10: 0.8230842005676443
ACC: 0.8238962472406183
KFold(n_splits=10, random_state=42, shuffle=True)


In [None]:
joblib.dump(kf, "../models/k_fold5.pkl")
joblib.dump(cross_val_res, "../models/cv_res.pkl")

['../models/cv_res.pkl']