## Load Dataset Skrining Diabetes

In [None]:
pip install ucimlrepo

Collecting ucimlrepo
  Downloading ucimlrepo-0.0.7-py3-none-any.whl.metadata (5.5 kB)
Downloading ucimlrepo-0.0.7-py3-none-any.whl (8.0 kB)
Installing collected packages: ucimlrepo
Successfully installed ucimlrepo-0.0.7


In [None]:
from ucimlrepo import fetch_ucirepo

# fetch dataset
early_stage_diabetes_risk_prediction = fetch_ucirepo(id=529)

# data (as pandas dataframes)
X = early_stage_diabetes_risk_prediction.data.features
y = early_stage_diabetes_risk_prediction.data.targets

# metadata
print(early_stage_diabetes_risk_prediction.metadata)

# variable information
print(early_stage_diabetes_risk_prediction.variables)


{'uci_id': 529, 'name': 'Early Stage Diabetes Risk Prediction', 'repository_url': 'https://archive.ics.uci.edu/dataset/529/early+stage+diabetes+risk+prediction+dataset', 'data_url': 'https://archive.ics.uci.edu/static/public/529/data.csv', 'abstract': 'This dataset contains the sign and symptpom data of newly diabetic or would be diabetic patient. ', 'area': 'Computer Science', 'tasks': ['Classification'], 'characteristics': ['Multivariate'], 'num_instances': 520, 'num_features': 16, 'feature_types': ['Categorical', 'Integer'], 'demographics': ['Age', 'Gender'], 'target_col': ['class'], 'index_col': None, 'has_missing_values': 'no', 'missing_values_symbol': None, 'year_of_dataset_creation': 2020, 'last_updated': 'Mon Mar 04 2024', 'dataset_doi': '10.24432/C5VG8H', 'creators': [], 'intro_paper': {'ID': 397, 'type': 'NATIVE', 'title': 'Likelihood Prediction of Diabetes at Early Stage Using Data Mining Techniques', 'authors': 'M. M. F. Islam, Rahatara Ferdousi, Sadikur Rahman, Humayra Yas

In [None]:
from IPython.display import display

print("Features (X):")
display(X)

print("Targets (y):")
display(y)

Features (X):


Unnamed: 0,age,gender,polyuria,polydipsia,sudden_weight_loss,weakness,polyphagia,genital_thrush,visual_blurring,itching,irritability,delayed_healing,partial_paresis,muscle_stiffness,alopecia,obesity
0,40,Male,No,Yes,No,Yes,No,No,No,Yes,No,Yes,No,Yes,Yes,Yes
1,58,Male,No,No,No,Yes,No,No,Yes,No,No,No,Yes,No,Yes,No
2,41,Male,Yes,No,No,Yes,Yes,No,No,Yes,No,Yes,No,Yes,Yes,No
3,45,Male,No,No,Yes,Yes,Yes,Yes,No,Yes,No,Yes,No,No,No,No
4,60,Male,Yes,Yes,Yes,Yes,Yes,No,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
515,39,Female,Yes,Yes,Yes,No,Yes,No,No,Yes,No,Yes,Yes,No,No,No
516,48,Female,Yes,Yes,Yes,Yes,Yes,No,No,Yes,Yes,Yes,Yes,No,No,No
517,58,Female,Yes,Yes,Yes,Yes,Yes,No,Yes,No,No,No,Yes,Yes,No,Yes
518,32,Female,No,No,No,Yes,No,No,Yes,Yes,No,Yes,No,No,Yes,No


Targets (y):


Unnamed: 0,class
0,Positive
1,Positive
2,Positive
3,Positive
4,Positive
...,...
515,Positive
516,Positive
517,Positive
518,Negative


## Preprocess Data




In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from IPython.display import display

X_encoded = pd.get_dummies(X, drop_first=True)
print("Encoded Features (X_encoded):")
display(X_encoded.head())

# Label encode the target variable y
le = LabelEncoder()
y_encoded = pd.DataFrame(le.fit_transform(y.iloc[:, 0]), columns=['class'])
print("Encoded Target (y_encoded):")
display(y_encoded.head())

Encoded Features (X_encoded):


Unnamed: 0,age,gender_Male,polyuria_Yes,polydipsia_Yes,sudden_weight_loss_Yes,weakness_Yes,polyphagia_Yes,genital_thrush_Yes,visual_blurring_Yes,itching_Yes,irritability_Yes,delayed_healing_Yes,partial_paresis_Yes,muscle_stiffness_Yes,alopecia_Yes,obesity_Yes
0,40,True,False,True,False,True,False,False,False,True,False,True,False,True,True,True
1,58,True,False,False,False,True,False,False,True,False,False,False,True,False,True,False
2,41,True,True,False,False,True,True,False,False,True,False,True,False,True,True,False
3,45,True,False,False,True,True,True,True,False,True,False,True,False,False,False,False
4,60,True,True,True,True,True,True,False,True,True,True,True,True,True,True,True


Encoded Target (y_encoded):


Unnamed: 0,class
0,1
1,1
2,1
3,1
4,1


## Train XGBoost Model

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_encoded, y_encoded, test_size=0.2, random_state=42)

print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

X_train shape: (416, 16)
X_test shape: (104, 16)
y_train shape: (416, 1)
y_test shape: (104, 1)


In [None]:
import xgboost as xgb

model = xgb.XGBClassifier(eval_metric='logloss', random_state=42)

# Train the XGBoost classifier
model.fit(X_train, y_train)

print("XGBoost model training complete.")

XGBoost model training complete.


## Evaluate Model Performance



In [None]:
y_pred = model.predict(X_test)
print("Predictions generated successfully.")

Predictions generated successfully.


In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

# Print the calculated metrics
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1:.4f}")

Accuracy: 0.9808
Precision: 1.0000
Recall: 0.9718
F1-Score: 0.9857


## Save Model



In [None]:
import joblib

# Save the trained XGBoost model
joblib.dump(model, 'xgboost_model.joblib')
print("XGBoost model saved as 'xgboost_model.joblib'")

# Save the LabelEncoder object
joblib.dump(le, 'label_encoder.joblib')
print("LabelEncoder saved as 'label_encoder.joblib'")

# Save the list of feature column names
joblib.dump(X_encoded.columns.tolist(), 'feature_columns.joblib')
print("Feature column names saved as 'feature_columns.joblib'")

XGBoost model saved as 'xgboost_model.joblib'
LabelEncoder saved as 'label_encoder.joblib'
Feature column names saved as 'feature_columns.joblib'


In [None]:
import zipfile
import os

# Define the files to be zipped
files_to_zip = ['xgboost_model.joblib', 'label_encoder.joblib', 'feature_columns.joblib']
zip_file_name = 'xgboost_deployment_artifacts.zip'

# Create a ZipFile object
with zipfile.ZipFile(zip_file_name, 'w', zipfile.ZIP_DEFLATED) as zipf:
    for file in files_to_zip:
        if os.path.exists(file):
            zipf.write(file, os.path.basename(file))
            print(f"Added {file} to {zip_file_name}")
        else:
            print(f"Warning: {file} not found and will not be added to the zip.")

print(f"Successfully created {zip_file_name} for deployment artifacts.")

Added xgboost_model.joblib to xgboost_deployment_artifacts.zip
Added label_encoder.joblib to xgboost_deployment_artifacts.zip
Added feature_columns.joblib to xgboost_deployment_artifacts.zip
Successfully created xgboost_deployment_artifacts.zip for deployment artifacts.


## Tes Prediksi Model

In [None]:
import pandas as pd

# Create new sample data (3 samples)
new_data = pd.DataFrame([
    {
        'age': 50, 'gender': 'Male', 'polyuria': 'Yes', 'polydipsia': 'Yes',
        'sudden_weight_loss': 'Yes', 'weakness': 'Yes', 'polyphagia': 'Yes',
        'genital_thrush': 'Yes', 'visual_blurring': 'Yes', 'itching': 'Yes',
        'irritability': 'Yes', 'delayed_healing': 'Yes', 'partial_paresis': 'Yes',
        'muscle_stiffness': 'Yes', 'alopecia': 'Yes', 'obesity': 'Yes'
    },
    {
        'age': 30, 'gender': 'Female', 'polyuria': 'No', 'polydipsia': 'No',
        'sudden_weight_loss': 'No', 'weakness': 'No', 'polyphagia': 'No',
        'genital_thrush': 'No', 'visual_blurring': 'No', 'itching': 'No',
        'irritability': 'No', 'delayed_healing': 'No', 'partial_paresis': 'No',
        'muscle_stiffness': 'No', 'alopecia': 'No', 'obesity': 'No'
    },
    {
        'age': 45, 'gender': 'Male', 'polyuria': 'No', 'polydipsia': 'Yes',
        'sudden_weight_loss': 'No', 'weakness': 'Yes', 'polyphagia': 'No',
        'genital_thrush': 'No', 'visual_blurring': 'Yes', 'itching': 'No',
        'irritability': 'No', 'delayed_healing': 'No', 'partial_paresis': 'No',
        'muscle_stiffness': 'No', 'alopecia': 'No', 'obesity': 'No'
    }
])

print("New Sample Data:")
display(new_data)

new_data_encoded = pd.get_dummies(new_data, drop_first=True)
missing_cols = set(X_encoded.columns) - set(new_data_encoded.columns)
for c in missing_cols:
    new_data_encoded[c] = False  # Add missing columns with False (or 0)

new_data_encoded = new_data_encoded[X_encoded.columns]

print("\nPreprocessed New Sample Data (aligned columns):")
display(new_data_encoded)

new_predictions_numeric = model.predict(new_data_encoded)
new_predictions_labels = le.inverse_transform(new_predictions_numeric)

print("\nPredictions for New Sample Data:")
for i, pred_label in enumerate(new_predictions_labels):
    print(f"Sample {i+1}: Predicted Class = {pred_label}")


New Sample Data:


Unnamed: 0,age,gender,polyuria,polydipsia,sudden_weight_loss,weakness,polyphagia,genital_thrush,visual_blurring,itching,irritability,delayed_healing,partial_paresis,muscle_stiffness,alopecia,obesity
0,50,Male,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Yes
1,30,Female,No,No,No,No,No,No,No,No,No,No,No,No,No,No
2,45,Male,No,Yes,No,Yes,No,No,Yes,No,No,No,No,No,No,No



Preprocessed New Sample Data (aligned columns):


Unnamed: 0,age,gender_Male,polyuria_Yes,polydipsia_Yes,sudden_weight_loss_Yes,weakness_Yes,polyphagia_Yes,genital_thrush_Yes,visual_blurring_Yes,itching_Yes,irritability_Yes,delayed_healing_Yes,partial_paresis_Yes,muscle_stiffness_Yes,alopecia_Yes,obesity_Yes
0,50,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True
1,30,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
2,45,True,False,True,False,True,False,False,True,False,False,False,False,False,False,False



Predictions for New Sample Data:
Sample 1: Predicted Class = Positive
Sample 2: Predicted Class = Negative
Sample 3: Predicted Class = Positive
