In [1]:
#import library
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler

In [2]:
df = pd.read_csv('osteoporosis.csv')
df.head()

Unnamed: 0,Id,Age,Gender,Hormonal Changes,Family History,Race/Ethnicity,Body Weight,Calcium Intake,Vitamin D Intake,Physical Activity,Smoking,Alcohol Consumption,Medical Conditions,Medications,Prior Fractures,Osteoporosis
0,104866,69,Female,Normal,Yes,Asian,Underweight,Low,Sufficient,Sedentary,Yes,Moderate,Rheumatoid Arthritis,Corticosteroids,Yes,1
1,101999,32,Female,Normal,Yes,Asian,Underweight,Low,Sufficient,Sedentary,No,,,,Yes,1
2,106567,89,Female,Postmenopausal,No,Caucasian,Normal,Adequate,Sufficient,Active,No,Moderate,Hyperthyroidism,Corticosteroids,No,1
3,102316,78,Female,Normal,No,Caucasian,Underweight,Adequate,Insufficient,Sedentary,Yes,,Rheumatoid Arthritis,Corticosteroids,No,1
4,101944,38,Male,Postmenopausal,Yes,African American,Normal,Low,Sufficient,Active,Yes,,Rheumatoid Arthritis,,Yes,1


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1958 entries, 0 to 1957
Data columns (total 16 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   Id                   1958 non-null   int64 
 1   Age                  1958 non-null   int64 
 2   Gender               1958 non-null   object
 3   Hormonal Changes     1958 non-null   object
 4   Family History       1958 non-null   object
 5   Race/Ethnicity       1958 non-null   object
 6   Body Weight          1958 non-null   object
 7   Calcium Intake       1958 non-null   object
 8   Vitamin D Intake     1958 non-null   object
 9   Physical Activity    1958 non-null   object
 10  Smoking              1958 non-null   object
 11  Alcohol Consumption  970 non-null    object
 12  Medical Conditions   1311 non-null   object
 13  Medications          973 non-null    object
 14  Prior Fractures      1958 non-null   object
 15  Osteoporosis         1958 non-null   int64 
dtypes: int

In [4]:
#ukuran data
df.shape

(1958, 16)

In [5]:
  #melihat unique Value
  df.nunique()

Id                     1749
Age                      73
Gender                    2
Hormonal Changes          2
Family History            2
Race/Ethnicity            3
Body Weight               2
Calcium Intake            2
Vitamin D Intake          2
Physical Activity         2
Smoking                   2
Alcohol Consumption       1
Medical Conditions        2
Medications               1
Prior Fractures           2
Osteoporosis              2
dtype: int64

In [6]:
#menghapus kolom id
df=df.drop(columns=['Id'])

In [7]:
#jumlah baris kosong pada kolom alkohol, kondisi medis, dan obat
nan_count = df[['Alcohol Consumption', 'Medical Conditions', 'Medications']].isna().all(axis=1).sum()

print("Jumlah baris dengan NaN di ketiga kolom:", nan_count)

Jumlah baris dengan NaN di ketiga kolom: 171


In [8]:
#menghapus baris yang kosong
df_clean=df.dropna(subset=['Alcohol Consumption', 'Medical Conditions', 'Medications'], how='all')
df_clean.shape

(1787, 15)

In [9]:
#mengubah isian kolom 
df_clean['Alcohol Consumption'] = df_clean['Alcohol Consumption'].fillna('No').replace('Moderate', 'Yes')
df_clean['Medical Conditions'] = df_clean['Medical Conditions'].fillna('No')
df_clean['Medications'] = df_clean['Medications'].fillna('No')
df_clean.head()

Unnamed: 0,Age,Gender,Hormonal Changes,Family History,Race/Ethnicity,Body Weight,Calcium Intake,Vitamin D Intake,Physical Activity,Smoking,Alcohol Consumption,Medical Conditions,Medications,Prior Fractures,Osteoporosis
0,69,Female,Normal,Yes,Asian,Underweight,Low,Sufficient,Sedentary,Yes,Yes,Rheumatoid Arthritis,Corticosteroids,Yes,1
2,89,Female,Postmenopausal,No,Caucasian,Normal,Adequate,Sufficient,Active,No,Yes,Hyperthyroidism,Corticosteroids,No,1
3,78,Female,Normal,No,Caucasian,Underweight,Adequate,Insufficient,Sedentary,Yes,No,Rheumatoid Arthritis,Corticosteroids,No,1
4,38,Male,Postmenopausal,Yes,African American,Normal,Low,Sufficient,Active,Yes,No,Rheumatoid Arthritis,No,Yes,1
5,41,Male,Normal,Yes,Caucasian,Normal,Low,Sufficient,Active,Yes,Yes,Rheumatoid Arthritis,Corticosteroids,Yes,1


In [10]:
# Ubah nilai 'Postmenopausal' menjadi 'Normal' untuk baris dengan 'gender' adalah 'male' dan 'hormonal changes' adalah 'Postmenopausal'
df_clean.loc[(df_clean['Gender'] == 'Male') & (df_clean['Hormonal Changes'] == 'Postmenopausal'), 'Hormonal Changes'] = 'Normal'

# Tampilkan dataframe setelah perubahan
df_clean

Unnamed: 0,Age,Gender,Hormonal Changes,Family History,Race/Ethnicity,Body Weight,Calcium Intake,Vitamin D Intake,Physical Activity,Smoking,Alcohol Consumption,Medical Conditions,Medications,Prior Fractures,Osteoporosis
0,69,Female,Normal,Yes,Asian,Underweight,Low,Sufficient,Sedentary,Yes,Yes,Rheumatoid Arthritis,Corticosteroids,Yes,1
2,89,Female,Postmenopausal,No,Caucasian,Normal,Adequate,Sufficient,Active,No,Yes,Hyperthyroidism,Corticosteroids,No,1
3,78,Female,Normal,No,Caucasian,Underweight,Adequate,Insufficient,Sedentary,Yes,No,Rheumatoid Arthritis,Corticosteroids,No,1
4,38,Male,Normal,Yes,African American,Normal,Low,Sufficient,Active,Yes,No,Rheumatoid Arthritis,No,Yes,1
5,41,Male,Normal,Yes,Caucasian,Normal,Low,Sufficient,Active,Yes,Yes,Rheumatoid Arthritis,Corticosteroids,Yes,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1953,19,Female,Normal,Yes,African American,Normal,Adequate,Sufficient,Sedentary,Yes,Yes,Rheumatoid Arthritis,No,Yes,0
1954,23,Female,Postmenopausal,Yes,Caucasian,Underweight,Low,Insufficient,Active,No,No,No,Corticosteroids,No,0
1955,34,Female,Postmenopausal,No,African American,Underweight,Low,Sufficient,Sedentary,No,No,Hyperthyroidism,No,No,0
1956,25,Male,Normal,No,African American,Normal,Low,Insufficient,Sedentary,Yes,No,Rheumatoid Arthritis,Corticosteroids,Yes,0


In [11]:
df_clean.duplicated().sum()

6

In [12]:
# Menghapus baris yang duplikat dari DataFrame df_clean
df_clean = df_clean.drop_duplicates()

# Jumlah total baris yang telah dihapus
total_duplicates = df_clean.duplicated().sum()

print("Total baris duplikat:", total_duplicates)

Total baris duplikat: 0


In [13]:
df_clean.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1781 entries, 0 to 1957
Data columns (total 15 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   Age                  1781 non-null   int64 
 1   Gender               1781 non-null   object
 2   Hormonal Changes     1781 non-null   object
 3   Family History       1781 non-null   object
 4   Race/Ethnicity       1781 non-null   object
 5   Body Weight          1781 non-null   object
 6   Calcium Intake       1781 non-null   object
 7   Vitamin D Intake     1781 non-null   object
 8   Physical Activity    1781 non-null   object
 9   Smoking              1781 non-null   object
 10  Alcohol Consumption  1781 non-null   object
 11  Medical Conditions   1781 non-null   object
 12  Medications          1781 non-null   object
 13  Prior Fractures      1781 non-null   object
 14  Osteoporosis         1781 non-null   int64 
dtypes: int64(2), object(13)
memory usage: 222.6+ KB


## Feature Encoding

In [14]:
label_encoder = LabelEncoder()
binary_cols = ['Gender', 'Hormonal Changes', 'Family History', 'Body Weight','Prior Fractures',
               'Calcium Intake', 'Vitamin D Intake', 'Physical Activity', 'Smoking', 'Alcohol Consumption', 'Medications']
for col in binary_cols:
    df_clean[col] = label_encoder.fit_transform(df_clean[col])

# One-Hot Encoding untuk kolom-kolom dengan lebih dari dua nilai unik
df_clean = pd.get_dummies(df_clean, columns=['Race/Ethnicity', 'Medical Conditions'], dtype=int)
df_clean

Unnamed: 0,Age,Gender,Hormonal Changes,Family History,Body Weight,Calcium Intake,Vitamin D Intake,Physical Activity,Smoking,Alcohol Consumption,Medications,Prior Fractures,Osteoporosis,Race/Ethnicity_African American,Race/Ethnicity_Asian,Race/Ethnicity_Caucasian,Medical Conditions_Hyperthyroidism,Medical Conditions_No,Medical Conditions_Rheumatoid Arthritis
0,69,0,0,1,1,1,1,1,1,1,0,1,1,0,1,0,0,0,1
2,89,0,1,0,0,0,1,0,0,1,0,0,1,0,0,1,1,0,0
3,78,0,0,0,1,0,0,1,1,0,0,0,1,0,0,1,0,0,1
4,38,1,0,1,0,1,1,0,1,0,1,1,1,1,0,0,0,0,1
5,41,1,0,1,0,1,1,0,1,1,0,1,1,0,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1953,19,0,0,1,0,0,1,1,1,1,1,1,0,1,0,0,0,0,1
1954,23,0,1,1,1,1,0,0,0,0,0,0,0,0,0,1,0,1,0
1955,34,0,1,0,1,1,1,1,0,0,1,0,0,1,0,0,1,0,0
1956,25,1,0,0,0,1,0,1,1,0,0,1,0,1,0,0,0,0,1


## Scaling & Split data

In [15]:
from sklearn.preprocessing import StandardScaler
import pickle

# Membuat objek scaler
scaler = StandardScaler()

df_scaled = df_clean.copy()  # Membuat salinan DataFrame
columns_to_scale = [col for col in df_scaled.columns if col != 'Osteoporosis']

# Ambil nilai kolom 'osteoporosis' sebelum scaling
osteoporosis_column = df_scaled['Osteoporosis']

# Fit dan transformasi data menggunakan StandardScaler hanya pada kolom fitur
df_scaled[columns_to_scale] = scaler.fit_transform(df_scaled[columns_to_scale])

# Masukkan kembali kolom 'osteoporosis' yang tidak di-scaling
df_scaled['Osteoporosis'] = osteoporosis_column

# Simpan scaler ke dalam file menggunakan pickle
with open('scaler.sav', 'wb') as f:
    pickle.dump(scaler, f)

df_scaled

Unnamed: 0,Age,Gender,Hormonal Changes,Family History,Body Weight,Calcium Intake,Vitamin D Intake,Physical Activity,Smoking,Alcohol Consumption,Medications,Prior Fractures,Osteoporosis,Race/Ethnicity_African American,Race/Ethnicity_Asian,Race/Ethnicity_Caucasian,Medical Conditions_Hyperthyroidism,Medical Conditions_No,Medical Conditions_Rheumatoid Arthritis
0,1.389115,-1.011861,-0.576270,1.023292,1.046563,0.961980,0.960899,1.044211,0.996077,0.920603,-0.915411,1.002811,1,-0.731673,1.451965,-0.701160,-0.782154,-0.601349,1.348346
2,2.324382,-1.011861,1.735299,-0.977238,-0.955509,-1.039523,0.960899,-0.957661,-1.003938,0.920603,-0.915411,-0.997197,1,-0.731673,-0.688722,1.426209,1.278521,-0.601349,-0.741649
3,1.809986,-1.011861,-0.576270,-0.977238,1.046563,-1.039523,-1.040693,1.044211,0.996077,-1.086244,-0.915411,-0.997197,1,-0.731673,-0.688722,1.426209,-0.782154,-0.601349,1.348346
4,-0.060548,0.988278,-0.576270,1.023292,-0.955509,0.961980,0.960899,-0.957661,0.996077,-1.086244,1.092405,1.002811,1,1.366731,-0.688722,-0.701160,-0.782154,-0.601349,1.348346
5,0.079742,0.988278,-0.576270,1.023292,-0.955509,0.961980,0.960899,-0.957661,0.996077,0.920603,-0.915411,1.002811,1,-0.731673,-0.688722,1.426209,-0.782154,-0.601349,1.348346
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1953,-0.949052,-1.011861,-0.576270,1.023292,-0.955509,-1.039523,0.960899,1.044211,0.996077,0.920603,1.092405,1.002811,0,1.366731,-0.688722,-0.701160,-0.782154,-0.601349,1.348346
1954,-0.761998,-1.011861,1.735299,1.023292,1.046563,0.961980,-1.040693,-0.957661,-1.003938,-1.086244,-0.915411,-0.997197,0,-0.731673,-0.688722,1.426209,-0.782154,1.662927,-0.741649
1955,-0.247602,-1.011861,1.735299,-0.977238,1.046563,0.961980,0.960899,1.044211,-1.003938,-1.086244,1.092405,-0.997197,0,1.366731,-0.688722,-0.701160,1.278521,-0.601349,-0.741649
1956,-0.668472,0.988278,-0.576270,-0.977238,-0.955509,0.961980,-1.040693,1.044211,0.996077,-1.086244,-0.915411,1.002811,0,1.366731,-0.688722,-0.701160,-0.782154,-0.601349,1.348346


In [16]:
# Memisahkan data menjadi fitur (X) dan target (y)
X = df_scaled.drop(columns=['Osteoporosis'])  # Mengambil semua kolom kecuali kolom target
y = df_scaled['Osteoporosis']  # Mengambil kolom target

In [17]:
# Train Test Split
from sklearn.model_selection import train_test_split

X_train_3, X_test_3, y_train_3, y_test_3 = train_test_split(X, y, test_size=0.4, random_state=42)

In [18]:
split_list = [X_train_3, X_test_3, y_train_3, y_test_3]

# Looping untuk mengecek ukuran variabel dan nama variabel
for var in split_list:
    # Mendapatkan nama variabel dengan menggunakan globals() atau locals()
    var_name = [name for name, value in locals().items() if value is var][0]

    # Menggunakan shape untuk mendapatkan ukuran variabel
    var_shape = var.shape

    print(f"Variable name: {var_name}, Shape: {var_shape}")

Variable name: X_train_3, Shape: (1068, 18)
Variable name: X_test_3, Shape: (713, 18)
Variable name: y_train_3, Shape: (1068,)
Variable name: y_test_3, Shape: (713,)


In [19]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.model_selection import cross_validate


def eval_classification(model):
    y_pred_3 = model.predict(X_test_3)
    y_pred_train_3 = model.predict(X_train_3)
    y_pred_proba_3 = model.predict_proba(X_test_3)
    y_pred_proba_train_3 = model.predict_proba(X_train_3)

    print("Accuracy (Train Set): {:.2f}%".format(accuracy_score(y_train_3, y_pred_train_3) * 100))
    print("Accuracy (Test Set): {:.2f}%".format(accuracy_score(y_test_3, y_pred_3) * 100))
    print("Precision (Train Set): {:.2f}%".format(precision_score(y_train_3, y_pred_train_3, average='weighted') * 100))
    print("Precision (Test Set): {:.2f}%".format(precision_score(y_test_3, y_pred_3, average='weighted') * 100))
    print("Recall (Train Set): {:.2f}%".format(recall_score(y_train_3, y_pred_train_3, average='weighted') * 100))
    print("Recall (Test Set): {:.2f}%".format(recall_score(y_test_3, y_pred_3, average='weighted') * 100))
    print("F1-Score (Train Set): {:.2f}%".format(f1_score(y_train_3, y_pred_train_3, average='weighted') * 100))
    print("F1-Score (Test Set): {:.2f}%".format(f1_score(y_test_3, y_pred_3, average='weighted') * 100))


In [20]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

# Mengatur seed acak untuk reproducibility
np.random.seed(42)

base_estimator = DecisionTreeClassifier(max_depth=1, splitter='best', random_state=42)
clf_2 = AdaBoostClassifier(base_estimator=base_estimator, random_state=42)
clf_2.fit(X_train_3, y_train_3)
eval_classification(clf_2)

Accuracy (Train Set): 90.17%
Accuracy (Test Set): 92.01%
Precision (Train Set): 91.79%
Precision (Test Set): 93.13%
Recall (Train Set): 90.17%
Recall (Test Set): 92.01%
F1-Score (Train Set): 90.08%
F1-Score (Test Set): 91.97%


## Model Prediksi

In [21]:
input_data = (26, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, True, False, False, False, False, True)
input_data_as_numpy = np.array(input_data)
input_data_reshape = input_data_as_numpy.reshape(1,-1)

std_data = scaler.transform(input_data_reshape)
print(std_data)

prediction = clf_2.predict(std_data)
print(prediction)

if(prediction[0] == 0):
    print('Pasien tidak terkena resiko osteoporosis')
else:
    print('Pasien terkena resiko osteoporosis')

[[-0.62170827 -1.01186147  1.73529861 -0.97723821  1.046563   -1.03952272
   0.96089861  1.04421051  0.99607732 -1.08624425 -0.915411   -0.99719652
   1.36673147 -0.68872182 -0.7011596  -0.78215389 -0.60134915  1.34834639]]
[0]
Pasien tidak terkena resiko osteoporosis


In [22]:
input_data = (26, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1)
input_data_as_numpy = np.array(input_data)
input_data_reshape = input_data_as_numpy.reshape(1,-1)

std_data = scaler.transform(input_data_reshape)
print(std_data)

prediction = clf_2.predict(std_data)
print(prediction)

if(prediction[0] == 0):
    print('Pasien tidak terkena osteoporosis')
else:
    print('Pasien terkena osteoporosis')

[[-0.62170827 -1.01186147  1.73529861 -0.97723821  1.046563   -1.03952272
   0.96089861  1.04421051  0.99607732 -1.08624425 -0.915411   -0.99719652
   1.36673147 -0.68872182 -0.7011596  -0.78215389 -0.60134915  1.34834639]]
[0]
Pasien tidak terkena osteoporosis


In [23]:
input_data = (78,0,0,0,1,0,0,1,1,0,0,0,0,0,1,0,0,1)
input_data_as_numpy = np.array(input_data)
input_data_reshape = input_data_as_numpy.reshape(1,-1)

std_data = scaler.transform(input_data_reshape)
print(std_data)

prediction = clf_2.predict(std_data)
print(prediction)

if(prediction[0] == 0):
    print('Pasien tidak terkena osteoporosis')
else:
    print('Pasien terkena osteoporosis')

[[ 1.80998555 -1.01186147 -0.5762697  -0.97723821  1.046563   -1.03952272
  -1.04069253  1.04421051  0.99607732 -1.08624425 -0.915411   -0.99719652
  -0.73167262 -0.68872182  1.4262088  -0.78215389 -0.60134915  1.34834639]]
[1]
Pasien terkena osteoporosis


## Simpan Model

In [24]:
import pickle

In [25]:
#filename = 'osteoporosis_model.sav'
#pickle.dump(clf,open(filename,'wb'))
with open('osteoporosis_model.sav', 'wb') as file:
    pickle.dump(clf_2, file)