<a href="https://colab.research.google.com/github/mariskaiz/TugasML/blob/main/ML2_Ind.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# penginputan dataset
data_path = '/content/smoking_health_data_final.csv'
data = pd.read_csv(data_path)

# tampilkan tabel awal
data.head()

Unnamed: 0,age,sex,current_smoker,heart_rate,blood_pressure,cigs_per_day,chol
0,54,male,yes,95,110/72,,219.0
1,45,male,yes,64,121/72,,248.0
2,58,male,yes,81,127.5/76,,235.0
3,42,male,yes,90,122.5/80,,225.0
4,42,male,yes,62,119/80,,226.0


In [2]:
#membuat salinan dataframe untuk processing
data_clean = data.copy()

#mengatasi missing value dengan mengisinya dengan median
#pengecekan missing value
missing_values = data_clean.isnull().sum()
print("Missing values per column:\n", missing_values)

Missing values per column:
 age                0
sex                0
current_smoker     0
heart_rate         0
blood_pressure     0
cigs_per_day      14
chol               7
dtype: int64


In [3]:
#pengisian median
for column in ['heart_rate', 'cigs_per_day', 'chol']:
    median_value = data_clean[column].median()
    data_clean[column].fillna(median_value, inplace=True)

# pembagian 'blood_pressure' menjadi dua kolom yaitu 'systolic_bp' dan 'diastolic_bp'
data_clean[['systolic_bp', 'diastolic_bp']] = data_clean['blood_pressure'].str.split('/', expand=True).astype(float)

# menghilangkan kolom 'blood_pressure'
data_clean.drop(columns=['blood_pressure'], inplace=True)

# Summarize key statistics untuk setiap variabel
summary_statistics = data_clean.describe().T
summary_statistics['range'] = summary_statistics['max'] - summary_statistics['min']
print("\nSummary Statistics:\n", summary_statistics)


Summary Statistics:
                count        mean        std    min    25%    50%    75%  \
age           3900.0   49.537949   8.559275   32.0   42.0   49.0   56.0   
heart_rate    3900.0   75.688974  12.015376   44.0   68.0   75.0   82.0   
cigs_per_day  3900.0    9.135641  12.028875    0.0    0.0    0.0   20.0   
chol          3900.0  236.591282  44.335649  113.0  206.0  234.0  263.0   
systolic_bp   3900.0  132.395385  21.976335   83.5  117.0  128.0  144.0   
diastolic_bp  3900.0   82.987179  11.918623   48.0   75.0   82.0   90.0   

                max  range  
age            70.0   38.0  
heart_rate    143.0   99.0  
cigs_per_day   70.0   70.0  
chol          696.0  583.0  
systolic_bp   295.0  211.5  
diastolic_bp  142.5   94.5  


In [9]:
#pengecekan data baru
print("jumlah baris dan kolom")
print(data_clean.shape)
print(sorted(data_clean.isna().sum()))

jumlah baris dan kolom
(3900, 8)
[0, 0, 0, 0, 0, 0, 0, 0]


In [10]:
data_clean.head()

Unnamed: 0,age,sex,current_smoker,heart_rate,cigs_per_day,chol,systolic_bp,diastolic_bp
0,54,male,yes,95,0.0,219.0,110.0,72.0
1,45,male,yes,64,0.0,248.0,121.0,72.0
2,58,male,yes,81,0.0,235.0,127.5,76.0
3,42,male,yes,90,0.0,225.0,122.5,80.0
4,42,male,yes,62,0.0,226.0,119.0,80.0


In [21]:
# Membuat DataFrame contoh
data_clean = pd.DataFrame({
    'age': [65, 45, 58, 42, 42],
    'heart_rate': [105, 64, 81, 90, 62],
    'cigs_per_day': [16, 0, 0, 0, 0],
    'chol': [248, 248, 235, 225, 226],
    'systolic_bp': [140, 121, 127, 122, 119],
    'diastolic_bp': [72, 72, 76, 80, 80]
})

# Fungsi untuk menentukan kondisi kesehatan berdasarkan kriteria tertentu
def tentukan_kondisi(row):
    if row['age'] > 50 and row['systolic_bp'] > 140:
        return 'Hipertensi'
    elif row['cigs_per_day'] > 15:
        return 'Perokok Berat'
    elif row['chol'] > 240:
        return 'Kolesterol Tinggi'
    elif (row['systolic_bp'] > 120 and row['systolic_bp'] < 140) or (row['diastolic_bp'] > 80 and row['diastolic_bp'] < 90):
        return 'Prehipertensi'
    elif (row['age'] > 65) or (row['heart_rate'] < 60 or row['heart_rate'] > 100):
        return 'Tidak Sehat'
    else:
        return 'Sehat'

# Membuat kolom baru "kondisi_kesehatan" dengan menggunakan fungsi di atas
data_clean['Kondisi_Kesehatan'] = data.apply(tentukan_kondisi, axis=1)

# Menampilkan DataFrame hasil
print(data_clean)

   age  heart_rate  cigs_per_day  chol  systolic_bp  diastolic_bp  \
0   65         105            16   248          140            72   
1   45          64             0   248          121            72   
2   58          81             0   235          127            76   
3   42          90             0   225          122            80   
4   42          62             0   226          119            80   

   Kondisi_Kesehatan  
0              Sehat  
1  Kolesterol Tinggi  
2      Prehipertensi  
3      Prehipertensi  
4              Sehat  


In [23]:
# Memisahkan variabel target dan prediktor
X = data_clean.drop('Kondisi_Kesehatan', axis=1)  # Variabel prediktor
y = data_clean['Kondisi_Kesehatan']  # Variabel target

# Menampilkan variabel prediktor
print("Variabel Prediktor (Features):")
print(X)

# Menampilkan variabel target
print("\nVariabel Target:")
print(y)

Variabel Prediktor (Features):
   age  heart_rate  cigs_per_day  chol  systolic_bp  diastolic_bp
0   65         105            16   248          140            72
1   45          64             0   248          121            72
2   58          81             0   235          127            76
3   42          90             0   225          122            80
4   42          62             0   226          119            80

Variabel Target:
0                Sehat
1    Kolesterol Tinggi
2        Prehipertensi
3        Prehipertensi
4                Sehat
Name: Kondisi_Kesehatan, dtype: object


In [24]:
# Pembagian data menjadi data pelatihan dan data pengujian
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardisasi data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Membangun model Regresi Logistik
model = LogisticRegression()

# Pelatihan model
model.fit(X_train_scaled, y_train)

# Prediksi menggunakan data pengujian
predictions = model.predict(X_test_scaled)

# Evaluasi model
accuracy = accuracy_score(y_test, predictions)
print("Akurasi:", accuracy)
print("\nLaporan Klasifikasi:")
print(classification_report(y_test, predictions))

Akurasi: 0.0

Laporan Klasifikasi:
                   precision    recall  f1-score   support

Kolesterol Tinggi       0.00      0.00      0.00       1.0
            Sehat       0.00      0.00      0.00       0.0

         accuracy                           0.00       1.0
        macro avg       0.00      0.00      0.00       1.0
     weighted avg       0.00      0.00      0.00       1.0



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
