In [1]:

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score



In [2]:
# 1) Konstruksi data
# Kumpulkan dan siapkan data yang akan digunakan untuk pelatihan dan pengujian model.
data_path = 'hungary_chickenpox.csv'
df = pd.read_csv(data_path)

In [3]:

print(df.info())



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 522 entries, 0 to 521
Data columns (total 21 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Date      522 non-null    object
 1   BUDAPEST  522 non-null    int64 
 2   BARANYA   522 non-null    int64 
 3   BACS      522 non-null    int64 
 4   BEKES     522 non-null    int64 
 5   BORSOD    522 non-null    int64 
 6   CSONGRAD  522 non-null    int64 
 7   FEJER     522 non-null    int64 
 8   GYOR      522 non-null    int64 
 9   HAJDU     522 non-null    int64 
 10  HEVES     522 non-null    int64 
 11  JASZ      522 non-null    int64 
 12  KOMAROM   522 non-null    int64 
 13  NOGRAD    522 non-null    int64 
 14  PEST      522 non-null    int64 
 15  SOMOGY    522 non-null    int64 
 16  SZABOLCS  522 non-null    int64 
 17  TOLNA     522 non-null    int64 
 18  VAS       522 non-null    int64 
 19  VESZPREM  522 non-null    int64 
 20  ZALA      522 non-null    int64 
dtypes: int64(20), ob

In [4]:

print("Jumlah Missing Values per Kolom:")
print(df.isnull().sum())


Jumlah Missing Values per Kolom:
Date        0
BUDAPEST    0
BARANYA     0
BACS        0
BEKES       0
BORSOD      0
CSONGRAD    0
FEJER       0
GYOR        0
HAJDU       0
HEVES       0
JASZ        0
KOMAROM     0
NOGRAD      0
PEST        0
SOMOGY      0
SZABOLCS    0
TOLNA       0
VAS         0
VESZPREM    0
ZALA        0
dtype: int64


In [5]:

target_column = 'BUDAPEST'
non_float =  'Date'
X = df.drop(target_column, axis=1).drop(non_float, axis=1)
y = df[target_column]
X, y

(     BARANYA  BACS  BEKES  BORSOD  CSONGRAD  FEJER  GYOR  HAJDU  HEVES  JASZ   
 0         79    30    173     169        42    136   120    162     36   130  \
 1         60    30     92     200        53     51    70     84     28    80   
 2         44    31     86      93        30     93    84    191     51    64   
 3         49    43    126      46        39     52   114    107     42    63   
 4         78    53     87     103        34     95   131    172     40    61   
 ..       ...   ...    ...     ...       ...    ...   ...    ...    ...   ...   
 517       12    41      6      39         0     16    15     14     10    56   
 518       39    31     10      34         3      2    30     25     19    34   
 519        7    15      0       0         0      7     7      4      2    30   
 520       23     8      0      11         4      1     9     10     17    27   
 521       42    49     32      38        15     11    98     61     38   112   
 
      KOMAROM  NOGRAD  PES

In [6]:
imputer = SimpleImputer(strategy='mean')
X_imputed = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)
X_imputed

Unnamed: 0,BARANYA,BACS,BEKES,BORSOD,CSONGRAD,FEJER,GYOR,HAJDU,HEVES,JASZ,KOMAROM,NOGRAD,PEST,SOMOGY,SZABOLCS,TOLNA,VAS,VESZPREM,ZALA
0,79.0,30.0,173.0,169.0,42.0,136.0,120.0,162.0,36.0,130.0,57.0,2.0,178.0,66.0,64.0,11.0,29.0,87.0,68.0
1,60.0,30.0,92.0,200.0,53.0,51.0,70.0,84.0,28.0,80.0,50.0,29.0,141.0,48.0,29.0,58.0,53.0,68.0,26.0
2,44.0,31.0,86.0,93.0,30.0,93.0,84.0,191.0,51.0,64.0,46.0,4.0,157.0,33.0,33.0,24.0,18.0,62.0,44.0
3,49.0,43.0,126.0,46.0,39.0,52.0,114.0,107.0,42.0,63.0,54.0,14.0,107.0,66.0,50.0,25.0,21.0,43.0,31.0
4,78.0,53.0,87.0,103.0,34.0,95.0,131.0,172.0,40.0,61.0,49.0,11.0,124.0,63.0,56.0,7.0,47.0,85.0,60.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
517,12.0,41.0,6.0,39.0,0.0,16.0,15.0,14.0,10.0,56.0,7.0,13.0,122.0,4.0,23.0,4.0,11.0,110.0,10.0
518,39.0,31.0,10.0,34.0,3.0,2.0,30.0,25.0,19.0,34.0,20.0,18.0,70.0,36.0,5.0,23.0,22.0,63.0,9.0
519,7.0,15.0,0.0,0.0,0.0,7.0,7.0,4.0,2.0,30.0,36.0,4.0,72.0,5.0,21.0,14.0,0.0,17.0,10.0
520,23.0,8.0,0.0,11.0,4.0,1.0,9.0,10.0,17.0,27.0,17.0,21.0,12.0,5.0,17.0,1.0,1.0,83.0,2.0


In [10]:
X_train, X_test, y_train, y_test = train_test_split(X_imputed, y, test_size=0.2, random_state=42)

In [11]:
# 2) Membangun Model
# Pilih dan bangun model regresi, seperti Linear Regression atau Random Forest Regressor.
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)


In [12]:
# 3) Evaluasi Model dan Kesimpulan
# Evaluasi performa model dan berikan kesimpulan.
from sklearn.metrics import mean_squared_error, r2_score

y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error:", mse)
print("R-squared Score:", r2)

Mean Squared Error: 2699.0754076190474
R-squared Score: 0.6606608187736691
