In [21]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, accuracy_score

In [22]:
# Load the data
data = pd.read_csv('dataset/praprocessed_dataset.csv')
data.head()

# Check for missing values
print(data.isnull().sum())

tanggal       0
stasiun       0
pm10        758
so2         568
co          392
o3          646
no2         528
max           0
critical      0
categori      0
dtype: int64


In [23]:
# Fill missing values using interpolation
data['pm10'].interpolate(method='linear', inplace=True)
data['so2'].interpolate(method='linear', inplace=True)
data['co'].interpolate(method='linear', inplace=True)
data['o3'].interpolate(method='linear', inplace=True)
data['no2'].interpolate(method='linear', inplace=True)

# Verify that there are no more missing values
print(data.isnull().sum())

tanggal     0
stasiun     0
pm10        0
so2         0
co          0
o3          0
no2         0
max         0
critical    0
categori    0
dtype: int64


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['pm10'].interpolate(method='linear', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['so2'].interpolate(method='linear', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we a

In [24]:
# Preprocess the data
# Convert categorical data to numeric if necessary
data['stasiun'] = data['stasiun'].astype('category').cat.codes

# Handle any missing values if necessary (simple example: fill with median)
# data.fillna(data.median(), inplace=True)

# Define features and target
X = data.drop(['tanggal', 'categori', 'critical'], axis=1)
y = data['categori']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the DecisionTreeClassifier
dtree = DecisionTreeClassifier(random_state=42, max_depth=5, min_samples_split=20, min_samples_leaf=10)

# Train the model
dtree.fit(X_train, y_train)

# Make predictions
y_pred = dtree.predict(X_test)

In [25]:
# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Accuracy: 0.9987205731832139

Classification Report:
                    precision    recall  f1-score   support

              BAIK       1.00      1.00      1.00       729
SANGAT TIDAK SEHAT       1.00      1.00      1.00        34
            SEDANG       1.00      1.00      1.00      2607
       TIDAK SEHAT       1.00      1.00      1.00       538

          accuracy                           1.00      3908
         macro avg       1.00      1.00      1.00      3908
      weighted avg       1.00      1.00      1.00      3908



In [26]:
# Feature Importance
print("\nFeature Importance:")
importances = pd.Series(dtree.feature_importances_, index=X.columns)
print(importances.sort_values(ascending=False))


Feature Importance:
max        0.973654
no2        0.025600
so2        0.000533
pm10       0.000189
o3         0.000024
stasiun    0.000000
co         0.000000
dtype: float64


In [27]:
# Cross-validation
scores = cross_val_score(dtree, X, y, cv=5)
print("\nCross-validated scores:", scores)


Cross-validated scores: [0.99974411 0.99846429 0.99820834 0.99872025 0.98566675]


