In [53]:
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [54]:
df = pd.read_csv("dataset_sekolah_feature_engineered.csv")

In [55]:
df.columns = df.columns.map(lambda x: str(x).strip())

print("===== NAMA KOLOM DATASET =====")
print(df.columns.tolist())

===== NAMA KOLOM DATASET =====
['nama_sekolah', 'jumlah_siswa', 'mengulang', 'putus_sekolah', 'kepala_guru', 'tendik', 'rombel', 'ruang_kelas', 'total_guru', 'rasio_siswa_guru', 'rasio_siswa_rombel', 'rasio_siswa_kelas', 'status_sekolah_Swasta', 'kepadatan_kelas_Sedang', 'kepadatan_kelas_Tinggi', 'kecamatan_Kec. Bontoala', 'kecamatan_Kec. Makasar', 'kecamatan_Kec. Mamajang', 'kecamatan_Kec. Manggala', 'kecamatan_Kec. Mariso', 'kecamatan_Kec. Panakukkang', 'kecamatan_Kec. Rappocini', 'kecamatan_Kec. Tallo', 'kecamatan_Kec. Tamalanrea', 'kecamatan_Kec. Tamalate', 'kecamatan_Kec. Ujung Pandang', 'kecamatan_Kec. Ujung Tanah', 'kecamatan_Kec. Wajo', 'kecamatan_Sumber:', 'kecamatan_Tanggal cutoff:']


In [56]:
kolom_siswa = [c for c in df.columns if "siswa" in c.lower()]
kolom_ruang = [c for c in df.columns if "ruang" in c.lower() and "kelas" in c.lower()]

if not kolom_siswa:
    raise ValueError("Kolom yang mengandung kata 'siswa' tidak ditemukan di dataset.")
if not kolom_ruang:
    raise ValueError("Kolom yang mengandung kata 'ruang' dan 'kelas' tidak ditemukan di dataset.")

col_siswa = kolom_siswa[0]
col_ruang = kolom_ruang[0]

print("\nKolom siswa yang digunakan      :", col_siswa)
print("Kolom ruang kelas yang digunakan:", col_ruang)


Kolom siswa yang digunakan      : jumlah_siswa
Kolom ruang kelas yang digunakan: ruang_kelas


In [57]:
df[col_siswa] = pd.to_numeric(df[col_siswa], errors="coerce")
df[col_ruang] = pd.to_numeric(df[col_ruang], errors="coerce")
df = df.dropna(subset=[col_siswa, col_ruang])

In [58]:
df["rasio_siswa_kelas"] = df[col_siswa] / df[col_ruang]

def kategori_kepadatan(x):
    if x < 20:
        return "Rendah"
    elif x < 30:
        return "Sedang"
    else:
        return "Tinggi"

df["kepadatan_kelas"] = df["rasio_siswa_kelas"].apply(kategori_kepadatan)
print("\n===== CONTOH DATA RASIO & LABEL =====")
print(df[[col_siswa, col_ruang, "rasio_siswa_kelas", "kepadatan_kelas"]].head())


===== CONTOH DATA RASIO & LABEL =====
   jumlah_siswa  ruang_kelas  rasio_siswa_kelas kepadatan_kelas
0     -0.380029    -0.592339           0.641573          Rendah
1      0.298084     0.176771           1.686270          Rendah
2      1.739309     1.231196           1.412699          Rendah
3      0.535598     0.623351           0.859224          Rendah
4     -0.139287    -0.369049           0.377420          Rendah


In [59]:
numeric_cols = df.select_dtypes(include=["int64", "float64"]).columns.tolist()

print("\nFitur numerik yang digunakan sebagai X:")
print(numeric_cols)

X = df[numeric_cols]
y = df["kepadatan_kelas"]


Fitur numerik yang digunakan sebagai X:
['jumlah_siswa', 'mengulang', 'putus_sekolah', 'kepala_guru', 'tendik', 'rombel', 'ruang_kelas', 'total_guru', 'rasio_siswa_guru', 'rasio_siswa_rombel', 'rasio_siswa_kelas']


In [60]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [61]:
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y  # supaya proporsi kelas seimbang
)

print("\n===== INFO SPLIT DATA =====")
print("Jumlah data training:", X_train.shape[0])
print("Jumlah data testing :", X_test.shape[0])


===== INFO SPLIT DATA =====
Jumlah data training: 22
Jumlah data testing : 6


In [62]:
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)

y_pred_knn = knn.predict(X_test)

print("\n=========== HASIL KNN ===========")
print("Accuracy KNN:", accuracy_score(y_test, y_pred_knn))
print("\nConfusion Matrix (KNN):")
print(confusion_matrix(y_test, y_pred_knn))
print("\nClassification Report (KNN):")
print(classification_report(y_test, y_pred_knn))


Accuracy KNN: 1.0

Confusion Matrix (KNN):
[[6]]

Classification Report (KNN):
              precision    recall  f1-score   support

      Rendah       1.00      1.00      1.00         6

    accuracy                           1.00         6
   macro avg       1.00      1.00      1.00         6
weighted avg       1.00      1.00      1.00         6





In [63]:
nb = GaussianNB()
nb.fit(X_train, y_train)

y_pred_nb = nb.predict(X_test)

print("\n======= HASIL NAIVE BAYES =======")
print("Accuracy Naive Bayes:", accuracy_score(y_test, y_pred_nb))
print("\nConfusion Matrix (NB):")
print(confusion_matrix(y_test, y_pred_nb))
print("\nClassification Report (NB):")
print(classification_report(y_test, y_pred_nb))


Accuracy Naive Bayes: 1.0

Confusion Matrix (NB):
[[6]]

Classification Report (NB):
              precision    recall  f1-score   support

      Rendah       1.00      1.00      1.00         6

    accuracy                           1.00         6
   macro avg       1.00      1.00      1.00         6
weighted avg       1.00      1.00      1.00         6





In [64]:
print("\n======= RINGKASAN AKURASI =======")
print("Accuracy KNN        :", accuracy_score(y_test, y_pred_knn))
print("Accuracy NaiveBayes :", accuracy_score(y_test, y_pred_nb))


Accuracy KNN        : 1.0
Accuracy NaiveBayes : 1.0
