In [1]:
import pandas as pd

df = pd.read_csv('data/kaltim-fix.csv')
df.dropna(inplace=True)

print(len(df))
df.tail()

16315


Unnamed: 0.1,Unnamed: 0,Tn,Tx,Tavg,RH_avg,RR,ss,ff_x,ddd_x,ff_avg,...,stasiun,Evaporasi,SPEI-30,Kategori-30,SPEI-60,Kategori-60,SPEI-90,Kategori-90,SPEI-120,Kategori-120
16667,16667,24.8,32.6,27.2,88.0,1.6,6.8,3.0,280.0,1.0,...,96529,0.00404,1.08328,NO-DROUGHT,0.508479,NO-DROUGHT,-0.301065,NO-DROUGHT,-0.621656,MILD
16668,16668,23.6,33.6,27.9,85.0,41.3,6.0,5.0,230.0,1.0,...,96529,0.004645,1.130926,NO-DROUGHT,0.755852,NO-DROUGHT,-0.030161,NO-DROUGHT,-0.391918,NO-DROUGHT
16669,16669,23.4,34.9,27.7,86.0,31.6,10.2,3.0,280.0,1.0,...,96529,0.00496,1.325941,NO-DROUGHT,0.933588,NO-DROUGHT,0.090963,NO-DROUGHT,-0.283589,NO-DROUGHT
16670,16670,23.6,31.9,27.3,86.0,0.8,8.5,3.0,100.0,1.0,...,96529,0.004176,1.299163,NO-DROUGHT,0.937887,NO-DROUGHT,-0.056339,NO-DROUGHT,-0.279025,NO-DROUGHT
16671,16671,23.6,34.8,28.6,82.0,0.0,5.4,3.0,300.0,1.0,...,96529,0.004991,1.299158,NO-DROUGHT,0.741692,NO-DROUGHT,-0.085874,NO-DROUGHT,-0.279027,NO-DROUGHT


In [47]:
print(df.isnull().sum())

Unnamed: 0      0
Tn              0
Tx              0
Tavg            0
RH_avg          0
RR              0
ss              0
ff_x            0
ddd_x           0
ff_avg          0
ddd_car         0
lat             0
long            0
el              0
stasiun         0
Evaporasi       0
SPEI-30         0
Kategori-30     0
SPEI-60         0
Kategori-60     0
SPEI-90         0
Kategori-90     0
SPEI-120        0
Kategori-120    0
dtype: int64


In [48]:
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Predictor variables
X = df[['RR', 'RH_avg', 'ff_avg', 'ss', 'Tx', 'Tn']]

# Target variable
y = df['Kategori-30']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [49]:
from sklearn.model_selection import cross_val_score
import numpy as np

k_values = list(range(1, 21))

cv_scores = []
for k in k_values:
    knn = KNeighborsClassifier(n_neighbors=k)

    scores = cross_val_score(knn, X_train, y_train)
    mean_score = np.mean(scores)

    cv_scores.append(mean_score)

optimal_k = k_values[np.argmax(cv_scores)]
print("Optimal k:", optimal_k)

Optimal k: 20


In [52]:
# # Tingkat penting berdasarkan persentase
# weights = [100, 80, 70, 40, 20, 0]

# def custom_distance(x, y, w):
#     return np.sum(w * np.abs(x - y))

# # Mengubah bobot pada data training
# X_train_weighted = X_train * (weights_percent / 100.0)

# knn = KNeighborsClassifier(n_neighbors=18, metric=custom_distance, metric_params={'w': weights})
knn = KNeighborsClassifier(n_neighbors=20)
knn.fit(X_train, y_train)

In [53]:
# Make predictions on the testing data
y_pred = knn.predict(X_test)

# Calculate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

precision = precision_score(y_test, y_pred, average='weighted', zero_division=1)
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

Accuracy: 0.6808988764044944
Precision: 0.5563158051466244
Recall: 0.6808988764044944
F1 Score: 0.5617174650689146


In [54]:
from sklearn.metrics import classification_report

# Generate classification report
report = classification_report(y_test, y_pred, zero_division=1)
print("Classification Report:")
print(report)

Classification Report:
              precision    recall  f1-score   support

     EXTREME       1.00      0.00      0.00        88
        MILD       0.27      0.01      0.03       732
    MODERATE       0.24      0.01      0.02       466
  NO-DROUGHT       0.69      0.99      0.81      3347
      SEVERE       0.09      0.00      0.01       262

    accuracy                           0.68      4895
   macro avg       0.46      0.20      0.17      4895
weighted avg       0.56      0.68      0.56      4895



In [None]:
import pickle

with open('models/knn.pkl', 'wb') as file:
    pickle.dump(knn, file)

In [None]:
# with open('models/knn.pkl', 'rb') as file:
#     knn_loaded = pickle.load(file)

df_test = pd.DataFrame({
    'RR': [0, 0, 0],  # No rainfall
    'RH_avg': [0, 0, 0],
    'ff_avg': [0, 0, 0],
    'ss': [0, 0, 0],
    'Tx': [50, 48, 52],  # Extreme drought values
    'Tn': [40, 42, 45],  # Extreme drought values
})

prediction = knn.predict(df_test)
prediction

array(['NO-DROUGHT', 'NO-DROUGHT', 'NO-DROUGHT'], dtype=object)

In [None]:
prediction_mode = pd.DataFrame(prediction).mode(axis=0, dropna=False)
print(prediction_mode[0][0])

NO-DROUGHT
