## **K-Nearest Neighbor Algorithm**

#### **1. Load Dataset**

In [2]:
import pandas as pd

df = pd.read_csv('../data/test/combine.csv')
df.dropna(inplace=True)

print(len(df))
df.tail()

7553


Unnamed: 0,Tanggal,Stasiun,Tn,Tx,Tavg,RH_avg,RR,ss,ff_x,ddd_x,ff_avg,ddd_car,Lat,Long,El,Evaporasi,SPEI,Kategori
7548,2020-12-21,96633,24.8,31.1,27.8,90.0,3.7,1.0,5.0,210.0,2.0,SW,-1.26,116.9,3,0.003533,0.289738,D0
7549,2020-12-24,96633,24.8,30.7,26.9,85.0,5.7,1.7,7.0,260.0,2.0,W,-1.26,116.9,3,0.003351,0.379628,D0
7550,2020-12-25,96633,24.7,30.2,27.1,84.0,0.0,1.9,3.0,250.0,1.0,C,-1.26,116.9,3,0.00325,-1.304603,D2
7551,2020-12-29,96633,24.0,31.0,27.3,86.0,1.1,1.0,4.0,240.0,2.0,S,-1.26,116.9,3,0.003683,0.034694,D0
7552,2020-12-30,96633,25.0,30.1,27.4,86.0,0.0,0.7,6.0,220.0,2.0,W,-1.26,116.9,3,0.003151,-1.290964,D2


#### **2. Null Identification**

In [3]:
print(df.isnull().sum())

Tanggal      0
Stasiun      0
Tn           0
Tx           0
Tavg         0
RH_avg       0
RR           0
ss           0
ff_x         0
ddd_x        0
ff_avg       0
ddd_car      0
Lat          0
Long         0
El           0
Evaporasi    0
SPEI         0
Kategori     0
dtype: int64


#### **3. Spliting Data Train & Test**

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Variabel Prediktor
# 1. Curah Hujan, 
# 2. Kelembaban (Rata-rata), 
# 3. Kecepatan Angin Maksimum (m/s)
# 4. Lamanya penyinaran matahari, 
# 5. Suhu (Max), 
# 6. Suhu (Min)
X = df[['RR', 'RH_avg', 'ff_avg', 'ss', 'Tx', 'Tn']]

# Variable Target
y = df['Kategori']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

#### **4. Finding K-value**

In [5]:
from sklearn.model_selection import cross_val_score
import numpy as np

k_values = list(range(1, 21))
cv_scores = []
for k in k_values:
    knn = KNeighborsClassifier(n_neighbors=k)
    scores = cross_val_score(knn, X_train, y_train)
    mean_score = np.mean(scores)
    cv_scores.append(mean_score)

optimal_k = k_values[np.argmax(cv_scores)]
print("Optimal k:", optimal_k)

Optimal k: 20


#### **5. Train Models**

In [6]:
# @TODO : Pembobotan

# weights = [100, 80, 70, 40, 20, 0]
# def custom_distance(x, y, w):
#     return np.sum(w * np.abs(x - y))
# # Mengubah bobot pada data training
# X_train_weighted = X_train * (weights_percent / 100.0)
# knn = KNeighborsClassifier(n_neighbors=18, metric=custom_distance, metric_params={'w': weights})

In [7]:
knn = KNeighborsClassifier(n_neighbors=optimal_k)
knn.fit(X_train, y_train)

#### **6. Performance Measures**

In [8]:
# Make predictions on the testing data
y_pred = knn.predict(X_test)

# Calculate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

precision = precision_score(y_test, y_pred, average='weighted', zero_division=1)
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

Accuracy: 0.7511032656663724
Precision: 0.7538100804483724
Recall: 0.7511032656663724
F1 Score: 0.7352198939781792


In [9]:
from sklearn.metrics import classification_report

# Generate classification report
report = classification_report(y_test, y_pred, zero_division=1)
print("Classification Report:")
print(report)

Classification Report:
              precision    recall  f1-score   support

          D0       0.96      0.91      0.93      1251
          D1       0.48      0.31      0.38       222
          D2       0.54      0.82      0.65       575
          D3       0.36      0.13      0.19       206
          D4       1.00      0.00      0.00        12

    accuracy                           0.75      2266
   macro avg       0.67      0.43      0.43      2266
weighted avg       0.75      0.75      0.74      2266



#### **7. Generate Models**

In [10]:
import pickle

with open('../models/knn.pkl', 'wb') as file:
    pickle.dump(knn, file)

#### **8. Manual Predict**

In [11]:
# with open('../models/knn.pkl', 'rb') as file:
#     knn_loaded = pickle.load(file)

def map_drought_condition(di):
    condition_map = {
        'D0': 'No-Drought',
        'D1': 'MILD',
        'D2': 'MODERATE',
        'D3': 'SEVERE',
        'D4': 'EXTREME'
    }
    return condition_map.get(di, 'Unknown')

In [15]:
# Contoh Prediksi
# ============================
# 'Curah Hujan': 0,
# 'Kelembaban (Rata-rata)': 0,
# 'Kecepatan Angin Maksimum (m/s)': 0,
# 'Lama Sinar Matahari': 0,
# 'Suhu (Max)': 50, 
# 'Suhu (Min)': 40,
input_data = np.array([[0, 0, 0, 0, 50, 40]])

input_df = pd.DataFrame(input_data, columns=['RR', 'RH_avg', 'ff_avg', 'ss', 'Tx', 'Tn'])
prediction = knn.predict(input_df)

print(map_drought_condition(prediction[0]))

MODERATE
