In [29]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC
from sklearn.ensemble import VotingClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, precision_recall_curve, average_precision_score

In [30]:
# Load the data
data = pd.read_csv('https://raw.githubusercontent.com/miftahudinfaiz/RainfallPrediction/main/Rainfall.csv')


In [31]:
# Preprocessing steps (drop columns, handle missing values, etc.)

# Drop irrelecant data
data = data.drop(columns=['Stasiun','Tanggal'],

                 axis=1)
data = data.dropna(how='any')

# Split the data into features (X) and target variable (Y)
Y = data['Besok_hujan']
X = data.drop(columns=['Besok_hujan', 'Hari_hujan'])

# Split the data into training and testing sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=9)


print(data.shape)
data.head()

(923, 6)


Unnamed: 0,Tn,Tx,Tavg,RR,Hari_hujan,Besok_hujan
0,20.3,31.1,25.8,108.0,1,1
1,25.0,31.8,29.2,106.0,1,1
2,23.4,29.6,25.8,100.6,1,1
3,23.6,28.0,25.6,93.9,1,1
4,21.1,29.4,25.5,93.0,1,1


In [32]:
# KNN Classifier
knncla = KNeighborsClassifier(n_neighbors=7, n_jobs=-1)
knncla.fit(X_train, Y_train)
Y_pred_knncla = knncla.predict(X_test)
accuracy_knncla = accuracy_score(Y_test, Y_pred_knncla)
confusion_matrix_knncla = confusion_matrix(Y_test, Y_pred_knncla)
classification_report_knncla = classification_report(Y_test, Y_pred_knncla)


In [33]:
# LinearSVC and Feature Selection
lsvc = LinearSVC(C=0.05, penalty="l1", dual=False, random_state=9).fit(X, Y)
model = SelectFromModel(lsvc, prefit=True)
X_new = model.transform(X)
selected_features = list(X.columns[model.get_support(indices=True)])
X1 = data[selected_features]
X1_train, X1_test, Y1_train, Y1_test = train_test_split(X1, Y, test_size=0.3, random_state=9)





In [34]:
# PCA
pca = PCA().fit(X1)
X1_pca = pca.transform(X1)
variance = pd.Series(list(np.cumsum(pca.explained_variance_ratio_)), index=list(range(1, len(pca.explained_variance_ratio_) + 1)))


In [35]:
# Define the ensemble of classifiers
ensemble_model = VotingClassifier([('knn', knncla), ('svm', LinearSVC()), ('pca_knn', KNeighborsClassifier(n_neighbors=5))])


In [36]:
# Train ensemble model
ensemble_model.fit(X_train, Y_train)




In [37]:
# Predict using ensemble model
Y_pred_ensemble = ensemble_model.predict(X_test)
accuracy_ensemble = accuracy_score(Y_test, Y_pred_ensemble)
confusion_matrix_ensemble = confusion_matrix(Y_test, Y_pred_ensemble)
classification_report_ensemble = classification_report(Y_test, Y_pred_ensemble)


In [38]:
# Evaluate ensemble model
print("Ensemble Classification Report:")
print(classification_report_ensemble)
print("Ensemble Accuracy:", accuracy_ensemble)
print("Ensemble Confusion Matrix:")
print(confusion_matrix_ensemble)


Ensemble Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.90      0.95       205
           1       0.78      1.00      0.88        72

    accuracy                           0.93       277
   macro avg       0.89      0.95      0.91       277
weighted avg       0.94      0.93      0.93       277

Ensemble Accuracy: 0.927797833935018
Ensemble Confusion Matrix:
[[185  20]
 [  0  72]]


In [39]:
# Average precision-recall score
average_precision = average_precision_score(Y_test, Y_pred_ensemble)
print('Average precision-recall score: {0:0.2f}'.format(average_precision))



Average precision-recall score: 0.78


In [40]:
# Precision-Recall curve
precision, recall, _ = precision_recall_curve(Y_test, ensemble_model.predict_proba(X_test)[:, 1])
average_precision = average_precision_score(Y_test, ensemble_model.predict_proba(X_test)[:, 1])


AttributeError: ignored

In [None]:
plt.plot(recall, precision, label='Precision-Recall curve')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('2-class Precision-Recall curve: AP={0:0.2f}'.format(average_precision))
plt.legend(loc="lower left")
plt.show()


In [None]:
# Individual models evaluation
model_scores = pd.DataFrame(columns=['Model', 'Train Score', 'Test Score'])


In [None]:
# KNN alone
train_score_knncla = knncla.score(X_train, Y_train) * 100
test_score_knncla = knncla.score(X_test, Y_test) * 100
model_scores = model_scores.append({'Model': 'KNN', 'Train Score': train_score_knncla, 'Test Score': test_score_knncla}, ignore_index=True)


In [None]:
# Ensemble (KNN + SVM + PCA + KNN)
train_score_ensemble = ensemble_model.score(X_train, Y_train) * 100
test_score_ensemble = ensemble_model.score(X_test, Y_test) * 100
model_scores = model_scores.append({'Model': 'Ensemble', 'Train Score': train_score_ensemble, 'Test Score': test_score_ensemble}, ignore_index=True)


In [None]:
print("\nModel Scores:")
print(model_scores)