# Check Data

In [None]:
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.tree import plot_tree
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, f1_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import BaggingClassifier
import warnings

# Menonaktifkan semua warning
warnings.filterwarnings("ignore")

In [None]:
df = pd.read_csv('/content/ecoli.csv')

In [None]:
#melakukan pengecekan missing value
df.isnull().sum()

mcg      0
gvh      0
lip      0
chg      0
aac      0
alm1     0
alm2     0
label    0
dtype: int64

In [None]:
df.dtypes

mcg      float64
gvh      float64
lip      float64
chg      float64
aac      float64
alm1     float64
alm2     float64
label     object
dtype: object

In [None]:
#membaca data
df

Unnamed: 0,mcg,gvh,lip,chg,aac,alm1,alm2,label
0,0.49,0.29,0.48,0.5,0.56,0.24,0.35,cp
1,0.07,0.40,0.48,0.5,0.54,0.35,0.44,cp
2,0.56,0.40,0.48,0.5,0.49,0.37,0.46,cp
3,0.59,0.49,0.48,0.5,0.52,0.45,0.36,cp
4,0.23,0.32,0.48,0.5,0.55,0.25,0.35,cp
...,...,...,...,...,...,...,...,...
331,0.74,0.56,0.48,0.5,0.47,0.68,0.30,pp
332,0.71,0.57,0.48,0.5,0.48,0.35,0.32,pp
333,0.61,0.60,0.48,0.5,0.44,0.39,0.38,pp
334,0.59,0.61,0.48,0.5,0.42,0.42,0.37,pp


# Feature Selection

In [None]:
#Memilih feature/ciri yang digunakan, disini saya menggunakan semua data kolom kecuali kolom label sebagai kolom label
X = df.drop(['label'], axis=1)
#memilih kolom label untuk digunakan sebagai label
Y = df['label']

* **Melakukan Oversampling Menggunakan RandomOverSampler dari fitur yang telah dipilih**

In [None]:
# Oversampling dengan RandomOverSampler
oversampler = RandomOverSampler(random_state=42)
X_resampled, y_resampled = oversampler.fit_resample(X, Y)

# Splitting data menjadi data train dan data testing
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)


# KNN Base Classifier

* **Mencari Jumlah N yang optimal untuk dijadikan base classifier**

In [None]:
n_values = [3, 5, 7, 9, 11]
knn = KNeighborsClassifier()
for neighbors in n_values:
    knn.n_neighbors = neighbors
    scores = cross_val_score(knn, X, Y, cv=5)
    mean_score = scores.mean()
    print(f"n_neighbors: {neighbors}, mean accuracy: {mean_score}")

n_neighbors: 3, mean accuracy: 0.8242756804214224
n_neighbors: 5, mean accuracy: 0.836215978928885
n_neighbors: 7, mean accuracy: 0.8570676031606673
n_neighbors: 9, mean accuracy: 0.8510096575943811
n_neighbors: 11, mean accuracy: 0.8392010535557507


* **Menginisiasi Model KNN Yang akan digunakan sebagai base classifier**

In [None]:
# Inisialisasi classifier base (ganti dengan classifier yang Anda inginkan)
knn_classifier = KNeighborsClassifier(n_neighbors=7, metric='euclidean')

# Inisialisasi Bagging Classifier
bagging_knn = BaggingClassifier(base_estimator=knn_classifier, n_estimators=10, random_state=42)

# Latih model
bagging_knn.fit(X_train, y_train)

* **Mencoba Prediksi antara hasil prediksi dengan data sebenarnya**

In [None]:
# Lakukan prediksi
y_pred_knn = bagging_knn.predict(X_test)
print('Predict:',y_pred_knn[0])
print('Actual:',y_test.values[0])

Predict: im
Actual: im


* **Evaluasi Kinerja dari bagging method**

In [None]:
# Evaluasi kinerja model
accuracy = accuracy_score(y_test, y_pred_knn)
print("Akurasi Bagging Classifier:", accuracy)

Akurasi Bagging Classifier: 0.925764192139738


*   **Classification Report**

In [None]:
print(classification_report(y_test,y_pred_knn))

              precision    recall  f1-score   support

          cp       0.97      0.94      0.95        32
          im       0.90      0.68      0.78        28
         imL       1.00      1.00      1.00        29
         imS       0.97      1.00      0.98        31
         imU       0.77      1.00      0.87        27
          om       0.88      1.00      0.94        29
         omL       1.00      1.00      1.00        19
          pp       0.97      0.82      0.89        34

    accuracy                           0.93       229
   macro avg       0.93      0.93      0.93       229
weighted avg       0.93      0.93      0.92       229



*   **Confusion Matrix**

In [None]:
print(confusion_matrix(y_test,y_pred_knn))

[[30  1  0  0  0  0  0  1]
 [ 0 19  0  1  8  0  0  0]
 [ 0  0 29  0  0  0  0  0]
 [ 0  0  0 31  0  0  0  0]
 [ 0  0  0  0 27  0  0  0]
 [ 0  0  0  0  0 29  0  0]
 [ 0  0  0  0  0  0 19  0]
 [ 1  1  0  0  0  4  0 28]]


# Decision Tree Classifier

* **Mencari Max depth yang optimal untuk dijadikan base classifier**

In [None]:
max_depth_values = [3, 5, 7, 9, 11]
dcf = DecisionTreeClassifier()
for depth in max_depth_values:
    dcf.max_depth = depth
    scores = cross_val_score(dcf, X, Y, cv=5)
    mean_score = scores.mean()
    print(f"max_depth: {depth}, mean accuracy: {mean_score}")

max_depth: 3, mean accuracy: 0.797585601404741
max_depth: 5, mean accuracy: 0.7828358208955224
max_depth: 7, mean accuracy: 0.8035996488147499
max_depth: 9, mean accuracy: 0.8035996488147499
max_depth: 11, mean accuracy: 0.7886742756804215


* **Menginisiasi Model Yang akan digunakan sebagai base classifier**

In [None]:
# Inisialisasi classifier base (ganti dengan classifier yang Anda inginkan)
base_classifier = DecisionTreeClassifier(criterion='entropy',max_depth=7,random_state=42)

# Inisialisasi Bagging Classifier
bagging_classifier = BaggingClassifier(base_estimator=base_classifier, n_estimators=10, random_state=42)

# Latih model
bagging_classifier.fit(X_train, y_train)

* **Mencoba Prediksi antara hasil prediksi dengan data sebenarnya**

In [None]:
# Lakukan prediksi
y_pred = bagging_classifier.predict(X_test)
print('Predict:',y_pred[0])
print('Actual:',y_test.values[0])

Predict: im
Actual: im


* **Evaluasi Kinerja dari bagging method dengan base DecisionTree**

In [None]:
# Evaluasi kinerja model
accuracy = accuracy_score(y_test, y_pred)
print("Akurasi Bagging Classifier:", accuracy)

Akurasi Bagging Classifier: 0.9606986899563319


*   **Classification Report**

In [None]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

          cp       0.94      0.94      0.94        32
          im       1.00      0.82      0.90        28
         imL       1.00      1.00      1.00        29
         imS       1.00      1.00      1.00        31
         imU       0.84      1.00      0.92        27
          om       0.97      1.00      0.98        29
         omL       1.00      1.00      1.00        19
          pp       0.97      0.94      0.96        34

    accuracy                           0.96       229
   macro avg       0.96      0.96      0.96       229
weighted avg       0.96      0.96      0.96       229



*   **Confusion Matrix**

In [None]:
print(confusion_matrix(y_test,y_pred))

[[30  0  0  0  0  1  0  1]
 [ 0 23  0  0  5  0  0  0]
 [ 0  0 29  0  0  0  0  0]
 [ 0  0  0 31  0  0  0  0]
 [ 0  0  0  0 27  0  0  0]
 [ 0  0  0  0  0 29  0  0]
 [ 0  0  0  0  0  0 19  0]
 [ 2  0  0  0  0  0  0 32]]
