In [29]:
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier # import DT
from sklearn.ensemble import RandomForestClassifier # import RandomForest
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder


In [30]:
# Load data
df = pd.read_csv('mushrooms.csv')

df.head()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g


In [31]:
# Cek Kolom Null
df.isnull().sum()

class                       0
cap-shape                   0
cap-surface                 0
cap-color                   0
bruises                     0
odor                        0
gill-attachment             0
gill-spacing                0
gill-size                   0
gill-color                  0
stalk-shape                 0
stalk-root                  0
stalk-surface-above-ring    0
stalk-surface-below-ring    0
stalk-color-above-ring      0
stalk-color-below-ring      0
veil-type                   0
veil-color                  0
ring-number                 0
ring-type                   0
spore-print-color           0
population                  0
habitat                     0
dtype: int64

In [32]:
# Membuat LabelEncoder instance
label_encoder = LabelEncoder()

# Mengidentifikasi kolom kategorikal
categorical_cols = df.select_dtypes(include=['object']).columns

# Looping melalui semua kolom kategorikal dan menerapkan label encoding
for col in categorical_cols:
    df[col] = label_encoder.fit_transform(df[col])


In [33]:
# Seleksi fitur

# Slice dataframe mulai dari kolom 'cap-shape' sampai 'habitat'
X = df.iloc[:, 1:]
y = df['class']

# Cek jumlah fitur and instaance
X.shape


# print("Kolom yang ada pada X:", X.columns.tolist())

(8124, 22)

Split data training dan testing

In [34]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [35]:
from sklearn.model_selection import GridSearchCV

In [36]:
# Definisikan parameter grid
param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 10, 20, 30, 40, 50],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': [None, 'sqrt', 'log2']
}

# Inisialisasi DecisionTreeClassifier
dt = DecisionTreeClassifier()

# Inisialisasi GridSearchCV
grid_search = GridSearchCV(estimator=dt, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)

# Fit GridSearchCV ke data training
grid_search.fit(X_train, y_train)

# Menampilkan hyperparameter terbaik
print("Best hyperparameters:", grid_search.best_params_)

# Menggunakan model terbaik untuk memprediksi label set test
best_dt = grid_search.best_estimator_
y_pred_dt = best_dt.predict(X_test)

# Menghitung akurasi set test
acc_dt = accuracy_score(y_test, y_pred_dt)
print("Test set accuracy: {:.2f}".format(acc_dt))
print(f"Test set accuracy: {acc_dt}")

Fitting 5 folds for each of 324 candidates, totalling 1620 fits
Best hyperparameters: {'criterion': 'gini', 'max_depth': None, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 2}
Test set accuracy: 1.00
Test set accuracy: 1.0


In [37]:
from sklearn.ensemble import AdaBoostClassifier

In [39]:
# Definisi grid hyperparameter
param_grid = {
    'n_estimators': [50, 100, 150, 200],
    'learning_rate': [0.01, 0.1, 0.5, 1.0],
    'estimator': [DecisionTreeClassifier(max_depth=1), DecisionTreeClassifier(max_depth=3)]
}
# Membuat model AdaBoostClassifier
ada = AdaBoostClassifier()

# Inisialisasi GridSearchCV
grid_search = GridSearchCV(estimator=ada, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)

# Fit GridSearchCV ke data training
grid_search.fit(X_train, y_train)

# Menampilkan hyperparameter terbaik
print("Best hyperparameters:", grid_search.best_params_)

# Menggunakan model terbaik untuk memprediksi label set test
best_ada = grid_search.best_estimator_
y_pred_ada = best_ada.predict(X_test)

# Menghitung akurasi set test
acc_ada = accuracy_score(y_test, y_pred_ada)
print("Test set accuracy: {:.2f}".format(acc_ada))
print(f"Test set accuracy: {acc_ada}")

Fitting 5 folds for each of 32 candidates, totalling 160 fits




Best hyperparameters: {'estimator': DecisionTreeClassifier(max_depth=1), 'learning_rate': 0.5, 'n_estimators': 100}
Test set accuracy: 1.00
Test set accuracy: 1.0
