In [11]:
import pandas as pd
import warnings; warnings.simplefilter('ignore')

from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, VotingClassifier

## Tugas 1
Terdapat dataset mushroom. Berdasarkan dataset yang tersebut, bandingkan peforma antara algoritma Decision Tree dan RandomForest. Gunakan tunning hyperparameter untuk mendapatkan parameter dan akurasi yang terbaik.

In [10]:
path = "csv/mushrooms.csv"
df = pd.read_csv(path)

encoder = OneHotEncoder()
encoded_data = encoder.fit_transform(df.drop(columns=['class'])).toarray()

encoded_df = pd.DataFrame(encoded_data, columns=encoder.get_feature_names_out(df.drop(columns=['class']).columns))

encoded_df['class'] = df['class']

X = encoded_df.drop(columns=['class'])
y = encoded_df['class']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

dt_classifier = DecisionTreeClassifier()
dt_params = {'max_depth': [None, 5, 10, 15]}
dt_grid_search = GridSearchCV(dt_classifier, dt_params, cv=5)
dt_grid_search.fit(X_train, y_train)
dt_best_params = dt_grid_search.best_params_
dt_model = dt_grid_search.best_estimator_

rf_classifier = RandomForestClassifier()
rf_params = {'n_estimators': [50, 100, 150], 'max_depth': [None, 5, 10, 15]}
rf_grid_search = GridSearchCV(rf_classifier, rf_params, cv=5)
rf_grid_search.fit(X_train, y_train)
rf_best_params = rf_grid_search.best_params_
rf_model = rf_grid_search.best_estimator_

dt_pred = dt_model.predict(X_test)
rf_pred = rf_model.predict(X_test)

dt_accuracy = accuracy_score(y_test, dt_pred)
rf_accuracy = accuracy_score(y_test, rf_pred)

print(f"Akurasi Decision Tree: {dt_accuracy:.2f}")
print(f"Akurasi RandomForest: {rf_accuracy:.2f}")
print(f"Parameter terbaik Decision Tree: {dt_best_params}")
print(f"Parameter terbaik RandomForest: {rf_best_params}")

Akurasi Decision Tree: 1.00
Akurasi RandomForest: 1.00
Parameter terbaik Decision Tree: {'max_depth': None}
Parameter terbaik RandomForest: {'max_depth': None, 'n_estimators': 50}


## Tugas 2
Terdapat dataset mushroom. Berdasarkan dataset tersebut, bandingkan peforma antara algoritma Decision Tree dan AdaBoost. Gunakan tunning hyperparameter untuk mendapatkan parameter dan akurasi yang terbaik.

In [9]:
path = "csv/mushrooms.csv"
df = pd.read_csv(path)

label_encoders = {}
for column in df.columns:
    le = LabelEncoder()
    df[column] = le.fit_transform(df[column])
    label_encoders[column] = le

X = df.drop(columns=['class'])
y = df['class']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

dt_classifier = DecisionTreeClassifier()
dt_params = {'max_depth': [None, 5, 10, 15]}
dt_grid_search = GridSearchCV(dt_classifier, dt_params, cv=5)
dt_grid_search.fit(X_train, y_train)
dt_best_params = dt_grid_search.best_params_
dt_model = dt_grid_search.best_estimator_

adaboost_classifier = AdaBoostClassifier(base_estimator=DecisionTreeClassifier())
adaboost_params = {'n_estimators': [50, 100, 150], 'learning_rate': [0.1, 0.5, 1.0]}
adaboost_grid_search = GridSearchCV(adaboost_classifier, adaboost_params, cv=5)
adaboost_grid_search.fit(X_train, y_train)
adaboost_best_params = adaboost_grid_search.best_params_
adaboost_model = adaboost_grid_search.best_estimator_

dt_pred = dt_model.predict(X_test)
adaboost_pred = adaboost_model.predict(X_test)

dt_accuracy = accuracy_score(y_test, dt_pred)
adaboost_accuracy = accuracy_score(y_test, adaboost_pred)

print(f"Akurasi Decision Tree: {dt_accuracy:.2f}")
print(f"Akurasi AdaBoost: {adaboost_accuracy:.2f}")
print(f"Parameter terbaik Decision Tree: {dt_best_params}")
print(f"Parameter terbaik AdaBoost: {adaboost_best_params}")

Akurasi Decision Tree: 1.00
Akurasi AdaBoost: 1.00
Parameter terbaik Decision Tree: {'max_depth': None}
Parameter terbaik AdaBoost: {'learning_rate': 0.1, 'n_estimators': 50}


## Tugas 3
Dengan menggunakan dataset diabetes, buatlah ensemble voting dengan algoritma
-> Logistic Regression
-> SVM kernel polynomial
-> Decission Tree

In [14]:
path = "csv/diabetes.csv"
df = pd.read_csv(path)

df = df.apply(pd.to_numeric)
X = df.drop(columns=['Outcome'])
y = df['Outcome']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

logreg_model = LogisticRegression(random_state=42)
svm_model = SVC(kernel='poly', degree=3, random_state=42)
dt_model = DecisionTreeClassifier(random_state=42)

voting_classifier = VotingClassifier(estimators=[('lr', logreg_model), ('svm', svm_model), ('dt', dt_model)], voting='hard')

voting_classifier.fit(X_train, y_train)
y_pred = voting_classifier.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f"Hasil Akurasi: {accuracy:.2f}")

Hasil Akurasi: 0.77
