<a href="https://colab.research.google.com/github/makhlufiaero338/tugas-machine-learning/blob/main/tugasperbaikan/Tugas_perbaikan_bab6.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Import libraries
import numpy as np
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

In [None]:
# Load dataset (Iris dataset as an example)
data = load_iris()
X, y = data.data, data.target
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.3)


In [None]:
# 1. Parameter Selection with Preprocessing
# Example of combining preprocessing (scaling) and model selection
pipeline_1 = Pipeline([
    ('scaler', StandardScaler()),
    ('svc', SVC())
])

# Train the pipeline
pipeline_1.fit(X_train, y_train)

# Test the pipeline
print("Pipeline 1 Accuracy:", pipeline_1.score(X_test, y_test))

Pipeline 1 Accuracy: 1.0


In [None]:
# 2. Building Pipelines
# A more complex pipeline with multiple steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), [0, 1, 2, 3]),  # Apply scaling to all features
    ]
)

pipeline_2 = Pipeline([
    ('preprocessor', preprocessor),
    ('svc', SVC())
])

# Train the pipeline
pipeline_2.fit(X_train, y_train)

# Test the pipeline
print("Pipeline 2 Accuracy:", pipeline_2.score(X_test, y_test))

Pipeline 2 Accuracy: 1.0


In [None]:
# 3. Using Pipelines in Grid Searches
param_grid = {
    'svc__C': [0.1, 1, 10],
    'svc__kernel': ['linear', 'rbf']
}

grid = GridSearchCV(pipeline_2, param_grid, cv=5)
grid.fit(X_train, y_train)

print("\nBest Parameters from Grid Search:", grid.best_params_)
print("Grid Search Accuracy:", grid.score(X_test, y_test))


Best Parameters from Grid Search: {'svc__C': 10, 'svc__kernel': 'linear'}
Grid Search Accuracy: 0.9777777777777777


In [None]:
# 4. The General Pipeline Interface
# Using the `fit`, `transform`, and `predict` methods of a pipeline
pipeline_2.fit(X_train, y_train)
predictions = pipeline_2.predict(X_test)
print("\nClassification Report (Pipeline 2):\n", classification_report(y_test, predictions))



Classification Report (Pipeline 2):
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        19
           1       1.00      1.00      1.00        13
           2       1.00      1.00      1.00        13

    accuracy                           1.00        45
   macro avg       1.00      1.00      1.00        45
weighted avg       1.00      1.00      1.00        45



In [None]:
# 5. Convenient Pipeline Creation with make_pipeline
# Creating a pipeline using make_pipeline
pipeline_3 = make_pipeline(StandardScaler(), SVC())
pipeline_3.fit(X_train, y_train)
print("Pipeline 3 Accuracy (make_pipeline):", pipeline_3.score(X_test, y_test))

Pipeline 3 Accuracy (make_pipeline): 1.0


In [None]:
# 6. Accessing Step Attributes
# Accessing the scaler step in pipeline_1
scaler = pipeline_1.named_steps['scaler']
print("\nScaler Mean (Pipeline 1):", scaler.mean_)


Scaler Mean (Pipeline 1): [5.84285714 3.00952381 3.87047619 1.23904762]


In [None]:
# 7. Accessing Attributes in a Grid-Searched Pipeline
# Accessing the best model and its parameters
best_model = grid.best_estimator_.named_steps['svc']
print("Best SVC Parameters from Grid Search:", best_model.get_params())

Best SVC Parameters from Grid Search: {'C': 10, 'break_ties': False, 'cache_size': 200, 'class_weight': None, 'coef0': 0.0, 'decision_function_shape': 'ovr', 'degree': 3, 'gamma': 'scale', 'kernel': 'linear', 'max_iter': -1, 'probability': False, 'random_state': None, 'shrinking': True, 'tol': 0.001, 'verbose': False}


In [None]:
# 8. Grid-Searching Preprocessing Steps and Model Parameters
# Adding preprocessing steps to grid search
param_grid_2 = {
    'preprocessor__num__with_mean': [True, False],
    'svc__C': [0.1, 1, 10],
    'svc__kernel': ['linear', 'rbf']
}

grid_2 = GridSearchCV(pipeline_2, param_grid_2, cv=5)
grid_2.fit(X_train, y_train)

print("\nBest Parameters from Grid Search with Preprocessing:", grid_2.best_params_)
print("Grid Search Accuracy (with Preprocessing):", grid_2.score(X_test, y_test))


Best Parameters from Grid Search with Preprocessing: {'preprocessor__num__with_mean': True, 'svc__C': 10, 'svc__kernel': 'linear'}
Grid Search Accuracy (with Preprocessing): 0.9777777777777777


In [None]:
# Grid-Searching Which Model To Use (Revised)
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

# Define individual pipelines for each model
pipeline_svc = Pipeline([
    ('scaler', StandardScaler()),
    ('classifier', SVC())
])

pipeline_rf = Pipeline([
    ('scaler', StandardScaler()),
    ('classifier', RandomForestClassifier())
])

# Define parameter grids for each pipeline
param_grid_svc = {
    'classifier__C': [0.1, 1, 10],
    'classifier__kernel': ['linear', 'rbf']
}

param_grid_rf = {
    'classifier__n_estimators': [50, 100],
    'classifier__max_depth': [None, 10, 20]
}

# Perform grid search for SVC
grid_svc = GridSearchCV(pipeline_svc, param_grid_svc, cv=5)
grid_svc.fit(X_train, y_train)

# Perform grid search for RandomForest
grid_rf = GridSearchCV(pipeline_rf, param_grid_rf, cv=5)
grid_rf.fit(X_train, y_train)

# Compare results
print("Best parameters for SVC:", grid_svc.best_params_)
print("Best SVC accuracy:", grid_svc.score(X_test, y_test))

print("\nBest parameters for RandomForest:", grid_rf.best_params_)
print("Best RandomForest accuracy:", grid_rf.score(X_test, y_test))

# Choose the best model based on accuracy
if grid_svc.best_score_ > grid_rf.best_score_:
    print("\nBest Model: SVC")
    best_model = grid_svc.best_estimator_
else:
    print("\nBest Model: RandomForest")
    best_model = grid_rf.best_estimator_

# Test the best model on the test set
print("\nTest accuracy of the best model:", best_model.score(X_test, y_test))


Best parameters for SVC: {'classifier__C': 10, 'classifier__kernel': 'linear'}
Best SVC accuracy: 0.9777777777777777

Best parameters for RandomForest: {'classifier__max_depth': None, 'classifier__n_estimators': 100}
Best RandomForest accuracy: 1.0

Best Model: SVC

Test accuracy of the best model: 0.9777777777777777
