<a href="https://colab.research.google.com/github/oaravind/BDA_ICP1_A/blob/main/ICP_5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from sklearn.datasets import load_iris
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline

# 1. Load dataset
X, y = load_iris(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 2. Create pipeline
pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('pca', PCA()),
    ('classifier', SVC())
])

# 3. Define parameter grid
param_grid = {
    'pca__n_components': [2, 3],
    'classifier__C': [0.1, 1, 10],
    'classifier__kernel': ['linear', 'rbf']
}

# 4. GridSearchCV
grid = GridSearchCV(pipe, param_grid)
grid.fit(X_train, y_train)

# 5. Results
print("Best parameters found:", grid.best_params_)
print("Best cross-validation score: {:.2f}".format(grid.best_score_))
print("Test set score: {:.2f}".format(grid.score(X_test, y_test)))



Best parameters found: {'classifier__C': 0.1, 'classifier__kernel': 'linear', 'pca__n_components': 3}
Best cross-validation score: 0.96
Test set score: 1.00


In [None]:
# 3-Fold CV
grid_3 = GridSearchCV(pipe, param_grid, cv=3)
grid_3.fit(X_train, y_train)

print("🔁 3-Fold CV Results:")
print("Best Parameters:", grid_3.best_params_)
print("Best Cross-Validation Score: {:.2f}".format(grid_3.best_score_))
print("Test Set Score: {:.2f}".format(grid_3.score(X_test, y_test)))


# 5-Fold CV
grid_5 = GridSearchCV(pipe, param_grid, cv=5)
grid_5.fit(X_train, y_train)

print("🔁 5-Fold CV Results:")
print("Best Parameters:", grid_5.best_params_)
print("Best Cross-Validation Score: {:.2f}".format(grid_5.best_score_))
print("Test Set Score: {:.2f}".format(grid_5.score(X_test, y_test)))


# 7-Fold CV
grid_7 = GridSearchCV(pipe, param_grid, cv=7)
grid_7.fit(X_train, y_train)

print("🔁 7-Fold CV Results:")
print("Best Parameters:", grid_7.best_params_)
print("Best Cross-Validation Score: {:.2f}".format(grid_7.best_score_))
print("Test Set Score: {:.2f}".format(grid_7.score(X_test, y_test)))

🔁 3-Fold CV Results:
Best Parameters: {'classifier__C': 0.1, 'classifier__kernel': 'linear', 'pca__n_components': 3}
Best Cross-Validation Score: 0.97
Test Set Score: 1.00
🔁 5-Fold CV Results:
Best Parameters: {'classifier__C': 0.1, 'classifier__kernel': 'linear', 'pca__n_components': 3}
Best Cross-Validation Score: 0.96
Test Set Score: 1.00
🔁 7-Fold CV Results:
Best Parameters: {'classifier__C': 0.1, 'classifier__kernel': 'linear', 'pca__n_components': 3}
Best Cross-Validation Score: 0.97
Test Set Score: 1.00


Check for 3 fold, 5 fold and 7 fold cross validation

Replace classifier, SVC with RandomForestClassifier and LogisticRegression, Perceptron, knn .

Update the param_grid accordingly (e.g., for RandomForestClassifier, use n_estimators, max_depth, etc.)

Also replace Gridsearch with randomnsearch function.

Relplace with with your own csv dataset using code below:

In [1]:
from google.colab import files
uploaded = files.upload()


Saving Iris.csv to Iris.csv


In [2]:
import pandas as pd

# data = pd.read_csv("Loan_Default.csv")
data = pd.read_csv("Iris.csv").sample(50, random_state=42)
X = data.drop("Species", axis=1)
y = data["Species"]


In [3]:
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import classification_report, accuracy_score

# Identify column types
categorical_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()
numeric_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()

# Preprocessing pipelines
numeric_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore', drop='first'))
])

preprocessor = ColumnTransformer([
    ('num', numeric_transformer, numeric_cols),
    ('cat', categorical_transformer, categorical_cols)
])

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [4]:
## Random Forest

from sklearn.ensemble import RandomForestClassifier

pipe_rf = Pipeline([
    ('preprocess', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42))
])

param_grid_rf = {
    'classifier__n_estimators': [50, 100, 200],
    'classifier__max_depth': [None, 5, 10, 20],
    'classifier__min_samples_split': [2, 5],
    'classifier__min_samples_leaf': [1, 2]
}

search_rf = RandomizedSearchCV(pipe_rf, param_distributions=param_grid_rf, cv=3, n_iter=5, random_state=42, verbose=1)
search_rf.fit(X_train, y_train)

print("\nRandom Forest Best Params:", search_rf.best_params_)
print("CV Score:", search_rf.best_score_)
print("Test Accuracy:", accuracy_score(y_test, search_rf.predict(X_test)))


Fitting 3 folds for each of 5 candidates, totalling 15 fits

Random Forest Best Params: {'classifier__n_estimators': 50, 'classifier__min_samples_split': 5, 'classifier__min_samples_leaf': 1, 'classifier__max_depth': 10}
CV Score: 1.0
Test Accuracy: 1.0


In [5]:
##Logistic regression
from sklearn.linear_model import LogisticRegression

pipe_lr = Pipeline([
    ('preprocess', preprocessor),
    ('classifier', LogisticRegression(max_iter=200))
])

param_grid_lr = {
    'classifier__C': [0.01, 0.1, 1, 10],
    'classifier__penalty': ['l2'],
    'classifier__solver': ['lbfgs', 'liblinear']
}

search_lr = RandomizedSearchCV(pipe_lr, param_distributions=param_grid_lr, cv=3, n_iter=5, random_state=42, verbose=1)
search_lr.fit(X_train, y_train)

print("\nLogistic Regression Best Params:", search_lr.best_params_)
print("CV Score:", search_lr.best_score_)
print("Test Accuracy:", accuracy_score(y_test, search_lr.predict(X_test)))



Fitting 3 folds for each of 5 candidates, totalling 15 fits

Logistic Regression Best Params: {'classifier__solver': 'lbfgs', 'classifier__penalty': 'l2', 'classifier__C': 0.1}
CV Score: 0.9487179487179488
Test Accuracy: 1.0


In [6]:
## Perceptron
from sklearn.linear_model import Perceptron

pipe_per = Pipeline([
    ('preprocess', preprocessor),
    ('classifier', Perceptron(max_iter=1000, tol=1e-3, random_state=42))
])

param_grid_per = {
    'classifier__penalty': [None, 'l2', 'l1', 'elasticnet'],
    'classifier__alpha': [0.0001, 0.001, 0.01],
    'classifier__fit_intercept': [True, False]
}

search_per = RandomizedSearchCV(pipe_per, param_distributions=param_grid_per, cv=3, n_iter=5, random_state=42, verbose=1)
search_per.fit(X_train, y_train)

print("\nPerceptron Best Params:", search_per.best_params_)
print("CV Score:", search_per.best_score_)
print("Test Accuracy:", accuracy_score(y_test, search_per.predict(X_test)))


Fitting 3 folds for each of 5 candidates, totalling 15 fits

Perceptron Best Params: {'classifier__penalty': None, 'classifier__fit_intercept': True, 'classifier__alpha': 0.001}
CV Score: 0.923076923076923
Test Accuracy: 0.9


In [7]:
##KNN

from sklearn.neighbors import KNeighborsClassifier

pipe_knn = Pipeline([
    ('preprocess', preprocessor),
    ('classifier', KNeighborsClassifier())
])

param_grid_knn = {
    'classifier__n_neighbors': [3, 5, 7, 9],
    'classifier__weights': ['uniform', 'distance'],
    'classifier__p': [1, 2]  # Manhattan (1), Euclidean (2)
}

search_knn = RandomizedSearchCV(pipe_knn, param_distributions=param_grid_knn, cv=3, n_iter=5, random_state=42, verbose=1)
search_knn.fit(X_train, y_train)

print("\nKNN Best Params:", search_knn.best_params_)
print("CV Score:", search_knn.best_score_)
print("Test Accuracy:", accuracy_score(y_test, search_knn.predict(X_test)))


Fitting 3 folds for each of 5 candidates, totalling 15 fits

KNN Best Params: {'classifier__weights': 'uniform', 'classifier__p': 1, 'classifier__n_neighbors': 3}
CV Score: 1.0
Test Accuracy: 1.0
