<a href="https://colab.research.google.com/github/mandadih/BDA-ICP/blob/main/ICP_5_700772591.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [39]:
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score


In [40]:
# Load the Iris dataset
df = pd.read_csv("/content/Iris.csv")

# Drop the 'Id' column if it exists
df = df.drop(columns=["Id"], errors="ignore")

# Define features and target
X = df.drop("Species", axis=1)
y = df["Species"]


In [41]:
# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


In [42]:
pipe = Pipeline([
    ("scaler", StandardScaler()),
    ("pca", PCA()),
    ("classifier", RandomForestClassifier(random_state=42))
])


In [43]:
param_grid = {
    "pca__n_components": [2, 3, 4],
    "classifier__n_estimators": [50, 100, 150],
    "classifier__max_depth": [3, 5, None],
    "classifier__min_samples_split": [2, 4, 6]
}


In [44]:
search_3 = RandomizedSearchCV(
    pipe, param_distributions=param_grid, cv=3, n_iter=10, random_state=42
)
search_3.fit(X_train, y_train)

print("CV=3")
print("Best Params:", search_3.best_params_)
print("Train CV Score: {:.2f}".format(search_3.best_score_))
print("Test Score: {:.2f}".format(search_3.score(X_test, y_test)))


CV=3
Best Params: {'pca__n_components': 3, 'classifier__n_estimators': 150, 'classifier__min_samples_split': 4, 'classifier__max_depth': None}
Train CV Score: 0.92
Test Score: 0.90


In [45]:
search_5 = RandomizedSearchCV(
    pipe, param_distributions=param_grid, cv=5, n_iter=10, random_state=42
)
search_5.fit(X_train, y_train)

print("CV=5")
print("Best Params:", search_5.best_params_)
print("Train CV Score: {:.2f}".format(search_5.best_score_))
print("Test Score: {:.2f}".format(search_5.score(X_test, y_test)))


CV=5
Best Params: {'pca__n_components': 3, 'classifier__n_estimators': 50, 'classifier__min_samples_split': 2, 'classifier__max_depth': 5}
Train CV Score: 0.93
Test Score: 0.90


In [46]:
search_7 = RandomizedSearchCV(
    pipe, param_distributions=param_grid, cv=7, n_iter=10, random_state=42
)
search_7.fit(X_train, y_train)

print("CV=7")
print("Best Params:", search_7.best_params_)
print("Train CV Score: {:.2f}".format(search_7.best_score_))
print("Test Score: {:.2f}".format(search_7.score(X_test, y_test)))


CV=7
Best Params: {'pca__n_components': 3, 'classifier__n_estimators': 100, 'classifier__min_samples_split': 2, 'classifier__max_depth': 5}
Train CV Score: 0.93
Test Score: 0.93


Logistic Regression - RandomizedSearchCV with 3, 5, and 7 Fold Cross-Validation

In this section, we apply Logistic Regression on the Iris dataset using a pipeline that includes StandardScaler, PCA, and the classifier. We use RandomizedSearchCV to tune hyperparameters and evaluate performance using 3-fold, 5-fold, and 7-fold cross-validation.

We compare:
- Best parameters found
- Cross-validation training accuracy
- Final test accuracy on hold-out data


In [47]:
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Load and prepare dataset
df = pd.read_csv("/content/Iris.csv")
df = df.drop(columns=["Id"], errors="ignore")
X = df.drop("Species", axis=1)
y = df["Species"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)


In [48]:
# Pipeline and hyperparameter grid
pipe = Pipeline([
    ("scaler", StandardScaler()),
    ("pca", PCA()),
    ("classifier", LogisticRegression(max_iter=1000, random_state=42))
])

param_grid = {
    "pca__n_components": [2, 3],
    "classifier__C": [0.01, 0.1, 1, 10],
    "classifier__penalty": ["l2"],
    "classifier__solver": ["lbfgs", "saga"]
}


In [49]:
for folds in [3, 5, 7]:
    search = RandomizedSearchCV(pipe, param_distributions=param_grid, cv=folds, n_iter=10, random_state=42)
    search.fit(X_train, y_train)

    print(f"\nCV = {folds}")
    print("Best Params:", search.best_params_)
    print("Train CV Score: {:.2f}".format(search.best_score_))
    print("Test Score: {:.2f}".format(search.score(X_test, y_test)))



CV = 3
Best Params: {'pca__n_components': 3, 'classifier__solver': 'lbfgs', 'classifier__penalty': 'l2', 'classifier__C': 10}
Train CV Score: 0.96
Test Score: 0.97

CV = 5
Best Params: {'pca__n_components': 3, 'classifier__solver': 'lbfgs', 'classifier__penalty': 'l2', 'classifier__C': 10}
Train CV Score: 0.97
Test Score: 0.97

CV = 7
Best Params: {'pca__n_components': 3, 'classifier__solver': 'lbfgs', 'classifier__penalty': 'l2', 'classifier__C': 10}
Train CV Score: 0.97
Test Score: 0.97


Perceptron - RandomizedSearchCV with 3, 5, and 7 Fold Cross-Validation

In this section, we apply the Perceptron classifier on the Iris dataset using a pipeline that includes StandardScaler, PCA, and the classifier. We use RandomizedSearchCV to explore hyperparameter combinations and evaluate performance with 3-fold, 5-fold, and 7-fold cross-validation.

We will report:
- Best hyperparameters
- Training accuracy using cross-validation
- Final test accuracy

In [50]:
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import Perceptron
from sklearn.metrics import accuracy_score

# Load and prepare dataset
df = pd.read_csv("/content/Iris.csv")
df = df.drop(columns=["Id"], errors="ignore")
X = df.drop("Species", axis=1)
y = df["Species"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)


In [51]:
pipe = Pipeline([
    ("scaler", StandardScaler()),
    ("pca", PCA()),
    ("classifier", Perceptron(random_state=42))
])

param_grid = {
    "pca__n_components": [2, 3],
    "classifier__penalty": [None, "l2", "l1", "elasticnet"],
    "classifier__alpha": [0.0001, 0.001, 0.01],
    "classifier__max_iter": [500, 1000]
}


In [52]:
for folds in [3, 5, 7]:
    search = RandomizedSearchCV(pipe, param_distributions=param_grid, cv=folds, n_iter=10, random_state=42)
    search.fit(X_train, y_train)

    print(f"\nCV = {folds}")
    print("Best Params:", search.best_params_)
    print("Train CV Score: {:.2f}".format(search.best_score_))
    print("Test Score: {:.2f}".format(search.score(X_test, y_test)))



CV = 3
Best Params: {'pca__n_components': 3, 'classifier__penalty': 'l2', 'classifier__max_iter': 1000, 'classifier__alpha': 0.001}
Train CV Score: 0.90
Test Score: 0.77

CV = 5
Best Params: {'pca__n_components': 3, 'classifier__penalty': 'l2', 'classifier__max_iter': 1000, 'classifier__alpha': 0.001}
Train CV Score: 0.92
Test Score: 0.77

CV = 7
Best Params: {'pca__n_components': 3, 'classifier__penalty': None, 'classifier__max_iter': 1000, 'classifier__alpha': 0.001}
Train CV Score: 0.91
Test Score: 0.77


KNN - RandomizedSearchCV with 3, 5, and 7 Fold Cross-Validation

In this section, we apply the KNN classifier on the Iris dataset using a pipeline that includes StandardScaler, PCA, and the classifier. We use RandomizedSearchCV to explore hyperparameter combinations and evaluate performance with 3-fold, 5-fold, and 7-fold cross-validation.

We will report:

Best hyperparameters
Training accuracy using cross-validation
Final test accuracy

In [53]:
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

# Load and prepare dataset
df = pd.read_csv("/content/Iris.csv")
df = df.drop(columns=["Id"], errors="ignore")
target_column = "Species"

X = df.drop(target_column, axis=1)
y = df[target_column]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)



In [54]:
pipe = Pipeline([
    ("scaler", StandardScaler()),
    ("pca", PCA()),
    ("classifier", KNeighborsClassifier())
])

param_grid = {
    "pca__n_components": [2, 3, 4],
    "classifier__n_neighbors": [3, 5, 7, 9],
    "classifier__weights": ["uniform", "distance"],
    "classifier__p": [1, 2]  # 1 = Manhattan distance, 2 = Euclidean distance
}


In [55]:
for folds in [3, 5, 7]:
    search = RandomizedSearchCV(
        pipe,
        param_distributions=param_grid,
        cv=folds,
        n_iter=10,
        random_state=42
    )
    search.fit(X_train, y_train)

    print(f"\nCV = {folds}")
    print("Best Params:", search.best_params_)
    print(f"Train CV Score: {search.best_score_:.2f}")
    print(f"Test Score: {search.score(X_test, y_test):.2f}")



CV = 3
Best Params: {'pca__n_components': 3, 'classifier__weights': 'distance', 'classifier__p': 1, 'classifier__n_neighbors': 3}
Train CV Score: 0.97
Test Score: 0.97

CV = 5
Best Params: {'pca__n_components': 3, 'classifier__weights': 'distance', 'classifier__p': 1, 'classifier__n_neighbors': 9}
Train CV Score: 0.97
Test Score: 0.97

CV = 7
Best Params: {'pca__n_components': 3, 'classifier__weights': 'distance', 'classifier__p': 1, 'classifier__n_neighbors': 9}
Train CV Score: 0.96
Test Score: 0.97
