<a href="https://colab.research.google.com/github/manideep099/big-data-assignments/blob/main/ICP5_Pothuraju_Manideep.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from sklearn.datasets import load_iris
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline

# 1. Load dataset
X, y = load_iris(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 2. Create pipeline
pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('pca', PCA()),
    ('classifier', SVC())
])

# 3. Define parameter grid
param_grid = {
    'pca__n_components': [2, 3],
    'classifier__C': [0.1, 1, 10],
    'classifier__kernel': ['linear', 'rbf']
}

# 4. GridSearchCV
grid = GridSearchCV(pipe, param_grid)
grid.fit(X_train, y_train)

# 5. Results
print("Best parameters found:", grid.best_params_)
print("Best cross-validation score: {:.2f}".format(grid.best_score_))
print("Test set score: {:.2f}".format(grid.score(X_test, y_test)))


Best parameters found: {'classifier__C': 0.1, 'classifier__kernel': 'linear', 'pca__n_components': 3}
Best cross-validation score: 0.96
Test set score: 1.00


In [None]:
from sklearn.datasets import load_iris
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline

# Load dataset
X, y = load_iris(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create pipeline
pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('pca', PCA()),
    ('classifier', SVC())
])

# Parameter grid
param_grid = {
    'pca__n_components': [2, 3],
    'classifier__C': [0.1, 1, 10],
    'classifier__kernel': ['linear', 'rbf']
}

# Test with 3-fold, 5-fold, and 7-fold
for cv_val in [3, 5, 7]:
    print(f"\n=== CV={cv_val} ===")
    grid = GridSearchCV(pipe, param_grid, cv=cv_val)
    grid.fit(X_train, y_train)
    print("Best parameters:", grid.best_params_)
    print("Best CV score: {:.2f}".format(grid.best_score_))
    print("Test set score: {:.2f}".format(grid.score(X_test, y_test)))


=== CV=3 ===
Best parameters: {'classifier__C': 0.1, 'classifier__kernel': 'linear', 'pca__n_components': 3}
Best CV score: 0.97
Test set score: 1.00

=== CV=5 ===
Best parameters: {'classifier__C': 0.1, 'classifier__kernel': 'linear', 'pca__n_components': 3}
Best CV score: 0.96
Test set score: 1.00

=== CV=7 ===
Best parameters: {'classifier__C': 0.1, 'classifier__kernel': 'linear', 'pca__n_components': 3}
Best CV score: 0.97
Test set score: 1.00


In [None]:
from sklearn.ensemble import RandomForestClassifier

pipe_rf = Pipeline([
    ('scaler', StandardScaler()),
    ('pca', PCA()),
    ('classifier', RandomForestClassifier())
])

param_grid_rf = {
    'pca__n_components': [2, 3],
    'classifier__n_estimators': [50, 100, 200],
    'classifier__max_depth': [None, 5, 10]
}

grid_rf = GridSearchCV(pipe_rf, param_grid_rf, cv=5)
grid_rf.fit(X_train, y_train)
print("RandomForest Best Params:", grid_rf.best_params_)

RandomForest Best Params: {'classifier__max_depth': 5, 'classifier__n_estimators': 50, 'pca__n_components': 3}


In [None]:
from sklearn.linear_model import LogisticRegression

pipe_lr = Pipeline([
    ('scaler', StandardScaler()),
    ('pca', PCA()),
    ('classifier', LogisticRegression())
])

param_grid_lr = {
    'pca__n_components': [2, 3],
    'classifier__C': [0.01, 0.1, 1, 10],
    'classifier__penalty': ['l2'],
    'classifier__solver': ['lbfgs']
}

grid_lr = GridSearchCV(pipe_lr, param_grid_lr, cv=5)
grid_lr.fit(X_train, y_train)
print("LogisticRegression Best Params:", grid_lr.best_params_)

LogisticRegression Best Params: {'classifier__C': 1, 'classifier__penalty': 'l2', 'classifier__solver': 'lbfgs', 'pca__n_components': 3}


In [None]:
from sklearn.linear_model import Perceptron

pipe_perceptron = Pipeline([
    ('scaler', StandardScaler()),
    ('pca', PCA()),
    ('classifier', Perceptron())
])

param_grid_perceptron = {
    'pca__n_components': [2, 3],
    'classifier__penalty': [None, 'l2', 'l1', 'elasticnet'],
    'classifier__alpha': [0.0001, 0.001, 0.01]
}

grid_perceptron = GridSearchCV(pipe_perceptron, param_grid_perceptron, cv=5)
grid_perceptron.fit(X_train, y_train)
print("Perceptron Best Params:", grid_perceptron.best_params_)

Perceptron Best Params: {'classifier__alpha': 0.0001, 'classifier__penalty': 'l1', 'pca__n_components': 3}


In [None]:
from sklearn.neighbors import KNeighborsClassifier

pipe_knn = Pipeline([
    ('scaler', StandardScaler()),
    ('pca', PCA()),
    ('classifier', KNeighborsClassifier())
])

param_grid_knn = {
    'pca__n_components': [2, 3],
    'classifier__n_neighbors': [3, 5, 7],
    'classifier__weights': ['uniform', 'distance']
}

grid_knn = GridSearchCV(pipe_knn, param_grid_knn, cv=5)
grid_knn.fit(X_train, y_train)
print("KNN Best Params:", grid_knn.best_params_)

KNN Best Params: {'classifier__n_neighbors': 3, 'classifier__weights': 'uniform', 'pca__n_components': 3}


In [None]:
from sklearn.model_selection import RandomizedSearchCV

# Using RandomizedSearchCV with RandomForest
random_search_rf = RandomizedSearchCV(pipe_rf, param_grid_rf, n_iter=5, cv=5, random_state=42)
random_search_rf.fit(X_train, y_train)
print("RandomizedSearch Best Params (RF):", random_search_rf.best_params_)



RandomizedSearch Best Params (RF): {'pca__n_components': 2, 'classifier__n_estimators': 100, 'classifier__max_depth': 5}


In [None]:
import pandas as pd

df = pd.read_csv('pd_speech_features.csv')


In [None]:
# Read custom dataset
import pandas as pd

df = pd.read_csv('pd_speech_features.csv')
X = df.drop('numPulses', axis=1)  # Replace target_column with actual target
print("Successfully Replaced target_columns")
y = df['numPulses']

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Successfully Replaced target_columns
