In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from hyperopt import fmin, tpe, hp
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

In [3]:
# pip install hyperoptn 


In [4]:
# 1. Reading data from CSV
def read_csv(file_path):
    return pd.read_csv(file_path)

# 2. Creating features
def create_features(data):
    # No feature creation for this example
    return data

# Create features
data = create_features(data=read_csv('iris.csv'))

    # Split data into features and target
X = data.drop('Species', axis=1)
y = data['Species']


# 3. Training a classifier model
def train_classifier(data):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    model = RandomForestClassifier()
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)

    return model, accuracy

# 4. Hyperparameter tuning with Hyperopt
def objective(params):
    model = RandomForestClassifier(**params)
    score = cross_val_score(model, X, y, cv=5).mean()
    return -score  # Minimize negative accuracy

# 5. Evaluating the model on the test set
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    return accuracy

In [5]:
file_path ="Iris.csv"
data = read_csv(file_path)

In [6]:
data

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,1,5.1,3.5,1.4,0.2,Iris-setosa
1,2,4.9,3.0,1.4,0.2,Iris-setosa
2,3,4.7,3.2,1.3,0.2,Iris-setosa
3,4,4.6,3.1,1.5,0.2,Iris-setosa
4,5,5.0,3.6,1.4,0.2,Iris-setosa
...,...,...,...,...,...,...
145,146,6.7,3.0,5.2,2.3,Iris-virginica
146,147,6.3,2.5,5.0,1.9,Iris-virginica
147,148,6.5,3.0,5.2,2.0,Iris-virginica
148,149,6.2,3.4,5.4,2.3,Iris-virginica


In [7]:
# if __name__ == "__main__":
# Load data
file_path ="Iris.csv"
data = read_csv(file_path)

# Create features
data = create_features(data)

# Split data into features and target
X = data.drop('Species', axis=1)
y = data['Species']

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the pipeline
pipeline = Pipeline([
    ('preprocessor', ColumnTransformer(
        transformers=[
            ('num', StandardScaler(), X.columns)
        ],
        remainder='passthrough'
    )),
    ('classifier', RandomForestClassifier())
])

# Train the model
pipeline.fit(X_train, y_train)

# Evaluate the model
y_pred = pipeline.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Model accuracy on test set: {accuracy}")

#Brute force appraoch try with 2 for loops

# Hyperparameter tuning using Tree of Parzen Estimators (TPE)
space = {
    'n_estimators': hp.choice('n_estimators', range(10, 101)),
    'max_depth': hp.choice('max_depth', range(1, 21))
}

best_params = fmin(fn=objective, space=space, algo=tpe.suggest, max_evals=100)

# Update the pipeline with the best hyperparameters

# Train model with best parameters

# Train the model with the best hyperparameters


# Evaluate the updated model

best_clf = RandomForestClassifier(
     n_estimators=int(best_params['n_estimators']),
    max_depth=int(best_params['max_depth']),
    random_state=42
    )

best_pipeline = Pipeline([
    ('preprocessor', ColumnTransformer(
         transformers=[('num', StandardScaler(), X.columns)],
         remainder='passthrough'
    )),
    ('classifier', best_clf)
])

best_pipeline.fit(X_train, y_train)
y_pred_best = best_pipeline.predict(X_test)
best_accuracy = accuracy_score(y_test, y_pred_best)
print(f"Tuned model accuracy: {best_accuracy}")





Model accuracy on test set: 1.0
100%|██████████| 100/100 [00:23<00:00,  4.30trial/s, best loss: -1.0]              
Tuned model accuracy: 1.0


In [8]:
best_params

{'max_depth': 12, 'n_estimators': 5}

In [9]:
pipeline

In [10]:
{'classifier__'+k:v for k, v in best_params.items()}

{'classifier__max_depth': 12, 'classifier__n_estimators': 5}

In [11]:
updated_pipeline =pipeline.set_params(**{'classifier__'+k:v for k, v in best_params.items()})

In [12]:
updated_pipeline

In [13]:
updated_pipeline.fit(X_train, y_train)

In [14]:
y_pred_updated = updated_pipeline.predict(X_test)
updated_accuracy = accuracy_score(y_test, y_pred_updated)
print(f"Updated model accuracy: {updated_accuracy}")

Updated model accuracy: 1.0


In [None]:
# dummy={'classifier__'+k:v for k, v in best_params.items()}

In [None]:
# dummy_pipeline = updated_pipeline.fit(X_train, y_train)

In [None]:
# dummy_y_pred = dummy_pipeline.predict(X_test)
# dummy_accuracy = accuracy_score(y_test, dummy_y_pred)
# print(f"Dummy model accuracy: {dummy_accuracy}")

Dummy model accuracy: 1.0
