1. Implement Classification Models:

Importing necessary libraries

In [19]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import classification_report, f1_score, mean_squared_error, r2_score

load the dataset from sklearn

In [20]:
# Load the wine dataset
data = load_wine()

# Convert to pandas DataFrame
df = pd.DataFrame(data.data, columns=data.feature_names)

# Add the target column
df['target'] = data.target

# Display the first ten rows
print(df.head(10))

   alcohol  malic_acid   ash  alcalinity_of_ash  magnesium  total_phenols  \
0    14.23        1.71  2.43               15.6      127.0           2.80   
1    13.20        1.78  2.14               11.2      100.0           2.65   
2    13.16        2.36  2.67               18.6      101.0           2.80   
3    14.37        1.95  2.50               16.8      113.0           3.85   
4    13.24        2.59  2.87               21.0      118.0           2.80   
5    14.20        1.76  2.45               15.2      112.0           3.27   
6    14.39        1.87  2.45               14.6       96.0           2.50   
7    14.06        2.15  2.61               17.6      121.0           2.60   
8    14.83        1.64  2.17               14.0       97.0           2.80   
9    13.86        1.35  2.27               16.0       98.0           2.98   

   flavanoids  nonflavanoid_phenols  proanthocyanins  color_intensity   hue  \
0        3.06                  0.28             2.29             5.64  1.

Define features and use train-test-split from sklearn

In [21]:
# Define X and y
X = df.drop(columns=['target'])
y = df['target']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [22]:
# Train Decision Tree Classifier
dt_clf = DecisionTreeClassifier(random_state=42)
dt_clf.fit(X_train, y_train)
y_pred_dt = dt_clf.predict(X_test)

# Train Random Forest Classifier
rf_clf = RandomForestClassifier(random_state=42)
rf_clf.fit(X_train, y_train)
y_pred_rf = rf_clf.predict(X_test)

# Evaluate and compare models based on F1 score
f1_dt = f1_score(y_test, y_pred_dt, average='weighted')
f1_rf = f1_score(y_test, y_pred_rf, average='weighted')

print(f"F1 Score (Decision Tree): {f1_dt:.4f}")
print(f"F1 Score (Random Forest): {f1_rf:.4f}")

# Determine the better model
if f1_rf > f1_dt:
    print("Random Forest Classifier performs better.")
else:
    print("Decision Tree Classifier performs better.")

F1 Score (Decision Tree): 0.9628
F1 Score (Random Forest): 1.0000
Random Forest Classifier performs better.


Hyperparameter tuning:

In [23]:
# Define parameter grid for Random Forest Classifier
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
}

# Perform GridSearchCV
grid_search = GridSearchCV(estimator=RandomForestClassifier(random_state=42),param_grid=param_grid,cv=5,scoring='f1_weighted', verbose=1, n_jobs=-1)
grid_search.fit(X_train, y_train)

# Best hyperparameters and evaluation
best_params = grid_search.best_params_
best_score = grid_search.best_score_

print("\nBest Hyperparameters for Random Forest Classifier:", best_params)
print("\nBest F1 Score for Random Forest Classifier:", best_score)

# Evaluate the best model on the test set
best_rf = grid_search.best_estimator_
y_pred = best_rf.predict(X_test)
print(classification_report(y_test, y_pred))


Fitting 5 folds for each of 36 candidates, totalling 180 fits

Best Hyperparameters for Random Forest Classifier: {'max_depth': None, 'min_samples_split': 2, 'n_estimators': 50}

Best F1 Score for Random Forest Classifier: 0.9680809081527346
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        19
           1       1.00      1.00      1.00        21
           2       1.00      1.00      1.00        14

    accuracy                           1.00        54
   macro avg       1.00      1.00      1.00        54
weighted avg       1.00      1.00      1.00        54



Implememt regression model

In [24]:
 # Using the alcohol feature as the target for regression
y_reg = df['alcohol']
X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(X, y_reg, test_size=0.3, random_state=42)

# Train Decision Tree Regressor
dt_regressor = DecisionTreeRegressor(random_state=42)
dt_regressor.fit(X_train_reg, y_train_reg)
dt_reg_predictions = dt_regressor.predict(X_test_reg)

# Train a Random Forest Regressor
rf_regressor = RandomForestRegressor(random_state=42)
rf_regressor.fit(X_train_reg, y_train_reg)
rf_reg_predictions = rf_regressor.predict(X_test_reg)

# Evaluate Regression Models
dt_mse = mean_squared_error(y_test_reg, dt_reg_predictions)
rf_mse = mean_squared_error(y_test_reg, rf_reg_predictions)
dt_r2 = r2_score(y_test_reg, dt_reg_predictions)
rf_r2 = r2_score(y_test_reg, rf_reg_predictions)

print("Decision Tree Regressor MSE:", dt_mse)
print("Decision Tree Regressor R2 Score:", dt_r2)
print("\nRandom Forest Regressor MSE:", rf_mse)
print("Random Forest Regressor R2 Score:", rf_r2)

Decision Tree Regressor MSE: 0.0017592592592592462
Decision Tree Regressor R2 Score: 0.9967228563711633

Random Forest Regressor MSE: 0.0013293277777777996
Random Forest Regressor R2 Score: 0.9975237316304281


 Identify three parameters for Random Forest Regression and Perform hyperparameter tuning using RandomSearchCV to optimize these parameters.

In [18]:
# Hyperparameter Tuning for Random Forest Regressor
param_distributions = {
    'n_estimators': [50, 100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Perform RandomizedSearchCV
random_search = RandomizedSearchCV(estimator=RandomForestRegressor(random_state=42), param_distributions=param_distributions,
                                   n_iter=50,
                                   cv=5,
                                   scoring='neg_mean_squared_error',
                                   verbose=1,
                                   n_jobs=-1,
                                   random_state=42)
random_search.fit(X_train_reg, y_train_reg)

# Best hyperparameters
best_params_reg = random_search.best_params_
print("\nBest Hyperparameters for Random Forest Regressor:", best_params_reg)

# model evaluation
best_rf_regressor = random_search.best_estimator_
optimized_reg_predictions = best_rf_regressor.predict(X_test_reg)

optimized_mse = mean_squared_error(y_test_reg, optimized_reg_predictions)
optimized_r2 = r2_score(y_test_reg, optimized_reg_predictions)

print("Optimized Random Forest Regressor MSE:", optimized_mse)
print("Optimized Random Forest Regressor R2 Score:", optimized_r2)

Fitting 5 folds for each of 50 candidates, totalling 250 fits

Best Hyperparameters for Random Forest Regressor: {'n_estimators': 50, 'min_samples_split': 5, 'min_samples_leaf': 2, 'max_depth': 30}
Optimized Random Forest Regressor MSE: 0.0013222566871488896
Optimized Random Forest Regressor R2 Score: 0.9975369036398869
