# Assignment7_Model_fitting

## Let's start by importing the libraries we will need.

In [15]:
import pandas as pd
import numpy as np
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.impute import SimpleImputer

from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import f1_score
from sklearn.pipeline import Pipeline

from xgboost import XGBClassifier

from sklearn.ensemble import RandomForestClassifier

## Load the Synthesized data

In [17]:
data = pd.read_csv(r'C:\Users\91970\Downloads\DM\Week9\Assign\synthetic_classification_data.csv')
data

Unnamed: 0,feature1,feature2,feature3,feature4,feature5,target
0,1.555730,0.628147,-0.567278,-2.548786,-1.575909,2
1,-3.194052,1.395636,3.033943,-0.280255,-2.172309,1
2,-0.329064,-0.461760,-1.183120,1.436410,-1.228790,0
3,-0.706467,0.648464,0.842007,-0.746024,-1.300526,1
4,0.543973,-0.595228,-1.202407,0.872089,0.185406,0
...,...,...,...,...,...,...
995,1.988782,-0.486354,-1.976167,-0.538039,-0.149938,2
996,-1.312794,0.714122,1.131945,-0.359647,-1.616572,1
997,1.207255,0.396648,0.093145,-1.914773,0.195360,1
998,1.018303,-0.730303,-1.842049,0.814984,-0.133006,1


## Explore data

In [18]:
print('**************************')
print(data.isnull().sum())#checking for missing values
print('**************************')
print(data.dtypes) #checking for datatypes of each column
print('**************************')
print(data.shape)# total rows and columns
print('**************************')
print(data.describe())
print('**************************')

**************************
feature1    6
feature2    6
feature3    4
feature4    6
feature5    7
target      0
dtype: int64
**************************
feature1    float64
feature2    float64
feature3    float64
feature4    float64
feature5    float64
target        int64
dtype: object
**************************
(1000, 6)
**************************
         feature1    feature2    feature3    feature4    feature5       target
count  994.000000  994.000000  996.000000  994.000000  993.000000  1000.000000
mean     0.286714   -0.128035   -0.343629    0.054966    0.059751     1.002000
std      1.497375    0.596076    1.381923    1.351252    1.241555     0.816494
min     -4.496088   -2.064734   -4.659960   -3.680880   -3.818368     0.000000
25%     -0.537595   -0.497248   -1.286116   -1.010132   -0.907865     0.000000
50%      0.281220   -0.170927   -0.605052    0.060282    0.235148     1.000000
75%      1.360429    0.209666    0.626663    1.058671    1.029208     2.000000
max      4.655887  

## Filling missing values

In [19]:
df_filled = data.fillna(data.mean())

In [20]:
df_filled

Unnamed: 0,feature1,feature2,feature3,feature4,feature5,target
0,1.555730,0.628147,-0.567278,-2.548786,-1.575909,2
1,-3.194052,1.395636,3.033943,-0.280255,-2.172309,1
2,-0.329064,-0.461760,-1.183120,1.436410,-1.228790,0
3,-0.706467,0.648464,0.842007,-0.746024,-1.300526,1
4,0.543973,-0.595228,-1.202407,0.872089,0.185406,0
...,...,...,...,...,...,...
995,1.988782,-0.486354,-1.976167,-0.538039,-0.149938,2
996,-1.312794,0.714122,1.131945,-0.359647,-1.616572,1
997,1.207255,0.396648,0.093145,-1.914773,0.195360,1
998,1.018303,-0.730303,-1.842049,0.814984,-0.133006,1


## Again checking for missing values

In [21]:
print(df_filled.isnull().sum())

feature1    0
feature2    0
feature3    0
feature4    0
feature5    0
target      0
dtype: int64


### Split and normalize data

In [22]:
features = df_filled.drop(columns=['target'])
target = df_filled['target']

# split the data into validation and training set
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.3, random_state=100)

# create a standard scaler and fit it to the training set of predictors
scaler = preprocessing.StandardScaler()
scaler.fit(X_train)

# Transform the predictors of training and validation sets
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

## Neural Network Model with Grid Search and Randomized Search

In [23]:


# Define a range of hyperparameters for grid search
param_grid = {
    'classifier__hidden_layer_sizes': [(50, 50), (100, 100), (100,)],
    'classifier__alpha': [0.0001, 0.001, 0.01],
}

# Define a range of hyperparameters for randomized search
param_dist = {
    'classifier__hidden_layer_sizes': [(50, 50), (100, 100), (100,)],
    'classifier__alpha': [0.0001, 0.001, 0.01],
}

# Create a pipeline for the Neural Network model
nn_pipeline = Pipeline([
    ('classifier', MLPClassifier(random_state=42))
])

# Perform grid search to find the best hyperparameters
grid_search = GridSearchCV(nn_pipeline, param_grid, cv=5, scoring='f1_weighted', verbose=1)
grid_search.fit(X_train, y_train)
best_nn_model_grid = grid_search.best_estimator_

# Perform randomized search to find the best hyperparameters
randomized_search = RandomizedSearchCV(nn_pipeline, param_distributions=param_dist, n_iter=10, cv=5, scoring='f1_weighted', verbose=1, random_state=42)
randomized_search.fit(X_train, y_train)
best_nn_model_randomized = randomized_search.best_estimator_

# Evaluate and compare the models
def evaluate_and_compare_models(model, X_test, y_test):
    y_pred = model.predict(X_test)
    f1 = f1_score(y_test, y_pred, average='weighted')
    return f1

f1_nn_grid = evaluate_and_compare_models(best_nn_model_grid, X_test, y_test)
f1_nn_randomized = evaluate_and_compare_models(best_nn_model_randomized, X_test, y_test)

print(f"Neural Network Weighted Macro F1 Score (Grid Search): {f1_nn_grid}")
print(f"Neural Network Weighted Macro F1 Score (Randomized Search): {f1_nn_randomized}")


Fitting 5 folds for each of 9 candidates, totalling 45 fits




Fitting 5 folds for each of 9 candidates, totalling 45 fits




Neural Network Weighted Macro F1 Score (Grid Search): 0.9036273779778996
Neural Network Weighted Macro F1 Score (Randomized Search): 0.9036273779778996




## XGBoost Model with Grid Search and Randomized Search

In [24]:
# Define a range of hyperparameters for grid search
param_grid = {
    'classifier__n_estimators': [50, 100, 200, 300],
    'classifier__max_depth': [3, 4, 5, 6],
    'classifier__learning_rate': [0.01, 0.1, 0.2, 0.3],
}

# Define a range of hyperparameters for randomized search
param_dist = {
    'classifier__n_estimators': [50, 100, 200, 300],
    'classifier__max_depth': [3, 4, 5, 6],
    'classifier__learning_rate': [0.01, 0.1, 0.2, 0.3],
}

# Create a pipeline for the XGBoost model
xgboost_pipeline = Pipeline([
    ('classifier', XGBClassifier(random_state=42))
])

# Perform grid search to find the best hyperparameters
grid_search = GridSearchCV(xgboost_pipeline, param_grid, cv=5, scoring='f1_weighted', verbose=1)
grid_search.fit(X_train, y_train)
best_xgboost_model_grid = grid_search.best_estimator_

# Perform randomized search to find the best hyperparameters
randomized_search = RandomizedSearchCV(xgboost_pipeline, param_distributions=param_dist, n_iter=10, cv=5, scoring='f1_weighted', verbose=1, random_state=42)
randomized_search.fit(X_train, y_train)
best_xgboost_model_randomized = randomized_search.best_estimator_

# Evaluate and compare the models
f1_xgboost_grid = evaluate_and_compare_models(best_xgboost_model_grid, X_test, y_test)
f1_xgboost_randomized = evaluate_and_compare_models(best_xgboost_model_randomized, X_test, y_test)

print(f"XGBoost Weighted Macro F1 Score (Grid Search): {f1_xgboost_grid}")
print(f"XGBoost Weighted Macro F1 Score (Randomized Search): {f1_xgboost_randomized}")

Fitting 5 folds for each of 64 candidates, totalling 320 fits
Fitting 5 folds for each of 10 candidates, totalling 50 fits
XGBoost Weighted Macro F1 Score (Grid Search): 0.8774197722567288
XGBoost Weighted Macro F1 Score (Randomized Search): 0.8706555356342449


## Random Forest Model with Grid Search and Randomized Search

In [25]:
# Define a range of hyperparameters for grid search
param_grid = {
    'classifier__n_estimators': [50, 100, 200, 300],
    'classifier__max_depth': [3, 4, 5, 6],
    'classifier__min_samples_split': [2, 5, 10],
}

# Define a range of hyperparameters for randomized search
param_dist = {
    'classifier__n_estimators': [50, 100, 200, 300],
    'classifier__max_depth': [3, 4, 5, 6],
    'classifier__min_samples_split': [2, 5, 10],
}

# Create a pipeline for the Random Forest model
rf_pipeline = Pipeline([
    ('classifier', RandomForestClassifier(random_state=42))
])

# Perform grid search to find the best hyperparameters
grid_search = GridSearchCV(rf_pipeline, param_grid, cv=5, scoring='f1_weighted', verbose=1)
grid_search.fit(X_train, y_train)
best_rf_model_grid = grid_search.best_estimator_

# Perform randomized search to find the best hyperparameters
randomized_search = RandomizedSearchCV(rf_pipeline, param_distributions=param_dist, n_iter=10, cv=5, scoring='f1_weighted', verbose=1, random_state=42)
randomized_search.fit(X_train, y_train)
best_rf_model_randomized = randomized_search.best_estimator_

# Evaluate and compare the models
f1_rf_grid = evaluate_and_compare_models(best_rf_model_grid, X_test, y_test)
f1_rf_randomized = evaluate_and_compare_models(best_rf_model_randomized, X_test, y_test)

print(f"Random Forest Weighted Macro F1 Score (Grid Search): {f1_rf_grid}")
print(f"Random Forest Weighted Macro F1 Score (Randomized Search): {f1_rf_randomized}")

Fitting 5 folds for each of 48 candidates, totalling 240 fits
Fitting 5 folds for each of 10 candidates, totalling 50 fits
Random Forest Weighted Macro F1 Score (Grid Search): 0.8735925799863852
Random Forest Weighted Macro F1 Score (Randomized Search): 0.883595396123051


# Conclusion Summary

To choose the best model based on the provided Weighted Macro F1 Scores, we should consider both the performance metrics and the search method (Grid Search or Randomized Search). Let's analyze the scores:

### Neural Network:

Grid Search: F1 Score = 0.9036

Randomized Search: F1 Score = 0.9036

For the Neural Network, Grid Search and Randomised Search produced the same F1 Score. As a result, we can rate both approaches as equally effective in this instance.

### XGBoost:

Grid Search: F1 Score = 0.8774

Randomized Search: F1 Score = 0.8707

Grid Search produced a higher F1 Score (0.8774) compared to Randomized Search (0.8707) for XGBoost. For XGBoost hyperparameter tuning, Grid Search seems to be a superior option, and the model with the greatest F1 Score ought to be chosen.

### Random Forest:

Grid Search: F1 Score = 0.8736

Randomized Search: F1 Score = 0.8836

For Random Forest, Randomised Search yielded a higher F1 Score (0.8836) than Grid Search (0.8736). Consequently, it appears that Randomised Search is a more effective method for optimising Random Forest hyperparameters; the model with the greatest F1 Score ought to be selected.

****************************************************************************************
****************************************************************************************
In summary, based on the provided F1 Scores, the best model selection depends on the specific model type:

* For Neural Network, both Grid Search and Randomized Search produced the same F1   Score.

* For XGBoost, Grid Search is preferred as it yielded a higher F1 Score (0.8774).

* For Random Forest, Randomized Search is preferred due to the higher F1 Score (0.8836).

Thus, given its better F1 Score performance, the Random Forest model with Randomised Search ought to be chosen as the optimal model for this particular dataset.