In [None]:
# Importing required library
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import ExtraTreesClassifier
from sklearn import preprocessing
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report,accuracy_score
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import RandomizedSearchCV
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from scipy.stats import uniform as sp_uniform
from scipy.stats import randint as sp_randint

In [None]:
# Load the Dry Bean Dataset CSV file into a Pandas dataframe
dataset = pd.read_csv("C:\\Users\\Purvesh\\OneDrive\\Documents\\MACHINE LEARNING\Dry_Bean_Dataset_11.csv")

In [None]:
# Display information about the dataset, such as column names, data types, and missing values
dataset.info()

In [None]:
# Creating a heatmap of missing values in the dataset
plt.figure(figsize=(10, 6))
sns.heatmap(dataset.isnull(), cmap='Blues', yticklabels=False)

# Adding labels and title and displaying the plot
plt.title('Missing Values in Dry Bean Dataset', fontsize=16)
plt.xlabel('Columns Names', fontsize=12)
plt.ylabel('Distribution', fontsize=12)
plt.show()

#### We can interpret from the graph that there are no null values present

In [None]:
# Create a countplot of the 'Class' variable using Seaborn
sns.countplot(x='Class', data=dataset)

# Add title to the countplot
plt.title('Count of Classes in the Dataset')

# Display the plot
plt.show()

In [None]:
# Create a histogram of the dataset using 20 bins and a figure size of 15x15
dataset.hist(bins=20, figsize=(15,15))

# Display the plot
plt.show()

### Label Encoding

In [None]:
# Encode the 'Class' variable using LabelEncoder and display the first 200 rows of the dataset
labelencoder = LabelEncoder()
dataset["Class"] = labelencoder.fit_transform(dataset['Class'])


### Train Test Split

In [None]:
# Split the dataset into features (X) and target variable (y)
X = dataset.drop(columns='Class') # X contains all columns except 'Class'
y = dataset['Class'] # y contains only the 'Class' column

# Split the dataset into training and testing sets with a 80:20 ratio and a random state of 42
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=42)

In [None]:
# Calculate the size of the training set by counting the number of rows in X_train
train_size = X_train.shape[0]

### Feature Importance

In [None]:
# Train an Extra Trees classifier model with 500 estimators and a random state of 42
model = ExtraTreesClassifier(n_estimators=500, random_state=42)

# Fit the model to the training data
model.fit(X,y)

# Print the feature importances along with the corresponding column names
for feature_name, feature_importance in zip(X.columns, model.feature_importances_):
    print(f"{feature_name}: {feature_importance}")

# Create a Pandas Series containing the feature importances with column names as indices
feat_importances = pd.Series(model.feature_importances_, index=X.columns)

# Sort the feature importances in descending order and select the top 10 features
feat_importances_sorted = feat_importances.nlargest(10)


In [None]:
# Create a horizontal bar plot
fig, ax = plt.subplots(figsize=(8,6))
sns.barplot(x=feat_importances_sorted, y=feat_importances_sorted.index, palette='coolwarm', ax=ax)

# Set plot title and axis labels
ax.set_title('ExtraTreesClassifier Top 10 Feature Importances')
ax.set_xlabel('Importance')
ax.set_ylabel('Feature')
plt.show()

### Standardized Features

In [None]:
# Create a StandardScaler object and fit it to the training data
scaler_X = preprocessing.StandardScaler().fit(X_train)

# Scale the training and testing data using the trained scaler
X_train_scaled = scaler_X.transform(X_train)
X_test_scaled = scaler_X.transform(X_test)

In [None]:
# Define a single function to plot a confusion matrix using scikit-learn's ConfusionMatrixDisplay for different models
def plot_confusion_matrix(model, X_test_scaled, y_test, labelencoder):
    """
    Parameters:
        model (object): The trained classification model to evaluate
        X_test_scaled (numpy array): The testing feature matrix (after standardization)
        y_test (numpy array): The testing target vector
        labelencoder (object): The LabelEncoder object used to encode the target variable
    """
    titles_options = [
        ("Confusion matrix", None,'.0f')
    ]
    for title, normalize, values_format in titles_options:
        disp = ConfusionMatrixDisplay.from_estimator(
            model,
            X_test_scaled,
            y_test,
            display_labels=np.unique(labelencoder.inverse_transform(y)),
            cmap=plt.cm.Blues,
            normalize=normalize,
            xticks_rotation='vertical',
            values_format= values_format
        )
        disp.ax_.set_title(title)

    plt.show()


### Multi Layer Perceptron Model

In [None]:
# Create a MLPClassifier object with specific hyperparameters
mlp_model = MLPClassifier(hidden_layer_sizes=(50, ), 
                          activation='relu', 
                          solver='adam', 
                          batch_size=train_size, 
                          learning_rate='constant', 
                          learning_rate_init=0.001, 
                          max_iter=1000, 
                          random_state=42)

# Train the MLPClassifier model using the training data after standardization
mlp_model.fit(X_train_scaled, y_train)

# Generate predictions on the testing data using the trained MLPClassifier model
y_pred_mlp = mlp_model.predict(X_test_scaled)

In [None]:
# Print the classification accuracy score and error rate for the MLPClassifier model
print('Accuracy: %.8f' % accuracy_score(y_test, y_pred_mlp))
print('Error Rate', 1 - accuracy_score(y_test, y_pred_mlp))

# Print a classification report for the MLPClassifier model
# The report includes precision, recall, F1-score, and support for each class label
# The target names are inferred from the LabelEncoder object and are displayed in alphabetical order
print(classification_report(y_test, y_pred_mlp, target_names=np.unique(labelencoder.inverse_transform(y))))

In [None]:
# Plot the confusion matrix for the MLPClassifier model
# The confusion matrix shows the true and predicted class labels for each instance in the testing data
# The diagonal of the matrix represents the number of correct predictions for each class label
# Off-diagonal entries represent the number of incorrect predictions for each pair of class labels
plot_confusion_matrix(mlp_model, X_test_scaled, y_test, labelencoder)

### Hyperparameter Tuning of MLP

In [None]:
# Define the hyperparameters to tune and their possible values
param_dist = {
    'hidden_layer_sizes': randint(10, 101),
    'activation': ['relu', 'logistic'],
    'solver': ['adam', 'sgd'],
    'learning_rate_init': [0.001, 0.01, 0.1],
    'max_iter': randint(100, 2001),
}

# Create an MLPClassifier object with default hyperparameters
mlp_model = MLPClassifier(random_state=42)

# Perform a randomized search with 5-fold cross-validation using the hyperparameter distribution and the MLPClassifier model
random_search = RandomizedSearchCV(mlp_model, param_distributions=param_dist, cv=5)

# Fit the randomized search object to the training data after standardization
random_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters and the corresponding classification accuracy score
print("Best hyperparameters: ", random_search.best_params_)
print("Accuracy: %.5f" % random_search.best_score_)


### MLP model wtih Hyperparaters Tuning

In [None]:
# Create a MLPClassifier object with tuned hyperparameters
mlp_model_tuned = MLPClassifier(hidden_layer_sizes=(80, ), 
                          activation='logistic', 
                          solver='adam', 
                          learning_rate_init=0.001, 
                          max_iter=837, 
                          random_state=42)


# Train the MLPClassifier model using the training data after standardization
mlp_model_tuned.fit(X_train_scaled, y_train)

# Generate predictions on the testing data using the trained MLPClassifier model with hyperparameter tuning
y_pred_mlp_tuned = mlp_model_tuned.predict(X_test_scaled)

In [None]:
# Print the classification accuracy score and error rate for the MLPClassifier model wtih Hyperparaters Tuning
print('Accuracy: %.8f' % accuracy_score(y_test, y_pred_mlp_tuned))
print('Error Rate', 1 - accuracy_score(y_test, y_pred_mlp_tuned))


# Print a classification report for the MLPClassifier model wtih Hyperparaters Tuning
# The report includes precision, recall, F1-score, and support for each class label
# The target names are inferred from the LabelEncoder object and are displayed in alphabetical order

print(classification_report(y_test, y_pred_mlp_tuned, target_names=np.unique(labelencoder.inverse_transform(y))))

In [None]:
# Plot the confusion matrix for the MLPClassifier model with Hyperparaters Tuning
# The confusion matrix shows the true and predicted class labels for each instance in the testing data
# The diagonal of the matrix represents the number of correct predictions for each class label
# Off-diagonal entries represent the number of incorrect predictions for each pair of class labels
plot_confusion_matrix(mlp_model_tuned, X_test_scaled, y_test, labelencoder)

### SVM 

In [None]:
# Support Vector Machine model with RBF kernel and specific hyperparameters
svm_model = SVC(C=1.0, 
                kernel='rbf', 
                degree=3, 
                gamma='scale', 
                coef0=0.0, 
                cache_size=200, 
                class_weight=None, 
                max_iter=-1, 
                decision_function_shape='ovr', 
                random_state=42)
# Train the SVM model using the training data after standardization
svm_model.fit(X_train_scaled, y_train)

# Generate predictions on the testing data using the trained SVM model
y_pred_svm = svm_model.predict(X_test_scaled)

In [None]:
# Print the accuracy score and error rate of the SVM model on the test data
print('Accuracy: %.8f' % accuracy_score(y_test, y_pred_svm))
print('Error Rate', 1 - accuracy_score(y_test, y_pred_svm))

# Convert the elements in target_names to strings
target_names = np.array([str(label) for label in np.unique(labelencoder.inverse_transform(y))])

#Prints the classification report for the predicted values of SVM model on the test data, including precision, recall, f1-score, and support for each class.
print(classification_report(y_test, y_pred_svm, target_names=target_names))

In [None]:
# Plot the confusion matrix for the SVM model
plot_confusion_matrix(svm_model, X_test_scaled, y_test, labelencoder)

### Hyperparameter Tuning of SVM

In [None]:
# Define the hyperparameter grid
param_dist = {'C': sp_uniform(loc=0, scale=10),
              'gamma': ['scale', 'auto'] + list(sp_uniform(loc=0, scale=1).rvs(5)),
              'kernel': ['linear', 'poly', 'rbf'], # removing sigmoid as it is used for binary classification
              'degree': sp_randint(1, 6),
              'coef0': sp_uniform(loc=0, scale=10)}

# Define the model
svm = SVC()

# Define the RandomizedSearchCV object
random_search = RandomizedSearchCV(estimator=svm,
                                   param_distributions=param_dist,
                                   n_iter=100,
                                   cv=5,
                                   random_state=42,
                                   n_jobs=-1)

# Fit the RandomizedSearchCV object to your dataset
random_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters: ", random_search.best_params_)


### SVM model with Hyperparaters Tuning

In [None]:
# Define the SVM model with the best hyperparameters
svm_model_tuned = SVC(C=1.9579113478929644,
                kernel='rbf',
                degree=2,
                gamma=0.19561646205339078,
                coef0=0.6936130087516545,
                cache_size=200,
                class_weight=None,
                max_iter=-1,
                decision_function_shape='ovr',
                random_state=42)

# Fit the SVM model with Hyperparaters Tuning on the training data
svm_model_tuned.fit(X_train_scaled, y_train)

# Use the SVM model with Hyperparaters Tuning to predict the labels of the test data
y_pred_svm_tuned = svm_model_tuned.predict(X_test_scaled)


In [None]:
# Print the accuracy score and error rate of the SVM model wtih Hyperparaters Tuning on the test data
print('Accuracy: %.8f' % accuracy_score(y_test, y_pred_svm_tuned))
print('Error Rate', 1 - accuracy_score(y_test, y_pred_svm_tuned))

# Convert the elements in target_names to strings
target_names = np.array([str(label) for label in np.unique(labelencoder.inverse_transform(y))])

#Prints the classification report for the predicted values of SVM model with Hyperparaters Tuning on the test data, including precision, recall, f1-score, and support for each class.
print(classification_report(y_test, y_pred_svm_tuned, target_names=target_names))

In [None]:
# Plot the confusion matrix for the SVM model with Hyperparaters Tuning
plot_confusion_matrix(svm_model_tuned, X_test_scaled, y_test, labelencoder)

### Random forest

In [None]:
# Create a RandomForestClassifier object with specific hyperparameters
RF_model = RandomForestClassifier(n_estimators=242, 
                                   random_state=42, 
                                   max_depth=None, 
                                   max_features=14, 
                                   min_samples_leaf=3, 
                                   min_samples_split=8)

# Fit the RF model on the training data
RF_model.fit(X_train_scaled, y_train)

# Use the RF model to predict the labels of the test data
y_pred_rf = RF_model.predict(X_test_scaled)


In [None]:
# Print the accuracy score and error rate of the RF model
print('Accuracy: %.8f' % accuracy_score(y_test, y_pred_rf))
print('Error Rate', 1 - accuracy_score(y_test, y_pred_rf))

# Convert the elements in target_names to strings
target_names_rf = np.array([str(label) for label in np.unique(labelencoder.inverse_transform(y))])

#Prints the classification report
print(classification_report(y_test, y_pred_rf, target_names=target_names_rf))

In [None]:
# Plot the confusion matrix for the RF model
plot_confusion_matrix(RF_model, X_test_scaled, y_test, labelencoder)

### Automatic Prediction Model

In [None]:
# Extracting the index values of X_test as instance id's
ids = X_test.index.values

# Creating a DataFrame with instance IDs and predicted labels
df = pd.DataFrame({'instance_id': ids, 'predicted_label': y_pred_svm})

# Saving the DataFrame as a CSV file named predicted_labels.csv
df.to_csv('260867.csv', index=False)