The main aim of this lab is to deal with the **pipeline** technique and **MultilayerPerceptron** algorithm

*   **Deadline: 23:59, 06/5/2024**



# Import libraries

In [55]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

from sklearn import metrics
from sklearn.datasets import load_iris, load_breast_cancer
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFE, SelectKBest, chi2
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.naive_bayes import CategoricalNB, GaussianNB, BernoulliNB, MultinomialNB, ComplementNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler, LabelEncoder, KBinsDiscretizer, OneHotEncoder
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import KBinsDiscretizer, StandardScaler
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier

from prettytable import PrettyTable

warnings.filterwarnings('ignore')
%pylab inline

%pylab is deprecated, use %matplotlib inline and import the required libraries.
Populating the interactive namespace from numpy and matplotlib


In [65]:
def my_pipeline(X, y, preprocessing, classifiers):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
    table = PrettyTable(["Classifier", "Preprocessing Steps", "Accuracy", "Precision", "Recall", "F1_Score"])
    preprocessing_steps = [name for name, _ in preprocessing.steps]

    for name, model in classifiers.items():
        pipeline = Pipeline([
            ('preprocessing', preprocessing),
            ('classifier', model)
        ])

        pipeline.fit(X_train, y_train)
        y_pred = pipeline.predict(X_test)
        acc = accuracy_score(y_test, y_pred)
        pre = precision_score(y_test, y_pred, average='macro')
        rec = recall_score(y_test, y_pred, average='macro')
        f1 = f1_score(y_test, y_pred, average='macro')

        table.add_row([model.__class__.__name__, preprocessing_steps, acc, pre, rec, f1])

    return table

#Task 1. With **iris** dataset
*  Apply **pipeline** including preprocessing steps (i.e., **StandardScaler**, **SimpleImputer**, **feature selection**, **KBinsDiscretizer**, …) and classification algorithms (i.e., **Random forest, kNN, Naïve Bayes**).


In [58]:
iris = load_iris()
X, y = iris.data, iris.target

In [59]:
models = {
    'RandomForest': RandomForestClassifier(),
    'kNN': KNeighborsClassifier(),
    'NaiveBayes': GaussianNB()
}

In [60]:
preprocessing = Pipeline([
    ('scaler', StandardScaler()),
    ('discretizer', KBinsDiscretizer(n_bins=3, encode='ordinal', strategy='uniform')),
    ('pca', PCA(n_components=2))
])

In [67]:
rf_pipeline = my_pipeline(X, y, preprocessing, models)
rf_pipeline

Classifier,Preprocessing Steps,Accuracy,Precision,Recall,F1_Score
RandomForestClassifier,"['scaler', 'discretizer', 'pca']",0.9666666666666668,0.9523809523809524,0.9743589743589745,0.9610256410256413
KNeighborsClassifier,"['scaler', 'discretizer', 'pca']",0.9666666666666668,0.9523809523809524,0.9743589743589745,0.9610256410256413
GaussianNB,"['scaler', 'discretizer', 'pca']",0.9666666666666668,0.9523809523809524,0.9743589743589745,0.9610256410256413


#Task 2. With **fashion** dataset
*   2.1. Apply **MultilayerPerceptron** classification with 1 hidden layer
having 10 nodes

In [73]:
def my_MLPClassifier(X_train, X_test, y_train, y_test, h_layer_szs):
    clf = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=h_layer_szs, random_state=1)
    table = PrettyTable(["AlgoName", "Hidden layer sizes", "Accuracy", "Precision", "Recall", "F1_Score"])

    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)

    acc = accuracy_score(y_test, y_pred)
    pre = precision_score(y_test, y_pred, average='macro')
    rec = recall_score(y_test, y_pred, average='macro')
    f1 = f1_score(y_test, y_pred, average='macro')

    table.add_row(['MultilayerPerceptron', h_layer_szs, acc, pre, rec, f1])
    return table

In [74]:
data_train = pd.read_csv('../data/fashion_train.csv')
data_test = pd.read_csv('../data/fashion_test.csv')

X_train = data_train.drop(columns=['y'])
y_train = data_train['y']
X_test = data_test.drop(columns=['y'])
y_test = data_test['y']

clf = my_MLPClassifier(X_train, X_test, y_train, y_test, (10,))
print(clf)

+----------------------+--------------------+----------+---------------------+---------------------+---------------------+
|       AlgoName       | Hidden layer sizes | Accuracy |      Precision      |        Recall       |       F1_Score      |
+----------------------+--------------------+----------+---------------------+---------------------+---------------------+
| MultilayerPerceptron |       (10,)        |  0.151   | 0.14817212558889442 | 0.16557886557886559 | 0.09551360613906014 |
+----------------------+--------------------+----------+---------------------+---------------------+---------------------+


*   2.2. Apply **MultilayerPerceptron** algorithm with the following settings (the first hidden layer has 250 neuron, the second one has 100 neurons).

In [75]:
clf = my_MLPClassifier(X_train, X_test, y_train, y_test, (250, 100))
print(clf)

+----------------------+--------------------+----------+--------------------+--------------------+--------------------+
|       AlgoName       | Hidden layer sizes | Accuracy |     Precision      |       Recall       |      F1_Score      |
+----------------------+--------------------+----------+--------------------+--------------------+--------------------+
| MultilayerPerceptron |     (250, 100)     |  0.764   | 0.7602001475797889 | 0.7614751901798809 | 0.7573455568059595 |
+----------------------+--------------------+----------+--------------------+--------------------+--------------------+


*   2.3. Find the best hyperparameters using **GridSearchCV**

In [79]:
def myGridSearchCV(X_train, y_train, X_test, y_test, classifier, params):  
    grid_search = GridSearchCV(
        estimator=classifier,
        param_grid=params,
        scoring='accuracy',
        refit=True,
        cv=10,
        return_train_score=True,
        n_jobs=-1
    )
    grid_search.fit(X_train, y_train)
    y_pred = grid_search.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='macro')
    recall = recall_score(y_test, y_pred, average='macro')
    f1 = f1_score(y_test, y_pred, average='macro')

    algoName = classifier.__class__.__name__
    best_params = grid_search.best_params_

    return algoName, best_params, accuracy, precision, recall, f1

In [80]:
param_grid = {
'hidden_layer_sizes': [(150,100,50), (120,80,40), (100,50,30)],
'max_iter': [50, 100, 150],
'activation': ['tanh', 'relu'],
'solver': ['sgd', 'adam'],
'alpha': [0.0001, 0.05],
'learning_rate': ['constant','adaptive'],
}

In [82]:
algoName, best_params, accuracy, precision, recall, f1 = myGridSearchCV(X_train, y_train, X_test, y_test, MLPClassifier(random_state=1), param_grid)
print(f"Classifier: {algoName}, Params: {best_params}")

Classifier: MLPClassifier, Params: {'activation': 'tanh', 'alpha': 0.05, 'hidden_layer_sizes': (150, 100, 50), 'learning_rate': 'constant', 'max_iter': 150, 'solver': 'sgd'}


In [83]:
table = PrettyTable(["Classifier with the best hyperparameters", "Accuracy", "Precision", "Recall", "F1_Score"])
table.add_row([algoName, accuracy, precision, recall, f1])

*   2.4. Compare the **MultilayerPerceptron** using the best hyperparameters in 2.3 and other classification algorithms (i.e., Random forest, kNN, Naïve Bayes)  in termns of accuracy, precision, recall, and F1

In [84]:
grid_params = { 'n_neighbors' : [5,7,9,11,13,15],
               'weights' : ['uniform','distance'],
               'metric' : ['minkowski','euclidean','manhattan']}

In [85]:
algoName, best_params, accuracy, precision, recall, f1 = myGridSearchCV(X_train, y_train, X_test, y_test, KNeighborsClassifier(), grid_params)
print(f"Classifier: {algoName}, Params: {best_params}")

Classifier: KNeighborsClassifier, Params: {'metric': 'minkowski', 'n_neighbors': 5, 'weights': 'distance'}


In [86]:
table.add_row([algoName, accuracy, precision, recall, f1])

In [87]:
param_grid = {
    'n_estimators': [25, 50, 100, 150],
    'max_features': ['sqrt', 'log2', None],
    'max_depth': [3, 6, 9],
    'max_leaf_nodes': [3, 6, 9],
}

In [88]:
algoName, best_params, accuracy, precision, recall, f1 = myGridSearchCV(X_train, y_train, X_test, y_test, RandomForestClassifier(), param_grid)
print(f"Classifier: {algoName}, Params: {best_params}")

Classifier: RandomForestClassifier, Params: {'max_depth': 6, 'max_features': 'log2', 'max_leaf_nodes': 9, 'n_estimators': 100}


In [89]:
table.add_row([algoName, accuracy, precision, recall, f1])

In [90]:
model = GaussianNB()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='macro')
recall = recall_score(y_test, y_pred, average='macro')
f1 = f1_score(y_test, y_pred, average='macro')

table.add_row([model.__class__.__name__, accuracy, precision, recall, f1])

In [91]:
print(table)

+------------------------------------------+----------+--------------------+--------------------+--------------------+
| Classifier with the best hyperparameters | Accuracy |     Precision      |       Recall       |      F1_Score      |
+------------------------------------------+----------+--------------------+--------------------+--------------------+
|              MLPClassifier               |  0.767   | 0.7694117201272791 | 0.7656622650561749 | 0.759167966189757  |
|           KNeighborsClassifier           |  0.761   | 0.7744516182441685 | 0.7612940338424302 | 0.7579477874924644 |
|          RandomForestClassifier          |   0.69   | 0.6859005769763701 | 0.6899556592478538 | 0.6354385534790379 |
|                GaussianNB                |  0.556   | 0.5788628371304589 | 0.559496772854223  | 0.5256907025966637 |
+------------------------------------------+----------+--------------------+--------------------+--------------------+


#Task 3. With **breast cancer** dataset

*   3.1. Apply **GridSearchCV** to **MultilayperPerceptron** to find the best hyperparameters (the setting of hyperparameters chosen by students)

In [92]:
param_grid = {
'hidden_layer_sizes': [(150,100,50), (120,80,40), (100,50,30)],
'max_iter': [50, 100, 150],
'activation': ['tanh', 'relu'],
'solver': ['sgd', 'adam'],
'alpha': [0.0001, 0.05],
'learning_rate': ['constant','adaptive'],
}

In [93]:
cancer = load_breast_cancer(as_frame=True)
X, y = cancer.data, cancer.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1) 

In [94]:
algoName, best_params, accuracy, precision, recall, f1 = myGridSearchCV(X_train, y_train, X_test, y_test, MLPClassifier(random_state=1), param_grid)
print(f"Classifier: {algoName}, Params: {best_params}")


Classifier: MLPClassifier, Params: {'activation': 'tanh', 'alpha': 0.0001, 'hidden_layer_sizes': (150, 100, 50), 'learning_rate': 'constant', 'max_iter': 50, 'solver': 'adam'}


In [95]:
table = PrettyTable(["Classifier with the best hyperparameters", "Accuracy", "Precision", "Recall", "F1_Score"])
table.add_row([algoName, accuracy, precision, recall, f1])

*   3.2. Compare the **MultilayerPerceptron** using the best hyperparameters in 3.1) and other classification algorithms (i.e., Random forest, kNN, Naïve Bayes)  in termns of accuracy, precision, recall, and F1

In [96]:
def evaluate_model(classifier, X_train, X_test, y_train, y_test):
    classifier.fit(X_train, y_train.values.ravel())
    y_pred = classifier.predict(X_test)

    ac = metrics.accuracy_score(y_test, y_pred)
    pre = metrics.precision_score(y_test, y_pred, average="macro")
    recall = metrics.recall_score(y_test, y_pred, average="macro")
    f1 = metrics.f1_score(y_test, y_pred, average="macro")

    return [classifier.__class__.__name__, ac, pre, recall, f1]

In [97]:
table.add_row(evaluate_model(KNeighborsClassifier(n_neighbors=13, metric='manhattan', weights='uniform'), X_train, X_test, y_train, y_test))

In [98]:
table.add_row(evaluate_model(RandomForestClassifier(max_depth=6, max_features=None, max_leaf_nodes=9, n_estimators=50), X_train, X_test, y_train, y_test))

In [99]:
table.add_row(evaluate_model(GaussianNB(), X_train, X_test, y_train, y_test))

In [100]:
print(table)

+------------------------------------------+--------------------+--------------------+--------------------+--------------------+
| Classifier with the best hyperparameters |      Accuracy      |     Precision      |       Recall       |      F1_Score      |
+------------------------------------------+--------------------+--------------------+--------------------+--------------------+
|              MLPClassifier               | 0.9298245614035088 | 0.9412393162393162 | 0.9097222222222223 | 0.9220512820512821 |
|           KNeighborsClassifier           | 0.9298245614035088 | 0.9412393162393162 | 0.9097222222222223 | 0.9220512820512821 |
|          RandomForestClassifier          | 0.9649122807017544 | 0.9736842105263157 | 0.9523809523809523 | 0.9614864864864865 |
|                GaussianNB                | 0.9473684210526315 | 0.9479729729729729 | 0.9384920634920635 | 0.942866688940862  |
+------------------------------------------+--------------------+--------------------+-----------

#Task 4. With **mobile price classification** dataset


*   4.1. Build your own Neural Network using **MultilayerPerceptron**  



In [101]:
mobile = pd.read_csv('../data/mobile.csv')
X = mobile.drop(columns=['price_range'])
y = mobile['price_range']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [102]:
clf = my_MLPClassifier(X_train, X_test, y_train, y_test, (250, 100))
print(clf)

+----------------------+--------------------+----------+--------------------+--------------------+--------------------+
|       AlgoName       | Hidden layer sizes | Accuracy |     Precision      |       Recall       |      F1_Score      |
+----------------------+--------------------+----------+--------------------+--------------------+--------------------+
| MultilayerPerceptron |     (250, 100)     |  0.6025  | 0.6158269016067182 | 0.6048588323762647 | 0.6092813569733209 |
+----------------------+--------------------+----------+--------------------+--------------------+--------------------+


In [103]:
clf = my_MLPClassifier(X_train, X_test, y_train, y_test, (150,100,50))
print(clf)

+----------------------+--------------------+----------+---------------------+--------------------+--------------------+
|       AlgoName       | Hidden layer sizes | Accuracy |      Precision      |       Recall       |      F1_Score      |
+----------------------+--------------------+----------+---------------------+--------------------+--------------------+
| MultilayerPerceptron |   (150, 100, 50)   |  0.3825  | 0.30566557778685177 | 0.3736242138364779 | 0.3223608363609721 |
+----------------------+--------------------+----------+---------------------+--------------------+--------------------+


*   4.2. Apply **GridSearchCV** to **MultilayperPerceptron** to find the best hyperparameters (the setting of hyperparameters chosen by students)

In [104]:
param_grid = {
'hidden_layer_sizes': [(150,100,50), (120,80,40), (100,50,30)],
'max_iter': [50, 100, 150],
'activation': ['tanh', 'relu'],
'solver': ['sgd', 'adam'],
'alpha': [0.0001, 0.05],
'learning_rate': ['constant','adaptive'],
}

In [105]:
algoName, best_params, accuracy, precision, recall, f1 = myGridSearchCV(X_train, y_train, X_test, y_test, MLPClassifier(random_state=1), param_grid)
print(f"Classifier: {algoName}, Params: {best_params}")

Classifier: MLPClassifier, Params: {'activation': 'tanh', 'alpha': 0.05, 'hidden_layer_sizes': (120, 80, 40), 'learning_rate': 'constant', 'max_iter': 100, 'solver': 'adam'}


#Finally,
Save a copy in your Github. Remember renaming the notebook.