# Lab7: Neural Networks

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler

!pip install dmba
from dmba import regressionSummary


Colab environment detected.


## 1. Preprocessing

In [2]:
# List of columns to be processed
all_columns = ['Age_08_04', 'KM', 'Fuel_Type', 'HP', 'Automatic', 'Doors',
               'Quarterly_Tax', 'Mfr_Guarantee', 'Guarantee_Period', 'Airco',
               'Automatic_airco', 'CD_Player','Powered_Windows', 'Sport_Model',
               'Tow_Bar', 'Price']

In [3]:
# Read dataframe considering only the specified columns
toyota = pd.read_csv('ToyotaCorolla.csv', usecols=all_columns)
toyota.head()

Unnamed: 0,Price,Age_08_04,KM,Fuel_Type,HP,Automatic,Doors,Quarterly_Tax,Mfr_Guarantee,Guarantee_Period,Airco,Automatic_airco,CD_Player,Powered_Windows,Sport_Model,Tow_Bar
0,13500,23,46986,Diesel,90,0,3,210,0,3,0,0,0,1,0,0
1,13750,23,72937,Diesel,90,0,3,210,0,3,1,0,1,0,0,0
2,13950,24,41711,Diesel,90,0,3,210,1,3,0,0,0,0,0,0
3,14950,26,48000,Diesel,90,0,3,210,1,3,0,0,0,0,0,0
4,13750,30,38500,Diesel,90,0,3,210,1,3,1,0,0,1,0,0


In [4]:
toyota.describe()

Unnamed: 0,Price,Age_08_04,KM,HP,Automatic,Doors,Quarterly_Tax,Mfr_Guarantee,Guarantee_Period,Airco,Automatic_airco,CD_Player,Powered_Windows,Sport_Model,Tow_Bar
count,1436.0,1436.0,1436.0,1436.0,1436.0,1436.0,1436.0,1436.0,1436.0,1436.0,1436.0,1436.0,1436.0,1436.0,1436.0
mean,10730.824513,55.947075,68533.259749,101.502089,0.05571,4.033426,87.122563,0.409471,3.81546,0.508357,0.056407,0.218663,0.561978,0.300139,0.277855
std,3626.964585,18.599988,37506.448872,14.98108,0.229441,0.952677,41.128611,0.491907,3.011025,0.500104,0.230786,0.413483,0.496317,0.458478,0.448098
min,4350.0,1.0,1.0,69.0,0.0,2.0,19.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,8450.0,44.0,43000.0,90.0,0.0,3.0,69.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,9900.0,61.0,63389.5,110.0,0.0,4.0,85.0,0.0,3.0,1.0,0.0,0.0,1.0,0.0,0.0
75%,11950.0,70.0,87020.75,110.0,0.0,5.0,85.0,1.0,3.0,1.0,0.0,0.0,1.0,1.0,1.0
max,32500.0,80.0,243000.0,192.0,1.0,5.0,283.0,1.0,36.0,1.0,1.0,1.0,1.0,1.0,1.0


In [5]:
# Print information about datatypes, nulls and size
toyota.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1436 entries, 0 to 1435
Data columns (total 16 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Price             1436 non-null   int64 
 1   Age_08_04         1436 non-null   int64 
 2   KM                1436 non-null   int64 
 3   Fuel_Type         1436 non-null   object
 4   HP                1436 non-null   int64 
 5   Automatic         1436 non-null   int64 
 6   Doors             1436 non-null   int64 
 7   Quarterly_Tax     1436 non-null   int64 
 8   Mfr_Guarantee     1436 non-null   int64 
 9   Guarantee_Period  1436 non-null   int64 
 10  Airco             1436 non-null   int64 
 11  Automatic_airco   1436 non-null   int64 
 12  CD_Player         1436 non-null   int64 
 13  Powered_Windows   1436 non-null   int64 
 14  Sport_Model       1436 non-null   int64 
 15  Tow_Bar           1436 non-null   int64 
dtypes: int64(15), object(1)
memory usage: 179.6+ KB


In [6]:
# Transform into categorical
toyota['Fuel_Type'] = toyota['Fuel_Type'].astype('category')

In [7]:
# New dataset of dummies
toyota2 = pd.get_dummies(toyota, drop_first=True, dtype='int')

In [8]:
# OPTIONAL: Creates a list of column names for standardization
# I want to rename scaled columns adding a prefix
cols = toyota2.columns.tolist()
cols = ["scale_" + c for c in cols if c!='Price']
cols

['scale_Age_08_04',
 'scale_KM',
 'scale_HP',
 'scale_Automatic',
 'scale_Doors',
 'scale_Quarterly_Tax',
 'scale_Mfr_Guarantee',
 'scale_Guarantee_Period',
 'scale_Airco',
 'scale_Automatic_airco',
 'scale_CD_Player',
 'scale_Powered_Windows',
 'scale_Sport_Model',
 'scale_Tow_Bar',
 'scale_Fuel_Type_Diesel',
 'scale_Fuel_Type_Petrol']

In [9]:
# Scale data excuding the target variable 'Price'
scaler = MinMaxScaler()

# Data Partition
outcome='Price'
predictors=[c for c in toyota2.columns if c!=outcome]

X=pd.DataFrame(scaler.fit_transform(toyota2[predictors]), columns=cols)
y=toyota2[outcome]
train_X, valid_X, train_y, valid_y = train_test_split(X,y,test_size=0.4,random_state=1)

## 2. Fitting the model

In [10]:
# Created and fit model
reg = MLPRegressor(random_state=1, activation='relu', solver='lbfgs',
                    hidden_layer_sizes=2, max_iter=1000)
reg.fit(train_X,train_y)

## 3. RMSE

In [11]:
# print performance measures
regressionSummary(train_y, reg.predict(train_X))
regressionSummary(valid_y, reg.predict(valid_X))


Regression statistics

                      Mean Error (ME) : -0.3666
       Root Mean Squared Error (RMSE) : 1169.1859
            Mean Absolute Error (MAE) : 875.4044
          Mean Percentage Error (MPE) : -1.0926
Mean Absolute Percentage Error (MAPE) : 8.6187

Regression statistics

                      Mean Error (ME) : 9.3591
       Root Mean Squared Error (RMSE) : 1132.6952
            Mean Absolute Error (MAE) : 906.0641
          Mean Percentage Error (MPE) : -1.0638
Mean Absolute Percentage Error (MAPE) : 9.2774


## 4. Grid Search

In [12]:
param_grid={'hidden_layer_sizes':[1,2,3,4,5,6,7,8]}

In [13]:
reg2=GridSearchCV(
    MLPRegressor(random_state=1, activation='relu', solver='lbfgs', max_iter=1000),
    param_grid=param_grid, cv=5
    )
reg2.fit(train_X,train_y)

In [14]:
reg2.best_params_

{'hidden_layer_sizes': 8}

The best number of hidden layers is: 8