# Machine Learning Project

##### Creating Regression Models for data available on https://archive.ics.uci.edu/ml/datasets/Computer+Hardware

##### Analysing and comparing with the values published and the estimated in the dataset


In [137]:
#Importing the Modules used here for the primary use
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn import preprocessing

#### Importing the dataset used

In [138]:
data=pd.read_csv('dataset/machine.data',names=['vendor_name','model_name','MYCT','MMIN','MMAX','CACH',
                                      'CHMIN','CHMAX','PRP','ERP'])
data_trn=data.copy()
data.head()

Unnamed: 0,vendor_name,model_name,MYCT,MMIN,MMAX,CACH,CHMIN,CHMAX,PRP,ERP
0,adviser,32/60,125,256,6000,256,16,128,198,199
1,amdahl,470v/7,29,8000,32000,32,8,32,269,253
2,amdahl,470v/7a,29,8000,32000,32,8,32,220,253
3,amdahl,470v/7b,29,8000,32000,32,8,32,172,253
4,amdahl,470v/7c,29,8000,16000,32,8,16,132,132


In [139]:
'''Encondind the label vendor_name for optimization purposes'''
transformer=preprocessing.LabelEncoder()
vendor_name=transformer.fit_transform(data_trn.vendor_name)
data_trn.vendor_name=vendor_name
data_trn.head(3)

Unnamed: 0,vendor_name,model_name,MYCT,MMIN,MMAX,CACH,CHMIN,CHMAX,PRP,ERP
0,0,32/60,125,256,6000,256,16,128,198,199
1,1,470v/7,29,8000,32000,32,8,32,269,253
2,1,470v/7a,29,8000,32000,32,8,32,220,253


### Splitting the dataset

In [140]:
from sklearn.preprocessing import MinMaxScaler
scaler=MinMaxScaler()

X=data_trn[['vendor_name','MYCT','MMIN','MMAX','CACH','CHMIN','CHMAX']]#Features
y=data_trn['PRP']#Label
#Splitting the data_set
X_train,X_test,y_train,y_test=train_test_split(X,y,random_state=0,test_size=0.15)

#Normalizated features for future models
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

### First Model: KNeighborsRegressor
###### For this Model we'll use the KN Regression, without normalization to see how this affects the model

In [141]:
from sklearn.neighbors import KNeighborsRegressor
#finding the best parameters to build the model 
for i in range(1,10):
    knnreg=KNeighborsRegressor(n_neighbors=i).fit(X_train,y_train)
    print(f'For this number of neighbors {i}, this is the score of the model: {knnreg.score(X_test,y_test)}')

For this number of neighbors 1, this is the score of the model: 0.9319133094864283
For this number of neighbors 2, this is the score of the model: 0.9444422413932635
For this number of neighbors 3, this is the score of the model: 0.959928027666726
For this number of neighbors 4, this is the score of the model: 0.9455338644934691
For this number of neighbors 5, this is the score of the model: 0.9093684023113154
For this number of neighbors 6, this is the score of the model: 0.8863751625312248
For this number of neighbors 7, this is the score of the model: 0.8708104765626368
For this number of neighbors 8, this is the score of the model: 0.8402558399964392
For this number of neighbors 9, this is the score of the model: 0.7999883702876436


In [142]:
#Considerating the feature above, we'll use 3 as the number of neighbors for this model
knnreg=KNeighborsRegressor(n_neighbors=3).fit(X_train,y_train)
x=zip(list(knnreg.predict(X_test)),y_test.tolist())
print('Comparing the prediction against the published value of perfomance: \n',list(x)[1:10])


Comparing the prediction against the published value of perfomance: 
 [(39.333333333333336, 16), (49.0, 63), (324.3333333333333, 318), (68.0, 75), (36.333333333333336, 16), (98.0, 66), (9.666666666666666, 12), (145.66666666666666, 141), (183.66666666666666, 214)]


### Second Model: Linear regression
###### For this Model we'll use the Linear regression (based on the squared error), with and without normalization to see how this affects the model

In [143]:
from sklearn.linear_model import LinearRegression

linreg= LinearRegression().fit(X_train,y_train)

print('R-squared score (training): {:.3f}'
     .format(linreg.score(X_train, y_train)))
print('R-squared score (test): {:.3f}'
     .format(linreg.score(X_test, y_test)))

R-squared score (training): 0.859
R-squared score (test): 0.905


In [144]:
#Using normalization
linreg= LinearRegression().fit(X_train_scaled,y_train)

print('R-squared score (training): {:.3f}'
     .format(linreg.score(X_train_scaled, y_train)))
print('R-squared score (test): {:.3f}'
     .format(linreg.score(X_test_scaled, y_test)))

R-squared score (training): 0.859
R-squared score (test): 0.905


In [145]:
linreg= LinearRegression().fit(X_train,y_train)
x=zip(list(abs(linreg.predict(X_test))),y_test.tolist())
print('Comparing the prediction against the published value of perfomance: \n',list(x)[1:10])

Comparing the prediction against the published value of perfomance: 
 [(5.634793336639916, 16), (88.617527673032, 63), (329.4614687399462, 318), (91.76017692143772, 75), (13.954745233956508, 16), (92.85222357620522, 66), (43.78858601717292, 12), (267.6093839013894, 141), (198.64301799478852, 214)]


### Third Model: Ridge regression
###### For this Model we'll use the Ridge regression (based on the squared error of the weights used), with and without normalization to see how this affects the model and variating the regularization of the model for avoid overfitting.

In [146]:
from sklearn.linear_model import Ridge
#values that control the regularization of the model
alpha_values=[1e-3,0.1,1,2,5,10,50,100]
for i in range(0,len(alpha_values)):
    linridge= Ridge(alpha=alpha_values[i]).fit(X_train,y_train)
    print(f'For this value of alpha: {alpha_values[i]}, those are the scores:')
    print('R-squared score (training): {:.3f}'
     .format(linridge.score(X_train, y_train)))
    print('R-squared score (test): {:.3f} \n'
     .format(linridge.score(X_test, y_test)))

For this value of alpha: 0.001, those are the scores:
R-squared score (training): 0.859
R-squared score (test): 0.905 

For this value of alpha: 0.1, those are the scores:
R-squared score (training): 0.859
R-squared score (test): 0.905 

For this value of alpha: 1, those are the scores:
R-squared score (training): 0.859
R-squared score (test): 0.905 

For this value of alpha: 2, those are the scores:
R-squared score (training): 0.859
R-squared score (test): 0.905 

For this value of alpha: 5, those are the scores:
R-squared score (training): 0.859
R-squared score (test): 0.905 

For this value of alpha: 10, those are the scores:
R-squared score (training): 0.859
R-squared score (test): 0.905 

For this value of alpha: 50, those are the scores:
R-squared score (training): 0.859
R-squared score (test): 0.905 

For this value of alpha: 100, those are the scores:
R-squared score (training): 0.859
R-squared score (test): 0.905 



In [147]:
#Using normalization
alpha_values=[1e-3,0.1,1,2,5,10,50,100]
for i in range(0,len(alpha_values)):
    linridge= Ridge(alpha=alpha_values[i]).fit(X_train_scaled,y_train)
    print(f'For this value of alpha: {alpha_values[i]}, those are the scores with normalization:')
    print('R-squared score (training): {:.3f}'
     .format(linridge.score(X_train_scaled, y_train)))
    print('R-squared score (test): {:.3f} \n'
     .format(linridge.score(X_test_scaled, y_test)))

For this value of alpha: 0.001, those are the scores with normalization:
R-squared score (training): 0.859
R-squared score (test): 0.905 

For this value of alpha: 0.1, those are the scores with normalization:
R-squared score (training): 0.859
R-squared score (test): 0.907 

For this value of alpha: 1, those are the scores with normalization:
R-squared score (training): 0.846
R-squared score (test): 0.910 

For this value of alpha: 2, those are the scores with normalization:
R-squared score (training): 0.827
R-squared score (test): 0.899 

For this value of alpha: 5, those are the scores with normalization:
R-squared score (training): 0.765
R-squared score (test): 0.844 

For this value of alpha: 10, those are the scores with normalization:
R-squared score (training): 0.669
R-squared score (test): 0.745 

For this value of alpha: 50, those are the scores with normalization:
R-squared score (training): 0.311
R-squared score (test): 0.349 

For this value of alpha: 100, those are the sco

In [148]:
#As we saw above, the best value was confirming using normalization and is around 1.0
linridge= Ridge(alpha=1.0).fit(X_train_scaled,y_train)
x=zip(list(abs(linridge.predict(X_test_scaled))),y_test.tolist())
print('Comparing the prediction against the published value of perfomance: \n',list(x)[1:10])

Comparing the prediction against the published value of perfomance: 
 [(2.0693701973432397, 16), (107.79144895858254, 63), (313.65263960223393, 318), (91.639864155838, 75), (7.213240952566601, 16), (96.90001725378589, 66), (13.035056872323935, 12), (256.98068918347934, 141), (198.90363813074646, 214)]


### Fourth Model: Lasso regression
###### For this Model we'll use the Lasso regression (based on the absolute value of the weights used) with and without normalization to see how this affects the model and variating the regularization of the model for avoid overfitting.

In [149]:
from sklearn.linear_model import Lasso

#values that control the regularization of the model
alpha_values=[1e-3,0.1,1,2,5,10,50,100,1000]
for i in range(0,len(alpha_values)):
    linlasso= Lasso(alpha=alpha_values[i],max_iter=10000).fit(X_train,y_train)
    print(f'For this value of alpha: {alpha_values[i]}, those are the scores:')
    print('R-squared score (training): {:.3f}'
     .format(linlasso.score(X_train, y_train)))
    print('R-squared score (test): {:.3f} \n'
     .format(linlasso.score(X_test, y_test)))

For this value of alpha: 0.001, those are the scores:
R-squared score (training): 0.859
R-squared score (test): 0.905 

For this value of alpha: 0.1, those are the scores:
R-squared score (training): 0.859
R-squared score (test): 0.905 

For this value of alpha: 1, those are the scores:
R-squared score (training): 0.859
R-squared score (test): 0.905 

For this value of alpha: 2, those are the scores:
R-squared score (training): 0.859
R-squared score (test): 0.905 

For this value of alpha: 5, those are the scores:
R-squared score (training): 0.859
R-squared score (test): 0.906 

For this value of alpha: 10, those are the scores:
R-squared score (training): 0.859
R-squared score (test): 0.908 

For this value of alpha: 50, those are the scores:
R-squared score (training): 0.858
R-squared score (test): 0.911 

For this value of alpha: 100, those are the scores:
R-squared score (training): 0.858
R-squared score (test): 0.912 

For this value of alpha: 1000, those are the scores:
R-squared

In [150]:
#using normalization
alpha_values=[1e-3,0.1,1,2,5,10,50,100,1000]
for i in range(0,len(alpha_values)):
    linlasso= Lasso(alpha=alpha_values[i],max_iter=10000).fit(X_train_scaled,y_train)
    print(f'For this value of alpha: {alpha_values[i]}, those are the scores:')
    print('R-squared score (training): {:.3f}'
     .format(linlasso.score(X_train_scaled, y_train)))
    print('R-squared score (test): {:.3f} \n'
     .format(linlasso.score(X_test_scaled, y_test)))

For this value of alpha: 0.001, those are the scores:
R-squared score (training): 0.859
R-squared score (test): 0.905 

For this value of alpha: 0.1, those are the scores:
R-squared score (training): 0.859
R-squared score (test): 0.909 

For this value of alpha: 1, those are the scores:
R-squared score (training): 0.850
R-squared score (test): 0.910 

For this value of alpha: 2, those are the scores:
R-squared score (training): 0.835
R-squared score (test): 0.886 

For this value of alpha: 5, those are the scores:
R-squared score (training): 0.762
R-squared score (test): 0.769 

For this value of alpha: 10, those are the scores:
R-squared score (training): 0.639
R-squared score (test): 0.641 

For this value of alpha: 50, those are the scores:
R-squared score (training): 0.000
R-squared score (test): -0.001 

For this value of alpha: 100, those are the scores:
R-squared score (training): 0.000
R-squared score (test): -0.001 

For this value of alpha: 1000, those are the scores:
R-squar

In [151]:
#As we saw above, the best way to build a model usind Lasso for this dataset is use Normalization
#for memorie usage and processing purposes, and takes alpha around the value of 1.0
linlasso= Ridge(alpha=1.0,max_iter=10000).fit(X_train_scaled,y_train)
x=zip(list(abs(linlasso.predict(X_test_scaled))),y_test.tolist())
print('Comparing the prediction against the published value of perfomance: \n',list(x)[1:10])

Comparing the prediction against the published value of perfomance: 
 [(2.0693701973432397, 16), (107.79144895858254, 63), (313.65263960223393, 318), (91.639864155838, 75), (7.213240952566601, 16), (96.90001725378589, 66), (13.035056872323935, 12), (256.98068918347934, 141), (198.90363813074646, 214)]


### Polynomial Features for Linear Regression
###### It's Possible to add some polynomial features to a linear regression. Basically the number of variables or features will order the degree of polymonial expression.
###### We can combine this features with normal regressions as Linear,Ridge and Lasso.

In [152]:
from sklearn.preprocessing import PolynomialFeatures

In [153]:
for i in range(1,8):
    poly=PolynomialFeatures(degree=i)
    #splitting the data with the addition of polynomial features
    X_train_polynomial=poly.fit_transform(X)
    X_train,X_test,y_train,y_test=train_test_split(X_train_polynomial,y,
                                                   random_state=0,test_size=0.15)
    #Normalizated features for future models
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    #creating the models again
    knnreg=KNeighborsRegressor(n_neighbors=3).fit(X_train,y_train)
    linreg= LinearRegression().fit(X_train,y_train)
    linridge= Ridge(alpha=1.0).fit(X_train_scaled,y_train)
    linlasso= Ridge(alpha=1.0,max_iter=10000).fit(X_train_scaled,y_train)
    
    print(f'For the degree "{i}" of the polynomial those are the scores for the models:\n')
    
    print(f'KN regression:    \t (training):{knnreg.score(X_train,y_train):.3f}\t(test):{knnreg.score(X_test,y_test):.3f}')
    
    print(f'Linear regression:\t (training):{linreg.score(X_train,y_train):.3f}\t(test):{linreg.score(X_test,y_test):.3f}')
    
    print(f'Ridge regression:\t (training):{linridge.score(X_train_scaled,y_train):.3f}\t(test):{linridge.score(X_test_scaled,y_test):.3f}')
    
    print(f'Lasso regression:\t (training):{linlasso.score(X_train_scaled,y_train):.3f}\t(test):{linlasso.score(X_test_scaled,y_test):.3f}\n')
    

For the degree "1" of the polynomial those are the scores for the models:

KN regression:    	 (training):0.932	(test):0.960
Linear regression:	 (training):0.859	(test):0.905
Ridge regression:	 (training):0.846	(test):0.910
Lasso regression:	 (training):0.846	(test):0.910

For the degree "2" of the polynomial those are the scores for the models:

KN regression:    	 (training):0.932	(test):0.960
Linear regression:	 (training):0.974	(test):0.816
Ridge regression:	 (training):0.946	(test):0.893
Lasso regression:	 (training):0.946	(test):0.893

For the degree "3" of the polynomial those are the scores for the models:

KN regression:    	 (training):0.932	(test):0.960
Linear regression:	 (training):0.980	(test):-14.019
Ridge regression:	 (training):0.960	(test):0.882
Lasso regression:	 (training):0.960	(test):0.882

For the degree "4" of the polynomial those are the scores for the models:

KN regression:    	 (training):0.932	(test):0.960
Linear regression:	 (training):0.997	(test):-3970.1

##### The cell above shows how complex the model are, usually regard to the degree, more it's tends to overfitting and in some cases, to not classify right, as we see in the Linear Regression model. 

In [154]:
poly=PolynomialFeatures(degree=i)
#splitting the data with the addition of polynomial features
X_train_polynomial=poly.fit_transform(X)
X_train,X_test,y_train,y_test=train_test_split(X_train_polynomial,y,
                                               random_state=0,test_size=0.15)
#Normalizated features for future models
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

#creating the models again
linridge= Ridge(alpha=1.0).fit(X_train_scaled,y_train)
linlasso= Ridge(alpha=1.0,max_iter=10000).fit(X_train_scaled,y_train)

x=zip(linridge.predict(X_test_scaled),linlasso.predict(X_test_scaled),y_test.tolist())
print('Comparing the prediction[Ridge and Lasso] against the published value of perfomance: \n \n',list(x)[1:10])

Comparing the prediction[Ridge and Lasso] against the published value of perfomance: 
 
 [(27.56156819616463, 27.56156819616463, 16), (103.12885624632241, 103.12885624632241, 63), (229.29990425515183, 229.29990425515183, 318), (82.72720282129607, 82.72720282129607, 75), (25.005919025838416, 25.005919025838416, 16), (79.40490277223498, 79.40490277223498, 66), (8.039566100948413, 8.039566100948413, 12), (182.60385968116648, 182.60385968116648, 141), (155.14093099668125, 155.14093099668125, 214)]


### Neural Networks Regressor
#### Getting a different prediction using Neural Networks

In [155]:
#Splitting the data_set
X_train,X_test,y_train,y_test=train_test_split(X,y,random_state=0,test_size=0.2)

#Normalizated features for future models
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [156]:
from sklearn.neural_network import MLPRegressor
#Defining the parameters for a better perfomance and data usage
for  thisactivation in ['tanh']:
    for thisalpha in [100,110,125,135,150,200,500,1000]:
        mlpreg = MLPRegressor(hidden_layer_sizes = [50,50],
                             activation = thisactivation,
                             alpha = thisalpha,
                             solver = 'lbfgs',max_iter=5000,random_state=0).fit(X_train_scaled, y_train)
        print(f'For the activation type "{thisactivation}" and setting the alpha regularization parameter as {thisalpha} the scores are:\
        \ntraining:{mlpreg.score(X_train_scaled,y_train)}\
        \t test:{mlpreg.score(X_test_scaled,y_test)} \n')

For the activation type "tanh" and setting the alpha regularization parameter as 100 the scores are:        
training:0.9379326005931539        	 test:0.9341754910322173 

For the activation type "tanh" and setting the alpha regularization parameter as 110 the scores are:        
training:0.931044097774645        	 test:0.9293675680429859 

For the activation type "tanh" and setting the alpha regularization parameter as 125 the scores are:        
training:0.9403445552899986        	 test:0.9266088584193414 

For the activation type "tanh" and setting the alpha regularization parameter as 135 the scores are:        
training:0.9408937438871837        	 test:0.9311781303176222 

For the activation type "tanh" and setting the alpha regularization parameter as 150 the scores are:        
training:0.9356071521705652        	 test:0.9291640807989081 

For the activation type "tanh" and setting the alpha regularization parameter as 200 the scores are:        
training:0.8726680210059334     

In [157]:
#Looking to the results above,we'll use the parameter alpha as 100, to look to the results
mlpreg = MLPRegressor(hidden_layer_sizes = [50,50],
                             activation = 'tanh',
                             alpha = 100,
                             solver = 'lbfgs',max_iter=5000,random_state=0).fit(X_train_scaled, y_train)

x=zip(mlpreg.predict(X_test_scaled),y_test.tolist())
print('Comparing the prediction[Neura Network Regressor] against the published value of perfomance: \n \n',list(x)[1:10])

Comparing the prediction[Neura Network Regressor] against the published value of perfomance: 
 
 [(38.14779644958861, 16), (58.224130536113364, 63), (268.3595863289079, 318), (58.01905744201565, 75), (37.91415948744907, 16), (63.211961307260935, 66), (36.45713264005627, 12), (192.03110864070942, 141), (177.17096173365934, 214)]


# Comparing models
#### Defined all the models, we'll try to compare all the predictions to the published and estimated perfomance of the computers 

In [187]:
#Feeding all the models with the entire dataset to estimate the result

#Splitting the data_set
X_train,X_test,y_train,y_test=train_test_split(X,y,random_state=0,test_size=0.15)

#Normalizated features for future models
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

#Creating a frame to facilitating the visualization
comparison=pd.DataFrame()

#Normalizating the X 
X_scaled = scaler.fit_transform(X)

#evocating the models
linreg= LinearRegression().fit(X_train,y_train)
linear=[int(x) for x in linreg.predict(X)]

knnreg=KNeighborsRegressor(n_neighbors=3).fit(X_train,y_train)
kneighbor=[int(x) for x in knnreg.predict(X)]

linridge= Ridge(alpha=1.0).fit(X_train_scaled,y_train)
ridge=[int(x) for x in linridge.predict(X_scaled)]

linlasso= Ridge(alpha=1.0,max_iter=10000).fit(X_train_scaled,y_train)
lasso=[int(x) for x in linlasso.predict(X_scaled)]

mlpreg = MLPRegressor(hidden_layer_sizes = [50,50],
                             activation = 'tanh',
                             alpha = 100,
                             solver = 'lbfgs',max_iter=5000,random_state=0).fit(X_train_scaled, y_train)
neural=[int(x) for x in mlpreg.predict(X_scaled)]

comparison['Published']=data['PRP']
comparison['Estimated']=data['ERP']
comparison['Linear Regression']=linear
comparison['KNeighbors Regression']=kneighbor
comparison['Ridge Regression']=ridge
comparison['Lasso Regression']=lasso
comparison['Neural Network Regression']=neural
comparison

Unnamed: 0,Published,Estimated,Linear Regression,KNeighbors Regression,Ridge Regression,Lasso Regression,Neural Network Regression
0,198,199,343,84,338,338,196
1,269,253,306,220,291,291,218
2,220,253,306,220,291,291,218
3,172,253,306,220,291,291,218
4,132,132,188,160,178,178,125
...,...,...,...,...,...,...,...
204,42,37,24,28,30,30,41
205,46,50,44,24,52,52,65
206,52,41,48,53,47,47,51
207,67,47,49,43,47,47,64
