# Machine Learning Project

##### Creating Regression Models for data available on https://archive.ics.uci.edu/ml/datasets/Computer+Hardware

In [35]:
#Importing the Modules used here for the primary use
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn import preprocessing

#### Importing the dataset used

In [6]:
data=pd.read_csv('machine.data',names=['vendor_name','model_name','MYCT','MMIN','MMAX','CACH',
                                      'CHMIN','CHMAX','PRP','ERP'])
data_trn=data.copy()
data.head()

Unnamed: 0,vendor_name,model_name,MYCT,MMIN,MMAX,CACH,CHMIN,CHMAX,PRP,ERP
0,adviser,32/60,125,256,6000,256,16,128,198,199
1,amdahl,470v/7,29,8000,32000,32,8,32,269,253
2,amdahl,470v/7a,29,8000,32000,32,8,32,220,253
3,amdahl,470v/7b,29,8000,32000,32,8,32,172,253
4,amdahl,470v/7c,29,8000,16000,32,8,16,132,132


In [10]:
'''Encondind the label vendor_name for optimization purposes'''
transformer=preprocessing.LabelEncoder()
vendor_name=transformer.fit_transform(data_trn.vendor_name)
data_trn.vendor_name=vendor_name
data_trn.head(3)

Unnamed: 0,vendor_name,model_name,MYCT,MMIN,MMAX,CACH,CHMIN,CHMAX,PRP,ERP
0,0,32/60,125,256,6000,256,16,128,198,199
1,1,470v/7,29,8000,32000,32,8,32,269,253
2,1,470v/7a,29,8000,32000,32,8,32,220,253


### Splitting the dataset

In [40]:
from sklearn.preprocessing import MinMaxScaler
scaler=MinMaxScaler()

X=data_trn[['vendor_name','MYCT','MMIN','MMAX','CACH','CHMIN','CHMAX']]#Features
y=data_trn['PRP']#Label
#Splitting the data_set
X_train,X_test,y_train,y_test=train_test_split(X,y,random_state=0,test_size=0.15)

#Normalizated features for future models
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

#### First Model: KNeighborsRegressor
###### For this Model we'll use the KN Regression, without normalization to see how this affects the model

In [41]:
from sklearn.neighbors import KNeighborsRegressor
#finding the best parameters to build the model 
for i in range(1,10):
    knnreg=KNeighborsRegressor(n_neighbors=i).fit(X_train,y_train)
    print(f'For this number of neighbors {i}, this is the score of the model: {knnreg.score(X_test,y_test)}')

For this number of neighbors 1, this is the score of the model: 0.9319133094864283
For this number of neighbors 2, this is the score of the model: 0.9444422413932635
For this number of neighbors 3, this is the score of the model: 0.959928027666726
For this number of neighbors 4, this is the score of the model: 0.9455338644934691
For this number of neighbors 5, this is the score of the model: 0.9093684023113154
For this number of neighbors 6, this is the score of the model: 0.8863751625312248
For this number of neighbors 7, this is the score of the model: 0.8708104765626368
For this number of neighbors 8, this is the score of the model: 0.8402558399964392
For this number of neighbors 9, this is the score of the model: 0.7999883702876436


In [42]:
#Considerating the feature above, we'll use 3 as the number of neighbors for this model
knnreg=KNeighborsRegressor(n_neighbors=3).fit(X_train,y_train)
x=zip(list(knnreg.predict(X_test)),y_test.tolist())
print('Comparing the prediction against the published value of perfomance: \n',list(x)[1:10])


Comparing the prediction against the published value of perfomance: 
 [(39.333333333333336, 16), (49.0, 63), (324.3333333333333, 318), (68.0, 75), (36.333333333333336, 16), (98.0, 66), (9.666666666666666, 12), (145.66666666666666, 141), (183.66666666666666, 214)]


#### Second Model: Linear regression
###### For this Model we'll use the Linear regression (based on the squared error), with and without normalization to see how this affects the model

In [45]:
from sklearn.linear_model import LinearRegression

linreg= LinearRegression().fit(X_train,y_train)

print('R-squared score (training): {:.3f}'
     .format(linreg.score(X_train, y_train)))
print('R-squared score (test): {:.3f}'
     .format(linreg.score(X_test, y_test)))

R-squared score (training): 0.859
R-squared score (test): 0.905


In [46]:
#Using normalization
linreg= LinearRegression().fit(X_train_scaled,y_train)

print('R-squared score (training): {:.3f}'
     .format(linreg.score(X_train_scaled, y_train)))
print('R-squared score (test): {:.3f}'
     .format(linreg.score(X_test_scaled, y_test)))

R-squared score (training): 0.859
R-squared score (test): 0.905


In [48]:
linreg= LinearRegression().fit(X_train,y_train)
x=zip(list(abs(linreg.predict(X_test))),y_test.tolist())
print('Comparing the prediction against the published value of perfomance: \n',list(x)[1:10])

Comparing the prediction against the published value of perfomance: 
 [(5.634793336639916, 16), (88.617527673032, 63), (329.4614687399462, 318), (91.76017692143772, 75), (13.954745233956508, 16), (92.85222357620522, 66), (43.78858601717292, 12), (267.6093839013894, 141), (198.64301799478852, 214)]


#### Third Model: Ridge regression
###### For this Model we'll use the Ridge regression (based on the squared error of the weights used), with and without normalization to see how this affects the model and variating the regularization of the model for avoid overfitting.

In [57]:
from sklearn.linear_model import Ridge
#values that control the regularization of the model
alpha_values=[1e-3,0.1,1,2,5,10,50,100]
for i in range(0,len(alpha_values)):
    linridge= Ridge(alpha=alpha_values[i]).fit(X_train,y_train)
    print(f'For this value of alpha: {alpha_values[i]}, those are the scores:')
    print('R-squared score (training): {:.3f}'
     .format(linridge.score(X_train, y_train)))
    print('R-squared score (test): {:.3f} \n'
     .format(linridge.score(X_test, y_test)))

For this value of alpha: 0.001, those are the scores:
R-squared score (training): 0.859
R-squared score (test): 0.905 

For this value of alpha: 0.1, those are the scores:
R-squared score (training): 0.859
R-squared score (test): 0.905 

For this value of alpha: 1, those are the scores:
R-squared score (training): 0.859
R-squared score (test): 0.905 

For this value of alpha: 2, those are the scores:
R-squared score (training): 0.859
R-squared score (test): 0.905 

For this value of alpha: 5, those are the scores:
R-squared score (training): 0.859
R-squared score (test): 0.905 

For this value of alpha: 10, those are the scores:
R-squared score (training): 0.859
R-squared score (test): 0.905 

For this value of alpha: 50, those are the scores:
R-squared score (training): 0.859
R-squared score (test): 0.905 

For this value of alpha: 100, those are the scores:
R-squared score (training): 0.859
R-squared score (test): 0.905 



In [59]:
#Using normalization
alpha_values=[1e-3,0.1,1,2,5,10,50,100]
for i in range(0,len(alpha_values)):
    linridge= Ridge(alpha=alpha_values[i]).fit(X_train_scaled,y_train)
    print(f'For this value of alpha: {alpha_values[i]}, those are the scores with normalization:')
    print('R-squared score (training): {:.3f}'
     .format(linridge.score(X_train_scaled, y_train)))
    print('R-squared score (test): {:.3f} \n'
     .format(linridge.score(X_test_scaled, y_test)))

For this value of alpha: 0.001, those are the scores with normalization:
R-squared score (training): 0.859
R-squared score (test): 0.905 

For this value of alpha: 0.1, those are the scores with normalization:
R-squared score (training): 0.859
R-squared score (test): 0.907 

For this value of alpha: 1, those are the scores with normalization:
R-squared score (training): 0.846
R-squared score (test): 0.910 

For this value of alpha: 2, those are the scores with normalization:
R-squared score (training): 0.827
R-squared score (test): 0.899 

For this value of alpha: 5, those are the scores with normalization:
R-squared score (training): 0.765
R-squared score (test): 0.844 

For this value of alpha: 10, those are the scores with normalization:
R-squared score (training): 0.669
R-squared score (test): 0.745 

For this value of alpha: 50, those are the scores with normalization:
R-squared score (training): 0.311
R-squared score (test): 0.349 

For this value of alpha: 100, those are the sco

In [61]:
#As we saw above, the best value was confirming using normalization and is around 1.0
linridge= Ridge(alpha=1.0).fit(X_train_scaled,y_train)
x=zip(list(abs(linridge.predict(X_test_scaled))),y_test.tolist())
print('Comparing the prediction against the published value of perfomance: \n',list(x)[1:10])

Comparing the prediction against the published value of perfomance: 
 [(2.0693701973432397, 16), (107.79144895858254, 63), (313.65263960223393, 318), (91.639864155838, 75), (7.213240952566601, 16), (96.90001725378589, 66), (13.035056872323935, 12), (256.98068918347934, 141), (198.90363813074646, 214)]


#### Third Model: Lasso regression
###### For this Model we'll use the Lasso regression (based on the absolute value of the weights used) with and without normalization to see how this affects the model and variating the regularization of the model for avoid overfitting.

In [69]:
from sklearn.linear_model import Lasso

#values that control the regularization of the model
alpha_values=[1e-3,0.1,1,2,5,10,50,100,1000]
for i in range(0,len(alpha_values)):
    linlasso= Lasso(alpha=alpha_values[i],max_iter=10000).fit(X_train,y_train)
    print(f'For this value of alpha: {alpha_values[i]}, those are the scores:')
    print('R-squared score (training): {:.3f}'
     .format(linlasso.score(X_train, y_train)))
    print('R-squared score (test): {:.3f} \n'
     .format(linlasso.score(X_test, y_test)))

For this value of alpha: 0.001, those are the scores:
R-squared score (training): 0.859
R-squared score (test): 0.905 

For this value of alpha: 0.1, those are the scores:
R-squared score (training): 0.859
R-squared score (test): 0.905 

For this value of alpha: 1, those are the scores:
R-squared score (training): 0.859
R-squared score (test): 0.905 

For this value of alpha: 2, those are the scores:
R-squared score (training): 0.859
R-squared score (test): 0.905 

For this value of alpha: 5, those are the scores:
R-squared score (training): 0.859
R-squared score (test): 0.906 

For this value of alpha: 10, those are the scores:
R-squared score (training): 0.859
R-squared score (test): 0.908 

For this value of alpha: 50, those are the scores:
R-squared score (training): 0.858
R-squared score (test): 0.911 

For this value of alpha: 100, those are the scores:
R-squared score (training): 0.858
R-squared score (test): 0.912 

For this value of alpha: 1000, those are the scores:
R-squared

In [70]:
#using normalization
alpha_values=[1e-3,0.1,1,2,5,10,50,100,1000]
for i in range(0,len(alpha_values)):
    linlasso= Lasso(alpha=alpha_values[i],max_iter=10000).fit(X_train_scaled,y_train)
    print(f'For this value of alpha: {alpha_values[i]}, those are the scores:')
    print('R-squared score (training): {:.3f}'
     .format(linlasso.score(X_train_scaled, y_train)))
    print('R-squared score (test): {:.3f} \n'
     .format(linlasso.score(X_test_scaled, y_test)))

For this value of alpha: 0.001, those are the scores:
R-squared score (training): 0.859
R-squared score (test): 0.905 

For this value of alpha: 0.1, those are the scores:
R-squared score (training): 0.859
R-squared score (test): 0.909 

For this value of alpha: 1, those are the scores:
R-squared score (training): 0.850
R-squared score (test): 0.910 

For this value of alpha: 2, those are the scores:
R-squared score (training): 0.835
R-squared score (test): 0.886 

For this value of alpha: 5, those are the scores:
R-squared score (training): 0.762
R-squared score (test): 0.769 

For this value of alpha: 10, those are the scores:
R-squared score (training): 0.639
R-squared score (test): 0.641 

For this value of alpha: 50, those are the scores:
R-squared score (training): 0.000
R-squared score (test): -0.001 

For this value of alpha: 100, those are the scores:
R-squared score (training): 0.000
R-squared score (test): -0.001 

For this value of alpha: 1000, those are the scores:
R-squar

In [71]:
#As we saw above, the best way to build a model usind Lasso for this dataset is use Normalization
#for memorie usage and processing purposes, and takes alpha around the value of 1.0
linlasso= Ridge(alpha=1.0,max_iter=10000).fit(X_train_scaled,y_train)
x=zip(list(abs(linlasso.predict(X_test_scaled))),y_test.tolist())
print('Comparing the prediction against the published value of perfomance: \n',list(x)[1:10])

Comparing the prediction against the published value of perfomance: 
 [(2.0693701973432397, 16), (107.79144895858254, 63), (313.65263960223393, 318), (91.639864155838, 75), (7.213240952566601, 16), (96.90001725378589, 66), (13.035056872323935, 12), (256.98068918347934, 141), (198.90363813074646, 214)]
