In [1]:
import numpy as np
import pandas as pd
from copy import copy
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import joblib
import warnings
warnings.filterwarnings("ignore")


In [2]:
#Importing regression.py class
from Regression import Regression

In [3]:
#Importing data frame
df = pd.read_csv("Dataset.data",delimiter=" ",header=None)
df 


Unnamed: 0,0,1,2,3,4,5,6,7,8
0,M,0.455,0.365,0.095,0.5140,0.2245,0.1010,0.1500,15
1,M,0.350,0.265,0.090,0.2255,0.0995,0.0485,0.0700,7
2,F,0.530,0.420,0.135,0.6770,0.2565,0.1415,0.2100,9
3,M,0.440,0.365,0.125,0.5160,0.2155,0.1140,0.1550,10
4,I,0.330,0.255,0.080,0.2050,0.0895,0.0395,0.0550,7
...,...,...,...,...,...,...,...,...,...
4172,F,0.565,0.450,0.165,0.8870,0.3700,0.2390,0.2490,11
4173,M,0.590,0.440,0.135,0.9660,0.4390,0.2145,0.2605,10
4174,M,0.600,0.475,0.205,1.1760,0.5255,0.2875,0.3080,9
4175,F,0.625,0.485,0.150,1.0945,0.5310,0.2610,0.2960,10


In [4]:
#One hot encoding implementation, So, that all attributes have numeric values
features=df.columns
one_hot_features=pd.get_dummies(df)
one_hot_features

Unnamed: 0,1,2,3,4,5,6,7,8,0_F,0_I,0_M
0,0.455,0.365,0.095,0.5140,0.2245,0.1010,0.1500,15,0,0,1
1,0.350,0.265,0.090,0.2255,0.0995,0.0485,0.0700,7,0,0,1
2,0.530,0.420,0.135,0.6770,0.2565,0.1415,0.2100,9,1,0,0
3,0.440,0.365,0.125,0.5160,0.2155,0.1140,0.1550,10,0,0,1
4,0.330,0.255,0.080,0.2050,0.0895,0.0395,0.0550,7,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...
4172,0.565,0.450,0.165,0.8870,0.3700,0.2390,0.2490,11,1,0,0
4173,0.590,0.440,0.135,0.9660,0.4390,0.2145,0.2605,10,0,0,1
4174,0.600,0.475,0.205,1.1760,0.5255,0.2875,0.3080,9,0,0,1
4175,0.625,0.485,0.150,1.0945,0.5310,0.2610,0.2960,10,1,0,0


In [5]:
#Mean Square Error Calculating Function
def MSE(y_actual,y_predicted):
      y_actual = y_actual.to_numpy()
      difference = y_actual - y_predicted
      difference = np.square(difference)
      mse = np.average(difference,axis=0)

      return mse

In [6]:
#Generic Function which will give n-folds of the data
def split_n_folds(df,n):
  splitDataframe=[]
  split = int(np.ceil(df.shape[0]/n))
  for x in np.arange(0,df.shape[0],split):
    a=x+split
    splitDataframe.append(df[x:a])
  return splitDataframe

In [7]:
#Call to n-fold function to divide it in 5-folds
data_list = split_n_folds(one_hot_features,5)
data_list

[         1      2      3       4       5       6       7   8  0_F  0_I  0_M
 0    0.455  0.365  0.095  0.5140  0.2245  0.1010  0.1500  15    0    0    1
 1    0.350  0.265  0.090  0.2255  0.0995  0.0485  0.0700   7    0    0    1
 2    0.530  0.420  0.135  0.6770  0.2565  0.1415  0.2100   9    1    0    0
 3    0.440  0.365  0.125  0.5160  0.2155  0.1140  0.1550  10    0    0    1
 4    0.330  0.255  0.080  0.2050  0.0895  0.0395  0.0550   7    0    1    0
 ..     ...    ...    ...     ...     ...     ...     ...  ..  ...  ...  ...
 831  0.425  0.325  0.100  0.3980  0.1185  0.0645  0.0945   6    0    1    0
 832  0.440  0.365  0.115  0.5010  0.2435  0.0840  0.1465   9    0    1    0
 833  0.445  0.335  0.100  0.4895  0.2745  0.0860  0.1105   7    0    1    0
 834  0.445  0.325  0.100  0.3780  0.1795  0.1000  0.0890   7    0    1    0
 835  0.450  0.350  0.130  0.5470  0.2450  0.1405  0.1405   8    0    1    0
 
 [836 rows x 11 columns],
           1      2      3       4       5      

Making the model and fitting and then calculating mse. Saving foldwise models using joblib as well

In [29]:
#List to store train and validation mse values for each folds
n_fold_train = []
n_fold_test = []
i =0
#Loop to implement functions of regression class on each 5-folds
for x in range(len(data_list)):
  #Making regression class object
  reg = Regression()

  #Making copy of the data. So, that in every fold main data is not altered
  data = copy(data_list)

  #Taking the data foldwise for implementation
  test = data.pop(x)
  train = pd.concat(data)
  train_X=train[['0_F','0_I','0_M',1,2,3,4,5,6,7]]
  train_y= train[8]
  test_X=test[['0_F','0_I','0_M',1,2,3,4,5,6,7]]
  test_y= test[8]

  #Fitting the model on training data
  reg.fit(train_X,train_y)

  #Saving the model foldwise
  name = "model"+str(i)+".sav"
  filename = name
  joblib.dump(reg,filename)

  #Calling predict function on training and validation data 
  trainy_predicted = reg.predict(train_X)
  testy_predicted = reg.predict(test_X)

  #Calculatin mse on both training and validation data
  Train_mse = MSE(train_y,trainy_predicted)
  Test_mse = MSE(test_y,testy_predicted)

  #Appending foldwise mse values of training and validation data 
  n_fold_train.append(Train_mse)
  n_fold_test.append(Test_mse)
  i = i+1

#Printing the list which contains foldwise mse values for training and validation set  
print('Train mse',n_fold_train,'\n','Validation mse',n_fold_test,sep='\t')
  


Train mse	[3.7621029631458547, 5.310196776905305, 4.625097919316821, 5.074881093796767, 5.0323620663875595]	
	Validation mse	[9.77379641008149, 3.019181977048445, 5.849455180921052, 3.8387597188995217, 3.939529327355942]


Using Saved model for mse calculation

In [8]:
#List to store train and validation mse values for each folds
n_fold_train = []
n_fold_test = []
i =0
#Loop to implement functions of regression class on each 5-folds
for x in range(len(data_list)):

  #Making copy of the data. So, that in every fold main data is not altered
  data = copy(data_list)

  #Taking the data foldwise for implementation
  test = data.pop(x)
  train = pd.concat(data)
  train_X=train[['0_F','0_I','0_M',1,2,3,4,5,6,7]]
  train_y= train[8]
  test_X=test[['0_F','0_I','0_M',1,2,3,4,5,6,7]]
  test_y= test[8]

  # Fitting the model on training data
  # reg.fit(train_X,train_y)

  #Using saved the model foldwise
  name = "model"+str(i)+".sav"
  filename = name
  stored_model = joblib.load(filename)

  #Calling predict function on training and validation data 
  trainy_predicted = stored_model.predict(train_X)
  testy_predicted = stored_model.predict(test_X)

  #Calculatin mse on both training and validation data
  Train_mse = MSE(train_y,trainy_predicted)
  Test_mse = MSE(test_y,testy_predicted)

  #Appending foldwise mse values of training and validation data 
  n_fold_train.append(Train_mse)
  n_fold_test.append(Test_mse)
  i = i+1
  

#Printing the list which contains foldwise mse values for training and validation set  
print('Train mse',n_fold_train,'\n','Validation mse',n_fold_test,sep='\t')
  


Train mse	[3.762134947344344, 5.310142665521644, 4.625581743045682, 5.074301542974596, 5.03173857328424]	
	Validation mse	[9.773657764544327, 3.021925857762971, 5.847498259475927, 3.836889823658044, 3.9336617459483794]


In [9]:
#Calculating  mean of training mse values
training_mean = sum(n_fold_train)/len(n_fold_train)

#Calculating mean of validation mse values
validation_mean = sum(n_fold_test)/len(n_fold_test)

#Printing the mean value of training mse and validation mse
print('Training mean',training_mean,'validation mean',validation_mean)

Training mean 4.760779894434101 validation mean 5.28272669027793


Using Saved Model for sklearn mse calculation

In [10]:
#Using sklearn mean_squared_error function and calculating mse for each fold on training and validation set

#List to store mse values foldwise for training set and validation set
train_inbuilt = []
test_inbuilt = []
i=0

#Loop to implement mse functions of sklearn regression class on each 5-folds
for x in range(len(data_list)):

  #Making copy of the data. So, that in every fold main data is not altered
  data = copy(data_list)

  #Taking the data foldwise for implementation
  test = data.pop(x)
  train = pd.concat(data)
  train_X=train[['0_F','0_I','0_M',1,2,3,4,5,6,7]]
  train_y= train[8]
  test_X=test[['0_F','0_I','0_M',1,2,3,4,5,6,7]]
  test_y= test[8]

  #Loading saved models and then using them further
  name = "model"+str(i)+".sav"
  filename = name
  stored_model = joblib.load(filename)

  #Calling predict function on training and validation data 
  basic_predicted_train = stored_model.predict(train_X)
  basic_predicted = stored_model.predict(test_X)

  #Calculatin sklearn mse on both training and validation data
  basic_test_mse = mean_squared_error(test_y, basic_predicted, squared=True)
  basic_train_mse = mean_squared_error(train_y, basic_predicted_train, squared=True)

  #Appending foldwise sklearn mse values of training and validation data 
  train_inbuilt.append(basic_train_mse)
  test_inbuilt.append(basic_test_mse)
  i = i+1

#Printing the list which contains foldwise sklearn mse values for training and validation set  
print('Train mse',train_inbuilt,'\n','Test mse',test_inbuilt,sep='\t')

Train mse	[3.762134947344344, 5.310142665521644, 4.625581743045682, 5.074301542974596, 5.03173857328424]	
	Test mse	[9.773657764544327, 3.021925857762971, 5.847498259475927, 3.836889823658044, 3.9336617459483794]


In [11]:
#Calculating  mean of training sklearn mse values
training_mean_inbuilt = sum(train_inbuilt)/len(train_inbuilt)

#Calculating mean of validation sklearn mse values
validation_mean_inbuilt = sum(test_inbuilt)/len(test_inbuilt)

#Printing the mean value of training sklearn mse and validation mse
print('Training mean',training_mean_inbuilt,'validation mean',validation_mean_inbuilt)

Training mean 4.760779894434101 validation mean 5.28272669027793


C. Normal Equations

In [12]:
#Function to calculate coefficients on given data using normal equation and returning those coefficients
def normal_equation_coefficients(X_data,y_data):
  X_data_numpy = X_data.to_numpy()
  y_data_numpy = y_data.to_numpy()
  product_x_value = np.dot(X_data_numpy.T,X_data_numpy)
  inverse_value = np.linalg.inv(product_x_value)
  product_x_y = np.dot(X_data_numpy.T,y_data_numpy)
  normal_coefficients = np.dot(inverse_value,product_x_y)
  
  return normal_coefficients 

In [13]:
#Function to give predicted values on given data using normal coefficients
def normal_predict(test_data,coeff_normal):
  test_data_numpy = test_data.to_numpy()
  y_normal_predict = np.dot(test_data_numpy,coeff_normal.T)

  return y_normal_predict

In [14]:
#List to store mse values foldwise for training set and validation set  
train_normal_fold=[]
test_normal_fold =[]

#Loop to implement mse function on each 5-folds
for x in range(len(data_list)):

  #making copy of the data
  data = copy(data_list)

  #Data splitting in training and validation set. At every fold values get change
  test = data.pop(x)
  train = pd.concat(data)
  train_X = train[['0_F','0_I','0_M',1,2,3,4,5,6,7]]
  train_y = train[8]
  test_X = test[['0_F','0_I','0_M',1,2,3,4,5,6,7]]
  test_y = test[8]

  #Calculating normal coefficients on training data
  normal_coeff = normal_equation_coefficients(train_X,train_y)

  #Calling normal predict function to get predicted values for training and validation set
  y_normal_predict_train = normal_predict(train_X,normal_coeff)
  y_normal_predict_test = normal_predict(test_X,normal_coeff)

  #Calculating mse on training and validation set
  train_normal_mse = MSE(train_y,y_normal_predict_train)
  test_normal_mse = MSE(test_y,y_normal_predict_test)

  #Appending foldwise values of mse
  train_normal_fold.append(train_normal_mse)
  test_normal_fold.append(test_normal_mse)

#Printing foldwise values of training and validation set  
print("Train mse",train_normal_fold,'\n','Test mse',test_normal_fold)

Train mse [3.7620280521240543, 5.307209308009528, 4.6140454527084085, 5.064279618471935, 5.0294375742481945] 
 Test mse [9.774942685472633, 2.9885931775415915, 5.80649735600861, 3.8227890471281936, 3.9534433229186567]


In [15]:
#Calculating mean on training set
training_mean_normal = sum(train_normal_fold)/len(train_normal_fold)

#Calculating mean on validation set
validation_mean_normal = sum(test_normal_fold)/len(test_normal_fold)

#Printing training and validation mean
print('Training mean',training_mean_normal,'validation mean',validation_mean_normal)

Training mean 4.755400001112425 validation mean 5.269253117813937


D. Sklearn Linear Regression model

Making model and fitting using sklearn regression class and saving those models using joblib

In [25]:
#List to store mse values foldwise for training set and validation set  
train_sklearn_fold=[]
test_sklearn_fold =[]

i = 0

for x in range(len(data_list)):

  #Making the copy of the data
  data = copy(data_list)

  #Data splitting in training and validation set. At every fold values get change
  test = data.pop(x)
  train = pd.concat(data)
  train_X = train[['0_F','0_I','0_M',1,2,3,4,5,6,7]]
  train_y = train[8]
  test_X = test[['0_F','0_I','0_M',1,2,3,4,5,6,7]]
  test_y = test[8]

  #Fitting the model using sklearn linear regression on training set
  sklearn_linear = LinearRegression().fit(train_X, train_y)

  #Saving the model foldwise
  name = "sklearnmodel"+str(i)+".sav"
  filename = name
  joblib.dump(sklearn_linear,filename)

  #Calling cklearn predict function to get predicted values on training and validation set
  sklearn_predicted_train = sklearn_linear.predict(train_X)
  sklearn_predicted_test = sklearn_linear.predict(test_X) 

  #Calculating mse using sklearn mean squared error on training and validation set
  sklearn_test_mse = mean_squared_error(test_y, sklearn_predicted_test, squared=True)
  sklearn_train_mse = mean_squared_error(train_y, sklearn_predicted_train, squared=True)

  #Appneding mse values foldwise
  train_sklearn_fold.append(sklearn_train_mse)
  test_sklearn_fold.append(sklearn_test_mse)
  i = i+1
  

#Printing mse training and validation values foldwise
print("Train mse",train_sklearn_fold,'\n','Validation mse',test_sklearn_fold)

Train mse [3.7614291207371484, 5.311082069833134, 4.651528827446872, 5.078056310329991, 5.03635660198887] 
 Validation mse [9.791404322574014, 2.9840286473908493, 5.87013704582835, 3.8447499252392343, 3.9812899769282715]


Using sklearn linear regression class as above but instead of training using saved model those were saved earlier using joblib

In [16]:
#List to store mse values foldwise for training set and validation set  
train_sklearn_fold=[]
test_sklearn_fold =[]

i = 0

for x in range(len(data_list)):

  #Making the copy of the data
  data = copy(data_list)

  #Data splitting in training and validation set. At every fold values get change
  test = data.pop(x)
  train = pd.concat(data)
  train_X = train[['0_F','0_I','0_M',1,2,3,4,5,6,7]]
  train_y = train[8]
  test_X = test[['0_F','0_I','0_M',1,2,3,4,5,6,7]]
  test_y = test[8]

  # Fitting the model using sklearn linear regression on training set
  # sklearn_linear = LinearRegression().fit(train_X, train_y)

  #Loading saved models and then using them further
  name = "sklearnmodel"+str(i)+".sav"
  filename = name
  stored_model = joblib.load(filename)

  #Calling cklearn predict function to get predicted values on training and validation set
  sklearn_predicted_train = stored_model.predict(train_X)
  sklearn_predicted_test = stored_model.predict(test_X) 

  #Calculating mse using sklearn mean squared error on training and validation set
  sklearn_test_mse = mean_squared_error(test_y, sklearn_predicted_test, squared=True)
  sklearn_train_mse = mean_squared_error(train_y, sklearn_predicted_train, squared=True)

  #Appneding mse values foldwise
  train_sklearn_fold.append(sklearn_train_mse)
  test_sklearn_fold.append(sklearn_test_mse)
  i = i+1
  

#Printing mse training and validation values foldwise
print("Train mse",train_sklearn_fold,'\n','Validation mse',test_sklearn_fold)

Train mse [3.7614291207371484, 5.311082069833134, 4.651528827446872, 5.078056310329991, 5.03635660198887] 
 Validation mse [9.791404322574014, 2.9840286473908493, 5.87013704582835, 3.8447499252392343, 3.9812899769282715]


In [17]:
training_mean_sklearn = sum(train_sklearn_fold)/len(train_sklearn_fold)
validation_mean_sklearn = sum(test_sklearn_fold)/len(test_sklearn_fold)

print('Training mean',training_mean_sklearn,'validation mean',validation_mean_sklearn)

Training mean 4.767690586067203 validation mean 5.294321983592143
