# 1. Creating Multi Linear Regression Model for Cereals Dataset

In [1]:
# Importing necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
import random

In [2]:
# Reading data and pre-processing
cereal_data = pd.read_csv("cereal.csv",sep = ";")
cereal_data = cereal_data.drop(0)

In [3]:
cereal_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 77 entries, 1 to 77
Data columns (total 16 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   name      77 non-null     object
 1   mfr       77 non-null     object
 2   type      77 non-null     object
 3   calories  77 non-null     object
 4   protein   77 non-null     object
 5   fat       77 non-null     object
 6   sodium    77 non-null     object
 7   fiber     77 non-null     object
 8   carbo     77 non-null     object
 9   sugars    77 non-null     object
 10  potass    77 non-null     object
 11  vitamins  77 non-null     object
 12  shelf     77 non-null     object
 13  weight    77 non-null     object
 14  cups      77 non-null     object
 15  rating    77 non-null     object
dtypes: object(16)
memory usage: 10.2+ KB


In [4]:
type_list = ['type']
def binary_map(x):
    return x.map({'H': 1, "C": 0})
cereal_data[type_list] = cereal_data[type_list].apply(binary_map)

In [5]:
# Converting categorical values to numbers
mfr_data = pd.get_dummies(cereal_data['mfr'])
cereal_data = pd.concat([cereal_data, mfr_data], axis = 1)
cereal_data.drop(['mfr'],axis = 1,inplace=True)
cereal_data.drop(['name'],axis = 1,inplace=True)
cereal_data

Unnamed: 0,type,calories,protein,fat,sodium,fiber,carbo,sugars,potass,vitamins,...,weight,cups,rating,A,G,K,N,P,Q,R
1,0,70,4,1,130,10,5,6,280,25,...,1,0.33,68.402973,0,0,0,1,0,0,0
2,0,120,3,5,15,2,8,8,135,0,...,1,1,33.983679,0,0,0,0,0,1,0
3,0,70,4,1,260,9,7,5,320,25,...,1,0.33,59.425505,0,0,1,0,0,0,0
4,0,50,4,0,140,14,8,0,330,25,...,1,0.5,93.704912,0,0,1,0,0,0,0
5,0,110,2,2,200,1,14,8,-1,25,...,1,0.75,34.384843,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
73,0,110,2,1,250,0,21,3,60,25,...,1,0.75,39.106174,0,1,0,0,0,0,0
74,0,110,1,1,140,0,13,12,25,25,...,1,1,27.753301,0,1,0,0,0,0,0
75,0,100,3,1,230,3,17,3,115,25,...,1,0.67,49.787445,0,0,0,0,0,0,1
76,0,100,3,1,200,3,17,3,110,25,...,1,1,51.592193,0,1,0,0,0,0,0


In [6]:
#Splitting test and train data
from sklearn.model_selection import train_test_split
np.random.seed(0)
df_train, df_test = train_test_split(cereal_data, train_size = 0.8, test_size = 0.2, random_state = 100)

In [7]:
# Scaling the Train data
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
num_vars = ['type','calories','protein','fat','sodium','fiber','carbo','sugars','potass','vitamins','shelf','weight','cups','rating','A','G','K','N','P','Q','R']
df_train[num_vars] = scaler.fit_transform(df_train[num_vars])
#df_train

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[col] = igetitem(value, i)


In [8]:
# Dividing the training data set into X and Y
y_train = df_train.pop('rating')
X_train = df_train

In [9]:
# Getting best features to predict the value
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression
lm = LinearRegression()
lm.fit(X_train, y_train)
rfe = RFE(lm, 10)
rfe = rfe.fit(X_train, y_train)
list(zip(X_train.columns,rfe.support_,rfe.ranking_))



[('type', False, 4),
 ('calories', True, 1),
 ('protein', True, 1),
 ('fat', True, 1),
 ('sodium', True, 1),
 ('fiber', True, 1),
 ('carbo', True, 1),
 ('sugars', True, 1),
 ('potass', True, 1),
 ('vitamins', True, 1),
 ('shelf', False, 6),
 ('weight', False, 2),
 ('cups', True, 1),
 ('A', False, 3),
 ('G', False, 8),
 ('K', False, 7),
 ('N', False, 9),
 ('P', False, 5),
 ('Q', False, 11),
 ('R', False, 10)]

In [10]:
X_train = df_train[['calories','protein','fat','sodium','fiber','carbo','sugars','potass','vitamins','cups']]

In [11]:
# Model Building on train data
import statsmodels.api as sm
X_train_lm = sm.add_constant(X_train)
lr_1 = sm.OLS(y_train.astype(float), X_train_lm.astype(float)).fit()
lr_1.summary()

  x = pd.concat(x[::order], 1)


0,1,2,3
Dep. Variable:,rating,R-squared:,1.0
Model:,OLS,Adj. R-squared:,1.0
Method:,Least Squares,F-statistic:,1.224e+16
Date:,"Fri, 25 Feb 2022",Prob (F-statistic):,0.0
Time:,22:10:49,Log-Likelihood:,1100.9
No. Observations:,61,AIC:,-2180.0
Df Residuals:,50,BIC:,-2157.0
Df Model:,10,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,0.3792,4.04e-09,9.38e+07,0.000,0.379,0.379
calories,-0.2944,6.48e-09,-4.54e+07,0.000,-0.294,-0.294
protein,0.2163,3.96e-09,5.46e+07,0.000,0.216,0.216
fat,-0.1118,3.94e-09,-2.84e+07,0.000,-0.112,-0.112
sodium,-0.2305,2.41e-09,-9.58e+07,0.000,-0.230,-0.230
fiber,0.6372,8.33e-09,7.64e+07,0.000,0.637,0.637
carbo,0.3465,5.71e-09,6.07e+07,0.000,0.347,0.347
sugars,-0.1533,3.98e-09,-3.85e+07,0.000,-0.153,-0.153
potass,-0.1487,6.61e-09,-2.25e+07,0.000,-0.149,-0.149

0,1,2,3
Omnibus:,11.444,Durbin-Watson:,1.98
Prob(Omnibus):,0.003,Jarque-Bera (JB):,3.931
Skew:,-0.294,Prob(JB):,0.14
Kurtosis:,1.905,Cond. No.,36.6


In [12]:
# Scaling Testing data
num_vars = ['type','calories','protein','fat','sodium','fiber','carbo','sugars','potass','vitamins','shelf','weight','cups','rating','A','G','K','N','P','Q','R']
df_test[num_vars] = scaler.transform(df_test[num_vars])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[col] = igetitem(value, i)


In [13]:
# Diving Test data in X and Y
y_test = df_test.pop('rating')
X_test = df_test[['calories','protein','fat','sodium','fiber','carbo','sugars','potass','vitamins','cups']]

In [14]:
# Adding constant variable to test dataframe
X_test_m1 = sm.add_constant(X_test)
# Predicting on test data using the model build above
y_pred_m1 = lr_1.predict(X_test_m1)

  x = pd.concat(x[::order], 1)


# 2. Function that finds the optimal Beta vector (coefficients)

In [15]:
# function calculating beta values using mathematical formula with numpy as given in question 
def estimate_coef_multiple_linear_regression(X, y):
    beta = np.dot((np.linalg.inv(np.dot(X.T,X))), np.dot(X.T,y))
    return beta

In [16]:
# Calling the function with training data
beta = estimate_coef_multiple_linear_regression(X_train, y_train)
beta

array([-0.48870871,  0.38347529,  0.0562027 , -0.22650288,  0.88150513,
        0.73170953,  0.07157783, -0.27132475, -0.10874613,  0.08994756])

# 3. Predicting the values based on the new coefficients

In [17]:
# predicting on the test data with the new coefficients (beta)
def manual_predict(X_test, beta):
    return np.dot(X_test, beta)

In [18]:
#predicting the values with test data
manual_predictions = manual_predict(X_test, beta)
manual_predictions

array([0.12906688, 0.34261621, 0.56033034, 0.74339698, 0.31499017,
       0.37614   , 0.62551548, 0.50015845, 0.26598208, 0.65304563,
       0.11117531, 0.38472664, 0.13232029, 0.72731447, 0.20078175,
       0.2082445 ])

In [22]:
def rss(y_hat, y):
    rss = np.sum(np.square((y_hat- y)))
    return rss
    
manual_rss = rss(y_test,manual_predictions)
reported_rss = rss(y_test,y_pred_m1)
print(manual_rss)
print(reported_rss)

0.044291851454093416
3.744276046977224e-16


# 4. Function to calculate R square value

In [20]:
def r_squared(y_hat, y):
    rss = np.sum(np.square((y_hat- y)))
    mean = np.mean(y_hat)
    sst = np.sum(np.square(y_hat-mean))
    r_square = 1 - (rss/sst)
    return r_square

# 5. Comparing R square values of statsmodel reported and manually calculated model after reducing coefficients

In [21]:
manual_model = r_squared(y_test,manual_predictions)
reported_model = r_squared(y_test,y_pred_m1)
print(manual_model)
print(reported_model)

0.9374554503397715
0.9999999999999994
