In [1]:
# importing required libraries
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

# read the train and test dataset
data = pd.read_csv('data/bigmart_sales_train.csv')
train_data , test_data = train_test_split(data,test_size=0.2,random_state=0)
test_ = pd.read_csv('data/bigmart_sales_test.csv')

# print(train_data.head())

train_data = train_data.select_dtypes(include=[int, float])
test_data = test_data.select_dtypes(include=[int, float])

train_data.dropna(how='any', inplace=True)
test_data.dropna(how='any', inplace=True)

# shape of the dataset
print('\nShape of training data :',train_data.shape)
print('\nShape of testing data :',test_data.shape)

train_data.head()


Shape of training data : (5645, 5)

Shape of testing data : (1415, 5)


Unnamed: 0,Item_Weight,Item_Visibility,Item_MRP,Outlet_Establishment_Year,Item_Outlet_Sales
5820,18.7,0.014623,52.0324,2002,1090.5804
48,15.85,0.121633,60.622,1997,2576.646
3791,21.0,0.049296,194.4478,2004,968.739
321,9.695,0.128483,223.9404,1999,4950.8888
4724,12.1,0.0,177.3002,2004,3044.7034


In [2]:
# Now, we need to predict the missing target variable in the test data
# target variable - Item_Outlet_Sales

# seperate the independent and target variable on training data
train_x = train_data.drop(columns=['Item_Outlet_Sales'],axis=1)
train_y = train_data['Item_Outlet_Sales']

# seperate the independent and target variable on training data
test_x = test_data.drop(columns=['Item_Outlet_Sales'],axis=1)
test_y = test_data['Item_Outlet_Sales']

train_x.head()

Unnamed: 0,Item_Weight,Item_Visibility,Item_MRP,Outlet_Establishment_Year
5820,18.7,0.014623,52.0324,2002
48,15.85,0.121633,60.622,1997
3791,21.0,0.049296,194.4478,2004
321,9.695,0.128483,223.9404,1999
4724,12.1,0.0,177.3002,2004


In [3]:
'''
Create the object of the Linear Regression model
You can also add other parameters and test your code here
Some parameters are : fit_intercept and normalize
Documentation of sklearn LinearRegression: 

https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LinearRegression.html

 '''
model = LinearRegression()

# fit the model with the training data
model.fit(train_x,train_y)

# coefficeints of the trained model
print('\nCoefficient of model :', model.coef_)

# intercept of the model
print('\nIntercept of model',model.intercept_)

# predict the target on the test dataset
predict_train = model.predict(train_x)
print('\nItem_Outlet_Sales on training data',predict_train) 

# Root Mean Squared Error on training dataset
rmse_train = mean_squared_error(train_y,predict_train)**(0.5)
print('\nRMSE on train dataset : ', rmse_train)

# predict the target on the testing dataset
predict_test = model.predict(test_x)
print('\nItem_Outlet_Sales on test data',predict_test) 

# Root Mean Squared Error on testing dataset
rmse_test = mean_squared_error(test_y,predict_test)**(0.5)
print('\nRMSE on test dataset : ', rmse_test)


Coefficient of model : [-2.04441926e+00 -2.52601124e+03  1.53133042e+01  3.94544528e+00]

Intercept of model -7738.708720453751

Item_Outlet_Sales on training data [ 881.69314706  729.01896856 2978.14767631 ... 1340.86575977 1716.03307948
 2803.14263762]

RMSE on train dataset :  1188.267230257051

Item_Outlet_Sales on test data [2346.58497078 2097.03543399 2350.5267619  ... 3539.67073753 1701.98837953
 3926.79576188]

RMSE on test dataset :  1222.9612200449396


In [5]:
test_

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type
0,FDW58,20.750,Low Fat,0.007565,Snack Foods,107.8622,OUT049,1999,Medium,Tier 1,Supermarket Type1
1,FDW14,8.300,reg,0.038428,Dairy,87.3198,OUT017,2007,,Tier 2,Supermarket Type1
2,NCN55,14.600,Low Fat,0.099575,Others,241.7538,OUT010,1998,,Tier 3,Grocery Store
3,FDQ58,7.315,Low Fat,0.015388,Snack Foods,155.0340,OUT017,2007,,Tier 2,Supermarket Type1
4,FDY38,,Regular,0.118599,Dairy,234.2300,OUT027,1985,Medium,Tier 3,Supermarket Type3
...,...,...,...,...,...,...,...,...,...,...,...
5676,FDB58,10.500,Regular,0.013496,Snack Foods,141.3154,OUT046,1997,Small,Tier 1,Supermarket Type1
5677,FDD47,7.600,Regular,0.142991,Starchy Foods,169.1448,OUT018,2009,Medium,Tier 3,Supermarket Type2
5678,NCO17,10.000,Low Fat,0.073529,Health and Hygiene,118.7440,OUT045,2002,,Tier 2,Supermarket Type1
5679,FDJ26,15.300,Regular,0.000000,Canned,214.6218,OUT017,2007,,Tier 2,Supermarket Type1
