# **preprocessing the data**

In [1]:
import numpy as np
import pandas as pd
from math import sqrt 
from statistics import variance 
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn import preprocessing


### **load the data**

In [2]:
#load the dataset
data = pd.read_csv('/content/drive/MyDrive/dataset.csv')

In [3]:
#examine the data
data.head(10)

#check number of rows, cols and dtypes
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8523 entries, 0 to 8522
Data columns (total 14 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   I_Id        8523 non-null   object 
 1   I_W         7060 non-null   float64
 2   I_Fat_C     8523 non-null   object 
 3   I_Vis       8523 non-null   float64
 4   I_MRP_JD    8523 non-null   float64
 5   I_category  8523 non-null   object 
 6   I_MRP_US    8523 non-null   float64
 7   O_Id        8523 non-null   object 
 8   O_Establ_Y  8523 non-null   int64  
 9   O_Size      6113 non-null   object 
 10  I_Recalled  8523 non-null   object 
 11  O_Loc_T     8523 non-null   object 
 12  O_T         8523 non-null   object 
 13  I_O_Sales   8523 non-null   float64
dtypes: float64(5), int64(1), object(8)
memory usage: 932.3+ KB


### **check the values**

In [4]:
# only null values we have are I_W and O_Size
data.isnull().sum()

I_Id             0
I_W           1463
I_Fat_C          0
I_Vis            0
I_MRP_JD         0
I_category       0
I_MRP_US         0
O_Id             0
O_Establ_Y       0
O_Size        2410
I_Recalled       0
O_Loc_T          0
O_T              0
I_O_Sales        0
dtype: int64

In [5]:
#lets check the values for each columns
data['I_Id'].value_counts(dropna = False)
data['I_W'].value_counts(dropna = False)  
data['I_Vis'].value_counts(dropna = False)   
data['I_MRP_JD'].value_counts(dropna = False)    
data['I_category'].value_counts(dropna = False)    
data['I_MRP_US'].value_counts(dropna = False) 
data['I_Fat_C'].value_counts(dropna = False)
data['O_Id'].value_counts(dropna = False)  
data['O_Establ_Y'].value_counts(dropna = False) 
data['O_Size'].value_counts(dropna = False)   
data['I_Recalled'].value_counts(dropna = False)  
data['O_Loc_T'].value_counts(dropna = False)
data['O_T'].value_counts(dropna = False) 
data['I_O_Sales'].value_counts(dropna = False)   

1138.7520    17
1522.2528    16
883.0848     15
2025.5976    15
1458.3360    14
             ..
4304.6310     1
6802.7126     1
1794.5650     1
5782.7070     1
2958.3834     1
Name: I_O_Sales, Length: 3493, dtype: int64

### **removing the null values and fixing the values**

In [6]:
#make all the I_Fat_C values either Regular or Low Fat
data['I_Fat_C'].replace({'LF':'Low Fat','low fat':'Low Fat','reg':'Regular'},inplace = True)

In [7]:
# to put the missing values in I_W as the mean of the same product weights 
data['I_W'].fillna(data.groupby(['I_Id'])['I_W'].transform('mean'),inplace=True)

In [8]:
# check if all missing values have been gone from I_W
data.isnull().sum()

I_Id             0
I_W              4
I_Fat_C          0
I_Vis            0
I_MRP_JD         0
I_category       0
I_MRP_US         0
O_Id             0
O_Establ_Y       0
O_Size        2410
I_Recalled       0
O_Loc_T          0
O_T              0
I_O_Sales        0
dtype: int64

In [9]:
# fill the rest of the 4 empty weight values as the mean of all the weight column
data['I_W'].fillna(0, inplace=True)

In [10]:
# since all the null values exist with the store Ids size values all null we
# will use the store name and take the mode sizes of each one to fill out the null values
# This will fill the missing values with the first element of the mode of each group, even if the group is empty

# i will create a dictionary of mode values for each store name
mode_storeName_size = data.groupby('O_T')['O_Size'].apply(lambda x: x.mode()[0]).to_dict()
# fill missing values using the dictionary
data['O_Size'].fillna(data['O_T'].map(mode_storeName_size), inplace=True)

In [11]:
# check if all missing values have been gone from O_Size
data.isnull().sum()

I_Id          0
I_W           0
I_Fat_C       0
I_Vis         0
I_MRP_JD      0
I_category    0
I_MRP_US      0
O_Id          0
O_Establ_Y    0
O_Size        0
I_Recalled    0
O_Loc_T       0
O_T           0
I_O_Sales     0
dtype: int64

In [12]:
data.describe()

Unnamed: 0,I_W,I_Vis,I_MRP_JD,I_MRP_US,O_Establ_Y,I_O_Sales
count,8523.0,8523.0,8523.0,8523.0,8523.0,8523.0
mean,12.869378,0.066249,148.384875,208.992782,2007.831867,2361.288914
std,4.653372,0.051608,44.215297,62.275067,8.37176,1706.499616
min,0.0,0.0,70.4959,99.29,1995.0,213.29
25%,8.785,0.027114,114.896815,161.8265,1997.0,1014.2474
50%,12.6,0.054056,149.819088,211.0128,2009.0,1974.331
75%,16.85,0.09471,180.087027,253.6437,2014.0,3281.2964
max,21.35,0.328516,237.770764,334.8884,2019.0,13266.9648


In [13]:
# visibility cant be 0 so ill replace the seros with null then fill them with the mean of I_Vis
data['I_Vis'] = data['I_Vis'].replace(0, np.nan)
data['I_Vis'].fillna(data['I_Vis'].mean(), inplace=True)

# feature engineering

In [14]:
# create a correlation matrix to find out which columns to drop
corr_matrix = data.corr()
corr_matrix

Unnamed: 0,I_W,I_Vis,I_MRP_JD,I_MRP_US,O_Establ_Y,I_O_Sales
I_W,1.0,-0.01896,0.026691,0.026691,-0.011404,0.012979
I_Vis,-0.01896,1.0,-0.005515,-0.005515,-0.078316,-0.134035
I_MRP_JD,0.026691,-0.005515,1.0,1.0,0.00502,0.567574
I_MRP_US,0.026691,-0.005515,1.0,1.0,0.00502,0.567574
O_Establ_Y,-0.011404,-0.078316,0.00502,0.00502,1.0,-0.049135
I_O_Sales,0.012979,-0.134035,0.567574,0.567574,-0.049135,1.0


In [15]:
# create a variance calculation to find out if there are any columns with low varience 
data_varience = data.var()
data_varience

  data_varience = data.var()


I_W           2.165387e+01
I_Vis         2.374640e-03
I_MRP_JD      1.954993e+03
I_MRP_US      3.878184e+03
O_Establ_Y    7.008637e+01
I_O_Sales     2.912141e+06
dtype: float64

In [16]:
# based on the corolation matrix and varience calculation i will drop I_MRP_JD
# moreoever logically we have its equivalant in US currency which is
# the same currency as the label
data.drop("I_MRP_JD", axis=1, inplace = True)

In [17]:
#data.to_csv('data.CSV',index=False)

# model

### **linear regression**

In [18]:
# hot encode the features/columns: I_Id (1559), I_category (16), O_Id (10) , O_T (4) plus the 9 normal columns
hot_encoded_data = pd.get_dummies(data, columns=['I_Id','I_category','O_Id','O_T'])

In [19]:
# # max-min normalize values

# #create a for loop for each column 

# min_value_I_Vis = min(hot_encoded_data['I_Vis'])
# max_value_I_Vis = max(hot_encoded_data['I_Vis'])

# min_value_I_MRP_US = min(hot_encoded_data['I_MRP_US'])
# max_value_I_MRP_US = max(hot_encoded_data['I_MRP_US']) 

# min_value_O_Establ_Y = min(hot_encoded_data['O_Establ_Y']) 
# max_value_O_Establ_Y = max(hot_encoded_data['O_Establ_Y']) 

# min_value_I_W = min(hot_encoded_data['I_W']) 
# max_value_I_W = max(hot_encoded_data['I_W']) 

# for x in range(len(data)):
  
#   scaled_x_I_Vis = (hot_encoded_data['I_Vis'][x] - min_value_I_Vis) / (max_value_I_Vis - min_value_I_Vis)
#   hot_encoded_data['I_Vis'][x] = scaled_x_I_Vis

#   scaled_x_I_MRP_US = (hot_encoded_data['I_MRP_US'][x] - min_value_I_MRP_US) / (max_value_I_MRP_US - min_value_I_MRP_US)
#   hot_encoded_data['I_MRP_US'][x] = scaled_x_I_MRP_US
  
#   scaled_x_O_Establ_Y = (hot_encoded_data['O_Establ_Y'][x] - min_value_O_Establ_Y) / (max_value_O_Establ_Y - min_value_O_Establ_Y)
#   hot_encoded_data['O_Establ_Y'][x] = scaled_x_O_Establ_Y

#   scaled_x_I_W = (hot_encoded_data['I_W'][x] - min_value_I_W) / (max_value_I_W - min_value_I_W)
#   hot_encoded_data['I_W'][x] = scaled_x_I_W


In [20]:
# z score normalized values

mean_value_I_Vis = np.mean(hot_encoded_data['I_Vis'])
std_dev_value_I_Vis = np.std(hot_encoded_data['I_Vis'])

mean_value_I_MRP_US = np.mean(hot_encoded_data['I_MRP_US'])
std_dev_value_I_MRP_US = np.std(hot_encoded_data['I_MRP_US'])

mean_value_O_Establ_Y = np.mean(hot_encoded_data['O_Establ_Y'])
std_dev_value_O_Establ_Y = np.std(hot_encoded_data['O_Establ_Y'])

mean_value_I_W = np.mean(hot_encoded_data['I_W'])
std_dev_value_I_W = np.std(hot_encoded_data['I_W'])

# normalize data


for x in range(len(data)):
  
  scaled_x_I_Vis = (hot_encoded_data['I_Vis'][x] - mean_value_I_Vis) / std_dev_value_I_Vis 
  hot_encoded_data['I_Vis'][x] = scaled_x_I_Vis

  scaled_x_I_MRP_US = (hot_encoded_data['I_MRP_US'][x] - mean_value_I_MRP_US) / std_dev_value_I_MRP_US 
  hot_encoded_data['I_MRP_US'][x] = scaled_x_I_MRP_US
  
  scaled_x_O_Establ_Y = (hot_encoded_data['O_Establ_Y'][x] - mean_value_O_Establ_Y) / std_dev_value_O_Establ_Y 
  hot_encoded_data['O_Establ_Y'][x] = scaled_x_O_Establ_Y

  scaled_x_I_W = (hot_encoded_data['I_Vis'][x] - mean_value_I_W) / std_dev_value_I_W 
  hot_encoded_data['I_W'][x] = scaled_x_I_W

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  hot_encoded_data['I_Vis'][x] = scaled_x_I_Vis
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  hot_encoded_data['I_MRP_US'][x] = scaled_x_I_MRP_US
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  hot_encoded_data['O_Establ_Y'][x] = scaled_x_O_Establ_Y
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  hot_encoded_da

In [21]:
le = preprocessing.LabelEncoder()
# to turn all these values into integers hot_encoded_data

hot_encoded_data['O_Loc_T'] = le.fit_transform(hot_encoded_data['O_Loc_T'])
hot_encoded_data['I_Fat_C'] = le.fit_transform(hot_encoded_data['I_Fat_C'])
hot_encoded_data['O_Size'] = le.fit_transform(hot_encoded_data['O_Size'])
hot_encoded_data['I_Recalled'] = le.fit_transform(hot_encoded_data['I_Recalled'])


In [22]:
x = hot_encoded_data.drop(['I_O_Sales'],axis=1)

y=hot_encoded_data['I_O_Sales']

In [23]:
x_train , x_test, y_train, y_test =train_test_split(x,y,test_size=0.33,random_state=0)

In [24]:
# to add a column of ones for the beta 
x_b = np.c_[np.ones((x_train.shape[0], 1)), x_train]  
x_test = np.c_[np.ones((x_test.shape[0], 1)), x_test] 

In [25]:
# turn the y_train into an array so that it can be used for the gradient
y_train = np.array(y_train)
x=y_train.shape[0]
y_train = y_train.reshape(x,1)

In [26]:
learning_rate = 0.015  # learning rate
n_iterations = 2000  #number of iteration
m = x_train.shape[0]              #number of rows in the dataset

beta = np.full((x_train.shape[1]+1,1),1)  # random initialization for beta

print(f'inital beta {beta}')
for iteration in range(n_iterations):
  #Here we have to compute the gradient 
  gradients = 2/m * x_b.T.dot(x_b.dot(beta) - y_train)

  #Here we have to update the values of beta 
  beta = beta - (learning_rate * gradients)

print(f'beta in iteration {iteration+1} is {beta}')

inital beta [[1]
 [1]
 [1]
 ...
 [1]
 [1]
 [1]]
beta in iteration 2000 is [[ 231.90775689]
 [-610.6993845 ]
 [  34.09864907]
 ...
 [ 415.38827029]
 [ 803.63876992]
 [-973.45264833]]


In [27]:
# for predicted y
y_pred=np.dot(x_test,beta) 

# to create an appropriate shaped y_test
y_test = np.array(y_test)
a = y_test.shape[0]
y_test=y_test.reshape(a,1)

In [28]:
#evaluate using r squared

x1 = (y_test - y_pred)
x2 = x1**2
rss = x2.sum()

xx1 = (y_test - y_test.mean())
xx2 = xx1**2
tss = xx2.sum()

r2 = 1 - (rss / tss)
print('R squared = ',r2)
MSE = np.square(np.subtract(y_test,y_pred)).mean()   
rsme = sqrt(MSE)
MAE=np.mean(np.abs(y_test - y_pred))

print("MSE=",MSE)  
print("RSME=",rsme)
print("MAE=",MAE)

R squared =  0.565978995422862
MSE= 1298894.1663203791
RSME= 1139.6903817793582
MAE= 849.6175956858469


### **KNN**

In [29]:
# hot encode the features/columns: I_Id (1559), I_category (16), O_Id (10) , plus the 10 normal columns
hot_encoded_KNNdata = pd.get_dummies(data, columns=['I_Id','I_category','O_Id'])

In [30]:
# normalize values

# create a for loop for each column 

min_value_I_Vis = min(hot_encoded_KNNdata['I_Vis'])
max_value_I_Vis = max(hot_encoded_KNNdata['I_Vis'])

min_value_I_MRP_US = min(hot_encoded_KNNdata['I_MRP_US'])
max_value_I_MRP_US = max(hot_encoded_KNNdata['I_MRP_US']) 

min_value_O_Establ_Y = min(hot_encoded_KNNdata['O_Establ_Y']) 
max_value_O_Establ_Y = max(hot_encoded_KNNdata['O_Establ_Y']) 

min_value_I_W = min(hot_encoded_KNNdata['I_W']) 
max_value_I_W = max(hot_encoded_KNNdata['I_W']) 

min_value_I_O_Sales = min(hot_encoded_KNNdata['I_O_Sales']) 
max_value_I_O_Sales = max(hot_encoded_KNNdata['I_O_Sales'])

for x in range(len(data)):
  
  scaled_x_I_Vis = (hot_encoded_KNNdata['I_Vis'][x] - min_value_I_Vis) / (max_value_I_Vis - min_value_I_Vis)
  hot_encoded_KNNdata['I_Vis'][x] = scaled_x_I_Vis

  scaled_x_I_MRP_US = (hot_encoded_KNNdata['I_MRP_US'][x] - min_value_I_MRP_US) / (max_value_I_MRP_US - min_value_I_MRP_US)
  hot_encoded_KNNdata['I_MRP_US'][x] = scaled_x_I_MRP_US
  
  scaled_x_O_Establ_Y = (hot_encoded_KNNdata['O_Establ_Y'][x] - min_value_O_Establ_Y) / (max_value_O_Establ_Y - min_value_O_Establ_Y)
  hot_encoded_KNNdata['O_Establ_Y'][x] = scaled_x_O_Establ_Y

  scaled_x_I_W = (hot_encoded_KNNdata['I_W'][x] - min_value_I_W) / (max_value_I_W - min_value_I_W)
  hot_encoded_KNNdata['I_W'][x] = scaled_x_I_W

  scaled_x_I_O_Sales = (hot_encoded_KNNdata['I_O_Sales'][x] - min_value_I_O_Sales) / (max_value_I_O_Sales - min_value_I_O_Sales)
  hot_encoded_KNNdata['I_O_Sales'][x] = scaled_x_I_O_Sales

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  hot_encoded_KNNdata['I_Vis'][x] = scaled_x_I_Vis
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  hot_encoded_KNNdata['I_MRP_US'][x] = scaled_x_I_MRP_US
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  hot_encoded_KNNdata['O_Establ_Y'][x] = scaled_x_O_Establ_Y
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  hot_e

In [31]:
# z score normalized values

# mean_value_I_Vis = np.mean(hot_encoded_KNNdata['I_Vis'])
# std_dev_value_I_Vis = np.std(hot_encoded_KNNdata['I_Vis'])

# mean_value_I_MRP_US = np.mean(hot_encoded_KNNdata['I_MRP_US'])
# std_dev_value_I_MRP_US = np.std(hot_encoded_KNNdata['I_MRP_US'])

# mean_value_O_Establ_Y = np.mean(hot_encoded_KNNdata['O_Establ_Y'])
# std_dev_value_O_Establ_Y = np.std(hot_encoded_KNNdata['O_Establ_Y'])

# mean_value_I_W = np.mean(hot_encoded_KNNdata['I_W'])
# std_dev_value_I_W = np.std(hot_encoded_KNNdata['I_W'])

# mean_value_I_O_Sales = np.mean(hot_encoded_KNNdata['I_O_Sales'])
# std_dev_value_I_O_Sales = np.std(hot_encoded_KNNdata['I_O_Sales'])

# # normalize data


# for x in range(len(data)):
  
#   scaled_x_I_Vis = (hot_encoded_KNNdata['I_Vis'][x] - mean_value_I_Vis) / std_dev_value_I_Vis 
#   hot_encoded_KNNdata['I_Vis'][x] = scaled_x_I_Vis

#   scaled_x_I_MRP_US = (hot_encoded_KNNdata['I_MRP_US'][x] - mean_value_I_MRP_US) / std_dev_value_I_MRP_US 
#   hot_encoded_KNNdata['I_MRP_US'][x] = scaled_x_I_MRP_US
  
#   scaled_x_O_Establ_Y = (hot_encoded_KNNdata['O_Establ_Y'][x] - mean_value_O_Establ_Y) / std_dev_value_O_Establ_Y 
#   hot_encoded_KNNdata['O_Establ_Y'][x] = scaled_x_O_Establ_Y

#   scaled_x_I_W = (hot_encoded_KNNdata['I_Vis'][x] - mean_value_I_W) / std_dev_value_I_W 
#   hot_encoded_KNNdata['I_W'][x] = scaled_x_I_W

#   scaled_x_I_O_Sales = (hot_encoded_KNNdata['I_O_Sales'][x] -  mean_value_I_O_Sales) / std_dev_value_I_O_Sales 
#   hot_encoded_KNNdata['I_O_Sales'][x] = scaled_x_I_O_Sales


In [32]:
# to turn all these values into integers hot_encoded_data

hot_encoded_KNNdata['O_Loc_T'] = le.fit_transform(hot_encoded_KNNdata['O_Loc_T'])
hot_encoded_KNNdata['I_Fat_C'] = le.fit_transform(hot_encoded_KNNdata['I_Fat_C'])
hot_encoded_KNNdata['O_Size'] = le.fit_transform(hot_encoded_KNNdata['O_Size'])
hot_encoded_KNNdata['I_Recalled'] = le.fit_transform(hot_encoded_KNNdata['I_Recalled'])
#hot_encoded_KNNdata['O_T'] = le.fit_transform(hot_encoded_KNNdata['O_T'])

In [33]:
#Importing the required modules
import numpy as np
from scipy.stats import mode
 
#Euclidean Distance
def eucledian(p1,p2):
    dist = np.sqrt(np.sum((p1-p2)**2))
    return dist
 
#Function to calculate KNN
def predict(x_train, y , x_input, k):
    op_labels = []
     
    #Loop through the Datapoints to be classified
    for item in x_input: 
         
        point_dist = []
         
        #Loop through each training Data
        for j in range(len(x_train)): 
            distances = eucledian(np.array(x_train[j,:]) , item) 
            #Calculating the distance
            point_dist.append(distances) 
        point_dist = np.array(point_dist) 
         
        #Sorting the array while preserving the index
        #Keeping the first K datapoints
        dist = np.argsort(point_dist)[:k] 
         
        #Labels of the K datapoints from above
        labels = y[dist]
         
        #Majority voting
        lab = mode(labels) 
        lab = lab.mode[0]
        op_labels.append(lab)
 
    return op_labels

In [34]:
kx = hot_encoded_KNNdata.drop(['O_T'],axis=1)
ky = hot_encoded_KNNdata['O_T']

In [35]:
# Import necessary modules
from sklearn.model_selection import train_test_split
 
# Split into training and test set
kx_train, kx_test, ky_train, ky_test = train_test_split(kx, ky, test_size = 0.2, random_state=42)

In [36]:
#convert all dataframes to numpy array (to use our function)
ky_train = ky_train.to_numpy()
kx_train = kx_train.to_numpy()
kx_test = kx_test.to_numpy()

In [37]:
#Applying our function 
ky_pred = predict(kx_train,ky_train,kx_test ,3)

from sklearn.metrics import accuracy_score
#Checking the accuracy
accuracy_score(ky_test, ky_pred)

0.9988269794721407

In [38]:
from sklearn.metrics import confusion_matrix
confusion_matrix = confusion_matrix(ky_test, ky_pred)
print(confusion_matrix)

[[ 176    0    0    0]
 [   0 1131    0    0]
 [   0    0  174    0]
 [   0    2    0  222]]


In [39]:
from sklearn.metrics import classification_report
print(classification_report(ky_test, ky_pred))

               precision    recall  f1-score   support

       C-Town       1.00      1.00      1.00       176
    Carrefour       1.00      1.00      1.00      1131
        Cozmo       1.00      1.00      1.00       174
Family Basket       1.00      0.99      1.00       224

     accuracy                           1.00      1705
    macro avg       1.00      1.00      1.00      1705
 weighted avg       1.00      1.00      1.00      1705



In [40]:
# A correlation matrix is a table showing the correlation coefficients between a set of variables. 
# Each cell in the table represents the correlation between two variables. 
# A correlation matrix is used to summarize data and find patterns among multiple variables.

# The matrix you provided seems to be a 4x4 matrix, where each entry (i,j) is
# the correlation coefficient between the ith and jth variables.

#Since the diagonal elements are all positive and large, it suggests that the
# variables are positively correlated with themselves 
# (i.e., they are perfectly correlated with themselves). 
# And the off-diagonal elements (i.e., non-diagonal elements) are all zero,
# it suggest that variables are not correlated with each other.