In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import pearsonr
from colorama import Fore

In [None]:
data=pd.read_excel('/Users/omarhassan/Documents/GitHub/BigMart-Analysis/Dataset/final_dataset.xlsx',sheet_name='Original_Imputed')
data.head()

In [None]:
data.isnull().sum()

### Exploratory Data Analysis

In [None]:
def plots(col):
    df1=data.groupby(col)
    sales1=dict()
    contents=data[col].unique()
    for content in contents:
        sale=0
        sale=df1.get_group(content).Item_Outlet_Sales.sum()
        sales1[content]=sale
    
    fig, ax = plt.subplots(figsize =(16, 9))
    ax.barh(list(sales1.keys()), list(sales1.values()))
    for s in ['top', 'bottom', 'left', 'right']:
        ax.spines[s].set_visible(False)
    ax.xaxis.set_ticks_position('none')
    ax.yaxis.set_ticks_position('none')
    ax.xaxis.set_tick_params(pad = 5)
    ax.yaxis.set_tick_params(pad = 10)
    ax.grid(color ='grey',
        linestyle ='-.', linewidth = 0.5,
        alpha = 0.2)
    ax.invert_yaxis()
    ax.set_title(col+' vs Item_Outlet_Sales',loc='left')
    plt.show()

In [None]:
columns=['Item_Fat_Content','Item_Type','Outlet_Identifier','Outlet_Location_Type','Outlet_Type']
for col in columns:
    plots(col)

It is seen that Low Fat items make more sales than regular.  
Fruits and Vegetables type items make the most sales, followed by Snack Foods and then Household.  
OUT027 is seen to have the most sales, followed by OUT035.
Location Type of Tier 3 makes the most sales out of all the Tiers.
Outlet Type of Supermarket Type1 makes the most sales by a huge margin compared to other outlet types.  

On the other side,  
Seafood makes the least sales. Followed by item type of Breakfast and Others.  
OUT019 and OUT010 made the least sales among all outlets.  
Tier 1 makes the least sales among all Tiers.  
Grocery Store and and Supermarket Type2 make the least sales among all Outlet Types.

Let us analyse why that is.

In [None]:
#Item_Fat_Content
df1=data.groupby('Item_Fat_Content')
print('Low Fat')
print("Max Price: ",df1.get_group('LF').Item_MRP.max())
print("Min Price: ",df1.get_group('LF').Item_MRP.min())
print("Max Item Weight: ",df1.get_group('LF').Item_Weight.max())
print("Min Item Weight: ",df1.get_group('LF').Item_Weight.min())
print("Max Item Visibility: ",df1.get_group('LF').Item_Visibility.max())
print("Min Item Visibility: ",df1.get_group('LF').Item_Visibility.min())
print('Regular')
print("Max Price: ",df1.get_group('REGULAR').Item_MRP.max())
print("Min Price: ",df1.get_group('REGULAR').Item_MRP.min())
print("Max Item Weight: ",df1.get_group('REGULAR').Item_Weight.max())
print("Min Item Weight: ",df1.get_group('REGULAR').Item_Weight.min())
print("Max Item Visibility: ",df1.get_group('REGULAR').Item_Visibility.max())
print("Min Item Visibility: ",df1.get_group('REGULAR').Item_Visibility.min())

Other than some minute differences in Item Visibility (LF>REGULAR), the two have very less differences in terms of weight and MRP. It can be concluded that the customers prefer the lower fat content over regular fat content because of the fat content in these items respectively instead of other variables in the dataset.

In [None]:
df1=data.groupby('Item_Type').get_group('Fruits and Vegetables')
print('Fruits and Vegetables')
print("Max Price: ",df1.Item_MRP.max())
print("Min Price: ",df1.Item_MRP.min())
print("Max Item Weight: ",df1.Item_Weight.max())
print("Min Item Weight: ",df1.Item_Weight.min())
print("Max Item Visibility: ",df1.Item_Visibility.max())
print("Min Item Visibility: ",df1.Item_Visibility.min())
df1=data.groupby('Item_Type').get_group('Seafood')
print('Seafood')
print("Max Price: ",df1.Item_MRP.max())
print("Min Price: ",df1.Item_MRP.min())
print("Max Item Weight: ",df1.Item_Weight.max())
print("Min Item Weight: ",df1.Item_Weight.min())
print("Max Item Visibility: ",df1.Item_Visibility.max())
print("Min Item Visibility: ",df1.Item_Visibility.min())

Other than differences in Item Visibility (Fruits and Vegetables>Seafood), the two have very less differences in terms of weight and MRP. It can be concluded that the customers purchase Fruits and Vegetables more than seafood due to higher personal preference. This may be because the customers have a higher preference for fruits and vegetables over seafood due to personal reasons or fators that connot be explained via the variables present in the dataset.

In [None]:
df1=data.groupby('Outlet_Identifier').get_group('OUT027')
print('OUT027')
print(df1['Item_Type'].value_counts())
print('Sales: ',df1['Item_Outlet_Sales'].sum())
print('\n')
df1=data.groupby('Outlet_Identifier').get_group('OUT019')
print('OUT019')
print(df1['Item_Type'].value_counts())
df1=data.groupby('Outlet_Identifier').get_group('OUT010')
print('Sales: ',df1['Item_Outlet_Sales'].sum())
print('\n')
print('OUT010')
print(df1['Item_Type'].value_counts())
print('Sales: ',df1['Item_Outlet_Sales'].sum())

OUT027 made the most sales with Fruits and Vegetables, which is the most preferred item type we found in the previous analysis. This lead them to make the most sales out of all the outlets.

In [None]:
df1=data.groupby('Outlet_Location_Type').get_group('Tier 3')
print('Tier 3')
print(df1['Item_Type'].value_counts())
print('Sales: ',df1['Item_Outlet_Sales'].sum())
print('\n')
df1=data.groupby('Outlet_Location_Type').get_group('Tier 2')
print('Tier 2')
print(df1['Item_Type'].value_counts())
print('Sales: ',df1['Item_Outlet_Sales'].sum())
print('\n')
df1=data.groupby('Outlet_Location_Type').get_group('Tier 1')
print('Tier 1')
print(df1['Item_Type'].value_counts())
print('Sales: ',df1['Item_Outlet_Sales'].sum())
print('\n')

The same pattern is noted here in Tier comparisons, The tiers that sold more Fruits and Vegetables have the higher sales.

In [None]:
df1=data.groupby('Outlet_Type').get_group('Supermarket Type1')
print('Supermarket Type1')
print(df1['Item_Type'].value_counts())
print('Sales: ',df1['Item_Outlet_Sales'].sum())
print('\n')
df1=data.groupby('Outlet_Type').get_group('Supermarket Type2')
print('Supermarket Type2')
print(df1['Item_Type'].value_counts())
print('Sales: ',df1['Item_Outlet_Sales'].sum())
print('\n')
df1=data.groupby('Outlet_Type').get_group('Grocery Store')
print('Grocery Store')
print(df1['Item_Type'].value_counts())
print('Sales: ',df1['Item_Outlet_Sales'].sum())
print('\n')

The same pattern is again noticed here, further proving the analysis where Fruits and Vegetables play an important part in making sales.

In [None]:
data[['Outlet_Type','Outlet_Location_Type']].value_counts()

Supermarket Type1 has the most sales and two location tiers, but then Grocery Store has the least sales and has two location tiers. One of the tiers available in Grocery Store is a higher tier than the ones available in Supermarkwet Type1, but then the sales made by each of them have a huge difference.  
We will analyse why such a difference exists. 

In [None]:
df1=data.groupby(['Outlet_Type','Outlet_Location_Type']).get_group(('Supermarket Type1','Tier 2'))
print('Supermarket Type1 and Tier 2')
print(df1['Item_Type'].value_counts())
print('Sales: ',df1['Item_Outlet_Sales'].sum())
print('\n')
df1=data.groupby(['Outlet_Type','Outlet_Location_Type']).get_group(('Supermarket Type1','Tier 1'))
print('Supermarket Type1 and Tier 1')
print(df1['Item_Type'].value_counts())
print('Sales: ',df1['Item_Outlet_Sales'].sum())
print('\n')
df1=data.groupby(['Outlet_Type','Outlet_Location_Type']).get_group(('Supermarket Type3','Tier 3'))
print('Supermarket Type3 and Tier 3')
print(df1['Item_Type'].value_counts())
print('Sales: ',df1['Item_Outlet_Sales'].sum())
print('\n')
df1=data.groupby(['Outlet_Type','Outlet_Location_Type']).get_group(('Supermarket Type1','Tier 3'))
print('Supermarket Type1 and Tier 3')
print(df1['Item_Type'].value_counts())
print('Sales: ',df1['Item_Outlet_Sales'].sum())
print('\n')
df1=data.groupby(['Outlet_Type','Outlet_Location_Type']).get_group(('Supermarket Type2','Tier 3'))
print('Supermarket Type2 and Tier 3')
print(df1['Item_Type'].value_counts())
print('Sales: ',df1['Item_Outlet_Sales'].sum())
print('\n')
df1=data.groupby(['Outlet_Type','Outlet_Location_Type']).get_group(('Grocery Store','Tier 3'))
print('Grocery Store and Tier 3')
print(df1['Item_Type'].value_counts())
print('Sales: ',df1['Item_Outlet_Sales'].sum())
print('\n')
df1=data.groupby(['Outlet_Type','Outlet_Location_Type']).get_group(('Grocery Store','Tier 1'))
print('Grocery Store and Tier 1')
print(df1['Item_Type'].value_counts())
print('Sales: ',df1['Item_Outlet_Sales'].sum())
print('\n')

Supermarket Type1 and tier 2 makes the most sales as they sell the most of Fruits and vegetables and Snack Foods, two of the item types that have been analysed to greatly influence sales. If other Outlet types and location type pairs focus more on the sales of these types of items, they can make more sales. It is seen that the grocery store have much lesser sales when compared to supermarket type1, if they manage to increase the number of sales in the influencial item types like Fruits and vegetables, Snack Foods, Household, etc, they can improve their sales. 

## Conclusion

The Item Types of Fruits and vegetables, Snack foods & Household greatly influence the Outlet Sales. Although Tier 3 outlets are plenty in number, tier 2 seems to make more sales than tier 3 and tier 1. Factors that arent present in the dataset might influence greater sales in this tier than the other tiers.Preferably opening Tier 1 location types in other Outlet types might greatly improve their Sales. It is also seen that Tier 1 and Tier 2 location types make more sales than tier 3, so preferably replacing tier 3 location types with tier 2 or tier 1 might greatly increase sales made by the outlet due to factors not present in this dataset.

## Model Selection

In [None]:
sns.pairplot(data)

The data has a varied set of distributions followed by each of these variables in the dataset. Moreover Linearity is not to be seen between any of these variables and the dependent variable and normality is not followed by the data. So we cannot use Multiple Linear Regression model for this dataset. We are first going to attempt using the Support Vector Regression Model on this dataset.

In [None]:
data.columns

In [None]:
cat=['Item_Identifier','Item_Fat_Content','Item_Type','Outlet_Identifier', 'Outlet_Location_Type', 'Outlet_Type']
metric=['Item_Weight','Item_Visibility','Item_MRP','Outlet_Establishment_Year','Item_Outlet_Sales']

In [None]:
data_conv=pd.concat([data[metric],pd.get_dummies(data[cat],drop_first=True)],axis=1).copy()
data_conv.head()

In [None]:
x=data_conv.drop(['Item_Outlet_Sales'],axis=1).copy()
y=data_conv['Item_Outlet_Sales'].copy()

In [None]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.25,random_state=10)

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score
from sklearn.model_selection import GridSearchCV
regressor= RandomForestRegressor(oob_score=True,random_state=10)
parameters={
    'max_features':['sqrt','log2',0.3,0.5,0.7,0.9],
    'n_estimators':[100,300,500],
    'max_depth':[3,5,7,9]
}
gridsearch=GridSearchCV(regressor,parameters)
gridsearch.fit(x_train,y_train)

In [None]:
print('oob score: ',regressor.oob_score_)
y_pred=regressor.predict(x_test)
print('Test Set: ',r2_score(y_test,y_pred))
y_train_pred=regressor.predict(x_train)
print('Train Set: ',r2_score(y_train,y_train_pred))