In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sb

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

#### Read dataset

In [3]:
store_df= pd.read_csv('../input/stores-area-and-sales-data/Stores.csv')

store_df.head()

In [4]:
store_df.columns

#### since store id does not have an impact on store_sales, so dropping it

In [5]:
store_df.drop('Store ID ',axis=1,inplace=True)

#### Checking for missing values in store_df

In [6]:
store_df.isnull().sum()

In [7]:
### no missing values in store_df

#### Exploratory Data Analysis [EDA]

In [8]:
store_eda_df= store_df.copy()

In [9]:
### information about the dataset [ datatype and number of observations]
store_eda_df.info()

In [10]:
#### studying about the dataset statitics in overall

store_eda_df.describe()

In [11]:
#### plotting the distribution of store area
plt.figure(figsize=(12,8))
sb.distplot(store_eda_df['Store_Area'],hist=False)
plt.show()

##### Observation:

1. Store Area has gaussian distribution


In [12]:
#### plotting the distribution of Items_Available
plt.figure(figsize=(12,8))
sb.distplot(store_eda_df['Items_Available'],hist=False)
plt.show()

##### Observations:

Items_Available is normally distributed

In [13]:
#### plotting the distribution of Daily_Customer_Count
plt.figure(figsize=(12,8))
sb.distplot(store_eda_df['Daily_Customer_Count'],hist=False)
plt.show()

##### Observations:

Daily_Customer_Count is normally distributed

In [14]:
#### drawing the correlation matrix to study the correlation b/w features
plt.figure(figsize=(12,8))
sb.heatmap(store_eda_df.corr(),annot=True)
plt.show()

##### Observations:

1. Store_Area & Items_Available has perfect positive correlation (1)
2. Store_Area & Daily_Customer_Count has weak negative correlation
3. Store_Area & Store_Sales has weak positive correlation
4. Items_Available & Daily_Customer_Count has weak negative correlation
5. Items_Available & Store_Sales has weak positive correlation.
6. Daily_Customer_Count & Store_Sales has weak positive correlation.

In [15]:
#### drawing a scatter plot b/w features
def scatter_plot(data,x,y):
    print(f'----------Drawing a scatter plot between {x} & {y}----------')
    plt.figure(figsize=(12,8))
    plt.scatter(data[x],data[y])
    plt.xlabel(x)
    plt.ylabel(y)
    plt.title(f'scatter plot between {x} & {y}')
    plt.show()
    print()
    print()

In [16]:
for c in store_eda_df.columns:
    for c1 in store_eda_df.columns:
        if c!=c1:
            scatter_plot(data=store_eda_df,x=c,y=c1)
            

#### Observations:

1. As items_available increases, the store area increases (perfect correlation=1)

2. As store area increases, there is little downward (independent) in the daily_customer count

3. As store area increases, store sales also increases

4. As items available increases, it is independent of daily customer count

5. As items_available increases, store sales increases (not much, but a little +ve correlation)

#### Boxplot

In [17]:
def plot_boxplot(data,x):
    plt.figure(figsize=(12,8))
    plt.title(f'Boxplot for {x}')
    sb.boxplot(data=data,x=x)
    plt.show()
    print()
    

In [18]:
plot_boxplot(data=store_eda_df,x='Store_Area')

#### Observations:

1. 25% of values come under between 800 & 1300

2. 25% of values come under between 1300 & 1500

3. 25% of values come under 1500 to 1700

4. 25% of values come under 1700 to  2100



In [19]:
plot_boxplot(data=store_eda_df,x='Items_Available')

##### Observations:
1. 25% of values come under between 1000 & 1550

2. 25% of values come under between 1550 & 1750

3. 25% of values come under 1750 to 2000

4. 25% of values come under 2000 to 2500

In [20]:
store_eda_df.columns

In [21]:
plot_boxplot(data=store_eda_df,x='Daily_Customer_Count')

##### Observations:
1. 25% of values come under between 0 & 610

2. 25% of values come under between 610 & 776

3. 25% of values come under 776 to 1000

4. 25% of values come under 1000 to 1500

In [22]:
plot_boxplot(data=store_eda_df,x='Store_Sales')

##### Observations:
1. 25% of values come under between 2,00,000 & 4,20,000

2. 25% of values come under between 4,20,000 & 5,80,000

3. 25% of values come under 5,80,000 to 7,00,000

4. 25% of values come under 7,00,000 to 11,00,000

#### Boxenplot

In [23]:
def plot_boxenplot(data,x):
    plt.figure(figsize=(12,8))
    plt.title(f'Boxenplot for {x}')
    sb.boxenplot(data=data,x=x)
    plt.show()
    print()

In [24]:
plot_boxenplot(data=store_eda_df,x='Store_Area')

In [25]:
plot_boxenplot(data=store_eda_df,x='Items_Available')

In [26]:
plot_boxenplot(data=store_eda_df,x='Daily_Customer_Count')

In [27]:
plot_boxenplot(data=store_eda_df,x='Store_Sales')

In [37]:
X= store_df.drop('Store_Sales',axis=1)
y= store_df['Store_Sales']

In [38]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test= train_test_split(X,y,test_size=0.1,random_state=0)


#### Scaling : Normalization

In [30]:
from sklearn.preprocessing import MinMaxScaler
scaler= MinMaxScaler()
X_train= scaler.fit_transform(X_train)
X_test= scaler.transform(X_test)
X_train= pd.DataFrame(X_train,columns=X.columns)
X_test= pd.DataFrame(X_test,columns=X.columns)


In [31]:
X_train

### Model Study

#### a.   Linear Regression

In [32]:
y_train

In [33]:
import statsmodels.api as sm
X_train= sm.add_constant(X_train)
lr_model= sm.OLS(y_train.ravel(),X_train).fit()
lr_model.summary()

Removing the store_area feature to study the impact & applying the linear regression

In [34]:
X_train= X_train.drop(['Store_Area'],axis=1)
X_test= X_test.drop(['Store_Area'],axis=1)
lr_model= sm.OLS(y_train.ravel(),X_train).fit()
lr_model.summary()

#### Observations from the above two linear models

1.  Model 1 was applied with all the input features, the results shows an r2 value of 0.011 & adj r2 of 0.007. Also due to high correlation b/w store area and items available, there is an instability in coefficents which could be refelected in the model 2 where only 2 input features are used (dropping the store_area feature).

2. In model 2 the items_avaialble is found to be significant and the daily customer count is less significant

### KNN Regressor

In [39]:
from sklearn.neighbors import KNeighborsRegressor
knn_model= KNeighborsRegressor()
knn_model.fit(X_train,y_train)
y_pred= knn_model.predict(X_test)
print(f'KNN Model train score: {knn_model.score(X_train,y_train)}')
print(f'KNN Model test score: {knn_model.score(X_test,y_test)}')

#### Decision Tree Regressor

In [41]:
X= store_df.drop('Store_Sales',axis=1)
y= store_df['Store_Sales']
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test= train_test_split(X,y,test_size=0.1,random_state=0)

In [42]:
from sklearn.tree import DecisionTreeRegressor
dt_regressor= DecisionTreeRegressor(random_state=0)
dt_regressor.fit(X_train,y_train)
print(f'Decision Tree train score: {dt_regressor.score(X_train,y_train)}')
print(f'Decision Tree test score: {dt_regressor.score(X_test,y_test)}')

### Random Forest Regressor

In [43]:
from sklearn.ensemble import RandomForestRegressor
rf_reg= RandomForestRegressor(random_state=1)
rf_reg.fit(X_train,y_train)
print(f'Random Forest train score: {rf_reg.score(X_train,y_train)}')
print(f'Random Forest test score: {rf_reg.score(X_test,y_test)}')

### Adaboost Regressor

In [44]:
from sklearn.ensemble import AdaBoostRegressor
adaboost_reg= AdaBoostRegressor(random_state=1)
adaboost_reg.fit(X_train,y_train)
print(f'Adaboost Regressor train score: {adaboost_reg.score(X_train,y_train)}')
print(f'Adaboost Regressor test score: {adaboost_reg.score(X_test,y_test)}')


#### Xgboost Regressor

In [45]:
import xgboost
from xgboost import *

In [46]:
xgb_regressor= XGBRegressor(random_state=1)
xgb_regressor.fit(X_train,y_train)
print(f'XGBoost Regressor train score: {xgb_regressor.score(X_train,y_train)}')
print(f'XGBoost Regressor test score: {xgb_regressor.score(X_test,y_test)}')


#### Randomized Search Cross Validation 

##### Random Forest Regressor

In [47]:
from sklearn.model_selection import RandomizedSearchCV
params= {'n_estimators':[i for i in range(100,1000,100)],'max_depth':[i for i in range(3,15)],'min_samples_leaf':[i for i in range(30,50,5)]}
random_rf_model= RandomizedSearchCV(estimator=RandomForestRegressor(random_state=1),param_distributions=params,cv=5,scoring='r2',n_iter=50,return_train_score=True,verbose=2,random_state=0)
random_rf_model.fit(X_train,y_train)
print(f'The best model params: {random_rf_model.best_params_}')
print(f'The best model score: {random_rf_model.best_score_}')

In [48]:
rf_reg_2= RandomForestRegressor(**random_rf_model.best_params_,random_state=1)
rf_reg_2.fit(X_train,y_train)
print(f'Random Forest train score: {rf_reg_2.score(X_train,y_train)}')
print(f'Random Forest test score: {rf_reg_2.score(X_test,y_test)}')

##### Adaboost Regressor

In [49]:
params= {'n_estimators':[i for i in range(100,1000,100)],'base_estimator':[DecisionTreeRegressor(max_depth=i) for i in range(3,12)]}
adaboost_model= RandomizedSearchCV(estimator=AdaBoostRegressor(random_state=1),param_distributions=params,cv=5,scoring='r2',n_iter=50,return_train_score=True,verbose=2,random_state=0)
adaboost_model.fit(X_train,y_train)
print(f'The best model params: {adaboost_model.best_params_}')
print(f'The best model score: {adaboost_model.best_score_}')

In [50]:
adaboost_reg_2= AdaBoostRegressor(**adaboost_model.best_params_,random_state=1)
adaboost_reg_2.fit(X_train,y_train)
print(f'Adaboost Regressor train score: {adaboost_reg_2.score(X_train,y_train)}')
print(f'Adaboost Regressor test score: {adaboost_reg_2.score(X_test,y_test)}')

In [51]:
params= {'n_estimators':[i for i in range(100,1000,100)],'learning_rate':[0.2,0.6],'subsample':[0.3,0.6,0.9]}
XGBOOST_model= RandomizedSearchCV(estimator=XGBRegressor(random_state=1),param_distributions=params,cv=5,scoring='r2',n_iter=50,return_train_score=True,verbose=2,random_state=0)
XGBOOST_model.fit(X_train,y_train)
print(f'The best model params: {XGBOOST_model.best_params_}')
print(f'The best model score: {XGBOOST_model.best_score_}')

In [52]:
xgboost_reg_2= XGBRegressor(**XGBOOST_model.best_params_,random_state=1)
xgboost_reg_2.fit(X_train,y_train)
print(f'XGBoost Regressor train score: {xgboost_reg_2.score(X_train,y_train)}')
print(f'XGBoost Regressor test score: {xgboost_reg_2.score(X_test,y_test)}')

#### Observations:

1. Applying the algorithm to the dataset does not give good results even after tuning hyperparameters

2. From the heatmap i could observe that the correlation values between independent variables and dependent varaiables is very low , ie near to zero, which would be the reason though why the models are not performing well.