## Import The Modules

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
%matplotlib inline
warnings.filterwarnings('ignore')

## Load the Dataset

In [2]:
df = pd.read_csv('Train.csv')
df.head()

FileNotFoundError: [Errno 2] No such file or directory: 'Train.csv'

In [None]:
# stats info
df.describe()

In [None]:
#attribute datatype
df.info()

In [None]:
#check unique value in dataset
df.apply(lambda x: len(x.unique()))

## Preprocessing the Data

In [None]:
#check for null values
df.isnull().sum()

In [None]:
#check for categrical or object type
cat_col = []
for x in df.dtypes.index:
    if df.dtypes[x] == 'object':
        cat_col.append(x)
cat_col

In [None]:
cat_col.remove('Item_Identifier')
cat_col.remove('Outlet_Identifier')
cat_col

In [None]:
#print the categorical columns
for col in cat_col:
    print(col)
    print(df[col].value_counts())
    print()

In [None]:
#fill the missing values
    #we are using the mean value of Item Identifier to fill Item-weight
item_weight_mean = df.pivot_table(values='Item_Weight',index='Item_Identifier')
item_weight_mean

In [None]:
miss_bool = df['Item_Weight'].isnull()
miss_bool

In [None]:
for i, item in enumerate(df['Item_Identifier']):
    if miss_bool[i]:
        if item in item_weight_mean:
            df['Item_Weight'][i] = item_weight_mean.loc[item]['Item_Weight']
        else:
            df['Item_Weight'][i] = np.mean(df['Item_Weight'])

In [None]:
df['Item_Weight'].isnull().sum()

In [None]:
outlet_size_mode = df.pivot_table(values='Outlet_Size',columns='Outlet_Type',aggfunc=(lambda x: x.mode()[0]))
outlet_size_mode

In [None]:
miss_bool = df['Outlet_Size'].isnull()
df.loc[miss_bool,'Outlet_Size'] = df.loc[miss_bool,'Outlet_Type'].apply(lambda x: outlet_size_mode[x])

In [None]:
df['Outlet_Size'].isnull().sum()

In [None]:
sum(df['Item_Visibility']==0)

In [None]:
#replace zeroes with mean
df.loc[:, 'Item_Visibility'].replace([0],[df['Item_Visibility'].mean()],inplace=True)

In [None]:
sum(df['Item_Visibility']==0)

In [None]:
# combine item fat content
df['Item_Fat_Content'] = df['Item_Fat_Content'].replace({'LF':'Low Fat','reg':'Regular','low fat':'Low Fat'})
df['Item_Fat_Content'].value_counts()

## Creating New Attributes

In [None]:
df['New_Item_Type'] = df['Item_Identifier'].apply(lambda x:x[:2])
df['New_Item_Type']

In [None]:
df['New_Item_Type'] = df['New_Item_Type'].map({'FD':'Food','NC':'Non-Consumable','DR':'Drinks'})
df['New_Item_Type'].value_counts()

In [None]:
df.loc[df['New_Item_Type']=='Non-Consumable','Item_Fat_Content'] = 'Non-Edible'
df['Item_Fat_Content'].value_counts() 

In [None]:
#create small values for estabished year
df['Outlet_Years'] = 2013 - df['Outlet_Establishment_Year']
df['Outlet_Years']

## EDA

In [None]:
sns.distplot(df['Item_Weight'])

In [None]:
sns.distplot(df['Item_Visibility'])

In [None]:
sns.distplot(df['Item_MRP'])

In [None]:
sns.distplot(df['Item_Outlet_Sales'])

In [None]:
# log Transformation
df['Item_Outlet_Sales'] = np.log(df['Item_Outlet_Sales']+1)
sns.distplot(df['Item_Outlet_Sales'])

In [None]:
sns.countplot(df['Item_Fat_Content'])

In [None]:
# plt.figure(figsize=(15,5))
label = list(df['Item_Type'].unique())
chart = sns.countplot(df['Item_Type'])
chart.set_xticklabels(labels=label,rotation=90)

In [None]:
sns.countplot(df['Outlet_Establishment_Year'])

In [None]:
sns.countplot(df['Outlet_Size'])

In [None]:
sns.countplot(df['Outlet_Location_Type'])

In [None]:
label = list(df['Outlet_Type'].unique())
sns.countplot(df['Outlet_Type']).set_xticklabels(labels =label,rotation=90)

## Coorelational Matrix

In [None]:
corr = df.corr()
sns.heatmap(corr,annot=True,cmap='coolwarm')

## Label Encoding

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df['Outlet'] = le.fit_transform(df['Outlet_Identifier'])
cat_col = ['Item_Fat_Content','Item_Type','Outlet_Size','Outlet_Location_Type','Outlet_Type','New_Item_Type']
for col in cat_col:
    df[col] = le.fit_transform(df[col])

In [None]:
df.head()

## One Hot Encoding

In [None]:
df = pd.get_dummies(df,columns=['Item_Fat_Content','Outlet_Size','Outlet_Location_Type','New_Item_Type','Outlet_Type'])
df.head()

## Input Split

In [None]:
X = df.drop(columns=['Outlet_Establishment_Year','Item_Identifier','Outlet_Identifier','Item_Outlet_Sales'])
y = df['Item_Outlet_Sales']

## Model Training

In [None]:
from sklearn.model_selection import train_test_split,cross_val_score
from sklearn.metrics import mean_squared_error

def train(model,X,y):
    model.fit(X,y)
    pred = model.predict(X)

    cv_score = cross_val_score(model,X,y,scoring='neg_mean_squared_error',cv=5)
    cv_score = np.abs(np.mean(cv_score))
    print("Model Report")
    print("MSE:",mean_squared_error(y,pred))
    print("CV Score",cv_score)

In [None]:
from sklearn.linear_model import LinearRegression,Ridge,Lasso
model = LinearRegression(normalize=True)
train(model, X, y)
coef = pd.Series(model.coef_,X.columns).sort_values()
coef.plot(kind='bar',title='Model Coefficients')

In [None]:
model = Ridge(normalize=True)
train(model, X, y)
coef = pd.Series(model.coef_,X.columns).sort_values()
coef.plot(kind='bar',title='Model Coefficients')

In [None]:
model = Lasso()
train(model, X, y)
coef = pd.Series(model.coef_,X.columns).sort_values()
coef.plot(kind='bar',title='Model Coefficients')

In [None]:
from sklearn.tree import DecisionTreeRegressor
model = DecisionTreeRegressor()
train(model, X, y)
coef = pd.Series(model.feature_importances_,X.columns).sort_values()
coef.plot(kind='bar',title='Feature Importance')

In [None]:
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor()
train(model, X, y)
coef = pd.Series(model.feature_importances_,X.columns).sort_values()
coef.plot(kind='bar',title='Feature Importance')

In [None]:
from sklearn.ensemble import ExtraTreesRegressor
model = ExtraTreesRegressor()
train(model, X, y)
coef = pd.Series(model.feature_importances_,X.columns).sort_values()
coef.plot(kind='bar',title='Feature Importance')