In [10]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import r2_score, mean_squared_error, classification_report
from sklearn.ensemble import RandomForestRegressor

from sklearn.preprocessing import OneHotEncoder
from scipy.stats import skew

from sklearn.pipeline import make_pipeline
import pickle

#### Data Preparation and Data Cleaning

In [2]:
sales_df = pd.read_csv('Supermart Grocery Sales - Retail Analytics Dataset.csv')

In [3]:
len(sales_df)

9994

In [4]:
sales_df.head()

Unnamed: 0,Order ID,Customer Name,Category,Sub Category,City,Order Date,Region,Sales,Discount,Profit,State
0,OD1,Harish,Oil & Masala,Masalas,Vellore,11-08-2017,North,1254,0.12,401.28,Tamil Nadu
1,OD2,Sudha,Beverages,Health Drinks,Krishnagiri,11-08-2017,South,749,0.18,149.8,Tamil Nadu
2,OD3,Hussain,Food Grains,Atta & Flour,Perambalur,06-12-2017,West,2360,0.21,165.2,Tamil Nadu
3,OD4,Jackson,Fruits & Veggies,Fresh Vegetables,Dharmapuri,10-11-2016,South,896,0.25,89.6,Tamil Nadu
4,OD5,Ridhesh,Food Grains,Organic Staples,Ooty,10-11-2016,South,2355,0.26,918.45,Tamil Nadu


In [5]:
sales_df.dtypes

Order ID          object
Customer Name     object
Category          object
Sub Category      object
City              object
Order Date        object
Region            object
Sales              int64
Discount         float64
Profit           float64
State             object
dtype: object

In [6]:
sales_df['Profit'] = sales_df['Profit'].astype('int')
sales_df['Discount'] = sales_df['Discount'].astype('int')
sales_df['Profit'].dtypes

dtype('int64')

In [7]:
sales_df['Discount'].dtypes

dtype('int64')

In [8]:
y_log= np.log1p(sales_df['Profit'].values)

#### Linear Regression

In [34]:
categorical = ["Customer Name","Sub Category","City"]
numerical = ["Sales","Discount"]

train_dict = sales_df[categorical + numerical].to_dict(orient="records")
y = y_log

In [35]:
train_dict, X_test, y_train, y_test = train_test_split(train_dict,y, test_size=0.2, random_state = 42)

In [36]:
dv = DictVectorizer(sparse= False)

X_train = dv.fit_transform(train_dict)
X_test = dv.transform(X_test)

In [37]:
model = LinearRegression()
model.fit(X_train, y_train)

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [24]:
y_pred_log = model.predict(X_test)

In [25]:
print("RSME", mean_squared_error(y_test, y_pred_log))
print("r2", r2_score(y_test, y_pred_log))

RSME 0.3771059581501356
r2 0.32458857169442745


#### random forest regressor

In [26]:
rf = RandomForestRegressor(n_estimators=30,      # fewer trees
                             max_depth=10,  
                              max_features="sqrt",
                               random_state=42)

In [27]:
rf.fit(X_train, y_train)

0,1,2
,n_estimators,30
,criterion,'squared_error'
,max_depth,10
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [28]:
prediction=rf.predict(X_test)

In [29]:
print("R² Score:", r2_score(y_test, prediction))
print("RMSE:", np.sqrt(mean_squared_error(y_test,prediction)))

R² Score: 0.23755048609435137
RMSE: 0.652458721239965


In [None]:
# categorical = ["Customer Name","Sub Category","City"]
# numerical = ["Sales","Discount"]

In [30]:
pipeline = make_pipeline(
    DictVectorizer(),
    RandomForestRegressor()
)

In [48]:
pipeline.fit(train_dict, y_train)

0,1,2
,steps,"[('dictvectorizer', ...), ('randomforestregressor', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,dtype,<class 'numpy.float64'>
,separator,'='
,sparse,True
,sort,True

0,1,2
,n_estimators,100
,criterion,'squared_error'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [49]:
new_item = [{
    "Category": "Fruit",
    "Store": "Downtown",
    "Discount": 0.1,
    "Sales": 200,
    "Quantity": 5
}]

In [50]:
pipeline.predict(new_item)

array([4.68582909])

In [51]:
with open('model.bin', 'wb') as f_out:
    pickle.dump(pipeline, f_out)

In [52]:
with open('model.bin', 'rb') as f_in:
    pipeline = pickle.load(f_in)

In [53]:
!ls -lh

total 70M
-rw-rw-rw- 1 codespace codespace  601 Nov 18 09:36  README.md
-rw-rw-rw- 1 codespace codespace 885K Nov 18 06:44 'Supermart Grocery Sales - Retail Analytics Dataset.csv'
-rw-rw-rw- 1 codespace codespace  69M Nov 18 12:38  model.bin
-rw-rw-rw- 1 codespace codespace 114K Nov 18 10:13  model.ipynb
-rw-rw-rw- 1 codespace codespace  92K Nov 18 12:37  notebook.ipynb
-rw-rw-rw- 1 codespace codespace 204K Nov 18 10:12 'preprocessing and EDA.ipynb'
