In [86]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import r2_score, mean_squared_error, classification_report
from sklearn.ensemble import RandomForestRegressor

from sklearn.preprocessing import OneHotEncoder
from scipy.stats import skew

from sklearn.pipeline import make_pipeline

#### Data Preparation and Data Cleaning

In [2]:
sales_df = pd.read_csv('Supermart Grocery Sales - Retail Analytics Dataset.csv')

In [3]:
len(sales_df)

9994

In [4]:
sales_df.head()

Unnamed: 0,Order ID,Customer Name,Category,Sub Category,City,Order Date,Region,Sales,Discount,Profit,State
0,OD1,Harish,Oil & Masala,Masalas,Vellore,11-08-2017,North,1254,0.12,401.28,Tamil Nadu
1,OD2,Sudha,Beverages,Health Drinks,Krishnagiri,11-08-2017,South,749,0.18,149.8,Tamil Nadu
2,OD3,Hussain,Food Grains,Atta & Flour,Perambalur,06-12-2017,West,2360,0.21,165.2,Tamil Nadu
3,OD4,Jackson,Fruits & Veggies,Fresh Vegetables,Dharmapuri,10-11-2016,South,896,0.25,89.6,Tamil Nadu
4,OD5,Ridhesh,Food Grains,Organic Staples,Ooty,10-11-2016,South,2355,0.26,918.45,Tamil Nadu


In [26]:
y_log= np.log1p(sales_df['Profit'])

#### Linear Regression

In [100]:
# predict profit (feature as sales, and profit as y)
#X =sales_df[['Sales','Discount']]
#y = y_log

categorical = ["Customer Name","Sub Category","City"]
numerical = ["Sales","Discount"]

train_dict = sales_df[categorical + numerical].to_dict(orient="records")
dv = DictVectorizer(sparse= False)
X = dv.fit_transform(train_dict)
y = y_log

In [124]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state = 42)

In [96]:
model = LinearRegression()
model.fit(X_train, y_train)

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [97]:
y_pred_log = model.predict(X_test)

In [98]:
print("RSME", mean_squared_error(y_test, y_pred_log))
print("r2", r2_score(y_test, y_pred_log))

RSME 0.37695550274346434
r2 0.32485804317563827


In [116]:
rf = RandomForestRegressor(n_estimators=50,      # fewer trees
                             max_depth=20,          # limit depth
                             max_features='sqrt',   # fewer features per split
                             random_state=42)

In [117]:
# train = sales_df[["Sales"]]
# target = sales_df["Profit"]

In [120]:
clf.fit(X_train, y_train)

0,1,2
,n_estimators,50
,criterion,'squared_error'
,max_depth,20
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [122]:
prediction=rf.predict(X_test)

NotFittedError: This RandomForestRegressor instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.

In [76]:
print("R² Score:", r2_score(y_train, prediction))
print("RMSE:", np.sqrt(mean_squared_error(y_train,prediction)))

R² Score: 0.5312551736869073
RMSE: 0.5024890413860865


In [77]:
# categorical = ["Customer Name","Sub Category","City"]
# numerical = ["Sales","Discount"]

In [87]:
pipeline = make_pipeline(
    DictVectorizer(),
    RandomForestRegressor()
)

In [91]:
train_dict = sales_df[categorical + numerical].to_dict(orient='records')
pipeline.fit(train_dict, y_train)

ValueError: Found input variables with inconsistent numbers of samples: [9994, 7995]

In [79]:
with open('model.bin', 'wb') as f_out:
    pickle.dump(pipeline, f_out)

In [80]:
!ls -lh

total 8.0M
-rw-rw-rw- 1 codespace codespace   78 Nov 18 06:45  README.md
-rw-rw-rw- 1 codespace codespace 885K Nov 18 06:44 'Supermart Grocery Sales - Retail Analytics Dataset.csv'
-rw-rw-rw- 1 codespace codespace 6.8M Nov 18 09:18  model.bin
-rw-rw-rw- 1 codespace codespace 265K Nov 18 09:18  notebook.ipynb


In [123]:
with open('model.bin', 'rb') as f_in:
    pipeline = pickle.load(f_in)

0,1,2
,n_estimators,50
,criterion,'squared_error'
,max_depth,20
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True
