In [1]:
# imports
import json
import pandas as pd
import numpy as np
import random
from datetime import datetime, timedelta
import sklearn
import math
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

import time
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.ensemble import RandomForestRegressor 
from statistics import mean

In [2]:
city = 'Delhi'
month = 'Dec'
PATH = f'/media/root/data/analysis/data/Item_wise_data/{city}/'
t100 = time.time()

In [3]:
# exclude non baking items
df = pd.read_csv(PATH+f'{month}.csv')
df = df[pd.notna(df['Oven_Time'])]
df = df.sort_values(by =['Kitchen_Display_Time', 'Location_Code', 'Order_Number'])
# df

In [4]:
print(df.shape)
df = df.drop(['storedesc', 'city', 'Customer_Code', 'Customer_Name', 'Actual_Order_Date', 'Order_Status_Code', 'Order_Type_Code', 'Order_Saved', 'Order_Time', 'Driver_ID', 'Driver_Shift', 'Route_Time', 'Return_Time', 'Delayed_Order', 'Order_Taker_ID','Order_Taker_Shift', 'Closed_Order_Time', 'Customer_Address_Id', 'Original_Location_Code', 'Order_Id',], axis=1)
print(df.shape)
df['Kitchen_Display_Time'] = pd.to_datetime(df['Kitchen_Display_Time'])
df['Oven_Time'] = pd.to_datetime(df['Oven_Time'])
df['prep_time'] = df['Oven_Time'] - df['Kitchen_Display_Time']
df['prep_time'] = round(df['prep_time']/np.timedelta64(1, 's')).astype(int) + 7*60
print(df.shape)
df.head()

(1266282, 29)
(1266282, 9)
(1266282, 10)


Unnamed: 0,Location_Code,Order_Date,Order_Number,Kitchen_Display_Time,Oven_Time,Menu_Code,Item Description,Quantity,Delivery_Time,prep_time
5295,DPI63889,2021-12-17,1,2021-12-17 07:28:22.537,2021-12-17 07:28:50.773,PIZ0132,_PIZ55-Pizza Mania Paneer _On,1,2021-12-17 07:37:29.117,448
5297,DPI63889,2021-12-17,1,2021-12-17 07:28:22.537,2021-12-17 07:28:50.837,NVPARCEL,Chicken Parcel,1,2021-12-17 07:37:29.117,448
5298,DPI63889,2021-12-17,2,2021-12-17 07:33:46.873,2021-12-17 07:33:52.233,PIZ0134,Pizza Mania Tomato,1,2021-12-17 07:42:34.047,425
5302,DPI63889,2021-12-17,2,2021-12-17 07:33:46.873,2021-12-17 07:33:52.563,PIZ0117,_SV-Margherita,1,2021-12-17 07:42:34.047,426
5303,DPI63889,2021-12-17,3,2021-12-17 09:24:05.420,2021-12-17 09:24:14.843,PIZ5109,PM Chicken Sausage,1,2021-12-17 09:39:10.430,429


In [5]:
allMenuCodes = df['Menu_Code'].unique()
allStoreIds = df['Location_Code'].unique()
# allMenuCodes, allStoreIds

In [6]:
sampleItem=None
for index, item in df.iterrows():
    sampleItem = item
    break

In [7]:
sampleItem

Location_Code                                DPI63889
Order_Date                                 2021-12-17
Order_Number                                        1
Kitchen_Display_Time       2021-12-17 07:28:22.537000
Oven_Time                  2021-12-17 07:28:50.773000
Menu_Code                                     PIZ0132
Item Description        _PIZ55-Pizza Mania Paneer _On
Quantity                                            1
Delivery_Time                 2021-12-17 07:37:29.117
prep_time                                         448
Name: 5295, dtype: object

In [8]:
# Feature 1: store id
def getOneHotEncodingStore(storeId, stores=allStoreIds):
    oneHotEncodingList = []
    for i in range(0, len(stores)):
        if(stores[i] == storeId):
            oneHotEncodingList.append(1)
        else:
            oneHotEncodingList.append(0)
    return oneHotEncodingList

# returns the feature store id for an order
def getStoreId(item, isOneHotEncodingRequired=False):
    if isOneHotEncodingRequired:
        return getOneHotEncodingStore(item['Location_Code'], allStoreIds)
    else:
        return [item['Location_Code']]

In [9]:
print(getStoreId(sampleItem))
print(getStoreId(sampleItem, True))

['DPI63889']
[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


In [10]:
# Feature 2: Item Type
def getOneHotEncodingItem(item, items=allMenuCodes, clubCategoryWise=False):
    if clubCategoryWise:
        newItemsList = []
        newItemsList = [i[0:3] for i in items]
        unique_items = list(dict.fromkeys(newItemsList))
        return getOneHotEncodingItem(item[0:3], unique_items, False)
    
    oneHotEncodingList=[]
    for i in range(0, len(items)):
        if (items[i] == item):
            oneHotEncodingList.append(1)
        else:
            oneHotEncodingList.append(0)
    return oneHotEncodingList

def getItemType(item, isOneHotEncodingRequired=False, clubCategoryWise=False):
    if (not isOneHotEncodingRequired):
        if clubCategoryWise:
            return [item['Menu_Code'][0:3]]
        return [item['Menu_Code']]
    else:
        return getOneHotEncodingItem(item['Menu_Code'], allMenuCodes, clubCategoryWise)

In [11]:
print(getItemType(sampleItem))
print(getItemType(sampleItem, True))
print(getItemType(sampleItem, clubCategoryWise=True))
print(getItemType(sampleItem, True, True))

['PIZ0132']
[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
['PIZ']
[1, 0, 0, 0, 0, 0, 0]


In [12]:
# Feature 3: TimeSlot of kitchen display time
total_slots = 24
def getTimeslot(dt, slots=total_slots):
	current = datetime(2011, 1, 1)
	t1 = current.time()
	t2 = dt.time()
	for i in range(0, slots):
		current = current + timedelta(minutes = 60*24/slots)
		if(current.time() > t2):
			return i
	return slots-1

def getOneHotEncodingTimeSlot(timeslot):
    oneHotEncodingList = []
    for i in range(0, 24):
        oneHotEncodingList.append(0)
    oneHotEncodingList[timeslot] = 1
    return oneHotEncodingList

def getTimeSlotOfKitchenDisplayTime(item, isOneHotEncodingRequired=False):
    timeslot = getTimeslot(item['Kitchen_Display_Time'], 24)
    if isOneHotEncodingRequired:
        return getOneHotEncodingTimeSlot(timeslot)
    else:
        return [timeslot]

In [13]:
def get_preparation_time(item):
    return item['prep_time']
# 	diff = item['Oven_Time'] - item['Kitchen_Display_Time']
# 	return diff.total_seconds() + 7*60

In [14]:
#DON"T RUN IT AGAIN AND AGAIN, it will take much more time
# Feature 4 : Count of all baking items of all orders received by the store in the last 20 minutes.

# Pre-preparation of data
df['pre_prep_time'] = 0

StoreWiseItems = {}
for index, item in df.iterrows():
    store = item['Location_Code']
    if store in StoreWiseItems:
        dtKitchenDisplay = item['Kitchen_Display_Time']
        lastTime = dtKitchenDisplay - timedelta(minutes=60)
        count = 0
        pt = 0
        for i in reversed(range(len(StoreWiseItems[store]))):
            if(StoreWiseItems[store][i]['Kitchen_Display_Time'] >= lastTime and StoreWiseItems[store][i]['Kitchen_Display_Time'] < dtKitchenDisplay):
                if(StoreWiseItems[store][i]['Menu_Code'] == item['Menu_Code']):
                    pt += get_preparation_time(item)
                    count+=1
            else:
                break
        if count != 0:
            df['pre_prep_time'][index] = pt/count
        else:
            df['pre_prep_time'][index] = 10*60
        StoreWiseItems[store].append(item)
    else:
        StoreWiseItems[store] = [item]
        df['pre_prep_time'][index] = 10*60

        
storeWiseCumulativeItems = {}
StoreWiseDFs = {}
for store in StoreWiseItems.keys():
    StoreWiseDFs[store] = pd.DataFrame.from_dict(StoreWiseItems[store])
    storeWiseCumulativeItems[store] = {}

# item count per slot
for store in StoreWiseItems.keys():
    for item in StoreWiseItems[store]:
        date = item['Order_Date']
        minuteSlot = getTimeslot(item['Kitchen_Display_Time'], 24*60)
        if date in storeWiseCumulativeItems[store]:
            storeWiseCumulativeItems[store][date][minuteSlot]+=1
        else:
            t = []
            for i in range(0, 24*60):
                t.append(0)
            storeWiseCumulativeItems[store][date] = t
            storeWiseCumulativeItems[store][date][minuteSlot] = 1
            
#prefix sum            
for store in storeWiseCumulativeItems.keys():
    for date in storeWiseCumulativeItems[store].keys():
        for i in range(1, 24*60):
            storeWiseCumulativeItems[store][date][i] += storeWiseCumulativeItems[store][date][i-1]
            
            
def countPastOrders(item, storeWiseCumulativeItems=storeWiseCumulativeItems, slotTime=30):
    minuteSlot = getTimeslot(item['Kitchen_Display_Time'], 24*60)
    total = storeWiseCumulativeItems[item['Location_Code']][item['Order_Date']][minuteSlot]
    prev = storeWiseCumulativeItems[item['Location_Code']][item['Order_Date']][max(minuteSlot-slotTime, 0)]
    return [(total-prev)]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['pre_prep_time'][index] = 10*60
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['pre_prep_time'][index] = 10*60
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['pre_prep_time'][index] = pt/count


In [15]:
print("time = StoreWiseItems and countPastOrders", time.time()-t100)

# print(countPastOrders(sampleItem))
# print(countPastOrders(sampleItem, slotTime=10))

d = datetime.strptime('2021-12-20 00:00:00', '%Y-%m-%d %H:%M:%S')
# storeWiseCumulativeItems[sampleItem['Location_Code']][d]

time = StoreWiseItems and countPastOrders 15606.38202881813


In [16]:
# type(StoreWiseItems[sampleItem['Location_Code']])

In [17]:
# Feature 5 : Avg prep time of the same item in the same store in last 60 minutes.

def getPreviousPrepTime(item, StoreWiseItems=StoreWiseItems, clubCategoryWise=False, timeslot = 60):
    dtKitchenDisplay = item['Kitchen_Display_Time']
    lastTime = dtKitchenDisplay - timedelta(minutes=timeslot)
    
    orderedItem = item['Menu_Code']
    prepTime = []
    if clubCategoryWise:
        orderedItem = orderedItem[0:3]
    d = StoreWiseDFs[item['Location_Code']]
    tdf = d[(d['Menu_Code'].str[0:len(orderedItem)]==orderedItem) \
             & (d['Kitchen_Display_Time'] >= lastTime) & (d['Kitchen_Display_Time'] <= dtKitchenDisplay)]

    if(len(tdf)>0):
        return [tdf['prep_time'].mean()]
    else:
        return [10*60]


# def getPreviousPrepTime(item, StoreWiseItems=StoreWiseItems, clubCategoryWise=False, timeslot = 60):
#     dtKitchenDisplay = item['Kitchen_Display_Time']
#     lastTime = dtKitchenDisplay - timedelta(minutes=timeslot)
    
#     orderedItem = item['Menu_Code']
#     prepTime = []
#     if clubCategoryWise:
#         orderedItem = orderedItem[0:3]
#     tdf = df[(df['Location_Code']==item['Location_Code']) & (df['Menu_Code'].str[0:len(orderedItem)]==orderedItem) \
#              & (df['Kitchen_Display_Time'] >= lastTime) & (df['Kitchen_Display_Time'] <= dtKitchenDisplay)]

#     if(len(tdf)>0):
#         return [tdf['prep_time'].mean()]
#     else:
#         return [10*60]

    
# Feature 6 : Length of the queue in the store at that time
def lengthQueue(item):
    dtKitchenDisplay = item['Kitchen_Display_Time']
    d = StoreWiseDFs[item['Location_Code']]
    tdf = d[(d['Oven_Time'] >= dtKitchenDisplay) & (d['Kitchen_Display_Time'] <= dtKitchenDisplay)]
    
#     prepTime = []
#     tdf = df[(df['Location_Code']==item['Location_Code']) & \
#              (df['Oven_Time'] >= dtKitchenDisplay) & (df['Kitchen_Display_Time'] <= dtKitchenDisplay)]
    return len(tdf.index)
#     for order in StoreWiseItems[item['Location_Code']]:
#         if(order['Kitchen_Display_Time'] >= lastTime and order['Kitchen_Display_Time'] <= dtKitchenDisplay):
#             if (order['Menu_Code'][0:len(orderedItem)] == orderedItem):
#                 prepTime.append(get_preparation_time(order))
#     if(len(prepTime) > 0):
#         return [mean(prepTime)]
#     else:
#         return [10*60] # POINT OF DISCUSSION, For now taking 10 minutes

In [18]:
getPreviousPrepTime(sampleItem, StoreWiseItems, False, 60)
lengthQueue(sampleItem)

2

In [19]:
item = sampleItem
orderedItem = item['Menu_Code']
orderedItem = orderedItem[0:3]
dtKitchenDisplay = item['Kitchen_Display_Time']
lastTime = dtKitchenDisplay - timedelta(minutes=60)
tdf = df[(df['Location_Code']==item['Location_Code']) & (df['Menu_Code'].str[0:len(orderedItem)]==orderedItem) \
         & (df['Kitchen_Display_Time'] >= lastTime) & (df['Kitchen_Display_Time'] <= dtKitchenDisplay)]
# print(item, len(tdf)) 
# if(len(tdf)>0):
#     print([round(tdf['prep_time'].mean())])
# tdf

In [20]:
# global variables

isThresholdForPreparationTime = True
thresholdForPreparationTime = 20 # minutes for preparation time (threshold)

clubItemsCategoryWise = False

splitMethod = 'random'
slotTimeForPastOrders = 20 # minutes

model = 'RF'
isOneHotEncodingRequired = True
isNormalizationRequired = True

print("time preprocess start", time.time()-t100)

time preprocess start 15607.80916929245


In [21]:
X_train, X_test, y_train, y_test = [], [], [], []
X = []
y = []
import time
t1 = t2 = t3 = t4 = t5 = t6 = t7 = 0
i = 0
for index, item in df.iterrows():
    i+=1
    if(i%5000==0): print(t1, t2, t3, t4, t5, t6, t7)
    if(isThresholdForPreparationTime and (get_preparation_time(item) > thresholdForPreparationTime*60)): #removing inputs 
        continue
        
    features = []
    t = time.time()
    features = features + getStoreId(item, isOneHotEncodingRequired)
    t1 += time.time() - t
    
    t = time.time()
    features = features + getItemType(item, isOneHotEncodingRequired=isOneHotEncodingRequired, clubCategoryWise=clubItemsCategoryWise)
    t2 += time.time() - t

    t = time.time()    
    features = features + getTimeSlotOfKitchenDisplayTime(item, isOneHotEncodingRequired=isOneHotEncodingRequired)
    t3 += time.time() - t

    t = time.time()
    features = features + countPastOrders(item, storeWiseCumulativeItems, slotTimeForPastOrders) 
    t4 += time.time() - t

    t = time.time()
    features = features + getPreviousPrepTime(item, clubCategoryWise=clubItemsCategoryWise, timeslot = 60)  
    t5 += time.time() - t

    t = time.time()        
    features = features + [lengthQueue(item)]
    t6 += time.time() - t
    
    t = time.time()
    X.append(features)
    y.append(get_preparation_time(item))
    t7 += time.time() - t

#     if((order['ORDER_DATE'] >='2021-12-20' and order['ORDER_DATE'] <= '2021-12-24') or (order['ORDER_DATE'] >='2021-12-27' and order['ORDER_DATE'] <= '2021-12-29')):
#         X_train.append(features)
#         y_train.append(420+round(get_preparation_time(order)))
#     elif(order['ORDER_DATE'] =='2021-12-30'):
#         X_test.append(features)
#         y_test.append(420+round(get_preparation_time(order)))
print(t1, t2, t3, t4, t5, t6, t7)
print("time preprocess Done", time.time()-t100)

0.2047264575958252 0.18013548851013184 0.24625158309936523 5.047973155975342 30.015671253204346 6.722490310668945 0.13335919380187988
0.41031360626220703 0.3626539707183838 0.5028293132781982 10.633415937423706 59.551711559295654 13.385382890701294 0.266495943069458
0.6163129806518555 0.545161247253418 0.7689633369445801 16.672192096710205 89.07452082633972 20.020427703857422 0.3977823257446289
0.8231809139251709 0.7284576892852783 1.0445342063903809 23.129173040390015 118.6014096736908 26.77162528038025 0.5323376655578613
1.028364658355713 0.9100174903869629 1.3269708156585693 30.011600732803345 148.04674100875854 33.497461795806885 0.6672699451446533
1.250720500946045 1.1078708171844482 1.6441833972930908 37.68711996078491 178.9095561504364 40.64654183387756 0.8075778484344482
1.4608335494995117 1.2946383953094482 1.951289176940918 45.4698326587677 209.24460172653198 47.497910022735596 0.9430341720581055
1.6711995601654053 1.48095703125 2.261143445968628 53.470489501953125 239.409550

13.607009887695312 11.975837469100952 18.881610870361328 461.0521876811981 1941.6840560436249 440.23459219932556 8.800320625305176
13.810053586959839 12.15424656867981 19.182921409606934 469.1006979942322 1970.8483452796936 446.6644971370697 8.928684949874878
14.01712703704834 12.335224628448486 19.49090814590454 477.43150091171265 2000.3337652683258 453.184161901474 9.057698726654053
14.22521162033081 12.518814325332642 19.809054851531982 485.96694111824036 2030.249239206314 459.8623597621918 9.203245162963867
14.430192232131958 12.699909210205078 20.125728130340576 494.71831798553467 2059.9007568359375 466.4427909851074 9.33591914176941
14.635522603988647 12.879386186599731 20.42499041557312 502.5843904018402 2090.1568927764893 473.00886058807373 9.465275526046753
14.842459201812744 13.059361457824707 20.651137113571167 506.5126895904541 2120.5132853984833 479.6577696800232 9.597611904144287
15.053351879119873 13.24380612373352 20.915096521377563 512.1157627105713 2150.2009682655334 

26.912302494049072 23.6758029460907 37.32870125770569 910.5248000621796 3849.39612698555 871.8753409385681 17.53817844390869
27.10466456413269 23.8441104888916 37.61041450500488 917.964955329895 3876.8965244293213 878.1067216396332 17.661381721496582
27.30060577392578 24.017521381378174 37.903383016586304 925.6040000915527 3905.01828789711 884.4667363166809 17.78761076927185
27.491300344467163 24.185433387756348 38.18977975845337 933.128411769867 3932.3486790657043 890.6289761066437 17.909756422042847
27.67689037322998 24.34944438934326 38.470142126083374 940.5429887771606 3959.052516222 896.6595833301544 18.02909779548645
27.85597538948059 24.506678581237793 38.73831820487976 947.7066748142242 3984.7837538719177 902.4825196266174 18.143840312957764
28.039466381072998 24.668047428131104 39.01762509346008 955.1247084140778 4011.099020957947 908.4424364566803 18.261507272720337
28.22249126434326 24.82986354827881 39.294084787368774 962.4360473155975 4037.045594215393 914.3768889904022 18

39.95866417884827 35.14659833908081 55.81741547584534 1366.0018119812012 5724.471216201782 1298.1310002803802 26.114062070846558
40.165008783340454 35.32779288291931 56.10893416404724 1373.268233537674 5754.02644443512 1304.8968768119812 26.250593423843384
40.373929023742676 35.51003408432007 56.40850901603699 1380.8274548053741 5783.70640039444 1311.7005875110626 26.386711835861206
40.58353805541992 35.69434857368469 56.717185735702515 1388.6926732063293 5813.733207702637 1318.5596134662628 26.5692777633667
40.79954409599304 35.88321375846863 57.03849983215332 1396.8455414772034 5844.110497951508 1325.591298341751 26.708234786987305
41.008278369903564 36.064560651779175 57.35075616836548 1405.1011202335358 5874.36544585228 1332.4477880001068 26.844594717025757
41.229917764663696 36.25830912590027 57.68701934814453 1413.846834897995 5905.6172747612 1339.5870959758759 26.985066890716553
41.44353747367859 36.44536352157593 58.019787073135376 1422.7708535194397 5936.27245092392 1346.58053

In [22]:
print("Done with the long run....")

Done with the long run....


In [23]:
bakeup_X1 = X
bakeup_y1 = y

In [24]:
if splitMethod == 'random':
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 42)

In [25]:
def getRMSE(actual, predictions):
    mse = sklearn.metrics.mean_squared_error(actual, predictions)
    return math.sqrt(mse)

In [39]:
len(X)

36

In [None]:
# random forest
if (model=='RF'):
    print("Random Forest Model")
    regressor = RandomForestRegressor()
    if(isNormalizationRequired):
        scaler = StandardScaler()
        scaler.fit(X_train)

        X_train = scaler.transform(X_train)
        X_test = scaler.transform(X_test)
    
    regressor.fit(X_train, y_train)
    predictions = regressor.predict(X_train)
    print("RMSE score on train set: ", getRMSE(y_train, predictions))
        
    predictions = regressor.predict(X_test)
    print("RMSE score on test set: ", getRMSE(y_test, predictions))

print("time trainin done", time.time()-t100)

Random Forest Model


In [None]:
X_t = []
Y_t = []
for j in range(0, len(allStoreIds)):
    xx = []
    yy = []
    for i in range(0, len(X_test)):
        if(X_test[i][j] > 0):
            xx.append(X_test[i])
            yy.append(y_test[i])
    X_t.append(j)
    Y_t.append(getRMSE(yy, regressor.predict(xx)))
    
plt.bar(X_t, Y_t)
plt.xticks(X_t)
plt.xlabel("Store Id")
plt.ylabel("RMSE value of test set")
plt.title("Preparation time prediction")
plt.savefig(f'plots/{city}_{month}_1.png')
# plt.show()

In [None]:
X_t = []
Y_t = []
XXX = [*X_train, *X_test]
yyy = [*y_train, *y_test]
for j in range(0, len(allStoreIds)):
    xx = []
    yy = []
    for i in range(0, len(XXX)):
        if(XXX[i][j] > 0):
            xx.append(XXX[i])
            yy.append(yyy[i])
    X_t.append(j)
    Y_t.append(getRMSE(yy, regressor.predict(xx)))
    
plt.bar(X_t, Y_t)
plt.xticks(X_t)
plt.xlabel("Store Id")
plt.ylabel("RMSE value of test set")
plt.title("Preparation time prediction")
plt.savefig(f'plots/{city}_{month}_1.png')
# plt.show()

In [None]:
# Store id 0, 1, 6 and 14 are not good.

In [None]:
X_t = []
Y_t = []
XXX = [*X_test]
yyy = [*y_test]


for j in {0, 1, 6, 14}:
    xx = []
    yy = []
    for i in range(0, len(XXX)):
        if(XXX[i][j] > 0):
            xx.append(XXX[i])
            yy.append(yyy[i])
    X_t.append(j)
    a = regressor.predict(xx)
    dict = {}
    for i in range(0, len(xx)):
        k = 10*(int(abs(yy[i]-a[i]))/10)
        if k not in dict:
            dict[k] = 0
        dict[k] += 1
    x1 = []
    y1 = []
    keys = sorted(dict.keys())
    print(f"Maximum key is for store-{j} is : "+str(keys[len(keys)-1]))
    for key in keys[0:15]:
        x1.append(key)
        y1.append(dict[key])
    plt.plot(x1, y1, label=f'store-{j}')

plt.legend()
plt.xlabel("Difference of predicted and real time")
plt.ylabel("Quantity of orders")
plt.title("Discrepency in prediction model")

In [None]:
X_Axis = [i for i in range(10, 24)]
RMSETimeSlotWise = {}
total_items = len(allMenuCodes)+len(allStoreIds)
for x in X_Axis:
    x1 = []
    y1 = []
    for i in range(0, len(X_test)):
        if(X_test[i][total_items+x] > 0):
            x1.append(X_test[i])
            y1.append(y_test[i])
    RMSETimeSlotWise[x] = getRMSE(y1, regressor.predict(x1))
    
X_axis = list(RMSETimeSlotWise.keys())
Y_axis = list(RMSETimeSlotWise.values())

plt.bar(X_axis, Y_axis)
plt.xticks(X_axis, rotation='vertical')
plt.xlabel("Time slot")
plt.ylabel("RMSE value of test set")
plt.title("Preparation time prediction over different time slots")
plt.savefig(f'plots/{city}_{month}_2.png')
# plt.show()

In [None]:
Y_t_all = Y_t
# Y_t_all

In [None]:
bakeup_X = X
bakeup_y = y

In [None]:
n = len(allStoreIds)
store_wise_X = [[] for _ in range(n)]
store_wise_y = [[] for _ in range(n)]

for j in range(n):
    for i in range(len(X)):
        if X[i][j] > 0:
            store_wise_X[j].append(X[i][n:])
            store_wise_y[j].append(y[i])

# store_wise_X = [[x for x in X] for j in range(n)]
# store_wise_X = [x for j in range(n) for _,y in zip(X,y) if x[j]>0]

In [None]:
myModels = {}
X_testModels = {}
RMSETimeSlotWise = {}

In [None]:
l = []
cnt=0
for i in range(n):
    X = store_wise_X[i]
    y = store_wise_y[i]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 42)
    
#     print("Random Forest Model", i, len(y_train), len(y_test))
    regressor = RandomForestRegressor()
    if(isNormalizationRequired):
        scaler = StandardScaler()
        scaler.fit(X_train)

        X_train = scaler.transform(X_train)
        X_test = scaler.transform(X_test)
    
    regressor.fit(X_train, y_train)
    myModels[i] = regressor
    predictions = regressor.predict(X_train)
    a = getRMSE(y_train, predictions)
#     print("RMSE score on train set: ", a)
    
    predictions = regressor.predict(X_test)
    b = getRMSE(y_test, predictions)
    if(b > 120):
        continue
    cnt+=1
#     print("RMSE score on test set: ", b)
    l.append((a, b, len(y_train), len(y_test)))
# l

In [None]:
l = []
cnt=0
RMSETimeSlotWise = {}
for i in range(n):
    X = store_wise_X[i]
    y = store_wise_y[i]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 42)
    
#     print("Random Forest Model", i, len(y_train), len(y_test))
    regressor = RandomForestRegressor()
    if(isNormalizationRequired):
        scaler = StandardScaler()
        scaler.fit(X_train)

        X_train = scaler.transform(X_train)
        X_test = scaler.transform(X_test)
    
    regressor.fit(X_train, y_train)
    myModels[i] = regressor
    predictions = regressor.predict(X_train)
    a = getRMSE(y_train, predictions)
#     print("RMSE score on train set: ", a)
    
    predictions = regressor.predict(X_test)
    b = getRMSE(y_test, predictions)
    if(b > 120):
        continue
    cnt+=1
#     print("RMSE score on test set: ", b)
    l.append((a, b, len(y_train), len(y_test)))
# l

In [None]:
len(invalidStores)

In [None]:
X_t = [i for i in range(cnt)]
Y_t = [x[1] for x in l]
Y_t, X_t
plt.bar(X_t, Y_t)
plt.xticks(X_t)
plt.xlabel("Store Id")
plt.ylabel("RMSE value of test set")
plt.title("Preparation time prediction")
plt.savefig(f'plots/{city}_{month}_3.png')
# plt.show()
# Y_t

In [None]:
l2 = [(a**2)*b for _,a,_,b in l]
l3 = [b for _,_,_,b in l]
combine_rmse = (sum(l2)/sum(l3))**(1/2)
print("restwise rmse:", combine_rmse)

In [None]:
y = []
for i in X_t:
    y.append((Y_t[i], Y_t_all[i]))
#     print(i, Y_t[i], Y_t_all[i])

In [None]:
# plt.plot(X_t, Y_t, label = 'Single Model Prediction')
# plt.plot(X_t, Y_t_all, label = 'Restaurent Wise Prediction')

plt.plot(X_t, y)
plt.legend(['Restaurent Model Prediction', 'Single Wise Prediction'])

plt.xticks(X_t, rotation='vertical')
plt.xlabel("Store Id")
plt.ylabel("RMSE value of test set")
plt.title("Preparation time prediction")
plt.savefig(f'plots/{city}_{month}_4.png')
# plt.show()

In [None]:
# dumping to file
import numpy as np
np.savetxt(f'{city}_Dec_X.txt', bakeup_X)
np.savetxt(f'{city}_Dec_y.txt', bakeup_y)