In [None]:
import pandas as pd
import joblib
import time

In [2]:
centers=pd.read_csv('DATA/fulfilment_center_info.csv')
meals=pd.read_csv('DATA/meal_info.csv')
df=pd.read_csv('DATA/train.csv')

#### Understand the dataset

In [3]:
centers.head()

Unnamed: 0,center_id,city_code,region_code,center_type,op_area
0,11,679,56,TYPE_A,3.7
1,13,590,56,TYPE_B,6.7
2,124,590,56,TYPE_C,4.0
3,66,648,34,TYPE_A,4.1
4,94,632,34,TYPE_C,3.6


In [4]:
meals.head()

Unnamed: 0,meal_id,category,cuisine
0,1885,Beverages,Thai
1,1993,Beverages,Thai
2,2539,Beverages,Thai
3,1248,Beverages,Indian
4,2631,Beverages,Indian


In [5]:
df.head()

Unnamed: 0,id,week,center_id,meal_id,checkout_price,base_price,emailer_for_promotion,homepage_featured,num_orders
0,1379560,1,55,1885,136.83,152.29,0,0,177
1,1466964,1,55,1993,136.83,135.83,0,0,270
2,1346989,1,55,2539,134.86,135.86,0,0,189
3,1338232,1,55,2139,339.5,437.53,0,0,54
4,1448490,1,55,2631,243.5,242.5,0,0,40


In [6]:
centers['center_type'].unique()

array(['TYPE_A', 'TYPE_B', 'TYPE_C'], dtype=object)

In [7]:
meals.category.unique()

array(['Beverages', 'Extras', 'Soup', 'Other Snacks', 'Salad',
       'Rice Bowl', 'Starters', 'Sandwich', 'Pasta', 'Desert', 'Biryani',
       'Pizza', 'Fish', 'Seafood'], dtype=object)

In [8]:
meals.cuisine.unique()

array(['Thai', 'Indian', 'Italian', 'Continental'], dtype=object)

In [9]:
len(df['week'].unique())
# in this data set ,there are 145 unique weeks

145

### merge the data set

In [10]:
df=df.merge(centers,on='center_id')
df=df.merge(meals,on='meal_id')

In [11]:
df.head()

Unnamed: 0,id,week,center_id,meal_id,checkout_price,base_price,emailer_for_promotion,homepage_featured,num_orders,city_code,region_code,center_type,op_area,category,cuisine
0,1379560,1,55,1885,136.83,152.29,0,0,177,647,56,TYPE_C,2.0,Beverages,Thai
1,1018704,2,55,1885,135.83,152.29,0,0,323,647,56,TYPE_C,2.0,Beverages,Thai
2,1196273,3,55,1885,132.92,133.92,0,0,96,647,56,TYPE_C,2.0,Beverages,Thai
3,1116527,4,55,1885,135.86,134.86,0,0,163,647,56,TYPE_C,2.0,Beverages,Thai
4,1343872,5,55,1885,146.5,147.5,0,0,215,647,56,TYPE_C,2.0,Beverages,Thai


In [12]:
df.shape

(456548, 15)

In [13]:
len(df['center_id'].unique())

77

In [14]:
len(df['meal_id'].unique())

51

In [15]:
len(df[df['cuisine']=='Thai'])

118216

In [16]:
df.groupby(['cuisine', 'category']).size().reset_index(name='count').sort_values(by=['cuisine', 'count'], ascending=[True, False])

Unnamed: 0,cuisine,category,count
2,Continental,Pizza,33138
0,Continental,Beverages,32554
3,Continental,Seafood,26916
1,Continental,Fish,10187
7,Indian,Rice Bowl,33408
4,Indian,Beverages,29296
6,Indian,Desert,29294
5,Indian,Biryani,20614
8,Italian,Beverages,33381
11,Italian,Sandwich,33291


#### we can understand which is more popular male in each cuisin ex(in india Rice Bowl is more popular)

In [17]:
df.groupby(['cuisine', 'category'])['meal_id'].nunique().reset_index(name='unique_meal_count')


Unnamed: 0,cuisine,category,unique_meal_count
0,Continental,Beverages,3
1,Continental,Fish,3
2,Continental,Pizza,3
3,Continental,Seafood,3
4,Indian,Beverages,3
5,Indian,Biryani,3
6,Indian,Desert,3
7,Indian,Rice Bowl,3
8,Italian,Beverages,3
9,Italian,Pasta,3


##### Here we can see that all catogory have 3 meal types

### DATA PREPROCESSING

In [18]:


df.isnull().sum()

id                       0
week                     0
center_id                0
meal_id                  0
checkout_price           0
base_price               0
emailer_for_promotion    0
homepage_featured        0
num_orders               0
city_code                0
region_code              0
center_type              0
op_area                  0
category                 0
cuisine                  0
dtype: int64

In [19]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 456548 entries, 0 to 456547
Data columns (total 15 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   id                     456548 non-null  int64  
 1   week                   456548 non-null  int64  
 2   center_id              456548 non-null  int64  
 3   meal_id                456548 non-null  int64  
 4   checkout_price         456548 non-null  float64
 5   base_price             456548 non-null  float64
 6   emailer_for_promotion  456548 non-null  int64  
 7   homepage_featured      456548 non-null  int64  
 8   num_orders             456548 non-null  int64  
 9   city_code              456548 non-null  int64  
 10  region_code            456548 non-null  int64  
 11  center_type            456548 non-null  object 
 12  op_area                456548 non-null  float64
 13  category               456548 non-null  object 
 14  cuisine                456548 non-nu

In [20]:
df[df.duplicated()]
# No duplicate value

Unnamed: 0,id,week,center_id,meal_id,checkout_price,base_price,emailer_for_promotion,homepage_featured,num_orders,city_code,region_code,center_type,op_area,category,cuisine


In [21]:
df.to_csv('DATA/preprocessed_data.csv', index=False)

### Feature Engineering

In [22]:
df.drop(['meal_id','id','center_id'],inplace=True,axis=1)

In [23]:
df.head()

Unnamed: 0,week,checkout_price,base_price,emailer_for_promotion,homepage_featured,num_orders,city_code,region_code,center_type,op_area,category,cuisine
0,1,136.83,152.29,0,0,177,647,56,TYPE_C,2.0,Beverages,Thai
1,2,135.83,152.29,0,0,323,647,56,TYPE_C,2.0,Beverages,Thai
2,3,132.92,133.92,0,0,96,647,56,TYPE_C,2.0,Beverages,Thai
3,4,135.86,134.86,0,0,163,647,56,TYPE_C,2.0,Beverages,Thai
4,5,146.5,147.5,0,0,215,647,56,TYPE_C,2.0,Beverages,Thai


### Label Encoding

In [24]:
from sklearn.preprocessing import LabelEncoder

In [25]:
le=LabelEncoder()
df['category']=le.fit_transform(df['category'])
df['cuisine']=le.fit_transform(df['cuisine'])
df['center_type']=le.fit_transform(df['center_type'])

In [26]:
df.head()

Unnamed: 0,week,checkout_price,base_price,emailer_for_promotion,homepage_featured,num_orders,city_code,region_code,center_type,op_area,category,cuisine
0,1,136.83,152.29,0,0,177,647,56,2,2.0,0,3
1,2,135.83,152.29,0,0,323,647,56,2,2.0,0,3
2,3,132.92,133.92,0,0,96,647,56,2,2.0,0,3
3,4,135.86,134.86,0,0,163,647,56,2,2.0,0,3
4,5,146.5,147.5,0,0,215,647,56,2,2.0,0,3


### Spliting train and test data

In [27]:
X=df.drop(['num_orders'],axis=1)
y=df['num_orders']

In [28]:
from sklearn.model_selection import train_test_split

In [29]:
x_train,x_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)

### creating model

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import accuracy_score,mean_absolute_error,mean_squared_error,mean_squared_log_error,r2_score,adjusted_r2

### Linear Regression

In [31]:
ln_model=LinearRegression()
ln_model.fit(x_train,y_train)

In [32]:
pred=ln_model.predict(x_test)

In [33]:
print('mean_absolute_error:',mean_absolute_error(pred,y_test))
print('mean_squared_error:',mean_squared_error(pred,y_test))


mean_absolute_error: 193.5930424295895
mean_squared_error: 115336.51517872344


### SVR 

In [34]:
from sklearn.svm import SVR

In [35]:
svr=SVR(kernel='rbf')

In [None]:
svr.fit(x_train,y_train)

In [None]:
y_pred=svr.predict(x_test)


In [None]:
print('mean_absolute_error:',mean_absolute_error(y_pred,y_test))
print('mean_squared_error:',mean_squared_error(y_pred,y_test))
print('r2_score:',r2_score(y_test,y_pred))
print('adjusted_r2:',adjusted_r2(y_test,y_pred))

In [None]:
joblib.dump('Model/svm.pkl')