In [2]:
import pandas as pd
from jdatetime import datetime, date
from IPython.display import display
import numpy as np
import seaborn as sns
import matplotlib.pylab as plt
import arabic_reshaper
from bidi.algorithm import get_display
import scipy.stats as st

In [3]:
from sklearn.metrics import r2_score
from sklearn.metrics import root_mean_squared_error
from copy import deepcopy
class Evaluator:
    def __init__(self, models:dict, test_data, test_label, train_data, train_label):
        self.models = models
        self.test_data = test_data
        self.test_label = test_label
        self.train_label = train_label
        self.train_data = train_data

    def calculate_evaluators(self):
        predict = {i: self.models[i].predict(self.test_data) for i in self.models.keys()}
        result = dict()
        for i in self.models.keys():
            result[i] = dict()
            result[i][f'r2']   = r2_score(self.test_label, predict[i])
            result[i][f'RMSE'] = root_mean_squared_error(self.test_label, predict[i])

        return pd.DataFrame(result).transpose()


    def add_or_change_module(self, model:dict):
        for i in model.keys():
            self.models[i] = model[i]

    def fit_models(self, models:dict):
      for name, model in models.items():
        for method, label in self.train_label.items():
          self.add_or_change_module({f'{name} {method}':deepcopy(model).fit(self.train_data, label)})

In [4]:
from sklearn.model_selection import train_test_split

class DataSplitter:
    def __init__(self, df, target_var, random_state=4):
        self.X = df.drop(target_var, axis=1)
        self.y = df[target_var]
        self.random_state = random_state

    def split(self, test_size=0.1, validation_size=0.1):
        X_train, X_test, y_train, y_test = train_test_split(self.X, self.y,
                                                            test_size=test_size + validation_size, 
                                                            random_state=self.random_state)

            
        X_val, X_test, y_val, y_test = train_test_split(X_test, y_test,
                                                        test_size=test_size / (test_size + validation_size),
                                                        random_state=self.random_state)

        return {
            'X': {'train': X_train, 'val': X_val, 'test': X_test, 'cross_train': pd.concat([X_train, X_val])},
            'y': {'train': y_train, 'val': y_val, 'test': y_test, 'cross_train': pd.concat([y_train, y_val])}
            }

    def get_random_state(self):
        return self.random_state

    def change_random_state(self):
        self.random_state += 1
        return self.random_state

In [21]:
df = pd.read_csv('/home/mvajhi/code/Introduction-to-Data-Science/final/out.csv')
df = df.drop_duplicates()
df = df[df['RestaurantName'].str.contains('حوزه') == False]
display(df)
display(df.info())

Unnamed: 0,DateReserve,RestaurantName,RestaurantType,Meal,FoodName,FoodType,Gender,ReceiveWithCard,ReceiveWithCode,DontReceive,Reservation,DayOfWeek,HolidayInWeekCount,HolidayInPrevWeekCount,HolidayInNextWeekCount,NextHoliday_1,NextHoliday_2,PreviousHoliday_1,PreviousHoliday_2,DateReserveGregorian
0,1402-10-10,ابوریحان,daneshgah,dinner,خوراک نودلیت,khorak,man,14,0,0,14,1,0,0,0,42,56,14,89,2023-12-31
1,1402-10-10,ابوریحان,daneshgah,dinner,خوراک نودلیت,khorak,woman,30,3,3,36,1,0,0,0,42,56,14,89,2023-12-31
2,1402-10-10,ابوریحان,daneshgah,dinner,چلوكباب كوبیده,berenji,man,139,0,0,139,1,0,0,0,42,56,14,89,2023-12-31
3,1402-10-10,ابوریحان,daneshgah,dinner,چلوكباب كوبیده,berenji,woman,68,17,4,89,1,0,0,0,42,56,14,89,2023-12-31
4,1402-10-10,ابوریحان,daneshgah,lunch,خوراک فلافل,khorak,man,46,0,0,46,1,0,0,0,42,56,14,89,2023-12-31
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15963,1402-12-09,کشاورزی,khabgah,dinner,خوراک سالاد ماكاروني,khorak,woman,73,7,8,88,4,1,0,0,20,21,3,17,2024-02-28
15964,1402-12-09,کوی,khabgah,dinner,خوراک كشك بادمجان,khorak,man,158,10,23,191,4,1,0,0,20,21,3,17,2024-02-28
15965,1402-12-09,کوی,khabgah,dinner,چلو خورش قیمه سیب زمینی,berenji,man,859,47,91,997,4,1,0,0,20,21,3,17,2024-02-28
15966,1402-12-09,کوی,khabgah,lunch,خوراک کوکو سبزی,khorak,man,80,7,10,97,4,1,0,0,20,21,3,17,2024-02-28


<class 'pandas.core.frame.DataFrame'>
Int64Index: 13062 entries, 0 to 15967
Data columns (total 20 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   DateReserve             13062 non-null  object
 1   RestaurantName          13062 non-null  object
 2   RestaurantType          13062 non-null  object
 3   Meal                    13062 non-null  object
 4   FoodName                13062 non-null  object
 5   FoodType                13062 non-null  object
 6   Gender                  13062 non-null  object
 7   ReceiveWithCard         13062 non-null  int64 
 8   ReceiveWithCode         13062 non-null  int64 
 9   DontReceive             13062 non-null  int64 
 10  Reservation             13062 non-null  int64 
 11  DayOfWeek               13062 non-null  int64 
 12  HolidayInWeekCount      13062 non-null  int64 
 13  HolidayInPrevWeekCount  13062 non-null  int64 
 14  HolidayInNextWeekCount  13062 non-null  int64 
 15  Ne

None

# Feature Engineering and Selection

In [22]:
df = df.drop(['ReceiveWithCard', 'ReceiveWithCode'], axis=1)

In [23]:
df['Gender'] = df['Gender'].apply(lambda x: int(x == 'man'))
df['FoodType'] = df['FoodType'].apply(lambda x: int(x == 'khorak'))
df['Meal'] = df['Meal'].apply(lambda x: int(x == 'dinner'))
df['RestaurantType'] = df['RestaurantType'].apply(lambda x: int(x == 'khabgah'))
df[:3]

Unnamed: 0,DateReserve,RestaurantName,RestaurantType,Meal,FoodName,FoodType,Gender,DontReceive,Reservation,DayOfWeek,HolidayInWeekCount,HolidayInPrevWeekCount,HolidayInNextWeekCount,NextHoliday_1,NextHoliday_2,PreviousHoliday_1,PreviousHoliday_2,DateReserveGregorian
0,1402-10-10,ابوریحان,0,1,خوراک نودلیت,1,1,0,14,1,0,0,0,42,56,14,89,2023-12-31
1,1402-10-10,ابوریحان,0,1,خوراک نودلیت,1,0,3,36,1,0,0,0,42,56,14,89,2023-12-31
2,1402-10-10,ابوریحان,0,1,چلوكباب كوبیده,0,1,0,139,1,0,0,0,42,56,14,89,2023-12-31


In [12]:
data_splitter = DataSplitter(df.drop(['DateReserve', 'DateReserveGregorian'], axis=1), 'DontReceive')
splitted_data = data_splitter.split()
print(len(splitted_data['X']['train']))
print(len(splitted_data['X']['val']))
print(len(splitted_data['X']['test']))

10449
1306
1307


In [None]:
from sklearn.tree import DecisionTreeClassifier
evaluator_DT = Evaluator({}, splitted_data['X']['test'], splitted_data['y']['test'], splitted_data['X']['train'], splitted_data['y']['train'])
for criterion in {'gini', 'entropy'}:
  evaluator_DT.fit_models({f'DT {criterion}': DecisionTreeClassifier(criterion=criterion, min_samples_split=10)})
evaluator_DT.calculate_evaluators()