In [1]:
import pandas as pd
import numpy as np
import time

In [2]:
job_start_time = time.time()
df_train = pd.read_csv('../../dataset/meal_service/train.csv')
df_test = pd.read_csv('../../dataset/meal_service/test.csv')
df_all = pd.concat([df_train, df_test])
NUM_VECTORS = 40

In [3]:
def get_is_holiday_lists(df):
    # pre-processing    
    is_yesterday_holiday_list = []
    is_tomorrow_holiday_list = []

    for i in range (0, len(df)):
        if df.iloc[i]['요일'] == '월':
            is_yesterday_holiday_list.append(1)
            if i < len(df)-1:
                is_tomorrow_holiday_list.append(0) if df.iloc[i+1]['요일'] == '화' else is_tomorrow_holiday_list.append(1)
            else:
                is_tomorrow_holiday_list.append(0)
        elif df.iloc[i]['요일'] == '화':
            if i < len(df)-1:
                is_tomorrow_holiday_list.append(0) if df.iloc[i+1]['요일'] == '수' else is_tomorrow_holiday_list.append(1)
            else:
                is_tomorrow_holiday_list.append(0)
            if i > 0:
                is_yesterday_holiday_list.append(0) if df.iloc[i-1]['요일'] == '월' else is_yesterday_holiday_list.append(1)
            else:
                is_yesterday_holiday_list.append(0)
        elif df.iloc[i]['요일'] == '수':
            if i < len(df)-1:
                is_tomorrow_holiday_list.append(0) if df.iloc[i+1]['요일'] == '목' else is_tomorrow_holiday_list.append(1)
            else:
                is_tomorrow_holiday_list.append(0)
            if i > 0:
                is_yesterday_holiday_list.append(0) if df.iloc[i-1]['요일'] == '화' else is_yesterday_holiday_list.append(1)
            else:
                is_yesterday_holiday_list.append(0)
        elif df.iloc[i]['요일'] == '목':
            if i < len(df)-1:
                is_tomorrow_holiday_list.append(0) if df.iloc[i+1]['요일'] == '금' else is_tomorrow_holiday_list.append(1)
            else:
                is_tomorrow_holiday_list.append(0)
            if i > 0:
                is_yesterday_holiday_list.append(0) if df.iloc[i-1]['요일'] == '수' else is_yesterday_holiday_list.append(1)
            else:
                is_yesterday_holiday_list.append(0)
        elif df.iloc[i]['요일'] == '금':
            is_tomorrow_holiday_list.append(1)
            if i > 0:
                is_yesterday_holiday_list.append(0) if df.iloc[i-1]['요일'] == '목' else is_yesterday_holiday_list.append(1)
            else:
                is_yesterday_holiday_list.append(0)
                
    return is_yesterday_holiday_list, is_tomorrow_holiday_list

In [4]:
import arrow

def get_is_corona_list(df):
    corona_list = []
    corona_start = arrow.get('2020-03-01')
    for i in range(0, len(df)):
        date = arrow.get(df.iloc[i]['일자'])
        if date >= corona_start:
            corona_list.append(1)
        else:
            corona_list.append(0)
    
    return corona_list

In [5]:
def is_last_wed_of_month(date):
    curr = arrow.get(date)
    if curr.weekday() != 2:
        # not wednesday
        return False
    else:
        curr_month = curr.month
        if curr_month != curr.shift(days=7).month:
            # last wednesday of the month
            return True
        else:
            return False
        
def get_is_last_wed_list(df):
    is_last_wed_list = []
    for i in range(0, len(df)):
        date = df.iloc[i]['일자']
        if is_last_wed_of_month(date):
            is_last_wed_list.append(1)
        else:
            is_last_wed_list.append(0)
    
    return is_last_wed_list

In [6]:
def get_month_list(df):
    month_list = []
    for i in range(0, len(df)):
        month = arrow.get(df.iloc[i]['일자']).month
        month_list.append(month)
    
    return month_list

In [7]:
def get_day_list(df):
    day_list = []
    for i in range(0, len(df)):
        day = arrow.get(df.iloc[i]['일자']).day
        day_list.append(day)
    
    return day_list

In [8]:
def seperate_and_processing_menu_str(menu_row):
    # split by space
    splits = menu_row.split(' ')
    menu = []
    is_new_menu = False
    for dish in splits:
        if len(dish) > 1:
            # find (New)
            if '(New)' in dish:
                menu.append(dish.split('(New)')[0] + dish.split('(New)')[1])
                is_new_menu = True
            elif '(' not in dish and ')' not in dish:
                menu.append(dish)
    if len(menu) <= 3:
        # no menu today
        menu = ['-']
    
    return menu, is_new_menu

In [9]:
def get_menu_info(df):
    is_new_lunch_menu = []
    is_new_dinner_menu = []
    lunch_menus = []
    dinner_menus = []
    for i in range(0, len(df)):
        row_lunch = df.iloc[i]['중식메뉴']
        row_dinner = df.iloc[i]['석식메뉴']
        
        lunch_menu, new_lunch_menu = seperate_and_processing_menu_str(row_lunch)
        is_new_lunch_menu.append(1) if new_lunch_menu else is_new_lunch_menu.append(0)
        
        dinner_menu, new_dinner_menu = seperate_and_processing_menu_str(row_dinner)
        is_new_dinner_menu.append(1) if new_dinner_menu else is_new_dinner_menu.append(0)

        lunch_menus.append(lunch_menu)
        dinner_menus.append(dinner_menu)
    
    return is_new_lunch_menu, is_new_dinner_menu, lunch_menus, dinner_menus

In [10]:
def get_is_no_dinner_list(df):
    is_no_dinner_list = []
    for i in range(0, len(df)):
        is_no_dinner_list.append(1) if len(df.iloc[i]['석식메뉴']) <= 20 else is_no_dinner_list.append(0)
    
    return is_no_dinner_list

In [11]:
def menu_to_vectors(model, menus):
    soups = []
    mains = []
    sides1 = []
    sides2 = []
    for menu in menus:
        if len(menu) > 4:
            soup = menu[1]
            main = menu[2]
            side1 = menu[3]
            side2 = menu[4]
            
            vector1 = model.wv.get_vector(soup)
            vector2 = model.wv.get_vector(main)
            vector3 = model.wv.get_vector(side1)
            vector4 = model.wv.get_vector(side2)
            
            soups.append(vector1)
            mains.append(vector2)
            sides1.append(vector3)
            sides2.append(vector4)
        else:
            zeros1 = np.zeros(NUM_VECTORS)
            zeros2 = np.zeros(NUM_VECTORS)
            zeros3 = np.zeros(NUM_VECTORS)
            zeros4 = np.zeros(NUM_VECTORS)

            soups.append(zeros1)
            mains.append(zeros2)
            sides1.append(zeros3)
            sides2.append(zeros4)

    
    return soups, mains, sides1, sides2

In [12]:
def get_menus(df):
    menus = []
    for i in range(0, len(df)):
        row_lunch = df.iloc[i]['중식메뉴']
        row_dinner = df.iloc[i]['석식메뉴']
        
        lunch_menu, new_lunch_menu = seperate_and_processing_menu_str(row_lunch)
        dinner_menu, new_dinner_menu = seperate_and_processing_menu_str(row_dinner)
        
        if len(lunch_menu) > 1:
            menus.append(lunch_menu)
        if len(dinner_menu) > 1:
            menus.append(dinner_menu) 
            
    return list(menus)

In [13]:
from gensim.models import Word2Vec
from gensim.models import KeyedVectors

try:
    model = Word2Vec.load('food_embedding_%s.model' % NUM_VECTORS)
    print('Model loaded')
except:
    print('Training word2vector')
    menus = get_menus(df_all)
    model = Word2Vec(sentences=menus, vector_size=NUM_VECTORS, window=7, min_count=0, workers=4, sg=0, epochs=5000)
    model.save('food_embedding_%s.model' % NUM_VECTORS)

Training word2vector


In [14]:
def pre_processing(df, model):
    yesterday_holiday_list, tommorow_holiday_list = get_is_holiday_lists(df)
    is_corona_list = get_is_corona_list(df)
    month_list = get_month_list(df)
    day_list = get_day_list(df)
    is_no_dinner_list = get_is_no_dinner_list(df)
    is_new_lunch_list, is_new_dinner_list, lunch_menus, dinner_menus = get_menu_info(df)
    is_last_wed_list = get_is_last_wed_list(df)

    is_yesterday_holiday = pd.DataFrame({'is_yesterday_holiday': yesterday_holiday_list})
    is_tomorrow_holiday = pd.DataFrame({'is_tomorrow_holiday': tommorow_holiday_list})
    is_corona = pd.DataFrame({'is_corona': is_corona_list})
    month = pd.DataFrame({'month': month_list})
    day = pd.DataFrame({'day': day_list})
    no_dinner = pd.DataFrame({'no_dinner': is_no_dinner_list})
    is_new_lunch = pd.DataFrame({'is_new_lunch': is_new_lunch_list})
    is_new_dinner = pd.DataFrame({'is_new_dinner': is_new_dinner_list})
    # is_last_wed = pd.DataFrame({'is_last_wed': is_last_wed_list})
    
    # using word2vec
    lunch_soups_list, lunch_mains_list, lunch_sides1_list, lunch_sides2_list = menu_to_vectors(model, lunch_menus)
    dinner_soups_list, dinner_mains_list, dinner_sides1_list, dinner_sides2_list = menu_to_vectors(model, dinner_menus)
    
    lunch_soups = pd.DataFrame({'lunch_soups': lunch_soups_list})
    lunch_mains = pd.DataFrame({'lunch_mains': lunch_mains_list})
    lunch_sides1 = pd.DataFrame({'lunch_sides1': lunch_sides1_list})
    lunch_sides2 = pd.DataFrame({'lunch_sides2': lunch_sides2_list})
    dinner_soups = pd.DataFrame({'dinner_soups': dinner_soups_list})
    dinner_mains = pd.DataFrame({'dinner_mains': dinner_mains_list})
    dinner_sides1 = pd.DataFrame({'dinner_sides1': dinner_sides1_list})
    dinner_sides2 = pd.DataFrame({'dinner_sides2': dinner_sides2_list})

    df = df.join(is_yesterday_holiday)
    df = df.join(is_tomorrow_holiday)
    df = df.join(is_corona)
    df = df.join(month)
    df = df.join(day)
    df = df.join(no_dinner)
    df = df.join(is_new_lunch)
    df = df.join(is_new_dinner) 

    # 원핫인코딩
    df = pd.get_dummies(df, columns=['요일'])
    
    dropped_df = df.loc[:, [col for col in df.columns if col not in ['일자', '조식메뉴', '중식메뉴', '석식메뉴', '석식계', '중식계']]]
    arr = dropped_df.to_numpy()

    arr = np.concatenate((arr, np.array(lunch_soups.to_numpy().tolist()).reshape(len(df),NUM_VECTORS)),axis=1)
    arr = np.concatenate((arr, np.array(lunch_mains.to_numpy().tolist()).reshape(len(df),NUM_VECTORS)),axis=1)
    arr = np.concatenate((arr, np.array(lunch_sides1.to_numpy().tolist()).reshape(len(df),NUM_VECTORS)),axis=1)
    # arr = np.concatenate((arr, np.array(lunch_sides2.to_numpy().tolist()).reshape(len(df),NUM_VECTORS)),axis=1)
    arr = np.concatenate((arr, np.array(dinner_soups.to_numpy().tolist()).reshape(len(df),NUM_VECTORS)),axis=1)
    arr = np.concatenate((arr, np.array(dinner_mains.to_numpy().tolist()).reshape(len(df),NUM_VECTORS)),axis=1)
    arr = np.concatenate((arr, np.array(dinner_sides1.to_numpy().tolist()).reshape(len(df),NUM_VECTORS)),axis=1)
    # arr = np.concatenate((arr, np.array(dinner_sides2.to_numpy().tolist()).reshape(len(df),NUM_VECTORS)),axis=1)
    
    return arr

In [15]:
from sklearn.model_selection import train_test_split
import random

lunch_people = df_train['중식계']
dinner_people = df_train['석식계']

arr_train = pre_processing(df_train, model)

x1_train, x1_test, y1_train, y1_test = train_test_split(arr_train, lunch_people,
                                                        random_state=random.randrange(1, 100))
x2_train, x2_test, y2_train, y2_test = train_test_split(arr_train, dinner_people,
                                                        random_state=random.randrange(1, 100))


In [16]:

# AutoML을 이용한 ML 구현
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import KFold, GridSearchCV
# suboptmal problem의 iteration 문제는 무시할 것! (suboptimal의 계산횟수를 한정하여 수렴시키겠다는 얘기)
# 궁금하면 max_iter라는 값을 인자로 전달받는 ML 알고리즘을 살펴보세요!
import warnings
warnings.filterwarnings('ignore')

# 필요한 알고리즘을 불러와야 합니다!
from sklearn.linear_model import LinearRegression, Ridge, Lasso, LogisticRegression
# Ridge와 Lasso의 hyperparmeter: alpha
# Logistic Regressio의 hyperparameter: penalty와 regularization strength인 C값
from sklearn.svm import SVR
# SVR의 hyperparameter: epsilon, regularization C, gamma, kernel = 'rbf', 'poly', 'sigmoid'
from sklearn.neural_network import MLPRegressor
# hidden_layer_sizes = (100,) , (10, 10, ) 정도만 activation = 'relu', 'logistic'까지만, alpha =0.0001, solver = 'lbfgs', 'adam'까지만

In [17]:
pipe = Pipeline([('preprocessing', None), ('regressor', LinearRegression())])
pre_list = [StandardScaler(), MinMaxScaler(), None]
hyperparam_grid = [
    {"regressor": [LinearRegression()], 'preprocessing': pre_list},
    {"regressor": [Ridge()], 'preprocessing': pre_list,
    "regressor__alpha": [0.0001, 0.001, 0.01, 0.1, 1, 10]},
    {"regressor": [Lasso()], 'preprocessing': pre_list,
    "regressor__alpha": [0.0001, 0.001, 0.01, 0.1, 1, 10]},
    {"regressor": [LogisticRegression()], 'preprocessing': pre_list,
    "regressor__C": [0.0001, 0.001, 0.01, 0.1, 1, 10]},
    {"regressor": [SVR()], 'preprocessing': pre_list,
    "regressor__epsilon": [0.001, 0.01, 0.1, 1, 10],
    "regressor__C": [0.0001, 0.001, 0.01, 0.1, 1, 10]},
    {"regressor": [MLPRegressor()], 'preprocessing': pre_list,
    "regressor__hidden_layer_sizes": [(100,) , (10, 10, )],
    "regressor__activation": ["relu", "logistic"],
    "regressor__solver": ["lbfgs", "adam"],
    "regressor__alpha": [0.0001, 0.01, 1]}

]
kfold = KFold(n_splits=5, shuffle=True, random_state=1)

grid1 = GridSearchCV(pipe, hyperparam_grid, scoring='neg_mean_absolute_error',
                     refit=True, cv=kfold)
grid2 = GridSearchCV(pipe, hyperparam_grid, scoring='neg_mean_absolute_error',
                     refit=True, cv=kfold)

In [18]:
grid1_start_time = time.time()
grid1.fit(x1_train, y1_train)

print('grid1 fitting time: %f' % (time.time() - grid1_start_time))
print(grid1.best_estimator_)
print(grid1.best_params_)
print(-grid1.best_score_)
print(-grid1.score(x1_test, y1_test))

grid1 fitting time: 419.774486
Pipeline(steps=[('preprocessing', MinMaxScaler()),
                ('regressor', Lasso(alpha=1))])
{'preprocessing': MinMaxScaler(), 'regressor': Lasso(alpha=1), 'regressor__alpha': 1}
78.69324981198938
71.7003295010078


In [19]:
grid2_start_time = time.time()
grid2.fit(x2_train, y2_train)
print('grid2 fitting time: %f' % (time.time() - grid2_start_time))
print(grid2.best_estimator_)
print(grid2.best_params_)
print(-grid2.best_score_)
print(-grid2.score(x2_test, y2_test))

grid2 fitting time: 370.673415
Pipeline(steps=[('preprocessing', MinMaxScaler()),
                ('regressor', Lasso(alpha=1))])
{'preprocessing': MinMaxScaler(), 'regressor': Lasso(alpha=1), 'regressor__alpha': 1}
58.04685450331542
54.36248685217784


In [20]:
from sklearn.metrics import mean_absolute_error
print('중식 최종 결과: ', mean_absolute_error(y1_test, grid1.best_estimator_.predict(x1_test)))
print('석식 최종 결과: ', mean_absolute_error(y2_test, grid2.best_estimator_.predict(x2_test)))

중식 최종 결과:  71.7003295010078
석식 최종 결과:  54.36248685217784


In [21]:
df_result_date = pd.DataFrame(df_test['일자'])

In [22]:
arr_test = pre_processing(df_test, model)

lunch_cnt_list = grid1.best_estimator_.predict(arr_test)
dinner_cnt_list = grid2.best_estimator_.predict(arr_test)

In [23]:
df_result_lunch = pd.DataFrame({'중식계': lunch_cnt_list})
df_result_dinner = pd.DataFrame({'석식계': dinner_cnt_list})
df_result = df_result_date.join(df_result_lunch).join(df_result_dinner)

In [24]:
df_result.to_csv('../../dataset/meal_service/result.csv', index=None)

In [25]:
df_result

Unnamed: 0,일자,중식계,석식계
0,2021-01-27,959.225335,388.112314
1,2021-01-28,886.329593,459.041941
2,2021-01-29,645.181284,350.182151
3,2021-02-01,1241.257655,531.750247
4,2021-02-02,1002.9331,501.302491
5,2021-02-03,965.89224,402.458448
6,2021-02-04,929.058657,485.056995
7,2021-02-05,689.072942,376.250962
8,2021-02-08,1249.137932,570.992061
9,2021-02-09,1004.839363,516.881701


In [26]:
print('job running time %f sec' % (time.time() - job_start_time ))

job running time 839.014033 sec
