In [1]:
import pandas as pd
import numpy as np

In [2]:
df_train = pd.read_csv('../../dataset/meal_service/train.csv')

In [3]:
def get_is_holiday_lists(df):
    # pre-processing    
    is_yesterday_holiday_list = []
    is_tomorrow_holiday_list = []

    for i in range (0, len(df)):
        if df.iloc[i]['요일'] == '월':
            is_yesterday_holiday_list.append(1)
            if i < len(df)-1:
                is_tomorrow_holiday_list.append(0) if df.iloc[i+1]['요일'] == '화' else is_tomorrow_holiday_list.append(1)
            else:
                is_tomorrow_holiday_list.append(0)
        elif df.iloc[i]['요일'] == '화':
            if i < len(df)-1:
                is_tomorrow_holiday_list.append(0) if df.iloc[i+1]['요일'] == '수' else is_tomorrow_holiday_list.append(1)
            else:
                is_tomorrow_holiday_list.append(0)
            if i > 0:
                is_yesterday_holiday_list.append(0) if df.iloc[i-1]['요일'] == '월' else is_yesterday_holiday_list.append(1)
            else:
                is_yesterday_holiday_list.append(0)
        elif df.iloc[i]['요일'] == '수':
            if i < len(df)-1:
                is_tomorrow_holiday_list.append(0) if df.iloc[i+1]['요일'] == '목' else is_tomorrow_holiday_list.append(1)
            else:
                is_tomorrow_holiday_list.append(0)
            if i > 0:
                is_yesterday_holiday_list.append(0) if df.iloc[i-1]['요일'] == '화' else is_yesterday_holiday_list.append(1)
            else:
                is_yesterday_holiday_list.append(0)
        elif df.iloc[i]['요일'] == '목':
            if i < len(df)-1:
                is_tomorrow_holiday_list.append(0) if df.iloc[i+1]['요일'] == '금' else is_tomorrow_holiday_list.append(1)
            else:
                is_tomorrow_holiday_list.append(0)
            if i > 0:
                is_yesterday_holiday_list.append(0) if df.iloc[i-1]['요일'] == '수' else is_yesterday_holiday_list.append(1)
            else:
                is_yesterday_holiday_list.append(0)
        elif df.iloc[i]['요일'] == '금':
            is_tomorrow_holiday_list.append(1)
            if i > 0:
                is_yesterday_holiday_list.append(0) if df.iloc[i-1]['요일'] == '목' else is_yesterday_holiday_list.append(1)
            else:
                is_yesterday_holiday_list.append(0)
                
    return is_yesterday_holiday_list, is_tomorrow_holiday_list

In [4]:
import arrow

def get_is_corona_list(df):
    corona_list = []
    corona_start = arrow.get('2020-03-01')
    for i in range(0, len(df)):
        date = arrow.get(df.iloc[i]['일자'])
        if date >= corona_start:
            corona_list.append(1)
        else:
            corona_list.append(0)
    
    return corona_list

In [5]:
def is_last_wed_of_month(date):
    curr = arrow.get(date)
    if curr.weekday() != 2:
        # not wednesday
        return False
    else:
        curr_month = curr.month
        if curr_month != curr.shift(days=7).month:
            # last wednesday of the month
            return True
        else:
            return False
        
def get_is_last_wed_list(df):
    is_last_wed_list = []
    for i in range(0, len(df)):
        date = df.iloc[i]['일자']
        if is_last_wed_of_month(date):
            is_last_wed_list.append(1)
        else:
            is_last_wed_list.append(0)
    
    return is_last_wed_list

In [6]:
def get_month_list(df):
    month_list = []
    for i in range(0, len(df)):
        month = arrow.get(df.iloc[i]['일자']).month
        month_list.append(month)
    
    return month_list

In [7]:
def get_day_list(df):
    day_list = []
    for i in range(0, len(df)):
        day = arrow.get(df.iloc[i]['일자']).day
        day_list.append(day)
    
    return day_list

In [8]:
def seperate_and_processing_menu_str(menu_row):
    # split by space
    splits = menu_row.split(' ')
    menu = []
    is_new_menu = False
    for dish in splits:
        if len(dish) > 1:
            # find (New)
            if '(New)' in dish:
                menu.append(dish.split('(New)')[0] + dish.split('(New)')[1])
                is_new_menu = True
            elif '(' not in dish and ')' not in dish:
                menu.append(dish)
    if len(menu) <= 3:
        # no menu today
        menu = ['-']
    
    return menu, is_new_menu

In [9]:
def get_menu_info(df):
    is_new_lunch_menu = []
    is_new_dinner_menu = []
    for i in range(0, len(df)):
        row_lunch = df.iloc[i]['중식메뉴']
        row_dinner = df.iloc[i]['석식메뉴']
        
        lunch_menu, new_lunch_menu = seperate_and_processing_menu_str(row_lunch)
        is_new_lunch_menu.append(1) if new_lunch_menu else is_new_lunch_menu.append(0)
        
        dinner_menu, new_dinner_menu = seperate_and_processing_menu_str(row_dinner)
        is_new_dinner_menu.append(1) if new_dinner_menu else is_new_dinner_menu.append(0)
        
        # to do 
        # menu classification
    
    return is_new_lunch_menu, is_new_dinner_menu

In [10]:
def get_is_no_dinner_list(df):
    is_no_dinner_list = []
    for i in range(0, len(df)):
        is_no_dinner_list.append(1) if len(df.iloc[i]['석식메뉴']) <= 20 else is_no_dinner_list.append(0)
    
    return is_no_dinner_list

In [11]:
def pre_processing(df):
    yesterday_holiday_list, tommorow_holiday_list = get_is_holiday_lists(df)
    is_corona_list = get_is_corona_list(df)
    month_list = get_month_list(df)
    day_list = get_day_list(df)
    is_no_dinner_list = get_is_no_dinner_list(df)
    is_new_lunch_list, is_new_dinner_list = get_menu_info(df)
    is_last_wed_list = get_is_last_wed_list(df)

    is_yesterday_holiday = pd.DataFrame({'is_yesterday_holiday': yesterday_holiday_list})
    is_tomorrow_holiday = pd.DataFrame({'is_tomorrow_holiday': tommorow_holiday_list})
    is_corona = pd.DataFrame({'is_corona': is_corona_list})
    month = pd.DataFrame({'month': month_list})
    day = pd.DataFrame({'day': day_list})
    no_dinner = pd.DataFrame({'no_dinner': is_no_dinner_list})
    is_new_lunch = pd.DataFrame({'is_new_lunch': is_new_lunch_list})
    is_new_dinner = pd.DataFrame({'is_new_dinner': is_new_dinner_list})
    #is_last_wed = pd.DataFrame({'is_last_wed': is_last_wed_list})

    df = df.join(is_yesterday_holiday)
    df = df.join(is_tomorrow_holiday)
    df = df.join(is_corona)
    df = df.join(month)
    df = df.join(day)
    df = df.join(no_dinner)
    df = df.join(is_new_lunch)
    df = df.join(is_new_dinner)
    #df = df.join(is_last_wed)

    df = df.drop(['일자', '조식메뉴', '중식메뉴', '석식메뉴'],axis='columns')

    # 원핫인코딩
    df = pd.get_dummies(df, columns=['요일'])
    
    return df

In [12]:
df_train = pre_processing(df_train)

In [13]:
df_train.head()

Unnamed: 0,본사정원수,본사휴가자수,본사출장자수,본사시간외근무명령서승인건수,현본사소속재택근무자수,중식계,석식계,is_yesterday_holiday,is_tomorrow_holiday,is_corona,month,day,no_dinner,is_new_lunch,is_new_dinner,요일_금,요일_목,요일_수,요일_월,요일_화
0,2601,50,150,238,0.0,1039.0,331.0,1,0,0,2,1,0,0,0,0,0,0,1,0
1,2601,50,173,319,0.0,867.0,560.0,0,0,0,2,2,0,0,0,0,0,0,0,1
2,2601,56,180,111,0.0,1017.0,573.0,0,0,0,2,3,0,0,0,0,0,1,0,0
3,2601,104,220,355,0.0,978.0,525.0,0,0,0,2,4,0,0,0,0,1,0,0,0
4,2601,278,181,34,0.0,925.0,330.0,0,1,0,2,5,0,0,0,1,0,0,0,0


In [14]:
from sklearn.model_selection import train_test_split

x1_train, x1_test, y1_train, y1_test = train_test_split(df_train.loc[:, [col for col in df_train.columns if col != '중식계' and col != '석식계']], df_train['중식계'])
x2_train, x2_test, y2_train, y2_test = train_test_split(df_train.loc[:, [col for col in df_train.columns if col != '중식계' and col != '석식계']], df_train['석식계'])

In [15]:

# AutoML을 이용한 ML 구현
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import KFold, GridSearchCV
# suboptmal problem의 iteration 문제는 무시할 것! (suboptimal의 계산횟수를 한정하여 수렴시키겠다는 얘기)
# 궁금하면 max_iter라는 값을 인자로 전달받는 ML 알고리즘을 살펴보세요!
import warnings
warnings.filterwarnings('ignore')

# 필요한 알고리즘을 불러와야 합니다!
from sklearn.linear_model import LinearRegression, Ridge, Lasso, LogisticRegression
# Ridge와 Lasso의 hyperparmeter: alpha
# Logistic Regressio의 hyperparameter: penalty와 regularization strength인 C값
from sklearn.svm import SVR
# SVR의 hyperparameter: epsilon, regularization C, gamma, kernel = 'rbf', 'poly', 'sigmoid'
from sklearn.neural_network import MLPRegressor
# hidden_layer_sizes = (100,) , (10, 10, ) 정도만 activation = 'relu', 'logistic'까지만, alpha =0.0001, solver = 'lbfgs', 'adam'까지만

In [16]:
pipe = Pipeline([('preprocessing', None), ('regressor', LinearRegression())])
pre_list = [StandardScaler(), MinMaxScaler(), None]
hyperparam_grid = [
    {"regressor": [LinearRegression()], 'preprocessing': pre_list},
    {"regressor": [Ridge()], 'preprocessing': pre_list,
    "regressor__alpha": [0.0001, 0.001, 0.01, 0.1, 1, 10]},
    {"regressor": [Lasso()], 'preprocessing': pre_list,
    "regressor__alpha": [0.0001, 0.001, 0.01, 0.1, 1, 10]},
    {"regressor": [LogisticRegression()], 'preprocessing': pre_list,
    "regressor__C": [0.0001, 0.001, 0.01, 0.1, 1, 10]},
    {"regressor": [SVR()], 'preprocessing': pre_list,
    "regressor__epsilon": [0.001, 0.01, 0.1, 1, 10],
    "regressor__C": [0.0001, 0.001, 0.01, 0.1, 1, 10]},
    {"regressor": [MLPRegressor()], 'preprocessing': pre_list,
    "regressor__hidden_layer_sizes": [(100,) , (10, 10, )],
    "regressor__activation": ["relu", "logistic"],
    "regressor__solver": ["lbfgs", "adam"],
    "regressor__alpha": [0.0001, 0.01, 1]}

]
kfold = KFold(n_splits=5, shuffle=True, random_state=1)

grid1 = GridSearchCV(pipe, hyperparam_grid, scoring="neg_mean_absolute_error",
                   refit=True, cv=kfold)
grid2 = GridSearchCV(pipe, hyperparam_grid, scoring="neg_mean_absolute_error",
                   refit=True, cv=kfold)

In [17]:
grid1.fit(x1_train, y1_train)
print(grid1.best_estimator_)
print(grid1.best_params_)
print(-grid1.best_score_)
print(-grid1.score(x1_test, y1_test))

Pipeline(steps=[('preprocessing', MinMaxScaler()),
                ('regressor',
                 MLPRegressor(alpha=0.01, hidden_layer_sizes=(10, 10),
                              solver='lbfgs'))])
{'preprocessing': MinMaxScaler(), 'regressor': MLPRegressor(alpha=0.01, hidden_layer_sizes=(10, 10), solver='lbfgs'), 'regressor__activation': 'relu', 'regressor__alpha': 0.01, 'regressor__hidden_layer_sizes': (10, 10), 'regressor__solver': 'lbfgs'}
75.251806897367
71.67563674048641


In [18]:
grid2.fit(x2_train, y2_train)
print(grid2.best_estimator_)
print(grid2.best_params_)
print(-grid2.best_score_)
print(-grid2.score(x2_test, y2_test))

Pipeline(steps=[('preprocessing', MinMaxScaler()),
                ('regressor',
                 MLPRegressor(alpha=0.01, hidden_layer_sizes=(10, 10),
                              solver='lbfgs'))])
{'preprocessing': MinMaxScaler(), 'regressor': MLPRegressor(alpha=0.01, hidden_layer_sizes=(10, 10), solver='lbfgs'), 'regressor__activation': 'relu', 'regressor__alpha': 0.01, 'regressor__hidden_layer_sizes': (10, 10), 'regressor__solver': 'lbfgs'}
54.332599583015494
52.734105234645455


In [19]:
from sklearn.metrics import mean_absolute_error
print("중식 최종 결과: ", mean_absolute_error(y1_test, grid1.best_estimator_.predict(x1_test)))
print("석식 최종 결과: ", mean_absolute_error(y2_test, grid2.best_estimator_.predict(x2_test)))

중식 최종 결과:  71.67563674048641
석식 최종 결과:  52.734105234645455


In [20]:
df_test = pd.read_csv('../../dataset/meal_service/test.csv')
df_result_date = pd.DataFrame(df_test['일자'])

In [21]:
df_test = pre_processing(df_test)

lunch_cnt_list = grid1.best_estimator_.predict(df_test)
dinner_cnt_list = grid2.best_estimator_.predict(df_test)

In [22]:
df_result_lunch = pd.DataFrame({'중식계': lunch_cnt_list})
df_result_dinner = pd.DataFrame({'석식계': dinner_cnt_list})

In [23]:
df_result = df_result_date.join(df_result_lunch).join(df_result_dinner)

# for i in range(0, len(df_result)):
#     if is_last_wed_of_month(df_result.iloc[i]['일자']):
#         df_result.at[i, '석식계'] = 0
#         print(df_result.iloc[i]['석식계'])


# df_result

In [27]:
df_result.to_csv('../../dataset/meal_service/result.csv', index=None)

In [25]:
df_result

Unnamed: 0,일자,중식계,석식계
0,2021-01-27,998.752478,360.783167
1,2021-01-28,823.549161,442.568615
2,2021-01-29,627.713703,289.748117
3,2021-02-01,1206.236852,554.796339
4,2021-02-02,938.084708,487.041933
5,2021-02-03,942.901559,449.280794
6,2021-02-04,884.27739,476.935018
7,2021-02-05,676.375684,336.850954
8,2021-02-08,1220.827464,590.287403
9,2021-02-09,931.527305,507.012452
