In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from lightgbm import LGBMRegressor

from gensim.models import Word2Vec
from gensim.models import KeyedVectors
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA

In [2]:
# HYPER PARAMETERS

class CFG:
    emb_dim = 200

args = CFG

In [3]:
df_train = pd.read_csv('./train.csv')
df_test = pd.read_csv('./test.csv')

# Word Embedding

In [4]:
df_all = pd.concat([df_train, df_test])

In [5]:
# Basic text preprocessing

def split_process(x, q):
    x_ = []
    x = x.split(' ')
    for i in x:
        if '(' in i and ':' in i and ')' in i:
            continue
        if '/' in i:
            x_.extend(i.split('/'))
        else:
            x_.append(i)
    x_ = list(set(x_))
    x_.remove('')
    return x_

In [6]:
# Get all combinations for training w2v (train + test)

food_combinations = []
for i in ['조식메뉴', '중식메뉴', '석식메뉴']:
    food_combinations += df_all[i].apply(lambda x: split_process(x, i)).to_list()

In [7]:
# Train or load w2v model

TRAIN_W2V = True
try:
    model = Word2Vec.load('food_embedding.model')
    print("Model loaded")
except:
    if TRAIN_W2V:
        print("Training w2v")
        model = Word2Vec(sentences=food_combinations, vector_size=args.emb_dim, window=7, min_count=0, workers=4, sg=0, epochs=5000)
        model.save('food_embedding.model')
    else:
        print("Model loading failed. Do not train.")

Model loaded


In [8]:
# w2v demo
model.wv.most_similar('된장찌개')

[('차돌박이찌개', 0.5788873434066772),
 ('오징어국', 0.5403120517730713),
 ('고추장찌개', 0.5396184921264648),
 ('감자국', 0.5392025113105774),
 ('민물새우찌개', 0.5303929448127747),
 ('얼갈이국', 0.5229366421699524),
 ('대구매운탕', 0.522230327129364),
 ('김치두부국', 0.5169321894645691),
 ('소고기무국', 0.5046370029449463),
 ('북어해장국', 0.5005111694335938)]

# Preprocess

In [9]:
def get_food_embedding(x):
    x_ = []
    x = x.split(' ')
    for i in x:
        if '(' in i and ':' in i and ')' in i:
            continue
        if '/' in i:
            x_.extend(i.split('/'))
        else:
            x_.append(i)
    x_ = list(set(x_))
    x_.remove('')
    vec_ = np.zeros(args.emb_dim)
    for i in x_:
        vec = model.wv.get_vector(i)
        vec_ += vec
    vec_ /= len(x_)
    return vec_

def preprocess_date(df, is_train):
    if is_train:
        df = df[df['석식계'] > 0]
        
    df['일자'] = pd.to_datetime(df['일자'], format="%Y-%m-%d")
    df['year'] = df['일자'].dt.year
    df['month'] = df['일자'].dt.month
    df['day'] = df['일자'].dt.day
    
    day_encoder = LabelEncoder()
    df['요일'] = day_encoder.fit_transform(df['요일'])
    df['remains'] = df['본사정원수'] - df['본사휴가자수'] - df['본사출장자수'] - df['현본사소속재택근무자수']
    df['dinner'] = df['석식메뉴'].apply(lambda x: get_food_embedding(x))
    
    if is_train:
        y = df['석식계']
    else:
        y = 0
    
    df.drop(['일자','조식메뉴', '중식메뉴', '석식메뉴', '본사정원수', '본사휴가자수', '본사출장자수', '현본사소속재택근무자수', '본사시간외근무명령서승인건수'], axis=1, inplace=True)
    if is_train:
        df.drop(['중식계', '석식계'], axis=1, inplace=True)
    
    common = df.iloc[:, :4].to_numpy()
    dinner = np.expand_dims(np.array(df.iloc[:, 4].to_numpy().tolist()), axis=1)
    common = np.concatenate((df.iloc[:, :4].to_numpy(), dinner), axis=1)
    
    return common, y


In [10]:
x_train, y_train = preprocess_date(df_train, True)
x_test, _ = preprocess_date(df_test, False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['일자'] = pd.to_datetime(df['일자'], format="%Y-%m-%d")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['year'] = df['일자'].dt.year
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['month'] = df['일자'].dt.month
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_in

In [11]:
input_train, input_val, target_train, target_val = train_test_split(x_train, y_train, test_size=0.1, random_state=42)

# Modeling

In [12]:
# Simple LGBM Regressor w/o tuning

model = LGBMRegressor()
model.fit(input_train, target_train)
pred = model.predict(input_val)
print("dinner mae: ", mean_absolute_error(pred, target_val))

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000042 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 312
[LightGBM] [Info] Number of data points in the train set: 1045, number of used features: 5
[LightGBM] [Info] Start training from score 480.212440
dinner mae:  46.671820573700835


# Inference

In [13]:
# Inference
test_pred_dinner = model.predict(x_test)
submission_df = pd.read_csv('sample_submission.csv')
submission_df['석식계'] = test_pred_dinner

In [14]:
# Save
submission_df.to_csv('submit.csv', index=False)