# 結合　Join

## Load Data

In [None]:
library(dplyr)
source('preprocess/load_data/data_loader.R')
load_hotel_reserve()

## Inner join

In [None]:
inner_join(reserve_tb %>% filter(people_num == 1),
           hotel_tb %>% filter(is_business),
           by = 'hotel_id')[1:10,]

## Recommend

In [None]:
# small_area_nameごとにホテル数をカウント、結合キーを判定するためのテーブル
# A table to count the number of hotels per small_area_name and to determine the join key.
small_area_mst <-
  hotel_tb %>%
    group_by(big_area_name, small_area_name) %>%

    # -1は、自ホテルを引いている
    # -1 is subtracting its own hotel.
    summarise(hotel_cnt = n() - 1) %>%

    # 集約処理完了後に、グループ化を解除
    # Remove grouping after consolidation is complete.
    ungroup() %>%

    # 20件以上であればjoin_area_idをsmall_area_nameとして設定
    # 20件未満であればjoin_area_idをbig_area_nameとして設定
    # If it is 20 or more, join_area_id is set as small_area_name.
    # If it is less than 20, set join_area_id as big_area_name.
    mutate(join_area_id
             = if_else(hotel_cnt >= 20, small_area_name, big_area_name)) %>%
    select(small_area_name, join_area_id)

# レコメンド元になるホテルにsmall_area_mstを結合することで、join_area_idを設定
# Set join_area_id by joining small_area_mst to the hotel that is the recommendation source.
base_hotel_mst <-
  inner_join(hotel_tb, small_area_mst, by = 'small_area_name') %>%
    select(hotel_id, join_area_id)

# 必要に応じて、メモリを解放(必須ではないがメモリ量に余裕のないときに利用)
# Memory release
rm(small_area_mst)

# recommend_hotel_mstはレコメンド候補のためのテーブル
# recommend_hotel_mst is a table for recommendation candidates.
recommend_hotel_mst <-
  bind_rows(
    # join_area_idをbig_area_nameとしたレコメンド候補マスタ
    # It is a recommendation candidate master whose join_area_id is big_area_name.
    hotel_tb %>%
      rename(rec_hotel_id = hotel_id, join_area_id = big_area_name) %>%
      select(join_area_id, rec_hotel_id),

    # join_area_idをsmall_area_nameとしたレコメンド候補マスタ
    # It is a recommendation candidate master whose join_area_id is small_area_name.
    hotel_tb %>%
      rename(rec_hotel_id = hotel_id, join_area_id = small_area_name) %>%
      select(join_area_id, rec_hotel_id)
  )

# base_hotel_mstとrecommend_hotel_mstを結合し、レコメンド候補の情報を付与
# Join base_hotel_mst and recommend_hotel_mst, and give information on recommendation candidates.
result <- inner_join(base_hotel_mst, recommend_hotel_mst, by = 'join_area_id') %>%

  # レコメンド候補から自分を除く
  # Remove ownself from recommendation candidates.
  filter(hotel_id != rec_hotel_id)

result[1:10,]

## Combining past data

Using [lag()](https://www.rdocumentation.org/packages/dplyr/versions/0.7.8/topics/lead-lag).

In [None]:
result <- reserve_tb %>%

  group_by(customer_id) %>%

  mutate(before_price = lag(total_price, n = 1,
                          order_by = reserve_datetime, default = NA))

result[1:10,]

## Combining past datas sum

Using [roll_sum()](https://www.rdocumentation.org/packages/RcppRoll/versions/0.3.0/topics/RcppRoll-exports).

In [None]:
library(RcppRoll)

result <- reserve_tb %>%

  group_by(customer_id) %>%

  arrange(reserve_datetime) %>%

  mutate(price_sum = roll_sum(total_price, n = 3, align = 'right', fill = NA))

result[1:10,]

## Direct product (cross join)

Using [merge()](https://www.rdocumentation.org/packages/base/versions/3.6.0/topics/merge).

In [None]:
library(tidyverse)

# 計算対象の年月のデータフレームを作成
# Create data frame of year for calculation.
month_mst <- data.frame(year_month=
  # 2017-01-01、2017-02-01, 2017-03-01を生成し、format関数で形式を年月に変換
  # Create '201701','201702','201703'
  format(seq(as.Date('2017-01-01'), as.Date('2017-03-01'), by = 'months'),
         format = '%Y%m')
)

# Cross join (customer_id x month_mst)
customer_mst <-

  merge(customer_tb %>% select(customer_id), month_mst) %>%

  # mergeで指定した結合キーのデータ型がカテゴリ型になっているので、文字型に戻す
  # merge() turns datas into categorical data. Revert to string type.
  mutate(customer_id = as.character(customer_id),
         year_month = as.character(year_month))

# 合計利用金額を月ごとに計算
# Calculate the total usage amount on a monthly basis.
result <- left_join(
  customer_mst,

  # 予約テーブルに年月の結合キーを準備
  # Add join key.
  reserve_tb %>%
    mutate(checkin_month = format(as.Date(checkin_date), format = '%Y%m')),

  # 同じcustomer_idと年月を結合
  # Join using customer_id and year_month.
  by=c('customer_id' = 'customer_id', 'year_month' = 'checkin_month')
) %>%

  # customer_idと年月で集約
  # Aggregate
  group_by(customer_id, year_month) %>%

  # 合計金額を算出
  # sum
  summarise(price_sum = sum(total_price)) %>%

  # 予約レコードがなかった場合の合計金額を値なしから0に変換
  # If no records, sum is NA. Change to zero.
  replace_na(list(price_sum = 0))

result[1:10,]