<a href="https://colab.research.google.com/github/pass4/atmacup-16/blob/main/%5B%E6%8F%90%E5%87%BA%E7%94%A8%5Drule_based_prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import os
import sys
import numpy as np
import pandas as pd
import polars as pl
from matplotlib import pyplot as plt
import seaborn as sns

%matplotlib inline

main_dir = ''

input_dir = main_dir + 'input/atmaCup16_Dataset/'
output_dir = main_dir + 'output/'

## データ準備

In [14]:
from collections import Counter

# ログから同一セッションで共起した回数を記録した辞書を作成する関数
def count_covisit_items(df_log_yado_list):
    covisit_items = {}
    for items in df_log_yado_list:
        items = np.unique(items)
        for item in items:
            if item not in covisit_items:
                covisit_items[item] = list()
            covisit_items[item].extend(items)

    for item in covisit_items:
        item_list = [e for e in covisit_items[item] if e != item]
        element_counts = Counter(item_list)
        covisit_items[item] = dict(element_counts)

    return covisit_items

In [15]:
# データセットの読み込み
df_train_log = pd.read_csv(os.path.join(input_dir, 'train_log.csv'))
df_test_log = pd.read_csv(os.path.join(input_dir, 'test_log.csv'))
df_train_label = pd.read_csv(os.path.join(input_dir, 'train_label.csv'))
df_yado = pd.read_csv(os.path.join(input_dir, 'yado.csv'))
df_test_session = pd.read_csv(os.path.join(input_dir, 'test_session.csv'))
sample_submission = pd.read_csv(os.path.join(input_dir, 'sample_submission.csv'))

# ログとyado.csvの結合
df_train_log_yado = df_train_log.merge(df_yado, how='left', on='yad_no')
df_test_log_yado = df_test_log.merge(df_yado, how='left', on='yad_no')
df_train_label_yado = df_train_label.merge(df_yado, how='left', on='yad_no')

# train+testおよびtrain+test+labelを結合したログの用意
df_all_log_yado = pd.concat([df_train_log_yado, df_test_log_yado], axis=0, sort=False).reset_index(drop=True)
df_all_log_label_yado = pd.concat([df_all_log_yado, df_train_label], axis=0, sort=False).reset_index(drop=True)

# セッションごとに訪問した宿のリスト
df_train_log_yado_list = df_train_log.groupby('session_id')['yad_no'].apply(list)
df_test_log_yado_list = df_test_log.groupby('session_id')['yad_no'].apply(list)
df_all_log_label_yado_list = pd.concat([df_train_log, df_test_log, df_train_label], axis=0, sort=False).reset_index(drop=True).groupby('session_id')['yad_no'].apply(list)

# ログから宿が訪問されたセッション数をカウントした辞書を作成
train_count_items = df_train_log.groupby(['yad_no'])['session_id'].count().to_dict()
test_count_items = df_test_log.groupby(['yad_no'])['session_id'].count().to_dict()
all_label_count_items = df_all_log_label_yado.groupby(['yad_no'])['session_id'].count().to_dict()

# ログから同一セッションで共起した回数を記録した辞書を作成
train_count_covisit_items = count_covisit_items(df_train_log_yado_list)
test_count_covisit_items = count_covisit_items(df_test_log_yado_list)
all_label_count_covisit_items = count_covisit_items(df_all_log_label_yado_list)

## 予測候補を作成する関数

優先度1: sessionにおいて訪問した宿を抽出する関数

In [16]:
def history_based_predict(df_log_yado_list):
    result = df_log_yado_list.apply(lambda x: [e for e in x if e != x[-1]])
    return result.index, result.values

優先度2: 同一セッションで共起した回数に重みづけを行って並び替える関数

In [20]:
def order_by_weighted_sum_of_covisit(dicts, weights, second_order):
    result =None
    for dict_i, weight_i in zip(dicts, weights):
        if result is None:
            result = {key: value * weight_i for key, value in dict_i.items()}
        else:
            for key, value in dict_i.items():
                if key in result:
                    result[key] += value * weight_i # 同じキーの値を加算
                else:
                    result[key] = value * weight_i   # 新しいキーを追加

    item_list = result.keys()
    second_order_item ={}
    for item in item_list:
        second_order_item[item] = second_order[item] if item in second_order.keys() else 0
    sorted_items = sorted(item_list, key=lambda x: (result[x], second_order_item[x]), reverse=True)
    return list(dict.fromkeys(sorted_items))

優先度3: セッション内で調べている地域で人気の宿を抽出する関数

In [18]:
# sessin_idに対して割り当てる地域を決定
def get_search_region_by_session(df_log):
    region_cds = ['wid_cd', 'ken_cd', 'lrg_cd', 'sml_cd']
    search_region = df_log.groupby('session_id')[region_cds].first() # sessionの最初に閲覧したyadoの地域を採用
    return search_region

# 指定された地域で人気の宿上位N件を抽出
def get_popluar_candidate_by_region(df_log, region_cd, n=10):
    yado_count = df_log.groupby(['yad_no', region_cd])['session_id'].nunique().rename('count').reset_index()
    yado_count['rank'] = yado_count.groupby(region_cd)['count'].rank(method='first', ascending=False)
    return yado_count[yado_count['rank']<=n+1].sort_values('rank').groupby(region_cd)['yad_no'].apply(list).rename('popular_yado')

# ログに対して指定された地域で人気の宿上位N件を予測として割り当て
def popular_based_predict(df_log, region_cd, n=10):
    popular_candiate = get_popluar_candidate_by_region(df_log, region_cd, n)
    search_region = get_search_region_by_session(df_log)[region_cd]

    # 最後のyadoを候補から除外するために抽出
    last_yado = df_log.groupby('session_id')['yad_no'].last().rename('last_yado')

    result = last_yado.reset_index().merge(search_region, how='left', on='session_id')
    result = result.merge(popular_candiate, how='left', on=region_cd)

    pred = []
    for last_yado_i, popular_yado_i in zip(result['last_yado'], result['popular_yado']):
        if last_yado_i in popular_yado_i:
            pred.append([e for e in popular_yado_i if e!=last_yado_i])
        else:
            pred.append(popular_yado_i[:-1])

    return result['session_id'], pred

## 評価関数

In [19]:
# 評価指標
def apk(actual, predicted, k=10):
    if actual in predicted[:k]:
        return 1.0 / (predicted[:k].index(actual) + 1)
    return 0.0

def mapk(actual, predicted, k=10):
    return sum(apk(a, p, k) for a, p in zip(actual, predicted)) / len(actual)

## Trainデータでの精度検証

In [26]:
# sessionにおいて訪問した宿を抽出
h_pred_index, h_pred = history_based_predict(df_train_log_yado_list)

# 共起した宿を抽出
train_train_count_related_items_by_session = df_train_log_yado_list.apply(lambda x: train_count_covisit_items[x[-1]] if x[-1] in train_count_covisit_items.keys() else {}) # 最後の宿に対する同じlogでカウントした共起
train_train_count_related_items_by_session_first = df_train_log_yado_list.apply(lambda x: train_count_covisit_items[x[0]] if x[0] in train_count_covisit_items.keys() else {}) # 最初の宿に対する同じlogでカウントした共起
train_test_count_related_items_by_session = df_train_log_yado_list.apply(lambda x: test_count_covisit_items[x[-1]] if x[-1] in test_count_covisit_items.keys() else {}) # 最後の宿に対する異なるlogでカウントした共起
train_test_count_related_items_by_session_first = df_train_log_yado_list.apply(lambda x: test_count_covisit_items[x[0]] if x[0] in test_count_covisit_items.keys() else {}) # 最初の宿に対する異なるlogでカウントした共起

# セッション内で調べている地域で人気の宿を抽出
sml_p_pred_index, sml_p_pred = popular_based_predict(df_train_log_yado, 'sml_cd', 10)
lrg_p_pred_index, lrg_p_pred = popular_based_predict(df_train_log_yado, 'lrg_cd', 10)
ken_p_pred_index, ken_p_pred = popular_based_predict(df_train_log_yado, 'ken_cd', 10)


# 上記で抽出した予測候補を統合
merged_pred = []
for sml_p_i, sml_p_pred_i, lrg_p_i, lrg_p_pred_i, ken_p_i, ken_p_pred_i in zip(sml_p_pred_index, sml_p_pred, lrg_p_pred_index, lrg_p_pred, ken_p_pred_index, ken_p_pred):
    h_i = h_pred_index.get_loc(sml_p_i)
    h_pred_i = h_pred[h_i]

    train_rel_i = train_train_count_related_items_by_session.index.get_loc(sml_p_i)
    train_rel_items_i = train_train_count_related_items_by_session.iloc[train_rel_i]

    train_rel_first_i = train_train_count_related_items_by_session_first.index.get_loc(sml_p_i)
    train_rel_items_first_i = train_train_count_related_items_by_session_first.iloc[train_rel_first_i]

    test_rel_i = train_test_count_related_items_by_session.index.get_loc(sml_p_i)
    test_rel_items_i = train_test_count_related_items_by_session.iloc[test_rel_i]

    test_rel_fisrt_i = train_test_count_related_items_by_session_first.index.get_loc(sml_p_i)
    test_rel_items_first_i = train_test_count_related_items_by_session_first.iloc[test_rel_i]

    # 共起した回数に重みづけを行って並び替え
    rel_items_i = order_by_weighted_sum_of_covisit([train_rel_items_i, train_rel_items_first_i, test_rel_items_i, test_rel_items_first_i], [1.0, 0.1, 0.01, 0.01], second_order=train_count_items)

    # 1番目に訪問した宿のlogにおける出現回数が2番目の宿の0.3倍以下であれば順序を入れ替え
    if len(h_pred_i) == 2 and train_count_items[h_pred_i[0]] /  train_count_items[h_pred_i[1]] < 0.3:
        h_pred_i = sorted(h_pred_i, key=lambda x: (train_count_items[x]), reverse=True)

    merged_pred_i = h_pred_i + rel_items_i + sml_p_pred_i + lrg_p_pred_i + ken_p_pred_i
    # 順序を保って重複除去
    merged_pred_i = list(dict.fromkeys(merged_pred_i))
    merged_pred.append(merged_pred_i[:10])

true = df_train_label[df_train_label['session_id']==sml_p_pred_index]['yad_no'].values

k = 10
print(f"MAP@{k}:", mapk(true, merged_pred, k))

MAP@10: 0.40636129400658677


## Testデータに対する予測実行

In [28]:
# sessionにおいて訪問した宿を抽出
h_pred_index, h_pred = history_based_predict(df_test_log_yado_list)

# 共起した宿を抽出
sml_p_pred_index, sml_p_pred = popular_based_predict(df_test_log_yado, 'sml_cd', 10)
lrg_p_pred_index, lrg_p_pred = popular_based_predict(df_test_log_yado, 'lrg_cd', 10)
ken_p_pred_index, ken_p_pred = popular_based_predict(df_test_log_yado, 'ken_cd', 10)

# セッション内で調べている地域で人気の宿を抽出
test_train_count_related_items_by_session = df_test_log_yado_list.apply(lambda x: all_label_count_covisit_items[x[-1]] if x[-1] in all_label_count_covisit_items.keys() else {}) # 最後の宿に対する同じlogでカウントした共起
test_test_count_related_items_by_session = df_test_log_yado_list.apply(lambda x: test_count_covisit_items[x[-1]] if x[-1] in test_count_covisit_items.keys() else {}) # 最初の宿に対する同じlogでカウントした共起
test_test_count_related_items_by_session_first = df_test_log_yado_list.apply(lambda x: test_count_covisit_items[x[0]] if x[0] in test_count_covisit_items.keys() else {}) # 最後の宿に対する異なるlogでカウントした共起
test_train_count_related_items_by_session_first = df_test_log_yado_list.apply(lambda x: all_label_count_covisit_items[x[0]] if x[0] in all_label_count_covisit_items.keys() else {}) # 最初の宿に対する異なるlogでカウントした共起

# Testのみ、後処理での削除用にtrain logに5回以上出現しているがtest logに出現していない宿を抽出
only_train_yado_list = {}
for yado in train_count_items:
    if yado not in test_count_items:
        only_train_yado_list[yado] = train_count_items[yado]
rm_yado_list = set([k for k,v in only_train_yado_list.items() if only_train_yado_list[k]>=5])


# 上記で抽出した予測候補を統合
merged_pred = []
for sml_p_i, sml_p_pred_i, lrg_p_i, lrg_p_pred_i, ken_p_i, ken_p_pred_i in zip(sml_p_pred_index, sml_p_pred, lrg_p_pred_index, lrg_p_pred, ken_p_pred_index, ken_p_pred):
    h_i = h_pred_index.get_loc(sml_p_i)
    h_pred_i = h_pred[h_i]

    train_rel_i = test_train_count_related_items_by_session.index.get_loc(sml_p_i)
    train_rel_items_i = test_train_count_related_items_by_session.iloc[train_rel_i]

    train_rel_first_i = test_train_count_related_items_by_session_first.index.get_loc(sml_p_i)
    train_rel_items_first_i = test_train_count_related_items_by_session_first.iloc[train_rel_first_i]

    test_rel_i = test_test_count_related_items_by_session.index.get_loc(sml_p_i)
    test_rel_items_i = test_test_count_related_items_by_session.iloc[test_rel_i]

    test_rel_first_i = test_test_count_related_items_by_session_first.index.get_loc(sml_p_i)
    test_rel_items_first_i = test_test_count_related_items_by_session_first.iloc[test_rel_first_i]

    # 共起した回数に重みづけを行って並び替え
    rel_items_i = order_by_weighted_sum_of_covisit([test_rel_items_i, test_rel_items_first_i, train_rel_items_i, train_rel_items_first_i], [1.0, 0.1, 0.01, 0.01], second_order=test_count_items)

    # 1番目に訪問した宿のlogにおける出現回数が2番目の宿の0.3倍以下であれば順序を入れ替え
    if len(h_pred_i) == 2 and test_count_items[h_pred_i[0]] /  test_count_items[h_pred_i[1]] < 0.3:
            h_pred_i = sorted(h_pred_i, key=lambda x: (test_count_items[x]), reverse=True)

    merged_pred_i = h_pred_i + rel_items_i + sml_p_pred_i + lrg_p_pred_i + ken_p_pred_i
    # 順序を保って重複除去
    merged_pred_i = list(dict.fromkeys(merged_pred_i))
    # 後処理としてtrain logに5回以上出現しているがtest logに出現していない宿を削除
    merged_pred_i = [e for e in merged_pred_i if e not in rm_yado_list]
    merged_pred.append(merged_pred_i[:10])

test_prediction = pd.DataFrame(merged_pred, index=sml_p_pred_index, columns=[f'predict_{i}' for i in range(10)])
test_prediction

Unnamed: 0_level_0,predict_0,predict_1,predict_2,predict_3,predict_4,predict_5,predict_6,predict_7,predict_8,predict_9
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
00001149e9c73985425197104712478c,3560,4420,9534,11561,5466,2680,6488,4714,6563,10233
0000e02747d749a52b7736dfa751e258,143,11923,8108,613,4066,6129,7014,11237,10095,5055
0000f17ae2628237d78d3a38b009d3be,757,7710,9190,9910,1774,13570,6721,410,10485,8922
000174a6f7a569b84c5575760d2e9664,12341,6991,3359,13521,1542,10861,4180,10746,9319,5657
00017e2a527901c9c41b1acef525d016,2862,9020,763,10826,13235,1448,5650,11480,607,3854
...,...,...,...,...,...,...,...,...,...,...
fffee3199ef94b92283239cd5e3534fa,1997,5744,7888,12942,1885,7062,10997,9743,11123,8771
ffff62c6bb49bc9c0fbcf08494a4869c,4014,1227,12432,899,3802,3644,2232,13220,2164,4962
ffff9a7dcc892875c7a8b821fa436228,13241,11037,13797,2087,13719,8143,7308,12939,3955,844
ffffb1d30300fe17f661941fd085b04b,3100,3002,2373,13672,4976,1687,5513,12281,6034,5515
