参考URLhttps://dse-souken.com/2021/03/25/ai-20/

In [None]:
#colab_note上で必要
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd 
import matplotlib.pyplot as plt
import numpy as np
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors

In [None]:
#参考URL: https://developers.microad.co.jp/entry/2019/05/10/180000
#疎行列のpivotテーブルを作成
#index_listとcol_list
def make_sparse_pivot(df, index, col, val, index_list = [], col_list = []): 
  if(len(index_list) == 0):
    index_list = df[index].unique()
  if(len(col_list) == 0):
    col_list = df[col].unique()
  index_categorical = pd.api.types.CategoricalDtype(categories=sorted(index_list), ordered=True)
  col_categorical = pd.api.types.CategoricalDtype(categories=sorted(col_list), ordered=True)
  row_num = df[index].astype(index_categorical).cat.codes
  col_num = df[col].astype(col_categorical).cat.codes
  
  sparse_matrix = csr_matrix((df[val], (row_num, col_num)), \
                           shape=(index_categorical.categories.size, \
                                  col_categorical.categories.size))
  return sparse_matrix

In [None]:
folder_path = '/content/drive/My Drive/dressipi_recsys2022/'
file_name_list = ["candidate_items.csv", "item_features.csv", "test_final_sessions.csv", "test_leaderboard_sessions.csv", "train_purchases.csv", "train_sessions.csv"]
df_candidate_items = pd.read_csv(folder_path + file_name_list[0])
df_item_features = pd.read_csv(folder_path + file_name_list[1])
df_test_final_sessions = pd.read_csv(folder_path + file_name_list[2])
df_test_leaderboard_sessions = pd.read_csv(folder_path + file_name_list[3])
df_train_purchases = pd.read_csv(folder_path + file_name_list[4])
df_train_sessions = pd.read_csv(folder_path + file_name_list[5])

In [None]:
#pivot用のカラムを作成
df_train_sessions['count'] = 1.0
df_test_leaderboard_sessions['count'] = 1.0

In [None]:
#この方法では最終的にcandidate itemが買われていないセッションの使い道はないので予めデータを落としておく
candidate_items = df_candidate_items['item_id'].unique()
candidate_session_list = df_train_purchases[df_train_purchases['item_id'].isin(candidate_items)]['session_id'].values
df_train_purchases = df_train_purchases[df_train_purchases['session_id'].isin(candidate_session_list)]
df_train_sessions = df_train_sessions[df_train_sessions['session_id'].isin(candidate_session_list)]

In [None]:
#trainデータとして全trainセッションを使用
df_train = df_train_sessions

In [None]:
#各セッションデータを疎行列に変換
index = 'session_id'
col = 'item_id'
val = 'count'
col_list = df_item_features['item_id'].unique()

train_sparse = make_sparse_pivot(df_train_sessions, index, col, val, col_list = col_list)
lb_session_sparse = make_sparse_pivot(df_test_leaderboard_sessions, index, col, val, col_list = col_list)
#pandasでpivotすると落ちる
#df_piv = df_train_sessions.pivot_table(index= "session_id", columns="item_id" ,values="date", aggfunc = "count").fillna(0)

In [None]:
#KNNのインスタンス化。評価方法をコサイン類似度に設定。bruteは総当り方式。
rec = NearestNeighbors(n_neighbors=10,algorithm= "brute", metric= "cosine")
# KNNで訓練
rec_model = rec.fit(train_sparse)

In [None]:
unique_session_lb = df_test_leaderboard_sessions['session_id'].unique()
unique_item_id_list = df_item_features['item_id'].unique()
df_train_purchases = df_train_purchases.set_index('session_id')
session_id_list = df_train_purchases.index

In [None]:
import warnings
warnings.simplefilter('ignore')

#距離が近い上位10000までのセッションのみを最終的な
num_n_neighbors = 10000

#予測：雑実装のため60分くらいかかる、、、
df_sub_lb = pd.DataFrame(columns=['session_id', 'item_id', 'rank'])
item_id = []
session_id = []
rank = []
count = 0

for i,s  in enumerate(unique_session_lb):

  #for文のカウント,50000で終わり
  if(count%1000 == 0):
    print(count)
  count += 1

#対象のlb sessionとの距離が近い、上位num_n_neighborsのtrain session の、indexと距離を取得
  distance, indice = rec_model.kneighbors(lb_session_sparse.getrow(i).toarray().flatten().reshape(1,-1),n_neighbors=num_n_neighbors)
  df_distance = pd.DataFrame(np.array([df_train_purchases.loc[session_id_list[indice].flatten()].item_id.values, 1 - distance[0]]).T, columns=['candidate_item_id', 'distance'])

#そのsessionに登場したアイテムを除く
  session_item_list = df_test_leaderboard_sessions[df_test_leaderboard_sessions['session_id']==s].item_id.values
  df_distance = df_distance[df_distance['candidate_item_id'].isin(session_item_list)==False]

#距離をもとにした重みで、アイテムごとの加重平均をとる->スコアが高い上位100アイテムを取得
  prob_top100_item_id=df_distance.groupby('candidate_item_id').sum().sort_values('distance', ascending = 0).index.astype('int').values[0:100]

#結果を格納していく
  item_id = np.concatenate([item_id,prob_top100_item_id])

  temp = np.array([s for i in range(len(prob_top100_item_id))])
  session_id = np.concatenate([session_id,temp])

  temp = np.array([i + 1 for i in range(len(prob_top100_item_id))])
  rank = np.concatenate([rank,temp])


0
1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
17000
18000
19000
20000
21000
22000
23000
24000
25000
26000
27000
28000
29000
30000
31000
32000
33000
34000
35000
36000
37000
38000
39000
40000
41000
42000
43000
44000
45000
46000
47000
48000
49000


In [None]:
#提出用のデータを作成
df_sub_lb = pd.DataFrame(np.vstack([session_id, item_id, rank]).T, columns=['session_id', 'item_id', 'rank'])
df_sub_lb = df_sub_lb.astype('int')#intに明示的に変換しないと通らない

In [None]:
#保存
file_name = "submit20220425_cos_dist3"
df_sub_lb.to_csv(folder_path + file_name + ".csv", index=False)

以下はゴミ

In [None]:

df_test_leaderboard_sessions[df_test_leaderboard_sessions['session_id']==1402710]

Unnamed: 0,session_id,item_id,date,count
72746,1402710,21668,2021-06-22 12:36:15.701,1.0
72747,1402710,25809,2021-06-22 12:38:16.523,1.0
72748,1402710,13656,2021-06-22 12:38:34.615,1.0
72749,1402710,955,2021-06-22 12:39:28.532,1.0
72750,1402710,6736,2021-06-22 12:39:32.15,1.0


In [None]:
df_sub_lb.isnull().all()

session_id    False
item_id       False
rank          False
dtype: bool

In [None]:
pd.set_option('display.max_rows', 200)
print(df_sub_lb[df_sub_lb['session_id']==1402710])

         session_id  item_id  rank
1570400     1402710    26853     1
1570401     1402710     8577     2
1570402     1402710    15249     3
1570403     1402710    21616     4
1570404     1402710    21890     5
1570405     1402710    18289     6
1570406     1402710    26542     7
1570407     1402710    24736     8
1570408     1402710    12662     9
1570409     1402710    11998    10
1570410     1402710     8861    11
1570411     1402710      447    12
1570412     1402710     6627    13
1570413     1402710    14648    14
1570414     1402710    23428    15
1570415     1402710    20798    16
1570416     1402710     1933    17
1570417     1402710    21965    18
1570418     1402710    15777    19
1570419     1402710    18947    20
1570420     1402710      340    21
1570421     1402710     3857    22
1570422     1402710    21100    23
1570423     1402710     6256    24
1570424     1402710    24286    25
1570425     1402710     2691    26
1570426     1402710     4193    27
1570427     1402710 

In [None]:
len(df_sub_lb)

5000000

In [None]:
test= df_item_features['item_id'].unique()

In [None]:
candidate_items = df_candidate_items['item_id'].unique()

In [None]:
df_sub_lb[df_sub_lb['item_id'].isin(candidate_items)]

Unnamed: 0,session_id,item_id,rank
0,26,3260,1
1,26,26538,2
2,26,20541,3
3,26,5383,4
4,26,27416,5
...,...,...,...
4999995,4439757,4564,96
4999996,4439757,17750,97
4999997,4439757,21886,98
4999998,4439757,10004,99


In [None]:
df_sub_lb[df_sub_lb['item_id'].isin(candidate_items)].head(100)

Unnamed: 0,session_id,item_id,rank
0,26,3260,1
1,26,26538,2
2,26,20541,3
3,26,5383,4
4,26,27416,5
5,26,6603,6
6,26,21035,7
7,26,3425,8
8,26,12939,9
9,26,2213,10


過去のやつ

In [None]:
import warnings
warnings.simplefilter('ignore')

#予測：雑実装のため60分くらいかかる、、、
df_sub_lb = pd.DataFrame(columns=['session_id', 'item_id', 'rank'])
item_id = []
session_id = []
rank = []
count = 0
for i,s  in enumerate(unique_session_lb):
  #for文のカウント,50000で終わり
  if(count%1000 == 0):
    print(count)
  count += 1

  distance, indice = rec_model.kneighbors(lb_session_sparse.getrow(i).toarray().flatten().reshape(1,-1),n_neighbors=300)
  candidate_df = df_train_purchases.loc[session_id_list[indice].flatten()]
  prob_top100_item_id = candidate_df[candidate_df['item_id'].isin(lb_item_id_list)]['item_id'].unique()[0:100]
  if(len(prob_top100_item_id) != 100):
    distance, indice = rec_model.kneighbors(lb_session_sparse.getrow(i).toarray().flatten().reshape(1,-1),n_neighbors=1000)
    candidate_df = df_train_purchases.loc[session_id_list[indice].flatten()]
    prob_top100_item_id = candidate_df[candidate_df['item_id'].isin(lb_item_id_list)]['item_id'].unique()[0:100]
  item_id = np.concatenate([item_id,prob_top100_item_id])

  temp = np.array([s for i in range(len(prob_top100_item_id))])
  session_id = np.concatenate([session_id,temp])

  temp = np.array([i + 1 for i in range(len(prob_top100_item_id))])
  rank = np.concatenate([rank,temp])

映画ID100を見た人におすすめの映画IDは以下です。


IndexError: ignored

In [None]:
def get_similairty(person1, person2):

  ## 両者とも見た映画の集合を取る
  set_person1 = set(dataset[person1].keys())
  set_person2 = set(dataset[person2].keys())
  set_both = set_person1.intersection(set_person2)

  if len(set_both)==0: #共通でみた映画がない場合は類似度を0とする
    return 0

  list_destance = []

  for item in set_both:
    # 同じ映画のレビュー点の差の2乗を計算
    # この数値が大きいほど「気が合わない」=「似ていない」と定義できる 
    distance = pow(dataset[person1][item]-dataset[person2][item], 2) 
    list_destance.append(distance)

  return 1/(1+np.sqrt(sum(list_destance))) #各映画の気の合わなさの合計の逆比的な指標を返す

In [None]:
get_similairty('Lisa Rose','Jack Matthews')

0.3405424265831667

In [None]:
def get_recommend(person, top_N):

  totals = {} ; simSums = {} #推薦度スコアを入れるための箱を作っておく

  # 自分以外のユーザのリストを取得してFor文を回す
  # -> 各人との類似度、及び各人からの（まだ本人が見てない）映画の推薦スコアを計算するため
  list_others = dataset.keys() ; list_others.remove(person)

  for other in list_others:
    # 本人がまだ見たことが無い映画の集合を取得
    set_other = set(dataset[other]); set_person = set(dataset[person])
    set_new_movie = set_other.difference(set_person)

    # あるユーザと本人の類似度を計算(simは0~1の数字)
    sim = get_similairty(person, other)

    # (本人がまだ見たことがない)映画のリストでFor分を回す
    for item in set_new_movie:

      # "類似度 x レビュー点数" を推薦度のスコアとして、全ユーザで積算する
      totals.setdefault(item,0)
      totals[item] += dataset[other][item]*sim 

      # またユーザの類似度の積算値をとっておき、これで上記のスコアを除する
      simSums.setdefault(item,0)
      simSums[item] += sim

  rankings = [(total/simSums[item],item) for item,total in totals.items()]
  rankings.sort()
  rankings.reverse()

  return [i[1] for i in rankings][:top_N]

In [None]:
get_recommend('Toby',2)

AttributeError: ignored

In [None]:
!git clone https://github.com/yolo-kiyoshi/recommend_cf.git


Cloning into 'recommend_cf'...
remote: Enumerating objects: 28, done.[K
remote: Counting objects: 100% (28/28), done.[K
remote: Compressing objects: 100% (16/16), done.[K
remote: Total 28 (delta 5), reused 25 (delta 2), pack-reused 0[K
Unpacking objects: 100% (28/28), done.


In [None]:
import os
path = '/content/recommend_cf'

#作業ディレクトリをpathに移動する
os.chdir(path)

#作業ディレクトリ直下のファイルを確認
!ls

data		    Dockerfile	poetry.lock	README.md  tests
docker-compose.yml  notebook	pyproject.toml	src
