<a href="https://colab.research.google.com/github/parkyolo/recommand_algorithm/blob/main/buydata_preprocessed.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import time
import random
import numpy as np
import pandas as pd
import datetime as dt

In [2]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [2]:
## load data
raw_data = pd.read_csv('/content/gdrive/MyDrive/yoochoose/yoochoose-buys.dat', sep=',', \
                   header=None, usecols=[0,1,2], dtype={0:np.int32, 1:str, 2:np.int64})
raw_data.columns = ['SessionId', 'TimeStamp', 'ItemId']
raw_data.shape

(1150753, 3)

In [3]:
# time stamp 버리고 
dropped_timestamp = raw_data.drop('TimeStamp',axis=1)
# session id & item id 쌍 중복 제거 
dropped_duplicates = dropped_timestamp.drop_duplicates()

In [11]:
len(dropped_duplicates)

1049817

In [4]:
# data : 중복 제거된 data 
# length : 길이 몇 이상인 데이터로 할 것 인지, sessionID 하나에 대한 itemID 개수
def filter_session(data,length):
    session_lists = list(data['SessionId'].value_counts().index)
    filtered_df = pd.DataFrame()
    for user in session_lists:
        filtered_user_df = data[data['SessionId']==user]
        if len(filtered_user_df) > length :
            filtered_df = pd.concat([filtered_df,filtered_user_df])
    return filtered_df

In [16]:
filtered_df = filter_session(dropped_duplicates,10)

In [17]:
filtered_df

Unnamed: 0,SessionId,ItemId
576452,6149111,214848373
576453,6149111,214848384
576454,6149111,214835167
576455,6149111,214829765
576456,6149111,214848926
...,...,...
640546,6273507,214848688
640547,6273507,214848410
640548,6273507,214848337
640549,6273507,214826990


In [18]:
session_id = np.array(filtered_df['SessionId'])
item_id = np.array(filtered_df['ItemId'])

In [19]:
from scipy.sparse import csr_matrix
# session_item_spm = sp.coo_matrix((np.ones((len(session_id),)), (session_id, item_id)))
session_item_spm = csr_matrix((np.ones((len(session_id),)), (session_id, item_id)))
num_users, num_items = session_item_spm.shape

In [9]:
from sklearn.metrics.pairwise import cosine_similarity

def cosine_sim(session_item_spm):
    item_spm = cosine_similarity(session_item_spm.transpose(),dense_output=False)
    return item_spm

In [20]:
item_sim = cosine_sim(session_item_spm)

In [21]:
print(item_sim.shape)

(214981103, 214981103)


In [22]:
print(item_sim)

  (214507331, 214832557)	0.5773502691896258
  (214507331, 214820458)	0.5
  (214507331, 214800262)	0.35355339059327373
  (214507331, 214783352)	1.0
  (214507331, 214748334)	0.4472135954999579
  (214507331, 214743821)	0.5773502691896258
  (214507331, 214716120)	1.0
  (214507331, 214648250)	0.4472135954999579
  (214507331, 214648247)	0.5
  (214507331, 214603138)	0.5
  (214507331, 214575665)	0.5773502691896258
  (214507331, 214517880)	1.0
  (214507331, 214507331)	1.0
  (214507365, 214850402)	0.16012815380508713
  (214507365, 214849768)	1.0
  (214507365, 214849766)	0.7071067811865475
  (214507365, 214849080)	0.2773500981126146
  (214507365, 214849001)	0.21320071635561041
  (214507365, 214845131)	0.15617376188860607
  (214507365, 214839973)	0.12126781251816648
  (214507365, 214837286)	0.21320071635561041
  (214507365, 214587317)	0.20851441405707477
  (214507365, 214586694)	0.30151134457776363
  (214507365, 214573350)	0.3333333333333333
  (214507365, 214507365)	1.0
  :	:
  (214865212, 2148324

In [35]:
# 유사도가 n보다 크고 m보다 작은 item과 유사도 출력
n = 0
m = 0.015
idx = 0
first_ptr = 214507331 # item_sim 을 출력했을 때 첫번째 itemID

for i in range(first_ptr, len(item_sim.indptr)-1 ):
  if (item_sim.indptr[i+1] - item_sim.indptr[i]) > 0: #2
    cnt = item_sim.indptr[i+1] - item_sim.indptr[i] #2
    ptr = i
    for j in range(cnt):
      if (item_sim.data[idx] > n and item_sim.data[idx] < m):
        print("(%d,%d) %f"%(ptr, item_sim.indices[idx], item_sim.data[idx]))
      idx += 1

(214716671,214853767) 0.013376
(214716671,214853702) 0.014982
(214716671,214844400) 0.014465
(214716671,214839973) 0.013474
(214716714,214748336) 0.013878
(214716746,214839973) 0.014097
(214748336,214854300) 0.013649
(214748336,214850739) 0.014506
(214748336,214853242) 0.014927
(214748336,214851080) 0.013649
(214748336,214716714) 0.013878
(214839973,214853767) 0.014599
(214839973,214716746) 0.014097
(214839973,214716671) 0.013474
(214844396,214853767) 0.014707
(214844398,214853767) 0.013901
(214844400,214716671) 0.014465
(214850739,214748336) 0.014506
(214851080,214748336) 0.013649
(214853242,214748336) 0.014927
(214853702,214716671) 0.014982
(214853767,214716671) 0.013376
(214853767,214844396) 0.014707
(214853767,214844398) 0.013901
(214853767,214839973) 0.014599
(214854300,214748336) 0.013649


In [28]:
# 공통된 sessionID의 개수
def duplicate(Item_i, Item_j):
  cnt = 0
  for i in Item_i['SessionId']:
    for j in Item_j['SessionId']:
      if i == j:
        cnt += 1
  return cnt

In [29]:
# (214832315, 214832317) 0.894427
Item_i = filtered_df.loc[filtered_df.ItemId == 214832315]
Item_j = filtered_df.loc[filtered_df.ItemId == 214832317]
duplicate(Item_i, Item_j)

4

In [36]:
# (214716671,214853767) 0.013376
Item_i = filtered_df.loc[filtered_df.ItemId == 214716671]
Item_j = filtered_df.loc[filtered_df.ItemId == 214853767]
duplicate(Item_i, Item_j)

1