## Amazon data load

In [55]:
import gzip
import json
import pandas as pd
from collections import defaultdict

user_reviews= defaultdict(list)

def parse_jsonl_gz_file(file_path):
    user_id_list = []
    asin_list = []
    timestamp_list = []
    
    with gzip.open(file_path, 'rt', encoding='utf-8') as f:
        for line in f:
            data = json.loads(line.strip())
            user_id_list.append(data['reviewerID'])
            asin_list.append(data['asin'])
            timestamp_list.append(data['unixReviewTime'])
    
    df = pd.DataFrame({
        'user_id': user_id_list,
        'asin': asin_list,
        'timestamp': timestamp_list
    })
    
    return df

file_path = 'data/Electronics_5.json.gz'

df = parse_jsonl_gz_file(file_path)

print(df.head()) 


          user_id        asin   timestamp
0   AAP7PPBU72QFM  0151004714   937612800
1  A2E168DTVGE6SV  0151004714  1382486400
2  A1ER5AYS3FQ9O3  0151004714  1220313600
3  A1T17LMQABMBN5  0151004714   968025600
4  A3QHJ0FXK33OBE  0151004714   949622400


In [56]:
duplicate_rows = df.duplicated(keep=False)
df_unique = df[~duplicate_rows]

In [57]:
# df_unique['user_id'],_ = pd.factorize(df['user_id'])
# df['asin'],_ = pd.factorize(df['asin'])  
# df = df.drop_duplicates(subset=['user_id', 'asin', 'timestamp']) # 중복 데이터 제거

gropuby_df= df_unique.groupby('user_id')['asin'].apply(list).reset_index()
gropuby_df



Unnamed: 0,user_id,asin
0,A0001528BGUBOEVR6T5U,"[B0055Q2VS8, B005XU7OL0, B007DJTBVK, B008FKDK2M]"
1,A0003214FKMKJE0PCW3D,"[B000P1O73A, B004XJ6R0Q, B004XVN1V2, B00BFO14W..."
2,A0008898NGEXICE0F146,"[B00005LEOO, B00009XVCZ, B009NVXGAI, B00KOUIC0..."
3,A00101847G3FJTWYGNQA,"[B0002JFNP6, B00453R808, B004WJRBUW, B005F778J..."
4,A0014476IZYE2N3XTK4K,"[B00A66XB54, B00DIF2BO2, B00L9K3MY6]"
...,...,...
728479,AZZYW4YOE1B6E,"[B000CKVOOY, B000EH0NLK, B000OOYECC, B001AK049..."
728480,AZZZ3LGTCGUZF,"[B004M8SBCK, B00EI4OIGQ, B017ANBP9Y, B01CG4C7F..."
728481,AZZZF5PSCPILV,"[B00JH23QMQ, B00RL9U878, B0123UHBBU, B01DXVKQE..."
728482,AZZZRS1YZ8HVP,"[B001CJOLBW, B00EZ9XKCM, B01CVOLKKQ]"


In [58]:
# 중복된 항목을 추출하여 새로운 컬럼에 저장하는 함수 정의
def extract_duplicates(items):
    duplicate_items = []
    for item in items:
        if items.count(item) > 1 and item not in duplicate_items:
            duplicate_items.append(item)
    return duplicate_items

# 새로운 컬럼에 중복된 항목 저장
gropuby_df['duplicated_items'] = gropuby_df['asin'].apply(extract_duplicates)

print(gropuby_df)

                     user_id  \
0       A0001528BGUBOEVR6T5U   
1       A0003214FKMKJE0PCW3D   
2       A0008898NGEXICE0F146   
3       A00101847G3FJTWYGNQA   
4       A0014476IZYE2N3XTK4K   
...                      ...   
728479         AZZYW4YOE1B6E   
728480         AZZZ3LGTCGUZF   
728481         AZZZF5PSCPILV   
728482         AZZZRS1YZ8HVP   
728483         AZZZU3P1UQZ0C   

                                                     asin duplicated_items  
0        [B0055Q2VS8, B005XU7OL0, B007DJTBVK, B008FKDK2M]               []  
1       [B000P1O73A, B004XJ6R0Q, B004XVN1V2, B00BFO14W...               []  
2       [B00005LEOO, B00009XVCZ, B009NVXGAI, B00KOUIC0...               []  
3       [B0002JFNP6, B00453R808, B004WJRBUW, B005F778J...               []  
4                    [B00A66XB54, B00DIF2BO2, B00L9K3MY6]               []  
...                                                   ...              ...  
728479  [B000CKVOOY, B000EH0NLK, B000OOYECC, B001AK049...               []  

In [59]:
len(gropuby_df[gropuby_df['duplicated_items'].apply(lambda x:len(x))>=1])  # 재구매 user 수

7780

In [60]:
7780/728484*100

1.0679712938101593

In [61]:
len(df['asin'].unique())

160052

In [62]:
# 중복을 제거할 집합(set) 생성
unique_items = set()

# 데이터프레임의 duplicated_items 컬럼을 순회하면서 중복 없이 원소 추가
for row in gropuby_df['duplicated_items']:
    unique_items.update(row)

# 중복 없이 추가된 원소들을 리스트로 변환
unique_items_list = list(unique_items)

In [63]:
len(unique_items_list)

3645

In [64]:
3645/160052*100

2.2773848499237745

## 범위 확장 (동일 item-> 브랜드 or 카테고리)
- 둘 중에 더 구별이 잘 되는 범주로 선택

In [143]:
import gzip
import json
import pandas as pd

def parse(path):
    with gzip.open(path, 'rb') as f:
        for line in f:
            yield json.loads(line)

def get_items_meta(meta_path):
    item2price = {}
    item2category = {}
    item2brand = {}


    for l in parse(meta_path):
        asin = l['asin']
        item2category[asin] = l['category']
        item2price[asin] = l['price'][1:] if 'price' in l else 0.0
        item2brand[asin] = l['brand'] if 'brand' in l else ''

    items_meta = {
        'item2price': item2price,
        'item2category': item2category,
        'item2brand': item2brand
    }

    return items_meta


file_path = 'data/Grocery_and_Gourmet_Food/meta_Grocery_and_Gourmet_Food.json.gz'
items_meta=get_items_meta(file_path)
items_meta

{'item2price': {'0681727810': '41.91',
  '0853347867': '',
  '1888861118': '29.95',
  '1888861517': '11.45',
  '1888861614': '15.00',
  '1888861533': '10.99',
  '1888861339': '12.91',
  '1888861355': '12.99',
  '188886155X': '10.99',
  '1888861207': '11.00',
  '4639725043': '12.46',
  '4639725183': '12.98',
  '4858582000': '25.93',
  '5236363640': '24.99',
  '541255556X': '18.99',
  '5463213682': '29.90',
  '5478541265': '',
  '5643467860': '',
  '6040343958': '',
  '6040763906': '',
  '6163058011': '49.98',
  '618205610X': '9.28',
  '6182055979': '',
  '6182056010': '15.20',
  '7107272160': '107.99',
  '7531848279': '52.99',
  '7656543546': '20.30',
  '7621000880': '',
  '7656453768': '',
  '7656453458': '',
  '7621915358': '',
  '7800648702': '',
  '8166679507': '',
  '8557758987': '10.99',
  '9177121805': 'a-box-inner{background-color:#fff}#alohaBuyBoxWidget .selected{background-color:#fffbf3;border-color:#e77600;box-shadow:0 0 3px rgba(228,121,17,.5)}#alohaBuyBoxWidget .contract-no

### brand 매핑

In [177]:
itemid_to_brand = {}
for itemid, brand in items_meta['item2brand'].items():
    itemid_to_brand[itemid] = brand

df['brand']=df['asin'].map(itemid_to_brand)
df

Unnamed: 0,user_id,asin,timestamp,brand
0,A1QVBUH9E1V6I8,4639725183,1416355200,Lipton
1,A3GEOILWLK86XM,4639725183,1476316800,Lipton
2,A32RD6L701BIGP,4639725183,1448064000,Lipton
3,A2UY1O1FBGKIE6,4639725183,1439337600,Lipton
4,A3QHVBQYDV7Z6U,4639725183,1432771200,Lipton
...,...,...,...,...
1143855,A223YRQH2Z5T1D,B01HJF6FRA,1504828800,Hampton Creek
1143856,A38GDA4TB9EILT,B01HJF6FRA,1501804800,Hampton Creek
1143857,A2025PN7HDC5BO,B01HJF6FRA,1499126400,Hampton Creek
1143858,A1NY7XWC7EPQOA,B01HJF6FRA,1496793600,Hampton Creek


### category 매핑

In [178]:
itemid_to_category = {}
for itemid, category in items_meta['item2category'].items():
    itemid_to_category[itemid] = category

df['category']=df['asin'].map(itemid_to_category)
df

Unnamed: 0,user_id,asin,timestamp,brand,category
0,A1QVBUH9E1V6I8,4639725183,1416355200,Lipton,"[Grocery & Gourmet Food, Beverages, Coffee, Te..."
1,A3GEOILWLK86XM,4639725183,1476316800,Lipton,"[Grocery & Gourmet Food, Beverages, Coffee, Te..."
2,A32RD6L701BIGP,4639725183,1448064000,Lipton,"[Grocery & Gourmet Food, Beverages, Coffee, Te..."
3,A2UY1O1FBGKIE6,4639725183,1439337600,Lipton,"[Grocery & Gourmet Food, Beverages, Coffee, Te..."
4,A3QHVBQYDV7Z6U,4639725183,1432771200,Lipton,"[Grocery & Gourmet Food, Beverages, Coffee, Te..."
...,...,...,...,...,...
1143855,A223YRQH2Z5T1D,B01HJF6FRA,1504828800,Hampton Creek,"[Grocery & Gourmet Food, Condiments & Salad Dr..."
1143856,A38GDA4TB9EILT,B01HJF6FRA,1501804800,Hampton Creek,"[Grocery & Gourmet Food, Condiments & Salad Dr..."
1143857,A2025PN7HDC5BO,B01HJF6FRA,1499126400,Hampton Creek,"[Grocery & Gourmet Food, Condiments & Salad Dr..."
1143858,A1NY7XWC7EPQOA,B01HJF6FRA,1496793600,Hampton Creek,"[Grocery & Gourmet Food, Condiments & Salad Dr..."


### userid, timestamp 순으로 정렬

In [179]:
df= df.sort_values(by=['user_id', 'timestamp'])

In [182]:
df

Unnamed: 0,user_id,asin,timestamp,brand,category
402831,A0096681Y127OL1H8W3U,B002UGMH9Y,1425513600,Big Tree Farms,"[Grocery & Gourmet Food, Cooking & Baking, Sug..."
209596,A0096681Y127OL1H8W3U,B0013OX8II,1435104000,Jarrow,"[Grocery & Gourmet Food, Cooking & Baking, Coo..."
263297,A0096681Y127OL1H8W3U,B0013OX8II,1435104000,Jarrow,"[Grocery & Gourmet Food, Cooking & Baking, Coo..."
570023,A0096681Y127OL1H8W3U,B006RNQ7YW,1435104000,Dr. Bronner's,"[Grocery & Gourmet Food, Cooking & Baking, Coo..."
111001,A0096681Y127OL1H8W3U,B000HDI5O8,1493769600,Farmer's Market Foods,"[Grocery & Gourmet Food, Cooking & Baking, Pie..."
...,...,...,...,...,...
588636,AZZZ5UJWUVCYZ,B007IS64J8,1505865600,Ice Breakers,"[Grocery & Gourmet Food, Candy & Chocolate, Mi..."
848720,AZZZ5UJWUVCYZ,B00LPF63DW,1505865600,Atkins,"[Grocery & Gourmet Food, Candy & Chocolate]"
1135281,AZZZ5UJWUVCYZ,B0147QHDKM,1505865600,Halo Top,"[Grocery & Gourmet Food, Frozen, Ice Cream & N..."
1143594,AZZZ5UJWUVCYZ,B01H2R5IT6,1505865600,Wang Derm,"[Grocery & Gourmet Food, Beverages, Coffee, Te..."


### user 별 같은 brand의 item을 구매한 주기 추출

- user_reviews에 brand info mapping

In [190]:
user_reviews_brand = defaultdict(list)

# user_reviews 딕셔너리를 데이터프레임으로 변환
user_reviews_df = pd.DataFrame([(user, itemid, timestamp) for user, interactions in user_reviews.items() for itemid, timestamp in interactions], columns=['user_id', 'asin', 'timestamp'])

# user_id, asin, timestamp를 기준으로 df와 조인
merged_df = pd.merge(user_reviews_df, df, left_on=['user_id', 'asin', 'timestamp'], right_on=['user_id', 'asin', 'timestamp'], how='inner')

# user_reviews_brand 딕셔너리에 정보 추가
for index, row in merged_df.iterrows():
    user_reviews_brand[row['user_id']].append([row['asin'], row['timestamp'], row['brand']])


user_reviews_brand = defaultdict(list)

# user_reviews 딕셔너리를 순회하면서 조건에 맞는 데이터프레임 행을 필터링하여 정보 추가
for user, interactions in user_reviews.items():
    for itemid, timestamp in interactions:
        filtered_df = df[(df['user_id'] == user) & (df['asin'] == itemid) & (df['timestamp'] == timestamp)]
        for index, row in filtered_df.iterrows():
            user_reviews_brand[user].append([itemid, timestamp, row['brand']])

user_reviews_brand

KeyboardInterrupt: 

In [171]:
from collections import defaultdict
import numpy as np

def calculate_purchase_intervals(interactions):
    item_period=defaultdict(list)

    for user, interactions_list in interactions.items():
        interactions_list.sort(key=lambda x: x[1])# timestamp sort

        item_count = defaultdict(int)
        item_timestamps = defaultdict(list)
    
        for item, timestamp in interactions_list:
            item_count[item] += 1
            item_timestamps[item].append(timestamp)

        
        for item, count in item_count.items():
            if count == 1:
                item_period[user].append((item, 0))

            elif count == 2:
                period = item_timestamps[item][1] - item_timestamps[item][0]
                item_period[user].append((item, period))
            else: # count>=3
                periods = []
                timestamps = item_timestamps[item]
                for i in range(len(timestamps) - 1):
                    periods.append(abs(timestamps[i + 1] - timestamps[i]))
                if periods:
                    avg_period = np.mean(periods)
                    item_period[user].append((item, avg_period))

    return item_period


calculate_purchase_intervals(user_reviews)

defaultdict(list,
            {'A1QVBUH9E1V6I8': [('4639725183', 0),
              ('B000E6LBXK', 0),
              ('B00KC6C8M0', 0),
              ('B004CPAP5E', 0),
              ('B00863B5JI', 0)],
             'A3GEOILWLK86XM': [('B001XSMANI', 0),
              ('B009PARMMA', 0),
              ('B000H24066', 0),
              ('B001G7QL72', 0),
              ('B00845B7NW', 0),
              ('B000PWWQP0', 0),
              ('B001M074O2', 0),
              ('B004SKOGZI', 0),
              ('B00374XTQI', 0),
              ('B0091YGEJ2', 0),
              ('B0007QMT7O', 0),
              ('B0028667PG', 0),
              ('B00E3A1SQS', 0),
              ('B00006FWVX', 0),
              ('B0000VM9EE', 0),
              ('B004Z0K2N6', 0),
              ('B000GFYRHQ', 0),
              ('B01C4N8ICC', 0),
              ('4639725183', 0)],
             'A32RD6L701BIGP': [('B000ES5GMK', 0),
              ('B005S8XXW6', 0),
              ('B0074LM402', 0),
              ('B0093NRWPG', 0),
  

In [60]:
from collections import Counter
counter=Counter(df['asin'].tolist())

In [61]:
# Counter 객체에서 값(value)이 1 이상인 키(key)들을 추출
filtered_keys = [key for key, value in counter.items() if value > 1]

# 값이 1 이상인 키들에 해당하는 값(value)들을 데이터프레임의 열(column)로 변환
values_list = [counter[key] for key in filtered_keys]
df = pd.DataFrame(values_list, columns=['count'])

# 데이터프레임의 describe() 메서드를 사용하여 통계 정보 출력
description = df.describe()
description

Unnamed: 0,count
count,41306.0
mean,27.692006
std,95.937762
min,2.0
25%,6.0
50%,10.0
75%,22.0
max,7387.0


# 데이터 확인

In [2]:
import gzip
import pickle as pkl
from collections import defaultdict
import numpy as np
import pandas as pd
import json
import random

def parse(path):
    with gzip.open(path, 'rb') as f:
        for line in f:
            yield json.loads(line)

def get_items_meta(meta_path, categories_used='all'):
    # item2price = {}
    item2category = {}
    item2brand = {}

    if categories_used == 'all':
        for l in parse(meta_path):

            asin = l['asin']
            item2category[asin] = l['category']
            # item2price[asin] = l['price'][1:] if 'price' in l else 0.0
            item2brand[asin] = l['brand'] if 'brand' in l else ''
    else:
        for l in parse(meta_path):
            asin = l['asin']
            item2category[asin] = l['category'][0] if l['category'] else ''  # 첫 번째 카테고리만 사용
            # item2price[asin] = l['price'][1:] if 'price' in l else 0.0
            item2brand[asin] = l['brand'] if 'brand' in l else ''

    items_meta = {
        # 'item2price': item2price,
        'item2category': item2category,
        'item2brand': item2brand
    }
    return items_meta

In [5]:
meta_path='data/Grocery_and_Gourmet_Food/meta_Grocery_and_Gourmet_Food.json.gz'
reviews_path='data/Grocery_and_Gourmet_Food/Grocery_and_Gourmet_Food_5.json.gz'
categories_used='all'
user2id = {'[PAD]': 0}
item2id = {'[PAD]': 0}
items_map = {
    # 'item2price': {},
    'item2category': {},
    'item2brand': {}
}
user_reviews = defaultdict(list)
action_times = []
items_meta = get_items_meta(meta_path, categories_used)

for l in parse(reviews_path):
    if l['reviewerID'] not in user2id:
        user2id[l['reviewerID']] = len(user2id)
    action_times.append(l['unixReviewTime'])
    user_reviews[l['reviewerID']].append([l['asin'], l['unixReviewTime']])

for u in user_reviews:
    user_reviews[u].sort(key=lambda x: x[1])
    for item, time in user_reviews[u]:
        if item not in item2id:
            item2id[item] = len(item2id)
item2_id_list=[] 
for item in item2id.keys():
    item2_id_list.append(item)  # review item 41321

items_meta_itemid_list=[]
not_match_item_id=[]
items_meta_itemid_list=set(list(items_meta['item2brand'].keys())) # meta data itemid

for item in item2_id_list:
    if item not in items_meta_itemid_list:
            not_match_item_id.append(item)# meta item
not_match_item_id.remove('[PAD]')  # 40

#change itemid: 41321->41281

remove_unmatch_item_item2id={}
for itemid in item2id:
    if itemid not in not_match_item_id: 
        if itemid not in remove_unmatch_item_item2id.keys(): # remove duplicate & first appear
            remove_unmatch_item_item2id[itemid] = len(remove_unmatch_item_item2id)

item2id=remove_unmatch_item_item2id

In [7]:
len(item2id)


41281

### 5번 이상 구매된 item 필터링

In [11]:
item_purchase_counts = defaultdict(int)

# 각 유저의 구매 기록을 사용하여 아이템별 구매 횟수를 계산
for user, interactions_list in user_reviews.items():
    for item, action_time in interactions_list:
        if item in item2id:  # item2id에 존재하는 아이템만 고려
            item_purchase_counts[item] += 1

# # 5번 이상 구매된 아이템 필터링
# items_over_5_purchases = {item: count for item, count in item_purchase_counts.items() if count >= 5}


In [13]:
len(item_purchase_counts)

41280

In [15]:
user_over_5_item_purchases = {}

for user, interactions_list in user_reviews.items():
    item_count = sum(1 for item, action_time in interactions_list if item in item_purchase_counts)
    if item_count >= 5:
        user_over_5_item_purchases[user] = interactions_list


In [17]:
len(user_over_5_item_purchases)

127364