In [1]:
import zipfile
import json
import pandas as pd
from collections import defaultdict

def parse_jsonl_gz_file(file_path):
    user_id_list = []
    business_list = []
    date_list = []
    
    with zipfile.ZipFile(file_path, 'r') as zip_ref:
        file_list = zip_ref.namelist()

        for file_name in file_list:
            with zip_ref.open(file_name) as json_file:
                for line in json_file:
                    data = json.loads(line)
                    user_id_list.append(data['user_id'])
                    business_list.append(data['business_id'])
                    date_list.append(data['date'])
    
    df = pd.DataFrame({
        'user_id': user_id_list,
        'business_id': business_list,
        'date': date_list
    })
    
    return df

file_path = 'yelp_academic_dataset_review.json.zip'

df = parse_jsonl_gz_file(file_path)

print(df.head())


                  user_id             business_id                 date
0  mh_-eMZ6K5RLWhZyISBhwA  XQfwVwDr-v0ZS3_CbbE5Xw  2018-07-07 22:09:11
1  OyoGAe7OKpv6SyGZT5g77Q  7ATYjTIgM3jUlt4UM3IypQ  2012-01-03 15:28:18
2  8g_iMtfSiwikVnbP2etR0A  YjUWPpI6HXG530lwP-fb2A  2014-02-05 20:30:30
3  _7bHUi9Uuf5__HHc_Q8guQ  kxX2SOes4o-D3ZQBkiMRfA  2015-01-04 00:01:03
4  bcjbaE6dDog4jkNY91ncLQ  e4Vwtrqf-wpJfwesgvdgxQ  2017-01-14 20:54:15


### 5-core

- 5번 이하 나타난 가게 삭제

In [2]:
# business_id 출현 횟수가 5번 미만인 값 삭제
business_id_counts = df['business_id'].value_counts()
business_id_to_keep = business_id_counts[business_id_counts >= 5].index.tolist()
df = df[df['business_id'].isin(business_id_to_keep)]

# user_id 출현 횟수가 5번 미만인 값 삭제
user_id_counts = df['user_id'].value_counts()
user_id_to_keep = user_id_counts[user_id_counts >= 5].index.tolist()
df = df[df['user_id'].isin(user_id_to_keep)]
df

Unnamed: 0,user_id,business_id,date
0,mh_-eMZ6K5RLWhZyISBhwA,XQfwVwDr-v0ZS3_CbbE5Xw,2018-07-07 22:09:11
1,OyoGAe7OKpv6SyGZT5g77Q,7ATYjTIgM3jUlt4UM3IypQ,2012-01-03 15:28:18
2,8g_iMtfSiwikVnbP2etR0A,YjUWPpI6HXG530lwP-fb2A,2014-02-05 20:30:30
3,_7bHUi9Uuf5__HHc_Q8guQ,kxX2SOes4o-D3ZQBkiMRfA,2015-01-04 00:01:03
4,bcjbaE6dDog4jkNY91ncLQ,e4Vwtrqf-wpJfwesgvdgxQ,2017-01-14 20:54:15
...,...,...,...
6990275,qskILQ3k0I_qcCMI-k6_QQ,jals67o91gcrD4DC81Vk6w,2014-12-17 21:45:20
6990276,Zo0th2m8Ez4gLSbHftiQvg,2vLksaMmSEcGbjI5gywpZA,2021-03-31 16:55:10
6990277,mm6E4FbCMwJmb7kPDZ5v2Q,R1khUUxidqfaJmcpmGd4aw,2019-12-30 03:56:30
6990278,YwAMC-jvZ1fvEUum6QkEkw,Rr9kKArrMhSLVE9a53q-aA,2022-01-19 18:59:27


In [3]:
df['user_id'],_ = pd.factorize(df['user_id'])
df['business_id'],_ = pd.factorize(df['business_id'])  
df = df.drop_duplicates(subset=['user_id', 'business_id', 'date']) # 중복 데이터 제거

gropuby_df= df.groupby('user_id')['business_id'].apply(list).reset_index()
gropuby_df


Unnamed: 0,user_id,business_id
0,0,"[0, 671, 2189, 18981, 18303, 15907, 15363, 305..."
1,1,"[1, 595, 577, 55009, 46113, 118965, 125986, 13..."
2,2,"[2, 164, 2182, 2182, 12606, 2182, 1212, 15366,..."
3,3,"[3, 16043, 77338, 75914, 90555]"
4,4,"[4, 6333, 4, 3175, 1191, 1567, 1051, 1338, 101..."
...,...,...
287111,287111,"[135358, 135358, 135358, 135358, 135358]"
287112,287112,"[134083, 134083, 134083, 134083, 134083, 134083]"
287113,287113,"[145040, 145040, 145040, 145040, 145040, 145040]"
287114,287114,"[135603, 135603, 135603, 135603, 135603]"


In [4]:
# 중복된 항목을 추출하여 새로운 컬럼에 저장하는 함수 정의
def extract_duplicates(items):
    duplicate_items = []
    for item in items:
        if items.count(item) > 1 and item not in duplicate_items:
            duplicate_items.append(item)
    return duplicate_items

# 새로운 컬럼에 중복된 항목 저장
gropuby_df['duplicated_items'] = gropuby_df['business_id'].apply(extract_duplicates)

print(gropuby_df)

        user_id                                        business_id  \
0             0  [0, 671, 2189, 18981, 18303, 15907, 15363, 305...   
1             1  [1, 595, 577, 55009, 46113, 118965, 125986, 13...   
2             2  [2, 164, 2182, 2182, 12606, 2182, 1212, 15366,...   
3             3                    [3, 16043, 77338, 75914, 90555]   
4             4  [4, 6333, 4, 3175, 1191, 1567, 1051, 1338, 101...   
...         ...                                                ...   
287111   287111           [135358, 135358, 135358, 135358, 135358]   
287112   287112   [134083, 134083, 134083, 134083, 134083, 134083]   
287113   287113   [145040, 145040, 145040, 145040, 145040, 145040]   
287114   287114           [135603, 135603, 135603, 135603, 135603]   
287115   287115           [133741, 134043, 134043, 134043, 134043]   

                                         duplicated_items  
0                                                      []  
1                                      

In [5]:
gropuby_df['duplicated_items'].apply(lambda x:len(x)).unique()

array([  0,   6,  14,   2,   1,  17,   3,  11, 141,  10,   5,  26,   7,
         8,   4,  19,   9, 236,  43,  15,  59,  12,  29,  20,  28,  16,
        47,  13,  82,  39, 538,  22,  18, 112,  48,  23,  25, 114,  35,
        36,  27,  75,  50,  31,  24,  74,  34,  41,  49,  40,  77,  67,
        71,  37,  42,  58,  65,  38,  89,  32,  21,  57, 187,  30,  45,
       302,  93,  68,  51,  56,  97,  33,  84,  53,  44, 104,  46, 128,
        83,  99,  70, 142,  81, 173,  76,  66,  63, 121,  86, 172,  88,
       170,  94,  54,  80,  62, 140, 103,  55,  92,  61,  60,  52, 109,
        90,  64, 100, 107, 247])

In [6]:
len(gropuby_df[gropuby_df['duplicated_items'].apply(lambda x:len(x))>=1])

80019

In [7]:
80019/287116*100

27.869920171637947

In [8]:
len(df['business_id'].unique())

148523

In [9]:
# 중복을 제거할 집합(set) 생성
unique_items = set()

# 데이터프레임의 duplicated_items 컬럼을 순회하면서 중복 없이 원소 추가
for row in gropuby_df['duplicated_items']:
    unique_items.update(row)

# 중복 없이 추가된 원소들을 리스트로 변환
unique_items_list = list(unique_items)
len(unique_items_list)

59734

In [10]:
59734/148523*100

40.21868666805815