# ml-100k는 u.data
# ml-latest-small는 ratings.csv

In [1]:
import pandas as pd
import numpy as np
import os
from os.path import join

## 데이터 로딩

In [42]:
DATA_NAME = "Movielens"
FILE_NAME1 = "ml-100k\\u.data"
FILE_NAME2 = "ml-latest-small\\ratings.csv"
COL_NAMES = ['user', 'item', 'rating', 'time']  # 초기 컬럼  user id | item id | rating | timestamp
BASE_DATA_RATIO = 6  # N/10
INC_RATIO = 4
INC_STEP = 5

# config
ROOT_PATH = os.path.abspath("")
DATA_PATH = join(ROOT_PATH + "\\dataset", DATA_NAME)
SAVE_PATH = DATA_PATH
FILE_PATH1 = join(DATA_PATH, FILE_NAME1)
FILE_PATH2 = join(DATA_PATH, FILE_NAME2)
FILT_THRESHOLD = 10  # 최소 데이터 갯수
cols = ["user", "item", "time"]  # 사용할 컬럼

print(f"""
ROOT_PATH: {ROOT_PATH}
DATA_PATH: {DATA_PATH}
SAVE_PATH: {SAVE_PATH}
FILE_PATH1: {FILE_PATH1}
FILE_PATH2: {FILE_PATH2}
""")


ROOT_PATH: C:\tech\Study\KCC
DATA_PATH: C:\tech\Study\KCC\dataset\Movielens
SAVE_PATH: C:\tech\Study\KCC\dataset\Movielens
FILE_PATH1: C:\tech\Study\KCC\dataset\Movielens\ml-100k\u.data
FILE_PATH2: C:\tech\Study\KCC\dataset\Movielens\ml-latest-small\ratings.csv



In [43]:
def print_info(df):
    """
    그래프 정보 출력
    """
    print(f"Total Edges : {len(df)}\nTotal User : {len(df['user'].unique())}\nTotal item : {len(df['item'].unique())} \
                \nSmallest user id : {df['user'].unique().min()} \
                \nbiggest user id : {df['user'].unique().max()} \
                \nSmallest item id : {df['item'].unique().min()} \
                \nbiggest item id : {df['item'].unique().max()} \
                \nMin Interaction Per user : {df.user.value_counts().min()} \
                \nMax Interaction Per user : {df.user.value_counts().max()} \
                \nAvg Interaction Per user : {df.user.value_counts().mean()}\
                \nMin Interaction Per item : {df.item.value_counts().min()} \
                \nMax Interaction Per item : {df.item.value_counts().max()} \
                \nAvg Interaction Per item : {df.item.value_counts().mean()}")

In [44]:
# Load File
ml_100k_df = pd.read_table(FILE_PATH1, names=COL_NAMES)
ml_small_df = pd.read_csv(FILE_PATH2, names=COL_NAMES, skiprows=[0])

In [45]:
print_info(ml_100k_df)

Total Edges : 100000
Total User : 943
Total item : 1682                 
Smallest user id : 1                 
biggest user id : 943                 
Smallest item id : 1                 
biggest item id : 1682                 
Min Interaction Per user : 20                 
Max Interaction Per user : 737                 
Avg Interaction Per user : 106.04453870625663                
Min Interaction Per item : 1                 
Max Interaction Per item : 583                 
Avg Interaction Per item : 59.45303210463734


In [46]:
print_info(ml_small_df)

Total Edges : 100836
Total User : 610
Total item : 9724                 
Smallest user id : 1                 
biggest user id : 610                 
Smallest item id : 1                 
biggest item id : 193609                 
Min Interaction Per user : 20                 
Max Interaction Per user : 2698                 
Avg Interaction Per user : 165.30491803278687                
Min Interaction Per item : 1                 
Max Interaction Per item : 329                 
Avg Interaction Per item : 10.369806663924312


## 데이터 전처리

* 중복 제거: 동일 유저, 동일 아이템에 대한 평점은 가장 최근 것만
* 긍정 데이터: rating 5점 데이터
* 유효 데이터: interaction 10개 이상

1. 중복 제거 (중복이 없다.)

In [47]:
ml_small_df[ml_small_df.duplicated(subset=['user', 'item'])]

Unnamed: 0,user,item,rating,time


2. 평점 5점 데이터는 1

In [48]:
def GetPositiveDf(df):
    df['rating'] = df['rating'].apply(lambda x: 1 if x >= 5 else 0)
    return df


ml_100k_positive_df = GetPositiveDf(ml_100k_df)
ml_small_positive_df = GetPositiveDf(ml_small_df)

In [49]:
ml_100k_positive_df

Unnamed: 0,user,item,rating,time
0,196,242,0,881250949
1,186,302,0,891717742
2,22,377,0,878887116
3,244,51,0,880606923
4,166,346,0,886397596
...,...,...,...,...
99995,880,476,0,880175444
99996,716,204,1,879795543
99997,276,1090,0,874795795
99998,13,225,0,882399156


In [50]:
print_info(ml_100k_positive_df)

Total Edges : 100000
Total User : 943
Total item : 1682                 
Smallest user id : 1                 
biggest user id : 943                 
Smallest item id : 1                 
biggest item id : 1682                 
Min Interaction Per user : 20                 
Max Interaction Per user : 737                 
Avg Interaction Per user : 106.04453870625663                
Min Interaction Per item : 1                 
Max Interaction Per item : 583                 
Avg Interaction Per item : 59.45303210463734


In [51]:
print_info(ml_small_positive_df)

Total Edges : 100836
Total User : 610
Total item : 9724                 
Smallest user id : 1                 
biggest user id : 610                 
Smallest item id : 1                 
biggest item id : 193609                 
Min Interaction Per user : 20                 
Max Interaction Per user : 2698                 
Avg Interaction Per user : 165.30491803278687                
Min Interaction Per item : 1                 
Max Interaction Per item : 329                 
Avg Interaction Per item : 10.369806663924312


3. 10개 미만 interaction 삭제

In [52]:
def GetVailidDf(fdf):
    while fdf.user.value_counts().min() < FILT_THRESHOLD or fdf.item.value_counts().min() < FILT_THRESHOLD:
        df_item = fdf.groupby('item').count()
        df_item = df_item[df_item.user < FILT_THRESHOLD]
        li = df_item.index.to_list()
        fdf = fdf.drop(fdf.loc[fdf.item.isin(li)].index)
        # print_info(fdf)
        df_usr = fdf.groupby('user').count()
        df_usr = df_usr[df_usr.item < FILT_THRESHOLD]
        li = df_usr.index.to_list()
        fdf = fdf.drop(fdf.loc[fdf.user.isin(li)].index)
    fdf = fdf.reset_index().drop(columns=['index'])
    return fdf


ml_100k_vailid_df = GetVailidDf(ml_100k_positive_df)
ml_small_vailid_df = GetVailidDf(ml_small_positive_df)

In [53]:
print_info(ml_100k_vailid_df)

Total Edges : 97953
Total User : 943
Total item : 1152                 
Smallest user id : 1                 
biggest user id : 943                 
Smallest item id : 1                 
biggest item id : 1615                 
Min Interaction Per user : 18                 
Max Interaction Per user : 589                 
Avg Interaction Per user : 103.87380699893956                
Min Interaction Per item : 10                 
Max Interaction Per item : 583                 
Avg Interaction Per item : 85.02864583333333


In [54]:
print_info(ml_small_vailid_df)

Total Edges : 81109
Total User : 609
Total item : 2269                 
Smallest user id : 1                 
biggest user id : 610                 
Smallest item id : 1                 
biggest item id : 187593                 
Min Interaction Per user : 11                 
Max Interaction Per user : 1634                 
Avg Interaction Per user : 133.183908045977                
Min Interaction Per item : 10                 
Max Interaction Per item : 329                 
Avg Interaction Per item : 35.7465843984134


4. 랜덤 섞기

In [55]:
def GetShuffledDf(df):
    return df.sample(frac=1, random_state=42)

pre_ml_100k_df = GetShuffledDf(ml_100k_vailid_df)
pre_ml_small_df = GetShuffledDf(ml_small_vailid_df)

In [56]:
pre_ml_small_df

Unnamed: 0,user,item,rating,time
35476,288,1194,0,976120462
22818,186,748,0,1031080101
59851,472,1693,0,1345842979
63943,489,1393,0,1333101553
51944,414,647,0,961512495
...,...,...,...,...
6265,51,1033,0,1230930537
54886,429,225,0,828124615
76820,599,6888,0,1498505002
860,8,153,0,839463451


In [57]:
SAVE_PATH

'C:\\tech\\Study\\KCC\\dataset\\Movielens'

In [58]:
pre_ml_100k_df.to_csv(SAVE_PATH + "\\preprocessed_ml_100k.csv", index=False)
pre_ml_small_df.to_csv(SAVE_PATH + "\\preprocessed_ml_small.csv", index=False)

## 시나리오1, 유저와 아이템 모두 증가. (increase)

In [59]:
pre_ml_100k_df = pd.read_csv(os.path.join(SAVE_PATH, "preprocessed_ml_100k.csv"))
pre_ml_small_df = pd.read_csv(os.path.join(SAVE_PATH, "preprocessed_ml_small.csv"))

In [60]:
def GetTimeSequenceDf(df):
    df['time'] = pd.to_datetime(df['time'])
    return df.sort_values(by=['time'])

In [61]:
pre_ml_100k_df = GetTimeSequenceDf(pre_ml_100k_df)
pre_ml_small_df = GetTimeSequenceDf(pre_ml_small_df)

In [62]:
# 경로 설정
INCREASE_PATH = os.path.join(SAVE_PATH, "increase\\")
INCREASE_PATH

'C:\\tech\\Study\\KCC\\dataset\\Movielens\\increase\\'

In [63]:
# base block 설정
def SetBaseBlock(df, path, category):
    pivot = len(df) * BASE_DATA_RATIO // 10
    print(f"base block size: {pivot}")
    df[:pivot].to_csv(path + category + "_inc0.csv", index=False)


SetBaseBlock(pre_ml_100k_df, INCREASE_PATH, "ml_100k")
SetBaseBlock(pre_ml_small_df, INCREASE_PATH, "ml_small")

base block size: 58771
base block size: 48665


In [64]:
def SetIncreaseBlocks(df, category):
    pivot = len(df) * BASE_DATA_RATIO // 10
    remain = df[pivot:]
    len_per_block = remain.shape[0] // INC_STEP
    start = 0
    inc_block = []
    for i in range(INC_STEP):
        if i != INC_STEP - 1:
            inc_block.append(remain[start:start + len_per_block])
        else:
            inc_block.append(remain[start:])
        start += len_per_block
        print(f"Inc{i + 1} Block Size : {len(inc_block[i])}")
        inc_block[i].to_csv(INCREASE_PATH + category + f"_inc{i + 1}.csv", index=False)


SetIncreaseBlocks(pre_ml_100k_df, "ml_100k")
SetIncreaseBlocks(pre_ml_small_df, "ml_small")

Inc1 Block Size : 7836
Inc2 Block Size : 7836
Inc3 Block Size : 7836
Inc4 Block Size : 7836
Inc5 Block Size : 7838
Inc1 Block Size : 6488
Inc2 Block Size : 6488
Inc3 Block Size : 6488
Inc4 Block Size : 6488
Inc5 Block Size : 6492


## 시나리오2, 유저와 아이템 모두 고정. (fixed)

In [65]:
pre_ml_100k_df = pd.read_csv(os.path.join(SAVE_PATH, "preprocessed_ml_100k.csv"))
pre_ml_small_df = pd.read_csv(os.path.join(SAVE_PATH, "preprocessed_ml_small.csv"))

In [66]:
def GetTimeSequenceDf(df):
    df['time'] = pd.to_datetime(df['time'])
    return df.sort_values(by=['time'])

In [67]:
pre_ml_100k_df = GetTimeSequenceDf(pre_ml_100k_df)
pre_ml_small_df = GetTimeSequenceDf(pre_ml_small_df)

In [68]:
FIXED_PATH = os.path.join(SAVE_PATH, "fixed\\")
FIXED_PATH

'C:\\tech\\Study\\KCC\\dataset\\Movielens\\fixed\\'

In [69]:
# base block 설정
SetBaseBlock(pre_ml_100k_df, FIXED_PATH, "ml_100k")
SetBaseBlock(pre_ml_small_df, FIXED_PATH, "ml_small")

base block size: 58771
base block size: 48665


In [70]:
def SetFixedBlocks(df, category):
    pivot = len(df) * BASE_DATA_RATIO // 10
    # base block에 있는 unique한 user, item index 추출
    unique_user = df[:pivot]['user'].unique()
    unique_item = df[:pivot]['item'].unique()
    # remain block에서 unique user/item에 해당하는 데이터만 추출
    remain = df[pivot:]
    fixed_df = remain[remain['user'].isin(unique_user) & remain['item'].isin(unique_item)]
    len_per_block = fixed_df.shape[0] // INC_STEP
    start = 0
    inc_block = []
    for i in range(INC_STEP):
        if i != INC_STEP - 1:
            inc_block.append(fixed_df[start:start + len_per_block])
        else:
            inc_block.append(fixed_df[start:])
        start += len_per_block
        print(f"Inc{i + 1} Block Size : {len(inc_block[i])}")
        inc_block[i].to_csv(FIXED_PATH + category + f"_inc{i + 1}.csv", index=False)


SetFixedBlocks(pre_ml_100k_df, "ml_100k")
SetFixedBlocks(pre_ml_small_df, "ml_small")

Inc1 Block Size : 1040
Inc2 Block Size : 1040
Inc3 Block Size : 1040
Inc4 Block Size : 1040
Inc5 Block Size : 1041
Inc1 Block Size : 319
Inc2 Block Size : 319
Inc3 Block Size : 319
Inc4 Block Size : 319
Inc5 Block Size : 322


## 시나리오3, 유저만 증가 (user)

In [46]:
pre_ml_100k_df = pd.read_csv(os.path.join(SAVE_PATH, "preprocessed_ml_100k.csv"))
pre_ml_small_df = pd.read_csv(os.path.join(SAVE_PATH, "preprocessed_ml_small.csv"))

In [47]:
USER_PATH = os.path.join(SAVE_PATH, "user\\")
USER_PATH

'C:\\Users\\PC\\Desktop\\Study\\KCC-지원-dataset\\dataset\\Movielens\\user\\'

In [48]:
# base block 설정
SetBaseBlock(pre_ml_100k_df, USER_PATH, "ml_100k")
SetBaseBlock(pre_ml_small_df, USER_PATH, "ml_small")

base block size: 56275
base block size: 45332


In [49]:
def SetUserBlocks(df, category):
    pivot = len(df) * BASE_DATA_RATIO // 10
    # base block에 있는 unique한 user index 추출
    unique_user = df[:pivot]['user'].unique()
    # base block에 존재하는 unique한 user index를 제외하여 remain을 형성하자.
    remain = df[pivot:]
    new_user_remain_df = remain[~remain['user'].isin(unique_user)]
    len_per_block = new_user_remain_df.shape[0] // INC_STEP
    start = 0
    inc_block = []
    for i in range(INC_STEP):
        if i != INC_STEP - 1:
            inc_block.append(new_user_remain_df[start:start + len_per_block])
        else:
            inc_block.append(new_user_remain_df[start:])
        start += len_per_block
        print(f"Inc{i + 1} Block Size : {len(inc_block[i])}")
        inc_block[i].to_csv(USER_PATH + category + f"_inc{i + 1}.csv")


SetUserBlocks(pre_ml_100k_df, "ml_100k")
SetUserBlocks(pre_ml_small_df, "ml_small")

Inc1 Block Size : 4173
Inc2 Block Size : 4173
Inc3 Block Size : 4173
Inc4 Block Size : 4173
Inc5 Block Size : 4174
Inc1 Block Size : 3613
Inc2 Block Size : 3613
Inc3 Block Size : 3613
Inc4 Block Size : 3613
Inc5 Block Size : 3615


## 시나리오4, 아이템만 증가. (item)

In [50]:
pre_ml_100k_df = pd.read_csv(os.path.join(SAVE_PATH, "preprocessed_ml_100k.csv"))
pre_ml_small_df = pd.read_csv(os.path.join(SAVE_PATH, "preprocessed_ml_small.csv"))

In [51]:
ITEM_PATH = os.path.join(SAVE_PATH, "item\\")
ITEM_PATH

'C:\\Users\\PC\\Desktop\\Study\\KCC-지원-dataset\\dataset\\Movielens\\item\\'

In [52]:
# base block 설정
SetBaseBlock(pre_ml_100k_df, ITEM_PATH, "ml_100k")
SetBaseBlock(pre_ml_small_df, ITEM_PATH, "ml_small")

base block size: 56275
base block size: 45332


In [53]:
def SetItemBlocks(df, category):
    pivot = len(df) * BASE_DATA_RATIO // 10
    # base block에 있는 unique한 item index 추출
    unique_item = df[:pivot]['item'].unique()
    # base block에 존재하는 unique한 user index를 제외하여 remain을 형성하자.
    remain = df[pivot:]
    new_item_remain_df = remain[~remain['item'].isin(unique_item)]
    len_per_block = new_item_remain_df.shape[0] // INC_STEP
    start = 0
    inc_block = []
    for i in range(INC_STEP):
        if i != INC_STEP - 1:
            inc_block.append(new_item_remain_df[start:start + len_per_block])
        else:
            inc_block.append(new_item_remain_df[start:])
        start += len_per_block
        print(f"Inc{i + 1} Block Size : {len(inc_block[i])}")
        inc_block[i].to_csv(ITEM_PATH + category + f"_inc{i + 1}.csv")

SetItemBlocks(pre_ml_100k_df, "ml_100k")
SetItemBlocks(pre_ml_small_df, "ml_small")

Inc1 Block Size : 68
Inc2 Block Size : 68
Inc3 Block Size : 68
Inc4 Block Size : 68
Inc5 Block Size : 69
Inc1 Block Size : 512
Inc2 Block Size : 512
Inc3 Block Size : 512
Inc4 Block Size : 512
Inc5 Block Size : 516
