In [207]:
import pandas as pd
import numpy as np
import os
from os.path import join

## 데이터 로딩

In [208]:
DATA_NAME = "Movielens"
FILE_NAME1 = "ml-1m\\ratings.dat"
COL_NAMES = ['user', 'item', 'rating', 'time']  # 초기 컬럼  user id | item id | rating | timestamp
BASE_DATA_RATIO = 6  # N/10
INC_RATIO = 4
INC_STEP = 5

# config
ROOT_PATH = os.path.abspath("")
DATA_PATH = join(ROOT_PATH + "\\dataset", DATA_NAME)
SAVE_PATH = DATA_PATH
FILE_PATH1 = join(DATA_PATH, FILE_NAME1)
FILT_THRESHOLD = 10  # 최소 데이터 갯수
cols = ["user", "item", "time"]  # 사용할 컬럼

print(f"""
ROOT_PATH: {ROOT_PATH}
DATA_PATH: {DATA_PATH}
SAVE_PATH: {SAVE_PATH}
FILE_PATH1: {FILE_PATH1}
""")


ROOT_PATH: C:\tech\Study\KCC
DATA_PATH: C:\tech\Study\KCC\dataset\Movielens
SAVE_PATH: C:\tech\Study\KCC\dataset\Movielens
FILE_PATH1: C:\tech\Study\KCC\dataset\Movielens\ml-1m\ratings.dat



In [209]:
def print_info(df):
    """
    그래프 정보 출력
    """
    print(f"Total Edges : {len(df)}\nTotal User : {len(df['user'].unique())}\nTotal item : {len(df['item'].unique())} \
                \nSmallest user id : {df['user'].unique().min()} \
                \nbiggest user id : {df['user'].unique().max()} \
                \nSmallest item id : {df['item'].unique().min()} \
                \nbiggest item id : {df['item'].unique().max()} \
                \nMin Interaction Per user : {df.user.value_counts().min()} \
                \nMax Interaction Per user : {df.user.value_counts().max()} \
                \nAvg Interaction Per user : {df.user.value_counts().mean()}\
                \nMin Interaction Per item : {df.item.value_counts().min()} \
                \nMax Interaction Per item : {df.item.value_counts().max()} \
                \nAvg Interaction Per item : {df.item.value_counts().mean()}")

In [210]:
# Load File
ml_1m_df = pd.read_table(FILE_PATH1, names=COL_NAMES, sep="::")

  ml_1m_df = pd.read_table(FILE_PATH1, names=COL_NAMES, sep="::")


In [211]:
print_info(ml_1m_df)

Total Edges : 1000209
Total User : 6040
Total item : 3706                 
Smallest user id : 1                 
biggest user id : 6040                 
Smallest item id : 1                 
biggest item id : 3952                 
Min Interaction Per user : 20                 
Max Interaction Per user : 2314                 
Avg Interaction Per user : 165.5975165562914                
Min Interaction Per item : 1                 
Max Interaction Per item : 3428                 
Avg Interaction Per item : 269.88909875876953


## 데이터 전처리

* 중복 제거: 동일 유저, 동일 아이템에 대한 평점은 가장 최근 것만
* 긍정 데이터: rating 5점 데이터
* 유효 데이터: interaction 10개 이상

1. 중복 제거 (중복이 없다.)

In [212]:
ml_1m_df[ml_1m_df.duplicated(subset=['user', 'item'])]

Unnamed: 0,user,item,rating,time


2. 평점 5점 데이터는 1

In [213]:
def GetPositiveDf(df):
    df['rating'] = df['rating'].apply(lambda x: 1 if x >= 5 else 0)
    return df


ml_1m_positive_df = GetPositiveDf(ml_1m_df)

In [214]:
ml_1m_positive_df

Unnamed: 0,user,item,rating,time
0,1,1193,1,978300760
1,1,661,0,978302109
2,1,914,0,978301968
3,1,3408,0,978300275
4,1,2355,1,978824291
...,...,...,...,...
1000204,6040,1091,0,956716541
1000205,6040,1094,1,956704887
1000206,6040,562,1,956704746
1000207,6040,1096,0,956715648


In [215]:
print_info(ml_1m_positive_df)

Total Edges : 1000209
Total User : 6040
Total item : 3706                 
Smallest user id : 1                 
biggest user id : 6040                 
Smallest item id : 1                 
biggest item id : 3952                 
Min Interaction Per user : 20                 
Max Interaction Per user : 2314                 
Avg Interaction Per user : 165.5975165562914                
Min Interaction Per item : 1                 
Max Interaction Per item : 3428                 
Avg Interaction Per item : 269.88909875876953


3. 10개 미만 interaction 삭제

In [216]:
def GetVailidDf(fdf):
    while fdf.user.value_counts().min() < FILT_THRESHOLD or fdf.item.value_counts().min() < FILT_THRESHOLD:
        df_item = fdf.groupby('item').count()
        df_item = df_item[df_item.user < FILT_THRESHOLD]
        li = df_item.index.to_list()
        fdf = fdf.drop(fdf.loc[fdf.item.isin(li)].index)
        # print_info(fdf)
        df_usr = fdf.groupby('user').count()
        df_usr = df_usr[df_usr.item < FILT_THRESHOLD]
        li = df_usr.index.to_list()
        fdf = fdf.drop(fdf.loc[fdf.user.isin(li)].index)
    fdf = fdf.reset_index().drop(columns=['index'])
    return fdf


ml_1m_vailid_df = GetVailidDf(ml_1m_df)

In [217]:
print_info(ml_1m_vailid_df)

Total Edges : 998539
Total User : 6040
Total item : 3260                 
Smallest user id : 1                 
biggest user id : 6040                 
Smallest item id : 1                 
biggest item id : 3952                 
Min Interaction Per user : 17                 
Max Interaction Per user : 2233                 
Avg Interaction Per user : 165.32102649006623                
Min Interaction Per item : 10                 
Max Interaction Per item : 3428                 
Avg Interaction Per item : 306.30030674846626


4. 랜덤 섞기

In [218]:
def GetShuffledDf(df):
    return df.sample(frac=1, random_state=42)


pre_ml_1m_df = GetShuffledDf(ml_1m_vailid_df)

In [219]:
pre_ml_1m_df

Unnamed: 0,user,item,rating,time
485191,2993,2606,0,970740610
696630,4169,1067,0,965693112
983347,5952,265,0,957144461
435115,2664,2378,0,973455388
620077,3762,2661,0,966093685
...,...,...,...,...
259178,1587,1985,0,1010336240
365838,2136,736,0,974643667
131932,854,45,0,975352099
671155,4041,1440,0,965714059


In [220]:
SAVE_PATH

'C:\\tech\\Study\\KCC\\dataset\\Movielens'

In [221]:
pre_ml_1m_df.to_csv(SAVE_PATH + "\\preprocessed_ml_1m.csv", index=False)

## 시나리오1, 유저와 아이템 모두 증가. (increase)

In [222]:
pre_ml_1m_df = pd.read_csv(os.path.join(SAVE_PATH, "preprocessed_ml_1m.csv"))

In [223]:
def GetTimeSequenceDf(df):
    df['time'] = pd.to_datetime(df['time'])
    return df.sort_values(by=['time'])

In [224]:
pre_ml_1m_df = GetTimeSequenceDf(pre_ml_1m_df)

In [225]:
# 경로 설정
INCREASE_PATH = os.path.join(SAVE_PATH, "increase\\")
INCREASE_PATH

'C:\\tech\\Study\\KCC\\dataset\\Movielens\\increase\\'

In [226]:
# base block 설정
def SetBaseBlock(df, path, category):
    pivot = len(df) * BASE_DATA_RATIO // 10
    print(f"base block size: {pivot}")
    df[:pivot].to_csv(path + category + "_inc0.csv", index=False)


SetBaseBlock(pre_ml_1m_df, INCREASE_PATH, "ml_1m")

base block size: 599123


In [227]:
def SetIncreaseBlocks(df, category):
    pivot = len(df) * BASE_DATA_RATIO // 10
    remain = df[pivot:]
    len_per_block = remain.shape[0] // INC_STEP
    start = 0
    inc_block = []
    for i in range(INC_STEP):
        if i != INC_STEP - 1:
            inc_block.append(remain[start:start + len_per_block])
        else:
            inc_block.append(remain[start:])
        start += len_per_block
        print(f"Inc{i + 1} Block Size : {len(inc_block[i])}")
        inc_block[i].to_csv(INCREASE_PATH + category + f"_inc{i + 1}.csv", index=False)


SetIncreaseBlocks(pre_ml_1m_df, "ml_1m")

Inc1 Block Size : 79883
Inc2 Block Size : 79883
Inc3 Block Size : 79883
Inc4 Block Size : 79883
Inc5 Block Size : 79884


## 시나리오2, 유저와 아이템 모두 고정. (fixed)

In [228]:
pre_ml_1m_df = pd.read_csv(os.path.join(SAVE_PATH, "preprocessed_ml_1m.csv"))

In [229]:
def GetTimeSequenceDf(df):
    df['time'] = pd.to_datetime(df['time'])
    return df.sort_values(by=['time'])

In [230]:
pre_ml_1m_df = GetTimeSequenceDf(pre_ml_1m_df)

In [231]:
FIXED_PATH = os.path.join(SAVE_PATH, "fixed\\")
FIXED_PATH

'C:\\tech\\Study\\KCC\\dataset\\Movielens\\fixed\\'

In [232]:
# base block 설정
SetBaseBlock(pre_ml_1m_df, FIXED_PATH, "ml_1m")

base block size: 599123


In [233]:
def SetFixedBlocks(df, category):
    pivot = len(df) * BASE_DATA_RATIO // 10
    # base block에 있는 unique한 user, item index 추출
    unique_user = df[:pivot]['user'].unique()
    unique_item = df[:pivot]['item'].unique()
    # remain block에서 unique user/item에 해당하는 데이터만 추출
    remain = df[pivot:]
    fixed_df = remain[remain['user'].isin(unique_user) & remain['item'].isin(unique_item)]
    len_per_block = fixed_df.shape[0] // INC_STEP
    start = 0
    inc_block = []
    for i in range(INC_STEP):
        if i != INC_STEP - 1:
            inc_block.append(fixed_df[start:start + len_per_block])
        else:
            inc_block.append(fixed_df[start:])
        start += len_per_block
        print(f"Inc{i + 1} Block Size : {len(inc_block[i])}")
        inc_block[i].to_csv(FIXED_PATH + category + f"_inc{i + 1}.csv", index=False)


SetFixedBlocks(pre_ml_1m_df, "ml_1m")

Inc1 Block Size : 13649
Inc2 Block Size : 13649
Inc3 Block Size : 13649
Inc4 Block Size : 13649
Inc5 Block Size : 13650


## 시나리오3, 유저만 증가 (user)

In [234]:
pre_ml_1m_df = pd.read_csv(os.path.join(SAVE_PATH, "preprocessed_ml_1m.csv"))

In [235]:
USER_PATH = os.path.join(SAVE_PATH, "user\\")
USER_PATH

'C:\\tech\\Study\\KCC\\dataset\\Movielens\\user\\'

In [236]:
ucdf = pre_ml_1m_df.groupby("user").count()
ucdf = ucdf.sample(frac=1, random_state=42).reset_index()

In [237]:
inc_user = [[] for i in range(INC_STEP+1)]
base_size = len(pre_ml_1m_df) * BASE_DATA_RATIO // 10
inc_size = len(pre_ml_1m_df)*INC_RATIO// 10 // INC_STEP

In [238]:
i = 0
cur = 0

while i < INC_STEP+1:
    count = 0

    if i==0:
        # Base block
        while count < base_size:
            row = ucdf.iloc[cur]
            inc_user[i].append(row['user'])
            count += row['item']
            cur += 1

        print(f"Base Block Size : {count}")
        print('Base Done')

    elif i == INC_STEP:
        inc_user[i] = ucdf.iloc[cur:]['user'].values.tolist()
        count = ucdf[cur:]['item'].values.sum()

        print(f'Inc{i} Done')
        print(f"Inc Block{i} Size : {count}")

    else:
        # Incremental block
        while count < inc_size:
            row = ucdf.iloc[cur]
            inc_user[i].append(row['user'])
            count += row['item']
            cur += 1

        print(f'Inc{i} Done')
        print(f"Inc Block{i} Size : {count}")



    result = pre_ml_1m_df[pre_ml_1m_df['user'].isin(inc_user[i])]
    print(result.shape)
    result.to_csv(USER_PATH+"ml_1m" + f"_inc{i}.csv", index=False)

    i += 1

Base Block Size : 599313
Base Done
(599313, 4)
Inc1 Done
Inc Block1 Size : 79962
(79962, 4)
Inc2 Done
Inc Block2 Size : 80090
(80090, 4)
Inc3 Done
Inc Block3 Size : 80175
(80175, 4)
Inc4 Done
Inc Block4 Size : 80206
(80206, 4)
Inc5 Done
Inc Block5 Size : 78793
(78793, 4)


## 시나리오4, 아이템만 증가. (item)

In [239]:
pre_ml_1m_df = pd.read_csv(os.path.join(SAVE_PATH, "preprocessed_ml_1m.csv"))

In [240]:
ITEM_PATH = os.path.join(SAVE_PATH, "item\\")
ITEM_PATH

'C:\\tech\\Study\\KCC\\dataset\\Movielens\\item\\'

In [241]:
icdf = pre_ml_1m_df.groupby("item").count()
icdf = icdf.sample(frac=1, random_state=42).reset_index()
icdf

Unnamed: 0,item,user,rating,time
0,1979,112,112,112
1,158,259,259,259
2,1046,90,90,90
3,2748,137,137,137
4,3696,65,65,65
...,...,...,...,...
3255,1351,35,35,35
3256,1392,219,219,219
3257,1614,428,428,428
3258,1066,175,175,175


In [242]:
inc_item = [[] for i in range(INC_STEP+1)]
base_size = len(pre_ml_1m_df) * BASE_DATA_RATIO // 10
inc_size = len(pre_ml_1m_df)*INC_RATIO// 10 // INC_STEP

In [243]:
i = 0
cur = 0

while i < INC_STEP+1:
    count = 0

    if i==0:
        # Base block
        while count < base_size:
            row = icdf.iloc[cur]
            inc_item[i].append(row['item'])
            count += row['user']
            cur += 1

        print(f"Base Block Size : {count}")
        print('Base Done')

    elif i == INC_STEP:
        inc_item[i] = icdf.iloc[cur:]['item'].values.tolist()
        count = icdf[cur:]['user'].values.sum()

        print(f'Inc{i} Done')
        print(f"Inc Block{i} Size : {count}")

    else:
        # Incremental block
        while count < inc_size:
            row = icdf.iloc[cur]
            inc_item[i].append(row['item'])
            count += row['user']
            cur += 1

        print(f'Inc{i} Done')
        print(f"Inc Block{i} Size : {count}")



    result = pre_ml_1m_df[pre_ml_1m_df['item'].isin(inc_item[i])]
    print(result.shape)
    result.to_csv(ITEM_PATH+"ml_1m" + f"_inc{i}.csv", index=False)

    i += 1

Base Block Size : 599135
Base Done
(599135, 4)
Inc1 Done
Inc Block1 Size : 80283
(80283, 4)
Inc2 Done
Inc Block2 Size : 80107
(80107, 4)
Inc3 Done
Inc Block3 Size : 79884
(79884, 4)
Inc4 Done
Inc Block4 Size : 80042
(80042, 4)
Inc5 Done
Inc Block5 Size : 79088
(79088, 4)


## user / item 시나리오

모든 블록 8:2 random split

In [244]:
USER_PATH

'C:\\tech\\Study\\KCC\\dataset\\Movielens\\user\\'

In [245]:
ITEM_PATH

'C:\\tech\\Study\\KCC\\dataset\\Movielens\\item\\'

In [246]:
from sklearn.model_selection import train_test_split

user_dfs = [pd.read_csv(USER_PATH+"ml_1m" + f"_inc{i}.csv") for i in range(6)]
item_dfs = [pd.read_csv(ITEM_PATH+"ml_1m" + f"_inc{i}.csv") for i in range(6)]

In [252]:
for i,user_df in enumerate(user_dfs):
    train, test = train_test_split(user_df, test_size=0.2, random_state=42)
    train.to_csv(USER_PATH+"train_ml_1m" + f"_inc{i}.csv", index=False)
    test.to_csv(USER_PATH+"test_ml_1m" + f"_inc{i}.csv", index=False)

In [253]:
for i,item_df in enumerate(item_dfs):
    train, test = train_test_split(item_df, test_size=0.2, random_state=42)
    train.to_csv(ITEM_PATH+"train_ml_1m" + f"_inc{i}.csv", index=False)
    test.to_csv(ITEM_PATH+"test_ml_1m" + f"_inc{i}.csv", index=False)