# Create atomic files 
- User-item interactions
    - Raw data (.csv) -> Atomic files (.inter)
- Item features
    - Raw data (.csv) -> Atomic files (.itememb)
- dtype
    - s: string
    - d: digit
    - f: float


In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import os
import glob

## .inter

In [2]:
# Raw data .csv files 
files = glob.glob('./dataset/transactions/*.csv')
files.sort()
files

['./dataset/transactions/azuki.csv',
 './dataset/transactions/bayc.csv',
 './dataset/transactions/coolcats.csv',
 './dataset/transactions/doodles.csv',
 './dataset/transactions/meebits.csv']

In [3]:
# Get names which is the one before '.csv'
names = [os.path.basename(x).split('.')[0] for x in files]
names.sort() 
names

['azuki', 'bayc', 'coolcats', 'doodles', 'meebits']

In [4]:
for name, file in zip(tqdm(names), files):

    df_azuki = pd.read_csv(file)
    # df_azuki = df_azuki.drop_duplicates(subset=['Buyer', 'Token ID'], keep='first') # drop duplicated interactions
    user = df_azuki['Buyer'].values
    item = df_azuki['Token ID'].values
    print('collection: ', name)
    print('num of interactions: ', len(user))
    print('')

    save_path = './dataset/collections/' + name
    if not os.path.exists(save_path):
        os.makedirs(save_path)

    f = open(save_path + f"/{name}.inter", 'w')
    f.write("user_id:token\titem_id:token\n")
    for i in range(len(user)):
        f.write("%s\t%d\n"%(user[i], item[i]))
    f.close()


100%|██████████| 5/5 [00:00<00:00, 33.97it/s]

collection:  azuki
num of interactions:  15708

collection:  bayc
num of interactions:  13737

collection:  coolcats
num of interactions:  14890

collection:  doodles
num of interactions:  7250

collection:  meebits
num of interactions:  21104






## .itememb
img, txt, prices, txns

In [33]:
for name in tqdm(names):
    print('Collection name: ', name)
    for attribute in ['img', 'txt', 'prices', 'txns']:
        print('--- ', attribute)

        # Get raw data file
        file = pd.read_csv(f'./dataset/features_item/{name}_{attribute}.csv')
        print('before: ', file.shape)
        
        # interaction에 등장하는 아이템만 남기기
        inter = pd.read_csv(f'./dataset/collections/{name}/{name}.inter', sep='\t')
        token_ids = inter['item_id:token'].unique()
        file = file[file['token_id'].isin(token_ids)].reset_index(drop=True)
        print('after: ', file.shape)

        # save flie
        save_path = './dataset/collections/' + name
        if not os.path.exists(save_path):
            os.makedirs(save_path)
        
        f = open(save_path + f"/{name}.itememb_{attribute}", 'w')
        f.write(f"iid_{attribute}:token" + '\t' + f'item_emb_{attribute}:float_seq' + '\n')
        for i in range(len(file)):
            # get token_id
            token_id = file['token_id'][i]
            # get the rest of the features
            features = file.iloc[i, 1:] # Series
            # write
            f.write(str(token_id) + '\t')
            for j in range(len(features)):
                f.write(f"{features[j].astype(np.float32)}") 
                # if it is not the last iteration
                if j != len(features) - 1:
                    f.write(' ')
            f.write('\n')    

        f.close()

  0%|          | 0/5 [00:00<?, ?it/s]

Collection name:  azuki
---  img
before:  (10000, 1025)
after:  (6659, 1025)
---  txt
before:  (10000, 1801)
after:  (6659, 1801)
---  prices
before:  (8908, 65)
after:  (6659, 65)
---  txns
before:  (8908, 65)
after:  (6659, 65)


 20%|██        | 1/5 [00:53<03:32, 53.06s/it]

Collection name:  bayc
---  img
before:  (9983, 1025)
after:  (6726, 1025)
---  txt
before:  (10000, 1801)
after:  (6726, 1801)
---  prices
before:  (8884, 65)
after:  (6726, 65)
---  txns
before:  (8884, 65)
after:  (6726, 65)


 40%|████      | 2/5 [01:47<02:40, 53.58s/it]

Collection name:  coolcats
---  img
before:  (9952, 1025)
after:  (6824, 1025)
---  txt
before:  (9941, 1501)
after:  (6824, 1501)
---  prices
before:  (8384, 65)
after:  (6824, 65)
---  txns
before:  (8384, 65)
after:  (6824, 65)


 60%|██████    | 3/5 [02:35<01:42, 51.48s/it]

Collection name:  doodles
---  img
before:  (9999, 1025)
after:  (4771, 1025)
---  txt
before:  (10000, 1501)
after:  (4771, 1501)
---  prices
before:  (8285, 65)
after:  (4771, 65)
---  txns
before:  (8285, 65)
after:  (4771, 65)


 80%|████████  | 4/5 [03:11<00:45, 45.04s/it]

Collection name:  meebits
---  img
before:  (20000, 1025)
after:  (6693, 1025)
---  txt
before:  (20000, 1801)
after:  (6693, 1801)
---  prices
before:  (9682, 65)
after:  (6693, 65)
---  txns
before:  (9682, 65)
after:  (6693, 65)


100%|██████████| 5/5 [04:06<00:00, 49.34s/it]


## .item

In [5]:
collection_path = './dataset/collections/'
features_path = './dataset/features_item/'
collection_names = ['azuki', 'bayc', 'coolcats', 'doodles', 'meebits']

In [6]:
"""
combined
"""
for COLLECTION in tqdm(collection_names):

    # read data
    image = pd.read_csv(os.path.join(features_path, f'{COLLECTION}_img.csv'), encoding='utf-8')
    text = pd.read_csv(os.path.join(features_path, f'{COLLECTION}_txt.csv'), encoding='utf-8')
    price = pd.read_csv(os.path.join(features_path, f'{COLLECTION}_prices.csv'), encoding='utf-8')
    txns = pd.read_csv(os.path.join(features_path, f'{COLLECTION}_txns.csv'), encoding='utf-8')
    # change first column name to 'token_id'
    image.columns = ['token_id'] + list(image.columns[1:])
    text.columns = ['token_id'] + list(text.columns[1:])
    price.columns = ['token_id'] + list(price.columns[1:])
    txns.columns = ['token_id'] + list(txns.columns[1:])
    # drop columns after 3rd
    price = price.iloc[:, :2]
    txns = txns.iloc[:, :2]
    # combine image, text, price, txns into one dataframe 
    combined = pd.merge(image, text, on='token_id')
    combined = pd.merge(combined, price, on='token_id')
    combined = pd.merge(combined, txns, on='token_id')
    print('before: ', combined.shape)

    # interaction에 등장하는 아이템만 남기기
    inter = pd.read_csv(f'./dataset/collections/{COLLECTION}/{COLLECTION}.inter', sep='\t')
    token_ids = inter['item_id:token'].unique()
    combined = combined[combined['token_id'].isin(token_ids)].reset_index(drop=True)
    print('after: ', combined.shape)

    # .item 저장하기
    f = open(os.path.join(collection_path, f'{COLLECTION}/{COLLECTION}.item'), 'w')
    f.write("item_id:token" + '\t' + 'img:float_seq' + '\t' + 'txt:float_seq' + '\t' + 'price:float' + '\t' +'txn:float' + '\n')

    for idx, row in tqdm(combined.iterrows(), total = len(combined)):
        
        image_feat = combined.iloc[idx, 1:1025] # Series
        text_feat = combined.iloc[idx, 1025:-2] # Series
        price_feat = combined.iloc[idx, -2] # float
        transaction_feat = combined.iloc[idx, -1] # float

        # 1. token_id
        f.write("%d"%(row['token_id']))
        f.write('\t')
        
        # 2. image_feat
        for idx, i in enumerate(image_feat):
            f.write("%f"%(i))
            if idx != len(image_feat)-1:
                f.write(' ')
        f.write('\t')

        # 3. text_feat
        for idx, i in enumerate(text_feat):
            f.write("%f"%(i))
            if idx != len(text_feat)-1:
                f.write(' ')
        f.write('\t')

        # 4. price_feat
        f.write("%f"%(price_feat))
        f.write('\t')
        
        # 5. transaction_feat
        f.write("%f"%(transaction_feat))
        
        f.write('\n')

    f.close()


  0%|          | 0/5 [00:00<?, ?it/s]

before:  (8908, 2827)
after:  (6659, 2827)


100%|██████████| 6659/6659 [00:17<00:00, 383.73it/s]
 20%|██        | 1/5 [00:20<01:20, 20.10s/it]

before:  (8867, 2827)
after:  (6726, 2827)


100%|██████████| 6726/6726 [00:17<00:00, 385.55it/s]
 40%|████      | 2/5 [00:40<01:00, 20.12s/it]

before:  (8373, 2527)
after:  (6824, 2527)


100%|██████████| 6824/6824 [00:17<00:00, 393.33it/s]
 60%|██████    | 3/5 [00:59<00:39, 19.87s/it]

before:  (8284, 2527)
after:  (4771, 2527)


100%|██████████| 4771/4771 [00:11<00:00, 426.74it/s]
 80%|████████  | 4/5 [01:13<00:17, 17.35s/it]

before:  (9682, 2827)
after:  (6693, 2827)


100%|██████████| 6693/6693 [00:17<00:00, 381.79it/s]
100%|██████████| 5/5 [01:35<00:00, 19.13s/it]


## .user

In [22]:
collection_path = './dataset/collections/'
features_path = './dataset/features_user/'
collection_names = ['azuki', 'bayc', 'coolcats', 'doodles', 'meebits']

In [23]:
"""
combined
"""
for COLLECTION in tqdm(collection_names):

    # read data
    combined = pd.read_csv(os.path.join(features_path, 'user_features.csv'), encoding='utf-8')

    # interaction에 등장하는 유저만 남기기
    inter = pd.read_csv(f'./dataset/collections/{COLLECTION}/{COLLECTION}.inter', sep='\t')
    token_ids = inter['user_id:token'].unique()
    combined = combined[combined['Buyer'].isin(token_ids)].reset_index(drop=True)
    print('after: ', combined.shape)

    # .user 저장하기
    f = open(os.path.join(collection_path, f'{COLLECTION}/{COLLECTION}.user'), 'w')
    f.write("user_id:token" + '\t' + 'num_txn:token' + '\t' + 'avg_price:float' + '\t' + 'hold_period:float' + '\n')

    for idx, row in tqdm(combined.iterrows(), total = len(combined)):

        # 1. token_id
        f.write("%s"%(row['Buyer']))
        f.write('\t')
        
        # 2. num_txn
        f.write("%d"%(row['# of transactions']))
        f.write('\t')        
        # 3. avg_price
        f.write("%f"%(row['Avg transaction price']))
        f.write('\t')        
        # 4. hold_period
        f.write("%f"%(row['holding period']))
        
        f.write('\n')

    f.close()


  0%|          | 0/5 [00:00<?, ?it/s]

after:  (1647, 5)


100%|██████████| 1647/1647 [00:00<00:00, 46640.19it/s]


after:  (1230, 5)


100%|██████████| 1230/1230 [00:00<00:00, 45337.85it/s]
 40%|████      | 2/5 [00:00<00:00, 13.75it/s]

after:  (1357, 5)


100%|██████████| 1357/1357 [00:00<00:00, 44341.47it/s]


after:  (804, 5)


100%|██████████| 804/804 [00:00<00:00, 44296.13it/s]
 80%|████████  | 4/5 [00:00<00:00, 16.38it/s]

after:  (1184, 5)


100%|██████████| 1184/1184 [00:00<00:00, 44231.97it/s]
100%|██████████| 5/5 [00:00<00:00, 16.16it/s]
