In [1]:
import pandas as pd
import numpy as np
import os
import json
import pickle

In [2]:
rating_path = './Fashion/AMAZON_FASHION.csv'

In [3]:
custom_header = ['item', 'user', 'rating', 'timestamp']
ratings_df = pd.read_csv(rating_path, header = None, names = custom_header)
print(ratings_df.shape)
ratings_df.head()

(883636, 4)


Unnamed: 0,item,user,rating,timestamp
0,7106116521,A1D4G1SNUZWQOT,5.0,1413763200
1,7106116521,A3DDWDH9PX2YX2,2.0,1411862400
2,7106116521,A2MWC41EW7XL15,4.0,1408924800
3,7106116521,A2UH2QQ275NV45,2.0,1408838400
4,7106116521,A89F3LQADZBS5,3.0,1406419200


In [4]:
fashion_5core_path = './Fashion/AMAZON_FASHION_5.json'
fashion_5core = []
with open(fashion_5core_path, 'rb') as f:
    for line in f:
        fashion_5core.append(json.loads(line))

In [5]:
print(len(fashion_5core))

3176


In [6]:
rating_df = ratings_df.sort_values(by=['user', 'timestamp'])
rating_df = rating_df.reset_index(drop = True)
print(rating_df.shape)
rating_df.head()

(883636, 4)


Unnamed: 0,item,user,rating,timestamp
0,B00L8J2RF8,A0007604Q2582KFW7N4B,5.0,1426377600
1,B0121M1AJE,A0010606488RW7ZH6EP7,1.0,1507852800
2,B00IGAOE3U,A001152055E9KVRHH96L,1.0,1434585600
3,B01GK6XQFI,A00181966XZUX7KEGYLH,3.0,1468022400
4,B00RLSCLJM,A00205926S885DTOYMX6,5.0,1441497600


In [8]:
ratings_7core = rating_df.groupby('user').filter(lambda x: len(x) >= 7)
print(ratings_7core.shape)
ratings_7core.head()

(11136, 4)


Unnamed: 0,item,user,rating,timestamp
3238,B00HFKN2IU,A10G10SU7SAUG1,4.0,1452816000
3239,B009PMILQE,A10G10SU7SAUG1,4.0,1453766400
3240,B00D83TNRM,A10G10SU7SAUG1,3.0,1453766400
3241,B00E1L9QOU,A10G10SU7SAUG1,4.0,1453766400
3242,B00AW83HM8,A10G10SU7SAUG1,4.0,1453766400


In [9]:
print("Unique items:", ratings_7core['item'].nunique())
print("Unique users:", ratings_7core['user'].nunique())

Unique items: 6089
Unique users: 1273


In [10]:
ratings_7core.rating.value_counts()

rating
5.0    6427
4.0    2158
3.0    1293
1.0     655
2.0     603
Name: count, dtype: int64

In [11]:
meta_fashion_path = "./Fashion/meta_AMAZON_FASHION.json"
meta_fashion = []
with open(meta_fashion_path, 'r') as f:
    for line in f:
        meta_fashion.append(json.loads(line))

In [12]:
print(len(meta_fashion))

186637


In [13]:
print(meta_fashion[0])

{'title': 'Slime Time Fall Fest [With CDROM and Collector Cards and Neutron Balls, Incredi-Ball and Glow Stick Necklace, Paper Fram', 'brand': 'Group Publishing (CO)', 'feature': ['Product Dimensions:\n                    \n8.7 x 3.6 x 11.4 inches', 'Shipping Weight:\n                    \n2.4 pounds'], 'rank': '13,052,976inClothing,Shoesamp;Jewelry(', 'date': '8.70 inches', 'asin': '0764443682', 'imageURL': ['https://images-na.ssl-images-amazon.com/images/I/51bSrINiWpL._US40_.jpg'], 'imageURLHighRes': ['https://images-na.ssl-images-amazon.com/images/I/51bSrINiWpL.jpg']}


In [15]:
fashion_5core[0]

{'overall': 5.0,
 'verified': True,
 'reviewTime': '09 4, 2015',
 'reviewerID': 'ALJ66O1Y6SLHA',
 'asin': 'B000K2PJ4K',
 'style': {'Size:': ' Big Boys', 'Color:': ' Blue/Orange'},
 'reviewerName': 'Tonya B.',
 'reviewText': 'Great product and price!',
 'summary': 'Five Stars',
 'unixReviewTime': 1441324800}

### Creating user and item mapping

In [16]:
ratings_7core.head()

Unnamed: 0,item,user,rating,timestamp
3238,B00HFKN2IU,A10G10SU7SAUG1,4.0,1452816000
3239,B009PMILQE,A10G10SU7SAUG1,4.0,1453766400
3240,B00D83TNRM,A10G10SU7SAUG1,3.0,1453766400
3241,B00E1L9QOU,A10G10SU7SAUG1,4.0,1453766400
3242,B00AW83HM8,A10G10SU7SAUG1,4.0,1453766400


In [18]:
item_mapping = {item: idx for idx, item in enumerate(ratings_7core['item'].unique())}
user_mapping = {user: idx for idx, user in enumerate(ratings_7core['user'].unique())}

In [19]:
print(len(item_mapping), len(user_mapping))

6089 1273


In [20]:
reverse_item_mapping = {v:k for k,v in item_mapping.items()}
reverse_user_mapping = {v:k for k,v in user_mapping.items()}

In [21]:
print(len(reverse_item_mapping), len(reverse_user_mapping))

6089 1273


In [22]:
id = 2
print(id, reverse_user_mapping[id], user_mapping[reverse_user_mapping[id]])

2 A10RXRZE0TAKPU 2


In [23]:
id = 374
print(id, reverse_item_mapping[id], item_mapping[reverse_item_mapping[id]])

374 B01201P2GS 374


In [24]:
with open('./Fashion/item_idx_mapping.json', 'w+') as f:
    json.dump(item_mapping, f)

In [25]:
with open('./Fashion/idx_item_mapping.json', 'w+') as f:
    json.dump(reverse_item_mapping, f)

In [26]:
with open('./Fashion/user_idx_mapping.json', 'w+') as f:
    json.dump(user_mapping, f)

In [27]:
with open('./Fashion/idx_user_mapping.json', 'w+') as f:
    json.dump(reverse_user_mapping, f)

### Creating train, test, valid dataset

In [28]:
ratings_7core['item'] = ratings_7core['item'].map(item_mapping)
ratings_7core['user'] = ratings_7core['user'].map(user_mapping)
ratings_7core.head()

Unnamed: 0,item,user,rating,timestamp
3238,0,0,4.0,1452816000
3239,1,0,4.0,1453766400
3240,2,0,3.0,1453766400
3241,3,0,4.0,1453766400
3242,4,0,4.0,1453766400


In [30]:
ratings_7core = ratings_7core.reset_index(drop=True)
ratings_7core.head()

Unnamed: 0,item,user,rating,timestamp
0,0,0,4.0,1452816000
1,1,0,4.0,1453766400
2,2,0,3.0,1453766400
3,3,0,4.0,1453766400
4,4,0,4.0,1453766400


In [33]:
ratings_7core.to_csv('./Fashion/user_rating.csv', index = False)

In [41]:
user_item_interactions = dict()
for user in list(ratings_7core['user'].unique()):
    user_item_interactions[user] = ratings_7core[ratings_7core['user']==user]['item'].tolist()
print(len(user_item_interactions))

1273


In [45]:
user_item_interactions[0]

[0, 1, 2, 3, 4, 5, 6, 7]

In [58]:
user_train = dict()
user_valid = dict()
user_test = dict()
user_profile = dict()
for user, item_list in user_item_interactions.items():
    user_profile[int(user)] = item_list[:-7]
    user_train[int(user)] = item_list[-7:-2]
    user_valid[int(user)] = item_list[-6:-1]
    user_test[int(user)] = item_list[-5:]
    # break

In [59]:
id = 1200
print(user_profile[id])
print(user_train[id])
print(user_valid[id])
print(user_test[id])

[974, 4266]
[5780, 2332, 5781, 5782, 5783]
[2332, 5781, 5782, 5783, 5784]
[5781, 5782, 5783, 5784, 5785]


In [60]:
with open('./Fashion/user_profile.json', 'w+') as f:
    json.dump(user_profile, f)

In [61]:
with open('./Fashion/user_train.json', 'w+') as f:
    json.dump(user_train, f)

In [62]:
with open('./Fashion/user_valid.json', 'w+') as f:
    json.dump(user_valid, f)

In [63]:
with open('./Fashion/user_test.json', 'w+') as f:
    json.dump(user_test, f)

### Creating item descriptions

In [83]:
all_keys = []
for item in meta_fashion:
    item_keys = list(item.keys())
    all_keys.extend(item_keys)

In [86]:
all_keys = list(set(all_keys))
print(all_keys)

['title', 'feature', 'rank', 'asin', 'date', 'details', 'also_buy', 'description', 'similar_item', 'imageURL', 'fit', 'price', 'brand', 'also_view', 'imageURLHighRes', 'tech1']


In [105]:
required_keys = ['title', 'brand', 'description', 'price']

In [122]:
item_description = dict()
cnt = 0
for item in meta_fashion:
    if item['asin'] in item_mapping:
        cnt += 1
        temp_desc = ""
        for key in required_keys:
            if key in item:
                if key == 'description':
                    temp_desc += f"{key}: {item[key][0]} "
                else:
                    temp_desc += f"{key}: {item[key]} "
        item_description[item_mapping[item['asin']]] = temp_desc
    # if cnt > 10:
    #     break

In [123]:
print(len(item_description))

6089


In [124]:
with open('./Fashion/item_description.json', 'w+') as f:
    json.dump(item_description, f)