In [1]:
import pandas as pd
import os
import zipfile as zf
import getpass

### Install requirements

In [2]:
# Install kaggle
# https://www.kaggle.com/docs/api
!pip install kaggle
!mkdir -p ~/.kaggle

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Collecting kaggle
  Downloading kaggle-1.5.12.tar.gz (58 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.0/59.0 KB[0m [31m92.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Collecting python-slugify
  Downloading python_slugify-6.1.1-py2.py3-none-any.whl (9.1 kB)
Collecting text-unidecode>=1.3
  Downloading text_unidecode-1.3-py2.py3-none-any.whl (78 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.2/78.2 KB[0m [31m166.9 MB/s[0m eta [36m0:00:00[0m
Building wheels for collected packages: kaggle
  Building wheel for kaggle (setup.py) ... [?25ldone
[?25h  Created wheel for kaggle: filename=kaggle-1.5.12-py3-none-any.whl size=73051 sha256=1c11df1621e264ec393a299aa05fee3c030d94a8c163fa9aaf0a624e1b9ed159
  Stored in directory: /tmp/pip-ephem-wheel-cache-fj4aals4/wheels/29/da/11/144cc25aebdaeb4931b231e25fd34b394e6a5725cbb2f50106
Suc

In [3]:
# Configure kaggle username and key
if not os.path.exists(os.path.expanduser('~/.kaggle/kaggle.json')):
    username = getpass.getpass("Username: ")
    key = getpass.getpass("Key: ")
    # Write credentials into file
    file = open(os.path.expanduser('~/.kaggle/kaggle.json'), "w")
    file.write("{\"username\":\"%s\",\"key\":\"%s\"}" %(username, key))
    file.close()
    !chmod 600 ~/.kaggle/kaggle.json

Username:  ··········
Key:  ································


### Download dataset

In [4]:
# Download dataset
!kaggle datasets download --force -d retailrocket/ecommerce-dataset

Downloading ecommerce-dataset.zip to /dli/task/retailrocket
 99%|████████████████████████████████████████▋| 288M/291M [00:01<00:00, 167MB/s]
100%|█████████████████████████████████████████| 291M/291M [00:01<00:00, 154MB/s]


In [5]:
# Extract dataset into data folder
files = zf.ZipFile("ecommerce-dataset.zip", "r")
files.extractall("data")
files.close()

In [6]:
# Remove zip file
os.remove("ecommerce-dataset.zip")

### Load dataset

In [7]:
events_df = pd.read_csv("data/events.csv")
category_tree_df = pd.read_csv("data/category_tree.csv")
item_properties_1_df = pd.read_csv("data/item_properties_part1.csv")
item_properties_2_df = pd.read_csv("data/item_properties_part2.csv")

In [8]:
events_df.head()

Unnamed: 0,timestamp,visitorid,event,itemid,transactionid
0,1433221332117,257597,view,355908,
1,1433224214164,992329,view,248676,
2,1433221999827,111016,view,318965,
3,1433221955914,483717,view,253185,
4,1433221337106,951259,view,367447,


In [9]:
# Type of interactions / events
events_df.event.unique()

array(['view', 'addtocart', 'transaction'], dtype=object)

In [10]:
print(len(item_properties_1_df))
item_properties_1_df.head()

10999999


Unnamed: 0,timestamp,itemid,property,value
0,1435460400000,460429,categoryid,1338
1,1441508400000,206783,888,1116713 960601 n277.200
2,1439089200000,395014,400,n552.000 639502 n720.000 424566
3,1431226800000,59481,790,n15360.000
4,1431831600000,156781,917,828513


In [11]:
print(len(item_properties_2_df))
item_properties_2_df.head()

9275903


Unnamed: 0,timestamp,itemid,property,value
0,1433041200000,183478,561,769062
1,1439694000000,132256,976,n26.400 1135780
2,1435460400000,420307,921,1149317 1257525
3,1431831600000,403324,917,1204143
4,1435460400000,230701,521,769062


In [12]:
item_properties_df = pd.concat([item_properties_1_df, item_properties_2_df])
print(len(item_properties_df))
item_properties_df.head()

20275902


Unnamed: 0,timestamp,itemid,property,value
0,1435460400000,460429,categoryid,1338
1,1441508400000,206783,888,1116713 960601 n277.200
2,1439089200000,395014,400,n552.000 639502 n720.000 424566
3,1431226800000,59481,790,n15360.000
4,1431831600000,156781,917,828513


### Store Datasets

In [13]:
events_df.to_pickle("data/01_events_df.pkl")

In [14]:
category_tree_df.to_pickle("data/01_category_tree_df.pkl")

In [15]:
item_properties_df.to_pickle("data/01_item_properties_df.pkl")