# Data Preparation

## Setup

In [1]:
%%capture
%pip install kaggle

In [2]:
import pandas as pd

In [3]:
SAMPLE_NUM_CUSTOMERS = 2000 #set to 0 or less for no sampling
FILTER_ARTICLES = False #whether to filter out certain intimate products for demo purposes (real data problems)
RANDOM_SEED = 7474 #seed to use for replicating sampling

In [4]:
def camel_case(s):
    ss = s.lower().split('_')
    if len(ss) == 1:
        return ss[0]
    return ss[0] + ''.join(st.title() for st in ss[1:])

def camel_case_dict(name_keys):
    name_values = [camel_case(s) for s in name_keys]
    return dict(zip(name_keys, name_values))

def camel_case_rename_cols(df):
    col_map = camel_case_dict(df.columns)
    return df.rename(columns=col_map)

## Get Source Data

In [5]:
# configure authentication per instruction @ https://github.com/Kaggle/kaggle-api/blob/main/README.md
!kaggle competitions download -c h-and-m-personalized-fashion-recommendations -f articles.csv -p data
!kaggle competitions download -c h-and-m-personalized-fashion-recommendations -f customers.csv -p data
!kaggle competitions download -c h-and-m-personalized-fashion-recommendations -f transactions_train.csv -p data
!cd data && unzip -n '*.zip'

Downloading articles.csv.zip to data
 23%|████████▉                             | 1.00M/4.26M [00:00<00:00, 6.12MB/s]
100%|██████████████████████████████████████| 4.26M/4.26M [00:00<00:00, 17.0MB/s]
Downloading customers.csv.zip to data
 94%|███████████████████████████████████▋  | 92.0M/97.9M [00:02<00:00, 52.2MB/s]
100%|██████████████████████████████████████| 97.9M/97.9M [00:02<00:00, 38.4MB/s]
Downloading transactions_train.csv.zip to data
100%|███████████████████████████████████████▊| 582M/584M [00:14<00:00, 47.5MB/s]
100%|████████████████████████████████████████| 584M/584M [00:14<00:00, 42.3MB/s]
Archive:  customers.csv.zip
  inflating: customers.csv           

Archive:  articles.csv.zip
  inflating: articles.csv            

Archive:  transactions_train.csv.zip
  inflating: transactions_train.csv  

3 archives were successfully processed.


## Filter Articles

In [6]:
init_article_df = camel_case_rename_cols(pd.read_csv('data/articles.csv'))
init_article_df.shape

(105542, 25)

In [7]:
# Filtering out some intimate products for demo purposes
filtered_article_ids = init_article_df.articleId
if FILTER_ARTICLES:
    filtered_article_ids = init_article_df[init_article_df.garmentGroupName != 'Under-, Nightwear'].articleId

In [8]:
init_transaction_df = camel_case_rename_cols(pd.read_csv('data/transactions_train.csv'))
# generally a good idea to have an id for these (source data doesn't include ids)
init_transaction_df['txId'] = range(init_transaction_df.shape[0])
init_transaction_df.shape

(31788324, 6)

In [9]:
filtered_customer_ids = init_transaction_df[init_transaction_df.articleId.isin(filtered_article_ids)].customerId.drop_duplicates()
filtered_customer_ids

0           000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...
2           00007d2de826758b65a93dd24ce629ed66842531df6699...
7           00083cda041544b2fbb0e0d2905ad17da7cf1007526fb4...
12          0008968c0d451dbc5a9968da03196fe20051965edde741...
14          000aa7f0dc06cd7174389e76c9e132a67860c5f65f9706...
                                  ...                        
31788165    fe99a0069d6b3c64c2707d0ce53b9311540917471d82df...
31788202    fecc5f77b5f7ee4570efde9ab05ec94d0de2bf80efb4f6...
31788208    fece2f68864c311a0b5208e2eb735b3dcde7e41461d327...
31788217    fee56cc5315dafb35a4490ccc6f711092cae913550c832...
31788275    ff5b8a8b26bf93a66290e9bd1b73393ac6a58968a78519...
Name: customerId, Length: 1362281, dtype: object

## Sample Customers

In [10]:
customer_ids = filtered_customer_ids
if SAMPLE_NUM_CUSTOMERS > 0:
    customer_ids = filtered_customer_ids.sample(n=SAMPLE_NUM_CUSTOMERS, random_state=RANDOM_SEED).reset_index(drop=True)
customer_ids

0       fdbe75e71e134938025dbbb9bc495bd302d578b449ac96...
1       fb9310441b653525f1adad3fbe7ece522ba50e752cca62...
2       f0a8599239eea199f1440af86ab9df78cb5d4e85f532fd...
3       4dd8a1b3175c88f07b123b388a5c9b5dfe16b3ba6fdf62...
4       696093ad8815f16ab92c07eb32d69c2d1e90daef479de7...
                              ...                        
1995    77f2b46bb15fb4c1251d82a37b2ed7d83f5a92bb5ff159...
1996    074148aa72ce41f82ca909e1912a18e2c055cae5b71390...
1997    b96b1cf69098b801738b20c4e922a46f5c713113442689...
1998    dfba337fefbf14b24281a0f931bbaba6e388ef4d1bc0e4...
1999    1abde4c8b89375315feca757811924a377dfe2ec8cc8a1...
Name: customerId, Length: 2000, dtype: object

In [11]:
article_ids = init_transaction_df[init_transaction_df.customerId.isin(customer_ids)].articleId.drop_duplicates()
article_ids

1559        662888002
1560        662888001
1561        651244002
1562        651244001
1588        633152003
              ...    
31775199    706271031
31779118    865926002
31779121    906639004
31779122    684238003
31779125    812530004
Name: articleId, Length: 21596, dtype: int64

## Sample Down Data
Now that we have the list of customers and articles to include , `customer_ids`, and `article_ids` respectively, we can use them to filter the source data and stage for loading

In [12]:
transaction_df = init_transaction_df[init_transaction_df.customerId.isin(customer_ids)]
transaction_df

Unnamed: 0,tDat,customerId,articleId,price,salesChannelId,txId
1559,2018-09-20,080756754aef493b2b36f592eae744f2b9787dc55b635b...,662888002,0.033881,2,1559
1560,2018-09-20,080756754aef493b2b36f592eae744f2b9787dc55b635b...,662888001,0.033881,2,1560
1561,2018-09-20,080756754aef493b2b36f592eae744f2b9787dc55b635b...,651244002,0.013542,2,1561
1562,2018-09-20,080756754aef493b2b36f592eae744f2b9787dc55b635b...,651244001,0.006763,2,1562
1588,2018-09-20,0843d9fb6e4f3befa53ff3a8447b902b9f75bfa955a0f9...,633152003,0.030492,1,1588
...,...,...,...,...,...,...
31779124,2020-09-22,b6be55f233772b5fc4a1ebedf36542fb3e1b6c15c23c7e...,921266007,0.016932,2,31779124
31779125,2020-09-22,b6be55f233772b5fc4a1ebedf36542fb3e1b6c15c23c7e...,812530004,0.010153,2,31779125
31779126,2020-09-22,b6be55f233772b5fc4a1ebedf36542fb3e1b6c15c23c7e...,942187001,0.016932,2,31779126
31779127,2020-09-22,b6be55f233772b5fc4a1ebedf36542fb3e1b6c15c23c7e...,866731001,0.025407,2,31779127


In [13]:
full_article_df = init_article_df[init_article_df.articleId.isin(article_ids)]
full_article_df

Unnamed: 0,articleId,productCode,prodName,productTypeNo,productTypeName,productGroupName,graphicalAppearanceNo,graphicalAppearanceName,colourGroupCode,colourGroupName,...,departmentName,indexCode,indexName,indexGroupNo,indexGroupName,sectionNo,sectionName,garmentGroupNo,garmentGroupName,detailDesc
0,108775015,108775,Strap top,253,Vest top,Garment Upper body,1010016,Solid,9,Black,...,Jersey Basic,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.
1,108775044,108775,Strap top,253,Vest top,Garment Upper body,1010016,Solid,10,White,...,Jersey Basic,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.
3,110065001,110065,OP T-shirt (Idro),306,Bra,Underwear,1010016,Solid,9,Black,...,Clean Lingerie,B,Lingeries/Tights,1,Ladieswear,61,Womens Lingerie,1017,"Under-, Nightwear","Microfibre T-shirt bra with underwired, moulde..."
4,110065002,110065,OP T-shirt (Idro),306,Bra,Underwear,1010016,Solid,10,White,...,Clean Lingerie,B,Lingeries/Tights,1,Ladieswear,61,Womens Lingerie,1017,"Under-, Nightwear","Microfibre T-shirt bra with underwired, moulde..."
6,111565001,111565,20 den 1p Stockings,304,Underwear Tights,Socks & Tights,1010016,Solid,9,Black,...,Tights basic,B,Lingeries/Tights,1,Ladieswear,62,"Womens Nightwear, Socks & Tigh",1021,Socks and Tights,"Semi shiny nylon stockings with a wide, reinfo..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
105395,939927001,939927,Dolphin,265,Dress,Garment Full body,1010013,Other pattern,9,Black,...,Dress,A,Ladieswear,1,Ladieswear,15,Womens Everyday Collection,1013,Dresses Ladies,Short dress in an airy weave with a small stan...
105444,942187001,942187,ED Sasha tee,255,T-shirt,Garment Upper body,1010016,Solid,9,Black,...,Jersey,A,Ladieswear,1,Ladieswear,2,H&M+,1005,Jersey Fancy,"Oversized, straight-cut T-shirt in a soft moda..."
105493,946282001,946282,Linnea dress,265,Dress,Garment Full body,1010021,Lace,9,Black,...,Dress,A,Ladieswear,1,Ladieswear,15,Womens Everyday Collection,1013,Dresses Ladies,Short dress in lace with flounces down the fro...
105520,947599001,947599,ED Duno 2p.,254,Top,Garment Upper body,1010016,Solid,9,Black,...,Jersey,A,Ladieswear,1,Ladieswear,2,H&M+,1005,Jersey Fancy,"Long-sleeved tops in soft, organic cotton jers..."


## Create Product, Department, and Article Dataframes

In [18]:
product_df = full_article_df[['productCode', 'prodName',
                              'productTypeNo', 'productTypeName',
                              'productGroupName', 'garmentGroupNo', 'garmentGroupName',
                              'detailDesc']].drop_duplicates(subset='productCode')

product_df.to_csv('product.csv', index=False)
product_df

Unnamed: 0,productCode,prodName,productTypeNo,productTypeName,productGroupName,garmentGroupNo,garmentGroupName,detailDesc
0,108775,Strap top,253,Vest top,Garment Upper body,1002,Jersey Basic,Jersey top with narrow shoulder straps.
3,110065,OP T-shirt (Idro),306,Bra,Underwear,1017,"Under-, Nightwear","Microfibre T-shirt bra with underwired, moulde..."
6,111565,20 den 1p Stockings,304,Underwear Tights,Socks & Tights,1021,Socks and Tights,"Semi shiny nylon stockings with a wide, reinfo..."
8,111586,Shape Up 30 den 1p Tights,273,Leggings/Tights,Garment Lower body,1021,Socks and Tights,Tights with built-in support to lift the botto...
9,111593,Support 40 den 1p Tights,304,Underwear Tights,Socks & Tights,1021,Socks and Tights,"Semi shiny tights that shape the tummy, thighs..."
...,...,...,...,...,...,...,...,...
105395,939927,Dolphin,265,Dress,Garment Full body,1013,Dresses Ladies,Short dress in an airy weave with a small stan...
105444,942187,ED Sasha tee,255,T-shirt,Garment Upper body,1005,Jersey Fancy,"Oversized, straight-cut T-shirt in a soft moda..."
105493,946282,Linnea dress,265,Dress,Garment Full body,1013,Dresses Ladies,Short dress in lace with flounces down the fro...
105520,947599,ED Duno 2p.,254,Top,Garment Upper body,1005,Jersey Fancy,"Long-sleeved tops in soft, organic cotton jers..."


In [19]:
department_df = full_article_df[['departmentNo', 'departmentName', 'sectionNo', 'sectionName']]\
    .drop_duplicates(subset='departmentNo')

department_df.to_csv('department.csv', index=False)
department_df

Unnamed: 0,departmentNo,departmentName,sectionNo,sectionName
0,1676,Jersey Basic,16,Womens Everyday Basics
3,1339,Clean Lingerie,61,Womens Lingerie
6,3608,Tights basic,62,"Womens Nightwear, Socks & Tigh"
17,5883,Jersey Basic,26,Men Underwear
23,2032,Jersey,8,Mama
...,...,...,...,...
89929,7857,Kids Boy Exclusive,46,Kids Boy
92964,7510,Woven,28,Men Edition
97443,3420,Small Accessories Extended,66,Womens Small accessories
101971,8090,Promotion/Other/Offer,29,Men Other


In [20]:
article_df = full_article_df[['articleId', 'productCode', 'departmentNo', 'prodName', 'productTypeName',
                              'graphicalAppearanceNo', 'graphicalAppearanceName', 'colourGroupCode', 'colourGroupName']]
article_df.to_csv('article.csv', index=False)
article_df

Unnamed: 0,articleId,productCode,departmentNo,prodName,productTypeName,graphicalAppearanceNo,graphicalAppearanceName,colourGroupCode,colourGroupName
0,108775015,108775,1676,Strap top,Vest top,1010016,Solid,9,Black
1,108775044,108775,1676,Strap top,Vest top,1010016,Solid,10,White
3,110065001,110065,1339,OP T-shirt (Idro),Bra,1010016,Solid,9,Black
4,110065002,110065,1339,OP T-shirt (Idro),Bra,1010016,Solid,10,White
6,111565001,111565,3608,20 den 1p Stockings,Underwear Tights,1010016,Solid,9,Black
...,...,...,...,...,...,...,...,...,...
105395,939927001,939927,1322,Dolphin,Dress,1010013,Other pattern,9,Black
105444,942187001,942187,1919,ED Sasha tee,T-shirt,1010016,Solid,9,Black
105493,946282001,946282,1322,Linnea dress,Dress,1010021,Lace,9,Black
105520,947599001,947599,1919,ED Duno 2p.,Top,1010016,Solid,9,Black


In [21]:
customer_df = camel_case_rename_cols(pd.read_csv('data/customers.csv'))
customer_df = customer_df[customer_df.customerId.isin(customer_ids)]
customer_df.to_csv('customer.csv', index=False)
customer_df

Unnamed: 0,customerId,fn,active,clubMemberStatus,fashionNewsFrequency,age,postalCode
86,0003e867a930d0d6842f923d6ba7c9b77aba33fe2a0fbf...,1.0,1.0,ACTIVE,Regularly,33.0,d647e4ede3d0eb4ce0750440a110350b5f4c758165d89d...
425,00140d87c629b961e410e1d143084146c6fe71df40fe3d...,,,ACTIVE,NONE,24.0,d686e242886674f5bed783e6ceb2c52fe89f2c39996bbf...
810,00264b7d4cd6498292e8a355b699c2d07725d123f04867...,1.0,1.0,ACTIVE,Regularly,53.0,2c29ae653a9282cce4151bd87643c907644e09541abc28...
1947,005c6d3bb66c86aab606814cd9995a12f99b3a44b58c72...,,,PRE-CREATE,NONE,,177b4a2258a85a2247daaa7cdffba96a74c741ea8a6605...
2155,006684ff58368b611db31b1ca782a87cad496e69835e42...,,,ACTIVE,NONE,32.0,4296834187b1ffb908c0aa276b29a4b1af87cad557fb40...
...,...,...,...,...,...,...,...
1364911,feac9822f51efc778acc044776b4b34e8e0a86615bf983...,,,ACTIVE,NONE,48.0,8cecc780f67ff32def9c8e8dff5f454bce26a7cbd4c860...
1366543,fef793ec3a7d62d782824517355d74ded50964dce33009...,,,ACTIVE,NONE,46.0,5799a39cffe701ebdb12181348bf10f9e23abcc3868c43...
1367605,ff2b58ad3e83f2e3499b3eda6ea99993b3bca10d8ceee4...,,,ACTIVE,NONE,35.0,2c29ae653a9282cce4151bd87643c907644e09541abc28...
1370498,ffb925b11e1bb2e375d22a02d67907994eb8cb92ec2e7d...,,,ACTIVE,NONE,34.0,ebdd8c5c893683c3cf52c011d4e35024e46d183c95f0fa...


## Write Transactions Dataframe to csv

In [22]:
transaction_df.to_csv('transaction.csv', index=False)

## Upload to Google Cloud

In [24]:
!gsutil cp product.csv gs://neo4j-workshop-data/genai-hm



Updates are available for some Google Cloud CLI components.  To install them,
please run:
  $ gcloud components update

Copying file://product.csv [Content-Type=text/csv]...
/ [1 files][  2.5 MiB/  2.5 MiB]                                                
Operation completed over 1 objects/2.5 MiB.                                      


In [25]:
!gsutil cp department.csv gs://neo4j-workshop-data/genai-hm

Copying file://department.csv [Content-Type=text/csv]...
/ [1 files][ 10.8 KiB/ 10.8 KiB]                                                
Operation completed over 1 objects/10.8 KiB.                                     


In [26]:
!gsutil cp article.csv gs://neo4j-workshop-data/genai-hm

Copying file://article.csv [Content-Type=text/csv]...
/ [1 files][  1.5 MiB/  1.5 MiB]                                                
Operation completed over 1 objects/1.5 MiB.                                      


In [27]:
!gsutil cp customer.csv gs://neo4j-workshop-data/genai-hm

Copying file://customer.csv [Content-Type=text/csv]...
/ [1 files][298.7 KiB/298.7 KiB]                                                
Operation completed over 1 objects/298.7 KiB.                                    


In [29]:
!gsutil cp transaction.csv gs://neo4j-workshop-data/genai-hm

Copying file://transaction.csv [Content-Type=text/csv]...
- [1 files][  5.3 MiB/  5.3 MiB]                                                
Operation completed over 1 objects/5.3 MiB.                                      
