# Neo4j Generative AI - Data Preparation Pre-Load
This notebook shows the data prep and filtering we did of the full Kaggle dataset used in the Neo4j GenAI workshop. 

## Setup

In [None]:
%%capture
%pip install kaggle

In [None]:
import pandas as pd

In [None]:
SAMPLE_NUM_CUSTOMERS = 1000 #set to 0 or less for no sampling
FILTER_ARTICLES = False #whether to filter out certain intimate products for demo purposes (real data problems)
RANDOM_SEED = 7474 #seed to use for replicating sampling

In [None]:
def camel_case(s):
    ss = s.lower().split('_')
    if len(ss) == 1:
        return ss[0]
    return ss[0] + ''.join(st.title() for st in ss[1:])

def camel_case_dict(name_keys):
    name_values = [camel_case(s) for s in name_keys]
    return dict(zip(name_keys, name_values))

def camel_case_rename_cols(df):
    col_map = camel_case_dict(df.columns)
    return df.rename(columns=col_map)

## Get Source Data

To download the data you will need to: 
1. configure authentication per the instruction @ https://github.com/Kaggle/kaggle-api/blob/main/README.md
2. accept the competition rules at https://www.kaggle.com/competitions/h-and-m-personalized-fashion-recommendations/rules
3. Then, run the download commands below

In [None]:
!kaggle competitions download -c h-and-m-personalized-fashion-recommendations -f articles.csv -p data
!kaggle competitions download -c h-and-m-personalized-fashion-recommendations -f customers.csv -p data
!kaggle competitions download -c h-and-m-personalized-fashion-recommendations -f transactions_train.csv -p data
!cd data && unzip -n '*.zip'

## Filter Articles

In [None]:
init_article_df = camel_case_rename_cols(pd.read_csv('data/articles.csv'))
init_article_df.shape

In [None]:
# Filtering out some intimate products for demo purposes
filtered_article_ids = init_article_df.articleId
if FILTER_ARTICLES:
    filtered_article_ids = init_article_df[init_article_df.garmentGroupName != 'Under-, Nightwear'].articleId

## Load Transactions & Create ID 

In [None]:
init_transaction_df = camel_case_rename_cols(pd.read_csv('data/transactions_train.csv'))
# create a transaction ID 
# generally a good idea to have an id for these (source data doesn't include ids)
init_transaction_df['txId'] = range(init_transaction_df.shape[0])
init_transaction_df.shape

In [None]:
# filter customer IDs to those transacting with filtered articles
filtered_customer_ids = init_transaction_df[init_transaction_df.articleId.isin(filtered_article_ids)].customerId.drop_duplicates()
filtered_customer_ids

## Sample Customers

In [None]:
customer_ids = filtered_customer_ids
if SAMPLE_NUM_CUSTOMERS > 0:
    customer_ids = filtered_customer_ids.sample(n=SAMPLE_NUM_CUSTOMERS, random_state=RANDOM_SEED).reset_index(drop=True)
customer_ids

In [None]:
# get articles transacted with by sample customers
article_ids = init_transaction_df[init_transaction_df.customerId.isin(customer_ids)].articleId.drop_duplicates()
article_ids

## Sample Down Data
Now that we have the list of customers and articles to include , `customer_ids`, and `article_ids` respectively, we can use them to filter the source data and stage for loading

In [None]:
transaction_df = init_transaction_df[init_transaction_df.customerId.isin(customer_ids)]
transaction_df

In [None]:
full_article_df = init_article_df[init_article_df.articleId.isin(article_ids)]
full_article_df

## Create Product, Department, and Article Dataframes

In [None]:
product_df = full_article_df[['productCode', 'prodName',
                              'productTypeNo', 'productTypeName',
                              'productGroupName', 'garmentGroupNo', 'garmentGroupName',
                              'detailDesc']].drop_duplicates(subset='productCode')

product_df.to_csv('product.csv', index=False)
product_df

In [None]:
department_df = full_article_df[['departmentNo', 'departmentName', 'sectionNo', 'sectionName']]\
    .drop_duplicates(subset='departmentNo')

department_df.to_csv('department.csv', index=False)
department_df

In [None]:
article_df = full_article_df[['articleId', 'productCode', 'departmentNo', 'prodName', 'productTypeName',
                              'graphicalAppearanceNo', 'graphicalAppearanceName', 'colourGroupCode', 'colourGroupName']]
article_df.to_csv('article.csv', index=False)
article_df

In [None]:
customer_df = camel_case_rename_cols(pd.read_csv('data/customers.csv'))
customer_df = customer_df[customer_df.customerId.isin(customer_ids)]
customer_df.to_csv('customer.csv', index=False)
customer_df

## Write Transactions Dataframe to csv

In [None]:
transaction_df.to_csv('transaction.csv', index=False)

## Upload to Google Cloud

In [None]:
!gsutil cp product.csv gs://neo4j-workshop-data/genai-hm

In [None]:
!gsutil cp department.csv gs://neo4j-workshop-data/genai-hm

In [None]:
!gsutil cp article.csv gs://neo4j-workshop-data/genai-hm

In [None]:
!gsutil cp customer.csv gs://neo4j-workshop-data/genai-hm

In [None]:
!gsutil cp transaction.csv gs://neo4j-workshop-data/genai-hm