In [1]:
import pandas as pd

# Load the Parquet file
df = pd.read_parquet('./vantage_furniture_tutorial.parquet')


In [2]:
import pandas as pd
import hashlib
from urllib.parse import urlparse, unquote

# Function to create MD5 hash of a string
def create_md5_hash(string):
    return hashlib.md5(string.encode()).hexdigest()

# Function to transform URL slug into a title
def url_to_title(url):
    parsed_url = urlparse(url)
    title = parsed_url.path.split('/')[-1].replace('-', ' ')
    title = unquote(title)
    return title.title()

# Function to bucket ratings
def bucket_rating(rating):
    if rating == 5.0:
        return '5 stars'
    elif rating >= 4.0:
        return '4 stars and up'
    elif rating >= 3.0:
        return '3 stars and up'
    elif rating >= 2.0:
        return '2 stars and up'
    else:
        return 'Less than 2 stars'

# Function to bucket number of ratings
def bucket_numratings(numratings):
    if numratings == 0:
        return 'none'
    elif numratings < 10:
        return 'few'
    elif numratings < 100:
        return 'dozens'
    elif numratings < 1000:
        return 'hundreds'
    else:
        return '1000+'

# prep
df.rename(columns={'id':'noop_url'}, inplace=True)

# 0. new id
df['id'] = df['noop_url'].apply(create_md5_hash)

# 1. Drop specified columns
df.drop(['meta_category_l1', 'meta_category_l2', 'meta_category_l3'], axis=1, inplace=True)

# 2. Rename 'meta_category_l4' to 'meta_category'
df.rename(columns={'meta_category_l4': 'meta_category'}, inplace=True)

# 2a. Rename 'meta_image' to 'noop_image_url'
df.rename(columns={'meta_image': 'noop_image_url'}, inplace=True)

# 3. Rename 'meta_rating' to 'noop_rating'
df.rename(columns={'meta_rating': 'noop_rating'}, inplace=True)

# 4. Create 'meta_rating_bucket' from 'noop_rating'
df['meta_rating_bucket'] = df['noop_rating'].apply(bucket_rating)

# 5. Rename 'meta_numratings' to 'noop_numratings'
df.rename(columns={'meta_numratings': 'noop_numratings'}, inplace=True)

# 6. Create 'meta_numratings_bucket' from 'noop_numratings'
df['meta_numratings_bucket'] = df['noop_numratings'].apply(bucket_numratings)

# 7. Creating a new column 'noop_description' with the first 255 characters of the 'description' column
df['noop_description'] = df['text'].str.slice(0, 255)

# 8. title from url
df['noop_title'] = df['noop_url'].apply(url_to_title)

# Display the modified DataFrame
df.head()


Unnamed: 0,meta_category,noop_url,noop_rating,noop_numratings,noop_image_url,text,id,meta_rating_bucket,meta_numratings_bucket,noop_description,noop_title
0,Console Tables & Cabinets,https://www.etsy.com/listing/1249606275/consol...,5.0,88.0,https://i.etsystatic.com/31205239/r/il/f74372/...,Console table made of old solid wood beams joi...,c76532c4c9f16dfd0d5f4ff630a18e20,5 stars,dozens,Console table made of old solid wood beams joi...,Console Table Made Of Old Solid Wood
1,Console Tables & Cabinets,https://www.etsy.com/listing/1560417295/narrow...,5.0,76.0,https://i.etsystatic.com/25878970/c/1857/1476/...,"Narrow Console Table, 9.8"" Deep Entry Table ♥ ...",545110c7c31fd107f9092c74d44e2aa1,5 stars,dozens,"Narrow Console Table, 9.8"" Deep Entry Table ♥ ...",Narrow Console Table 98 Deep Entry Table
2,,https://www.etsy.com/listing/1591708043/glass-...,,,https://i.etsystatic.com/39362305/r/il/2d0b71/...,glass coffee table Do not settle for less when...,2c4a4b1d9c0738907cd4a94c3738bff7,Less than 2 stars,1000+,glass coffee table Do not settle for less when...,Glass Coffee Table
3,Couches & Loveseats,https://www.etsy.com/listing/1134883686/linen-...,5.0,627.0,https://i.etsystatic.com/23543776/r/il/9b812b/...,"Linen fabric Floor seating sofa,Off white Beig...",9e11e1bc4cc09ae548e870b3c67882d0,5 stars,hundreds,"Linen fabric Floor seating sofa,Off white Beig...",Linen Fabric Floor Seating Sofaoff White
4,Coffee & End Tables,https://www.etsy.com/listing/1028007759/coffee...,5.0,,https://i.etsystatic.com/21203318/r/il/4476de/...,"Coffee Table - South American Walnut, Live Edg...",cd20ee1e96cec7b1c4781538bc7ef625,5 stars,1000+,"Coffee Table - South American Walnut, Live Edg...",Coffee Table South American Walnut Live


In [3]:
import pandas as pd

# Reordering the columns in the DataFrame before saving to a parquet file
meta_columns = [col for col in df.columns if col.startswith('meta_')]
noop_columns = [col for col in df.columns if col.startswith('noop_')]

# Constructing the new column order
new_column_order = ['id', 'text'] + meta_columns + noop_columns

# Reordering the DataFrame
df_reordered = df[new_column_order]

# Saving the DataFrame to a parquet file
output_file_path = './furniture.parquet'  # Specifying the file path
df_reordered.to_parquet(output_file_path)

In [4]:
import os
import json

# Assuming the DataFrame 'df_reordered' is already loaded
# If not, you would need to load the parquet file again
# df_reordered = pd.read_parquet('furniture.parquet')

# Create the directory 'public' if it doesn't exist
output_dir = './public'
os.makedirs(output_dir, exist_ok=True)

# Iterate through the DataFrame and write each row to a separate JSON file
for index, row in df_reordered.iterrows():
    # Constructing the file name
    file_name = os.path.join(output_dir, f"{row['id']}.json")
    
    # Convert the row to a dictionary and write to a JSON file
    with open(file_name, 'w') as file:
        json.dump(row.to_dict(), file)

# Informing that the operation is complete
"JSON files for each row have been created in the 'public' directory."



"JSON files for each row have been created in the 'public' directory."