In [None]:
import pandas as pd

# Load the Parquet file
df = pd.read_parquet('./vantage_furniture_tutorial.parquet')


In [None]:
import pandas as pd
import hashlib
from urllib.parse import urlparse, unquote

# Function to create MD5 hash of a string
def create_md5_hash(string):
    return hashlib.md5(string.encode()).hexdigest()

# Function to transform URL slug into a title
def url_to_title(url):
    parsed_url = urlparse(url)
    title = parsed_url.path.split('/')[-1].replace('-', ' ')
    title = unquote(title)
    return title.title()

# Function to bucket ratings
def bucket_rating(rating):
    if rating == 5.0:
        return '5 stars'
    elif rating >= 4.0:
        return '4 stars and up'
    elif rating >= 3.0:
        return '3 stars and up'
    elif rating >= 2.0:
        return '2 stars and up'
    else:
        return 'Less than 2 stars'

# Function to bucket number of ratings
def bucket_numratings(numratings):
    if numratings == 0:
        return 'none'
    elif numratings < 10:
        return 'few'
    elif numratings < 100:
        return 'dozens'
    elif numratings < 1000:
        return 'hundreds'
    else:
        return '1000+'

# prep
df.rename(columns={'id':'noop_url'}, inplace=True)

# 0. new id
df['id'] = df['noop_url'].apply(create_md5_hash)

# 1. Drop specified columns
df.drop(['meta_category_l1', 'meta_category_l2', 'meta_category_l3'], axis=1, inplace=True)

# 2. Rename 'meta_category_l4' to 'meta_category'
df.rename(columns={'meta_category_l4': 'meta_category'}, inplace=True)

# 2a. Rename 'meta_image' to 'noop_image_url'
df.rename(columns={'meta_image': 'noop_image_url'}, inplace=True)

# 3. Rename 'meta_rating' to 'noop_rating'
df.rename(columns={'meta_rating': 'noop_rating'}, inplace=True)

# 4. Create 'meta_rating_bucket' from 'noop_rating'
df['meta_rating_bucket'] = df['noop_rating'].apply(bucket_rating)

# 5. Rename 'meta_numratings' to 'noop_numratings'
df.rename(columns={'meta_numratings': 'noop_numratings'}, inplace=True)

# 6. Create 'meta_numratings_bucket' from 'noop_numratings'
df['meta_numratings_bucket'] = df['noop_numratings'].apply(bucket_numratings)

# 7. Creating a new column 'noop_description' with the first 255 characters of the 'description' column
df['noop_description'] = df['text'].str.slice(0, 255)

# 8. title from url
df['noop_title'] = df['noop_url'].apply(url_to_title)

# Display the modified DataFrame
df.head()


In [None]:
import pandas as pd

# Reordering the columns in the DataFrame before saving to a parquet file
meta_columns = [col for col in df.columns if col.startswith('meta_')]
noop_columns = [col for col in df.columns if col.startswith('noop_')]

# Constructing the new column order
new_column_order = ['id', 'text'] + meta_columns + noop_columns

# Reordering the DataFrame
df_reordered = df[new_column_order]

# Saving the DataFrame to a parquet file
output_file_path = './furniture.parquet'  # Specifying the file path
df_reordered.to_parquet(output_file_path)

In [None]:
import os
import json
import pandas as pd

# Load your DataFrame here
# df_reordered = pd.read_parquet('furniture.parquet')

# Create the directory 'public' if it doesn't exist
output_dir = './public'
os.makedirs(output_dir, exist_ok=True)

# Iterate through the DataFrame and write each row to a separate JSON file
for index, row in df_reordered.iterrows():
    # Convert NaN values to None
    row_cleaned = row.where(pd.notnull(row), None)
    
    # Constructing the file name
    file_name = os.path.join(output_dir, f"{row_cleaned['id']}.json")
    
    # Convert the row to a dictionary and write to a JSON file
    with open(file_name, 'w') as file:
        json.dump(row_cleaned.to_dict(), file)


In [None]:

# Adjusting the function to create the JSON file in the accessible directory
def create_json_file(df, column_name):
    # Generate the nice, init upper cased without underscores version of the column name
    nice_name = column_name.replace("meta_", "").replace("_", " ").title()
    slug = column_name.replace("meta_", "")
    
    # Find distinct values in the column
    distinct_values = df[column_name].dropna().unique()
    
    # Create JSON data
    json_data = [
        {
            "name": value,
            "slug": value,
            "categoryName": nice_name,
            "categorySlug": slug,
            "count": df[df[column_name] == value].shape[0]
        }
        for value in distinct_values
    ]
    
    # Write to file
    file_path = "public/" + column_name + ".json"
    with open(file_path, 'w') as file:
        json.dump(json_data, file, indent=4)
    
    return file_path

# Creating JSON files for each column with 'meta_' prefix
json_files = [create_json_file(df, column) for column in df.columns if column.startswith('meta_')]
json_files_paths = {column: create_json_file(df, column) for column in df.columns if column.startswith('meta_')}
json_files_paths