In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from pyspark.sql.functions import col, split, explode
from pyspark.sql import SparkSession

# Initialize Spark session
spark = SparkSession.builder.appName("CSVExample").getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/03/26 22:50:09 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [112]:
raw_2024 = pd.read_csv('2024_raw.csv')
raw_2023 = pd.read_csv('2023_raw.csv')

In [113]:
raw_2024.head()

Unnamed: 0,Bag_Type,Date,Material_Style,Price,Currency
0,Kelly 25,1/2/24,Togo Retourne,8600.0,EUR
1,Kelly 25,1/2/24,Epsom Sellier,9500.0,EUR
2,Kelly 25,1/2/24,Swift,9000.0,EUR
3,Kelly 25,1/2/24,Mysore Chèvre Sellier,9600.0,EUR
4,Kelly 25,1/2/24,Shiny Crocodile Sellier,42400.0,EUR


In [135]:
def split_and_clean(df):
    # Create a copy of the DataFrame to avoid modifying the original
    processed_df = df.copy()
    
    # 1. Add parentheses to make the patterns into capture groups
    hermes_leather = r'(Togo|Epsom|Teddy|Swift|swift|Mysore Chèvre|Touch|Chèvre|Barenia Faubourg|Chèvre Chamkila|Chamkila|Box|Chevre|Barenia|Clemence|Evercolor|Evergrain|Taurillon Maurice|Taurillon Novillo|Vache Hunter|Veau Madame|Tadelakt|Togo Butler|Sombrero|Grizzly|Doblis|Buffalo|Vachette Crispe|Teddy Shearling)'
    hermes_exotic_skin = r'(Alligator Mississippiensis|Satine Boreal Alligator|Matte Gator|Matte Alligator|Prosus|Porosus|Porosus Crocodile|Alligator Matte|Crocodile Niloticus|Niloticus Croc|Croc|Crocodile Porosus|Crocodile Himalaya|Lizard|Lizard Ombre|Ostrich|Python|Barenia Alligator)'
    hermes_stitching = r'(Retourne|Sellier)'
    hermes_limited_edition = r'(Casaque|Chamikla|Midas|Padded|Studded|Chamika|Étoilée|Touch|Teddy|Himalaya|Tri-Color|in and out|Canvas|Suede|Verso|Picnic|Côte à Côte|Sellier Aizome|Mirror|Cargo|In & Out|Barenia Faubourg|Faubourg|Shadow|Fringe|Club|Clouté|Cavalcadour|Rock|Quadrille|Mosaic|Tressage|Graphite)'
    
    # 2. Split Bag Type and Bag Size
    processed_df[['Bag_Type', 'Bag_Size']] = processed_df['Bag_Type'].str.split(' ', expand=True)
    
    # 3. Extract materials and styles with error handling
    try:
        processed_df['Leather'] = processed_df['Material_Style'].str.extract(hermes_leather)
        processed_df['Exotic_Skin'] = processed_df['Material_Style'].str.extract(hermes_exotic_skin)
        processed_df['Stitching'] = processed_df['Material_Style'].str.extract(hermes_stitching)
        processed_df['Limited_Edition'] = processed_df['Material_Style'].str.extract(hermes_limited_edition)

    #4. Filling Bag "Constance" with Sellier Stitching
        processed_df.loc[processed_df['Bag_Type'].str.contains('Constance', na=False), 'Stitching'] = processed_df \
                    .loc[processed_df['Bag_Type'].str.contains('Constance', na=False), 'Stitching'] \
                    .fillna('Sellier')    
    
    #5. Filling Bag "Lindy" with Retourne Stitching
        processed_df.loc[processed_df['Bag_Type'].str.contains('Lindy', na=False), 'Stitching'] = processed_df \
                    .loc[processed_df['Bag_Type'].str.contains('Lindy', na=False), 'Stitching'] \
                    .fillna('Retourne')          
         
    #6. Filling Bag "Evelyne" with Retourne Stitching
        processed_df.loc[processed_df['Bag_Type'].str.contains('Evelyne', na=False), 'Stitching'] = processed_df \
                    .loc[processed_df['Bag_Type'].str.contains('Evelyne', na=False), 'Stitching'] \
                    .fillna('Retourne')       

    #7. Filling Bag "Mini Kelly 7.5" with Sellier Stitching
        processed_df.loc[processed_df['Bag_Size'].str.contains('7.5', na=False), 'Stitching'] = processed_df \
                    .loc[processed_df['Bag_Size'].str.contains('7.5', na=False), 'Stitching'] \
                    .fillna('Sellier')    
    
    #8. Filling Bag "Kelly" with NaN Stitching with Sellier Stitching
        processed_df.loc[processed_df['Bag_Type'].str.contains('Kelly', na=True), 'Stitching'] = processed_df \
                    .loc[processed_df['Bag_Type'].str.contains('Kelly', na=True), 'Stitching'] \
                    .fillna('Sellier')    

    #8. Filling Bag "Birkin" with NaN Stitching with Retourne Stitching
        processed_df.loc[processed_df['Bag_Type'].str.contains('Birkin', na=True), 'Stitching'] = processed_df \
                    .loc[processed_df['Bag_Type'].str.contains('Birkin', na=True), 'Stitching'] \
                    .fillna('Retourne')    

    #9. Filling Bag "HAC" with NaN Stitching with Retourne Stitching
        processed_df.loc[processed_df['Bag_Type'].str.contains('HAC', na=True), 'Stitching'] = processed_df \
                    .loc[processed_df['Bag_Type'].str.contains('HAC', na=True), 'Stitching'] \
                    .fillna('Retourne')    
              
    # 10. Fill NaN values
        processed_df['Leather'] = processed_df['Leather'].fillna('Exotic')
        processed_df['Exotic_Skin'] = processed_df['Exotic_Skin'].fillna('Regular Leather')
        #processed_df['Stitching'] = processed_df['Stitching'].fillna('')
        processed_df['Limited_Edition'] = processed_df['Limited_Edition'].fillna('Regular')
        
    except Exception as e:
        print(f"Error processing materials: {e}")
    
    return processed_df

In [136]:
# Cleaned Dataset for Experiment and Querying
cleaned_2024 = split_and_clean(raw_2024)
cleaned_2023 = split_and_clean(raw_2023)

In [137]:
cleaned_2024['Bag_Type'].unique().tolist()

['Kelly', 'Birkin', 'HAC', 'Constance', 'Lindy', 'Evelyne']

In [138]:
# For querying and look at leftover
spark_df = spark.createDataFrame(cleaned_2024)
spark_df.show(200)
#spark_df.where(col("Leather") != "NaN") \
 #       .where(col("Exotic_Skin") == "NaN") \
  #      .show(200,truncate=False)


+---------+--------+--------------------+---------+--------+--------+-----------------+--------------------+---------+----------------+
| Bag_Type|    Date|      Material_Style|    Price|Currency|Bag_Size|          Leather|         Exotic_Skin|Stitching| Limited_Edition|
+---------+--------+--------------------+---------+--------+--------+-----------------+--------------------+---------+----------------+
|    Kelly|  1/2/24|       Togo Retourne|   8600.0|     EUR|      25|             Togo|     Regular Leather| Retourne|         Regular|
|    Kelly|  1/2/24|       Epsom Sellier|   9500.0|     EUR|      25|            Epsom|     Regular Leather|  Sellier|         Regular|
|    Kelly|  1/2/24|               Swift|   9000.0|     EUR|      25|            Swift|     Regular Leather|  Sellier|         Regular|
|    Kelly|  1/2/24|Mysore Chèvre Sel...|   9600.0|     EUR|      25|    Mysore Chèvre|     Regular Leather|  Sellier|         Regular|
|    Kelly|  1/2/24|Shiny Crocodile S...|  42400

In [139]:
spark_df = spark.createDataFrame(cleaned_2023)
spark_df.show(450)

+---------+--------+--------------------+-------+--------+--------+----------------+---------------+---------+----------------+
| Bag_Type|    Date|      Material_Style|  Price|Currency|Bag_Size|         Leather|    Exotic_Skin|Stitching| Limited_Edition|
+---------+--------+--------------------+-------+--------+--------+----------------+---------------+---------+----------------+
|    Kelly|  1/2/23|       Epsom Sellier|   8650|     EUR|      25|           Epsom|Regular Leather|  Sellier|         Regular|
|    Kelly|  1/2/23|       Togo Retourné|   8050|     EUR|      25|            Togo|Regular Leather|  Sellier|         Regular|
|    Kelly|  1/6/23|         Box Sellier|  23200|     SGD|      25|             Box|Regular Leather|  Sellier|         Regular|
|    Kelly| 1/10/23|  Swift (in and out)|  10600|     EUR|      25|           Swift|Regular Leather|  Sellier|      in and out|
|    Kelly| 1/29/23|Quadrille Toile/S...|  12870|     USD|      25|           Swift|Regular Leather|  Se