In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from pyspark.sql.functions import col, split, explode
from pyspark.sql import SparkSession

# Initialize Spark session
spark = SparkSession.builder.appName("CSVExample").getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/03/26 22:50:09 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [176]:
raw_data = pd.read_csv('2024-2019_raw.csv')

In [185]:
#Checking for Duplicates
raw_data['Currency'].unique().tolist()

['EUR',
 'RMB',
 'USD',
 'GBP',
 'SGD',
 'CAD',
 'THB',
 'CHF',
 'AUD',
 'HKD',
 'CZK',
 'SEK',
 'AED',
 'KRW',
 'JPY',
 'MOP',
 'SAR',
 'MYR',
 'PHP',
 'CNY',
 'CDN',
 'QAR',
 'MXN']

In [186]:
def split_and_clean(df):
    # Create a copy of the DataFrame to avoid modifying the original
    processed_df = df.copy()
    
    # 1. Add parentheses to make the patterns into capture groups
    hermes_leather = r'(Togo|Epsom|Hunter|Clémence|Teddy|Swift|swift|Mysore Chèvre|Touch|Chèvre|Barenia Faubourg|Tressage|Chèvre Chamkila|Chamkila|Box|Chevre|Barenia|Clemence|Evercolor|Evergrain|Taurillon Maurice|Taurillon Novillo|Vache Hunter|Veau Madame|Tadelakt|Togo Butler|Sombrero|Grizzly|Doblis|Buffalo|Vachette Crispe|Teddy Shearling)'
    hermes_exotic_skin = r'(Alligator Mississippiensis|Satine Boreal Alligator|Matte Gator|Matte Alligator|Prosus|Porosus|Porosus Crocodile|Alligator Matte|Crocodile Niloticus|Niloticus Croc|Croc|Crocodile Porosus|Crocodile Himalaya|Lizard|Lizard Ombre|Ostrich|Python|Barenia Alligator)'
    hermes_stitching = r'(Retourne|Sellier)'
    hermes_limited_edition = r'(Casaque|Diamond|Chamikla|Midas|Padded|Studded|Chamika|Étoilée|Touch|Teddy|Himalaya|Tri-Color|in and out|Canvas|Suede|Verso|Picnic|Côte à Côte|Sellier Aizome|Mirror|Cargo|In & Out|Barenia Faubourg|Faubourg|Shadow|Fringe|Club|Clouté|Cavalcadour|Rock|Quadrille|Mosaic|Tressage|Graphite)'
    
    # 2. Split Bag Type and Bag Size
    processed_df[['Bag_Type', 'Bag_Size']] = processed_df['Bag_Type'].str.split(' ', expand=True)
    
    # 3. Extract materials and styles with error handling
    try:
        processed_df['Leather'] = processed_df['Material_Style'].str.extract(hermes_leather)
        processed_df['Exotic_Skin'] = processed_df['Material_Style'].str.extract(hermes_exotic_skin)
        processed_df['Stitching'] = processed_df['Material_Style'].str.extract(hermes_stitching)
        processed_df['Limited_Edition'] = processed_df['Material_Style'].str.extract(hermes_limited_edition)

    #4. Filling Bag "Constance" with Sellier Stitching
        processed_df.loc[processed_df['Bag_Type'].str.contains('Constance', na=False), 'Stitching'] = processed_df \
                    .loc[processed_df['Bag_Type'].str.contains('Constance', na=False), 'Stitching'] \
                    .fillna('Sellier')    
    
    #5. Filling Bag "Lindy" with Retourne Stitching
        processed_df.loc[processed_df['Bag_Type'].str.contains('Lindy', na=False), 'Stitching'] = processed_df \
                    .loc[processed_df['Bag_Type'].str.contains('Lindy', na=False), 'Stitching'] \
                    .fillna('Retourne')          
         
    #6. Filling Bag "Evelyne" with Retourne Stitching
        processed_df.loc[processed_df['Bag_Type'].str.contains('Evelyne', na=False), 'Stitching'] = processed_df \
                    .loc[processed_df['Bag_Type'].str.contains('Evelyne', na=False), 'Stitching'] \
                    .fillna('Retourne')       

    #7. Filling Bag "Mini Kelly 7.5" with Sellier Stitching
        processed_df.loc[processed_df['Bag_Size'].str.contains('7.5', na=False), 'Stitching'] = processed_df \
                    .loc[processed_df['Bag_Size'].str.contains('7.5', na=False), 'Stitching'] \
                    .fillna('Sellier')    
    
    #8. Filling Bag "Kelly" with NaN Stitching with Sellier Stitching
        processed_df.loc[processed_df['Bag_Type'].str.contains('Kelly', na=True), 'Stitching'] = processed_df \
                    .loc[processed_df['Bag_Type'].str.contains('Kelly', na=True), 'Stitching'] \
                    .fillna('Sellier')    

    #8. Filling Bag "Birkin" with NaN Stitching with Retourne Stitching
        processed_df.loc[processed_df['Bag_Type'].str.contains('Birkin', na=True), 'Stitching'] = processed_df \
                    .loc[processed_df['Bag_Type'].str.contains('Birkin', na=True), 'Stitching'] \
                    .fillna('Retourne')    

    #9. Filling Bag "HAC" with NaN Stitching with Retourne Stitching
        processed_df.loc[processed_df['Bag_Type'].str.contains('HAC', na=True), 'Stitching'] = processed_df \
                    .loc[processed_df['Bag_Type'].str.contains('HAC', na=True), 'Stitching'] \
                    .fillna('Retourne')    
              
    # 10. Fill NaN values
        processed_df['Leather'] = processed_df['Leather'].fillna('Exotic')
        processed_df['Exotic_Skin'] = processed_df['Exotic_Skin'].fillna('Regular Leather')
        #processed_df['Stitching'] = processed_df['Stitching'].fillna('')
        processed_df['Limited_Edition'] = processed_df['Limited_Edition'].fillna('Regular')
        
    except Exception as e:
        print(f"Error processing materials: {e}")
    
    return processed_df

In [187]:
# Cleaned Dataset for Experiment and Querying
cleaned = split_and_clean(raw_data)

In [188]:
# For querying and look at leftover
spark_df = spark.createDataFrame(cleaned)
spark_df.show(200)
#spark_df.where(col("Leather") != "NaN") \
 #       .where(col("Exotic_Skin") == "NaN") \
  #      .show(200,truncate=False)


+---------+--------+--------------------+---------+--------+--------+-----------------+--------------------+---------+----------------+
| Bag_Type|    Date|      Material_Style|    Price|Currency|Bag_Size|          Leather|         Exotic_Skin|Stitching| Limited_Edition|
+---------+--------+--------------------+---------+--------+--------+-----------------+--------------------+---------+----------------+
|    Kelly|  1/2/24|       Togo Retourne|   8600.0|     EUR|      25|             Togo|     Regular Leather| Retourne|         Regular|
|    Kelly|  1/2/24|       Epsom Sellier|   9500.0|     EUR|      25|            Epsom|     Regular Leather|  Sellier|         Regular|
|    Kelly|  1/2/24|               Swift|   9000.0|     EUR|      25|            Swift|     Regular Leather|  Sellier|         Regular|
|    Kelly|  1/2/24|Mysore Chèvre Sel...|   9600.0|     EUR|      25|    Mysore Chèvre|     Regular Leather|  Sellier|         Regular|
|    Kelly|  1/2/24|Shiny Crocodile S...|  42400

In [189]:
#Checking for NA Values
spark_df.where(col("Bag_Size") == "NaN").show(450)

+--------+----+--------------+-----+--------+--------+-------+-----------+---------+---------------+
|Bag_Type|Date|Material_Style|Price|Currency|Bag_Size|Leather|Exotic_Skin|Stitching|Limited_Edition|
+--------+----+--------------+-----+--------+--------+-------+-----------+---------+---------------+
+--------+----+--------------+-----+--------+--------+-------+-----------+---------+---------------+

