In [1]:
import pandas as pd
import re

In [2]:
df = pd.read_csv('cazooCarsDataset.csv')

In [3]:
def parse_attributes(attr):
    engine = None
    doors = None
    emission = None
    model_full = None

    if pd.isna(attr):
        return pd.Series([engine, doors, emission, model_full])
    
    text = attr

    # delete number: (24) - 
    text = re.sub(r'^\(\d+\)\s*-\s*', '', text).strip()

    # Engine Size
    # 320d = series 3, 2.0 diesel
    engine_match_bmw = re.search(r'(\d{3})d', text, flags=re.IGNORECASE)
    if engine_match_bmw:
        val = int(engine_match_bmw.group(1))
        engine = round(val / 100.0, 1)   # 320 → 2.0
        

    # 2.0 simple   or 3.0T = 3.0 Turbo/Petrol
    engine_match = re.search(r'(\d\.\d)\s*T?', text)
    if engine_match:
        engine = float(engine_match.group(1))

    # Doors
    doors_match = re.search(r'(\d)\s*[- ]?dr', text, flags=re.IGNORECASE)
    if not doors_match:
        doors_match = re.search(r'(\d)\s*[- ]?Door', text, flags=re.IGNORECASE)
    if doors_match:
        doors = int(doors_match.group(1))

    # Emission Standard
    emission_match = re.search(r'Euro\s*\d+(\s*\(s/s\))?', text, flags=re.IGNORECASE)
    if emission_match:
        emission = emission_match.group(0)

    # residual text = ModelFull 
    model_full = re.sub(r'^\d{3}d', '', text)                 # 320d
    model_full = re.sub(r'\d\.\dT?', '', model_full)          # 2.0 or 3.0T
    model_full = re.sub(r'\d\s*[- ]?(dr|Door)', '', model_full, flags=re.IGNORECASE) # Doors
    model_full = re.sub(r'Euro\s*\d+(\s*\(s/s\))?', '', model_full, flags=re.IGNORECASE) # Euro
    model_full = re.sub(r'\s+', ' ', model_full).strip()      # gaps

    return pd.Series([engine, doors, emission, model_full])


df[["EngineSize", "Doors", "Emission", "ModelFull"]] = df["Attrebutes"].apply(parse_attributes)

df["Doors"] = df["Doors"].astype("Int64")

df.drop(columns=["Attrebutes"], inplace=True)

df.to_csv("cazooCarsDataset_clean.csv", index=False)