# Auto Scout Project 
**Magnimind** 

**Author:** Mark    
**Date:** February 2026  
**GitHub:** [link](https://github.com/mjoslin-ai/Magnimind-Project-Auto-Scout)

In [713]:
import pandas as pd
import numpy as np
import re
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)

## Project Phase:
### 1. Data Cleaning:
This phase involves removing broken, irrelevant, or redundant columns, and generating new
columns with meaningful values.

#### Data exploration

In [714]:
df = pd.read_json("scout_car.json", lines=True)
print(df.shape)
print(df.head())
print(df.info())

(15919, 54)
                                                                                                                                                url  \
0         https://www.autoscout24.com//offers/audi-a1-sportback-1-4-tdi-s-tronic-xenon-navi-klima-diesel-black-bdab349a-caa5-41b0-98eb-c1345b84445e   
1                                      https://www.autoscout24.com//offers/audi-a1-1-8-tfsi-sport-gasoline-red-b2547f8a-e83f-6237-e053-e250040a56df   
2  https://www.autoscout24.com//offers/audi-a1-sportback-1-6-tdi-s-tronic-einparkhilfe-plus-music-diesel-black-6183cb6a-8570-4b86-a132-9b54214bca88   
3                             https://www.autoscout24.com//offers/audi-a1-1-4-tdi-design-s-tronic-diesel-brown-35c86585-2727-d61e-e053-e250040ad640   
4    https://www.autoscout24.com//offers/audi-a1-sportback-1-4-tdi-s-tronic-s-line-ext-admired-xe-diesel-black-b86b55d8-9c7c-4a58-b658-b97008a51efc   

  make_model                                   short_description body_type  \
0  

#### Helper functions

In [715]:
# remove \n
def remove_na(x):
    if isinstance(x, list):
        cleaned = [str(item).strip().replace('\n', '').replace('\\n', '') 
                   for item in x if str(item).strip() and str(item).strip() != '']
        if len(cleaned) == 0:
            return np.nan
        if len(cleaned) == 1:
            return cleaned[0]
        return cleaned
    elif isinstance(x, str):
        return x.strip().replace('\n', '').replace('\\n', '')
    return x

# extract pure numeric values
def extract_number(x):
    if isinstance(x, list):
        # Grab the first element of the list
        x = x[0] if x else pd.NA
    if pd.isna(x):
        return pd.NA
    # Convert to string 
    s = str(x).strip()
    # Find numbers
    m = re.search(r'-?(?:\d{1,5}(?:,\d{3})*|\d+)(?:\.\d+)?', s)
    if not m:
        return pd.NA
    # Remove commas 
    cleaned = m.group().replace(',', '')
    try:
        return cleaned
    except ValueError:
        return pd.NA

# split str in a list (e.g., type column into condition and fuel)
def extract_first(x):
    if isinstance(x, list) and len(x) >= 2:
        return x[0] # first item
    return x if not pd.isna(x) else pd.NA

def extract_last(x):
    if isinstance(x, list) and len(x) >= 2:
        return x[-1] # last item
    return pd.NA

# extract warranty duration in months
def extract_months(value):
    # Handle missing values safely (scalar or array)
    try:
        if pd.isna(value):
            return pd.NA
    except (TypeError, ValueError):
        # pd.isna failed is probably array-like, treat as invalid
        pass

    # If it's array-like (list, tuple, Series, ndarray, etc.) take first element
    if hasattr(value, '__len__') and not isinstance(value, str):
        if len(value) == 0:
            return pd.NA
        # Recurse on first element (handles nested weirdness)
        return extract_months(value[0])

    # convert to string
    try:
        s = str(value).strip()
    except Exception:
        return pd.NA

    # Pattern 1: "12 months"
    match = re.search(r'(\d+)\s*(?:months?)\b', s, re.IGNORECASE)
    if match:
        return int(match.group(1))

    # Pattern 2: inside brackets "[12 months, Euro 6]"
    match_list = re.search(r'\[.*?(\d+)\s*(?:months?).*?\]', s, re.IGNORECASE)
    if match_list:
        return int(match_list.group(1))

    return pd.NA

# split consumption
def split_list(input_data):
    if isinstance(input_data, list):
        if all(isinstance(i, list) for i in input_data):
            # for list of lists
            return pd.Series(input_data)  
        else:
            # for flat list 
            return pd.Series(input_data)
    return pd.Series([None] * 3)  # Return NaN if input is not a list


#### Remove \n

In [716]:
df = (df
      .apply(lambda col: col.apply(remove_na)) # in col values
      .rename(columns=remove_na) # in col names
)

#### Standardize column names

In [717]:
print(df.columns)

Index(['url', 'make_model', 'short_description', 'body_type', 'price', 'vat',
       'km', 'registration', 'prev_owner', 'kW', 'hp', 'Type',
       'Previous Owners', 'Next Inspection', 'Inspection new', 'Warranty',
       'Full Service', 'Non-smoking Vehicle', 'null', 'Make', 'Model',
       'Offer Number', 'First Registration', 'Body Color', 'Paint Type',
       'Body Color Original', 'Upholstery', 'Body', 'Nr. of Doors',
       'Nr. of Seats', 'Model Code', 'Gearing Type', 'Displacement',
       'Cylinders', 'Weight', 'Drive chain', 'Fuel', 'Consumption',
       'CO2 Emission', 'Emission Class', 'Comfort & Convenience',
       'Entertainment & Media', 'Extras', 'Safety & Security', 'description',
       'Emission Label', 'Gears', 'Country version', 'Electricity consumption',
       'Last Service Date', 'Other Fuel Types', 'Availability',
       'Last Timing Belt Service Date', 'Available from'],
      dtype='str')


In [718]:
df.columns = df.columns.str.lower().str.replace(r'[ &]', '_', regex=True).str.replace('___', '_')

print(df.columns)

Index(['url', 'make_model', 'short_description', 'body_type', 'price', 'vat',
       'km', 'registration', 'prev_owner', 'kw', 'hp', 'type',
       'previous_owners', 'next_inspection', 'inspection_new', 'warranty',
       'full_service', 'non-smoking_vehicle', 'null', 'make', 'model',
       'offer_number', 'first_registration', 'body_color', 'paint_type',
       'body_color_original', 'upholstery', 'body', 'nr._of_doors',
       'nr._of_seats', 'model_code', 'gearing_type', 'displacement',
       'cylinders', 'weight', 'drive_chain', 'fuel', 'consumption',
       'co2_emission', 'emission_class', 'comfort_convenience',
       'entertainment_media', 'extras', 'safety_security', 'description',
       'emission_label', 'gears', 'country_version', 'electricity_consumption',
       'last_service_date', 'other_fuel_types', 'availability',
       'last_timing_belt_service_date', 'available_from'],
      dtype='str')


#### Remove irrelevant columns

In [719]:
cols_to_drop = ['url', # metadata
                'short_description', # general column
                'description', # general column
                'kw', # 0 non-null values and redundant with hp
                'null', # empty list (df['null'].apply(lambda x: x == []).all() is true)
                'offer_number', # manufacturer/internal reference code
                'model_code', # manufacturer/internal reference code
                'make_model', # redundant with make and model columns
                'body_color_original', # redundant with body_color and paint_type columns
                'full_service' # redundant with co2_emission and emission_class
                ]
df.drop(columns=cols_to_drop, inplace=True)
print(df.shape)

(15919, 44)


#### Remove columns with too many nulls (> 70%)

In [720]:
threshold = 70
missing_percentage = df.isnull().mean() * 100
cols_to_drop = missing_percentage[missing_percentage > threshold].index
print(f"Columns missing values > {threshold}%:\n", cols_to_drop)
df.drop(columns=cols_to_drop, inplace=True)
print(df.shape)

Columns missing values > 70%:
 Index(['next_inspection', 'inspection_new', 'non-smoking_vehicle',
       'emission_label', 'electricity_consumption', 'last_service_date',
       'other_fuel_types', 'availability', 'last_timing_belt_service_date',
       'available_from'],
      dtype='str')
(15919, 34)


#### Clean quantitative features (integer or float)

##### **price**

In [721]:
print(df['price'])

0        15770
1        14500
2        14640
3        14500
4        16790
         ...  
15914    39950
15915    39885
15916    39875
15917    39700
15918    40999
Name: price, Length: 15919, dtype: int64


In [722]:
df['price'] = df['price'].apply(extract_number).astype('float64')

print(df['price'])

0        15770.0
1        14500.0
2        14640.0
3        14500.0
4        16790.0
          ...   
15914    39950.0
15915    39885.0
15916    39875.0
15917    39700.0
15918    40999.0
Name: price, Length: 15919, dtype: float64


##### **km**

In [723]:
print(df['km'])

0        56,013 km
1        80,000 km
2        83,450 km
3        73,000 km
4        16,200 km
           ...    
15914         - km
15915     9,900 km
15916        15 km
15917        10 km
15918         - km
Name: km, Length: 15919, dtype: str


In [724]:
df['km'] = pd.to_numeric(df['km'].apply(extract_number), errors='coerce').astype('float64')

print(df['km'])

0        56013.0
1        80000.0
2        83450.0
3        73000.0
4        16200.0
          ...   
15914        NaN
15915     9900.0
15916       15.0
15917       10.0
15918        NaN
Name: km, Length: 15919, dtype: float64


##### **hp** (kW)

In [725]:
print(df['hp'])

0         66 kW
1        141 kW
2         85 kW
3         66 kW
4         66 kW
          ...  
15914    147 kW
15915    165 kW
15916    146 kW
15917    147 kW
15918    165 kW
Name: hp, Length: 15919, dtype: str


In [726]:
df['hp_kw'] = df['hp'].apply(extract_number).astype('float64')
df.drop(columns=['hp'], inplace=True)

print(df['hp_kw'])

0         66.0
1        141.0
2         85.0
3         66.0
4         66.0
         ...  
15914    147.0
15915    165.0
15916    146.0
15917    147.0
15918    165.0
Name: hp_kw, Length: 15919, dtype: float64


##### **displacement** (cc)

In [727]:
print(df['displacement'])

0        1,422 cc
1        1,798 cc
2        1,598 cc
3        1,422 cc
4        1,422 cc
           ...   
15914    1,997 cc
15915    1,798 cc
15916    1,997 cc
15917    1,997 cc
15918    1,798 cc
Name: displacement, Length: 15919, dtype: str


In [728]:
df['displacement_cc'] = df['displacement'].apply(extract_number).astype('Int64')
df.drop(columns=['displacement'], inplace=True)

print(df['displacement_cc'])

0        1422
1        1798
2        1598
3        1422
4        1422
         ... 
15914    1997
15915    1798
15916    1997
15917    1997
15918    1798
Name: displacement_cc, Length: 15919, dtype: Int64


##### **warranty** (months)

In [729]:
print(df['warranty'].head(20))

0                           4 (Green)
1                                 NaN
2                  99 g CO2/km (comb)
3                                 NaN
4                              Euro 6
5                                 NaN
6                                 NaN
7                              Euro 6
8                 [12 months, Euro 6]
9                            3 months
10                                NaN
11                             Euro 6
12                             Euro 6
13                          12 months
14                                NaN
15                [12 months, Euro 6]
16                                NaN
17                                NaN
18    [6 months, 103 g CO2/km (comb)]
19                                   
Name: warranty, dtype: object


In [730]:
# extract warranty duration in months
df['warranty_months'] = df['warranty'].apply(extract_months).astype('Int64')
df.drop(columns=['warranty'], inplace=True)

# check missing values percentage introduced with creation of warranty_months
print("missing values percentage:", df['warranty_months'].isnull().mean() * 100, "%")

print(df['warranty_months'].head(20))

missing values percentage: 69.51441673471952 %
0     <NA>
1     <NA>
2     <NA>
3     <NA>
4     <NA>
5     <NA>
6     <NA>
7     <NA>
8       12
9        3
10    <NA>
11    <NA>
12    <NA>
13      12
14    <NA>
15      12
16    <NA>
17    <NA>
18       6
19    <NA>
Name: warranty_months, dtype: Int64


##### **weight** (kg)

In [731]:
print(df['weight'])

0        1,220 kg
1        1,255 kg
2             NaN
3        1,195 kg
4             NaN
           ...   
15914    1,758 kg
15915    1,708 kg
15916         NaN
15917    1,758 kg
15918    1,685 kg
Name: weight, Length: 15919, dtype: str


In [732]:
df['weight_kg'] = df['weight'].apply(extract_number).astype('float64')
df.drop(columns=['weight'], inplace=True)

print(df['weight_kg'])

0        1220.0
1        1255.0
2           NaN
3        1195.0
4           NaN
          ...  
15914    1758.0
15915    1708.0
15916       NaN
15917    1758.0
15918    1685.0
Name: weight_kg, Length: 15919, dtype: float64


##### **nr_of_doors** 

In [733]:
print(df['nr._of_doors'])

0        5
1        3
2        4
3        3
4        5
        ..
15914    5
15915    5
15916    5
15917    5
15918    5
Name: nr._of_doors, Length: 15919, dtype: str


In [734]:
df['nr._of_doors'] = df['nr._of_doors'].apply(extract_number).astype('Int64')

print(df['nr._of_doors'])

0        5
1        3
2        4
3        3
4        5
        ..
15914    5
15915    5
15916    5
15917    5
15918    5
Name: nr._of_doors, Length: 15919, dtype: Int64


##### **nr_of_seats** 

In [735]:
print(df['nr._of_seats'])

0        5
1        4
2        4
3        4
4        5
        ..
15914    5
15915    5
15916    7
15917    7
15918    5
Name: nr._of_seats, Length: 15919, dtype: str


In [736]:
df['nr._of_seats'] = df['nr._of_seats'].apply(extract_number).astype('Int64')

print(df['nr._of_seats'])

0        5
1        4
2        4
3        4
4        5
        ..
15914    5
15915    5
15916    7
15917    7
15918    5
Name: nr._of_seats, Length: 15919, dtype: Int64


##### **cylinders** 

In [737]:
print(df['cylinders'])

0          3
1          4
2        NaN
3          3
4          3
        ... 
15914      4
15915      4
15916      4
15917      4
15918      4
Name: cylinders, Length: 15919, dtype: str


In [738]:
df['cylinders'] = df['cylinders'].apply(extract_number).astype('Int64')

print(df['cylinders'])

0           3
1           4
2        <NA>
3           3
4           3
         ... 
15914       4
15915       4
15916       4
15917       4
15918       4
Name: cylinders, Length: 15919, dtype: Int64


##### **gears** 

In [739]:
print(df['gears'])

0        NaN
1          7
2        NaN
3          6
4        NaN
        ... 
15914      6
15915      7
15916      6
15917      6
15918    NaN
Name: gears, Length: 15919, dtype: str


In [740]:
df['gears'] = df['gears'].apply(extract_number).astype('Int64')

print(df['gears'])

0        <NA>
1           7
2        <NA>
3           6
4        <NA>
         ... 
15914       6
15915       7
15916       6
15917       6
15918    <NA>
Name: gears, Length: 15919, dtype: Int64


##### **registration**

In [741]:
print(df[['registration', 'first_registration']])

      registration first_registration
0          01/2016               2016
1          03/2017               2017
2          02/2016               2016
3          08/2016               2016
4          05/2016               2016
...            ...                ...
15914          -/-                NaN
15915      01/2019               2019
15916      03/2019               2019
15917      06/2019               2019
15918      01/2019               2019

[15919 rows x 2 columns]


In [742]:
# split registration into month and year
split = df['registration'].astype(str).str.split('/', expand=True)

df['registration_month'] = pd.to_numeric(split[0].apply(extract_number), errors='coerce').astype('Int64')

df['registration_year'] = pd.to_numeric(split[1].apply(extract_number), errors='coerce').astype('Int64')

df.drop(columns=['registration'], inplace=True)

# check if first_registration adds any value to registration_year
df['registration_year'] = df['registration_year'].fillna(df['first_registration'])

df.drop(columns=['first_registration'], inplace=True)

print(df[['registration_month', 'registration_year']])

       registration_month  registration_year
0                       1               2016
1                       3               2017
2                       2               2016
3                       8               2016
4                       5               2016
...                   ...                ...
15914                <NA>               <NA>
15915                   1               2019
15916                   3               2019
15917                   6               2019
15918                   1               2019

[15919 rows x 2 columns]


##### previous_owners

In [743]:
print(df[['previous_owners', 'prev_owner']])

                                                             previous_owners  \
0                                                                          2   
1                                                                        NaN   
2                                                                          1   
3                                                                          1   
4                                                                          1   
...                                                                      ...   
15914                                                                    NaN   
15915  [1, 7.4 l/100 km (comb), 9.2 l/100 km (city), 6.3 l/100 km (country)]   
15916                                               [1, 139 g CO2/km (comb)]   
15917                                                                    NaN   
15918                                                                      1   

              prev_owner  
0      2 pre

In [744]:
df['previous_owners'] = df['previous_owners'].apply(extract_number).astype('Int64')

df['prev_owner'] = df['prev_owner'].apply(extract_number).astype('Int64')

df['previous_owners'] = df['previous_owners'].fillna(df['prev_owner'])

df.drop(columns=['prev_owner'], inplace=True)

print(df[['previous_owners']])

       previous_owners
0                    2
1                 <NA>
2                    1
3                    1
4                    1
...                ...
15914             <NA>
15915                1
15916                1
15917             <NA>
15918                1

[15919 rows x 1 columns]


##### **consumption** (combination, city, country)

In [745]:
print(df['consumption'])

0        [['3.8 l/100 km (comb)'], ['4.3 l/100 km (city)'], ['3.5 l/100 km (country)']]
1        [['5.6 l/100 km (comb)'], ['7.1 l/100 km (city)'], ['4.7 l/100 km (country)']]
2        [['3.8 l/100 km (comb)'], ['4.4 l/100 km (city)'], ['3.4 l/100 km (country)']]
3        [['3.8 l/100 km (comb)'], ['4.3 l/100 km (city)'], ['3.5 l/100 km (country)']]
4        [['4.1 l/100 km (comb)'], ['4.6 l/100 km (city)'], ['3.8 l/100 km (country)']]
                                              ...                                      
15914    [['5.3 l/100 km (comb)'], ['6.2 l/100 km (city)'], ['4.7 l/100 km (country)']]
15915                [7.4 l/100 km (comb), 9.2 l/100 km (city), 6.3 l/100 km (country)]
15916    [['5.3 l/100 km (comb)'], ['6.2 l/100 km (city)'], ['4.7 l/100 km (country)']]
15917    [['5.3 l/100 km (comb)'], ['6.2 l/100 km (city)'], ['4.7 l/100 km (country)']]
15918    [['6.8 l/100 km (comb)'], ['8.7 l/100 km (city)'], ['5.7 l/100 km (country)']]
Name: consumption, Length: 15919

In [746]:
# split consumption into three columns
df[['consumption_comb_1/100km', 'consumption_city_1/100km', 'consumption_country_1/100km']] = df['consumption'].apply(split_list)
df.drop(columns=['consumption'], inplace=True)

print(df[['consumption_comb_1/100km', 'consumption_city_1/100km', 'consumption_country_1/100km']])

      consumption_comb_1/100km consumption_city_1/100km  \
0      ['3.8 l/100 km (comb)']  ['4.3 l/100 km (city)']   
1      ['5.6 l/100 km (comb)']  ['7.1 l/100 km (city)']   
2      ['3.8 l/100 km (comb)']  ['4.4 l/100 km (city)']   
3      ['3.8 l/100 km (comb)']  ['4.3 l/100 km (city)']   
4      ['4.1 l/100 km (comb)']  ['4.6 l/100 km (city)']   
...                        ...                      ...   
15914  ['5.3 l/100 km (comb)']  ['6.2 l/100 km (city)']   
15915      7.4 l/100 km (comb)      9.2 l/100 km (city)   
15916  ['5.3 l/100 km (comb)']  ['6.2 l/100 km (city)']   
15917  ['5.3 l/100 km (comb)']  ['6.2 l/100 km (city)']   
15918  ['6.8 l/100 km (comb)']  ['8.7 l/100 km (city)']   

      consumption_country_1/100km  
0      ['3.5 l/100 km (country)']  
1      ['4.7 l/100 km (country)']  
2      ['3.4 l/100 km (country)']  
3      ['3.5 l/100 km (country)']  
4      ['3.8 l/100 km (country)']  
...                           ...  
15914  ['4.7 l/100 km (country)']  
159

In [747]:
df['consumption_comb_1/100km'] = df['consumption_comb_1/100km'].apply(extract_number).astype('float64')
df['consumption_city_1/100km'] = df['consumption_city_1/100km'].apply(extract_number).astype('float64')
df['consumption_country_1/100km'] = df['consumption_country_1/100km'].apply(extract_number).astype('float64')

print(df[['consumption_comb_1/100km', 'consumption_city_1/100km', 'consumption_country_1/100km']])

       consumption_comb_1/100km  consumption_city_1/100km  \
0                           3.8                       4.3   
1                           5.6                       7.1   
2                           3.8                       4.4   
3                           3.8                       4.3   
4                           4.1                       4.6   
...                         ...                       ...   
15914                       5.3                       6.2   
15915                       7.4                       9.2   
15916                       5.3                       6.2   
15917                       5.3                       6.2   
15918                       6.8                       8.7   

       consumption_country_1/100km  
0                              3.5  
1                              4.7  
2                              3.4  
3                              3.5  
4                              3.8  
...                            ...  
15914   

##### **co2_emission**

In [748]:
print(df['co2_emission'])

0         99 g CO2/km (comb)
1        129 g CO2/km (comb)
2         99 g CO2/km (comb)
3         99 g CO2/km (comb)
4        109 g CO2/km (comb)
                ...         
15914    139 g CO2/km (comb)
15915    168 g CO2/km (comb)
15916    139 g CO2/km (comb)
15917    139 g CO2/km (comb)
15918    153 g CO2/km (comb)
Name: co2_emission, Length: 15919, dtype: object


In [749]:
df['co2_emission_g_CO2/km_comb'] = df['co2_emission'].apply(extract_number).astype('Int64')
df.drop(columns=['co2_emission'], inplace=True)

print(df['co2_emission_g_CO2/km_comb'])

0         99
1        129
2         99
3         99
4        109
        ... 
15914    139
15915    168
15916    139
15917    139
15918    153
Name: co2_emission_g_CO2/km_comb, Length: 15919, dtype: Int64


#### Clean categorical features (string)

##### **type**

In [750]:
print(df['type'])

0                          [Used, Diesel (Particulate Filter)]
1                                             [Used, Gasoline]
2                          [Used, Diesel (Particulate Filter)]
3                          [Used, Diesel (Particulate Filter)]
4                          [Used, Diesel (Particulate Filter)]
                                 ...                          
15914                       [New, Diesel (Particulate Filter)]
15915    [Used, Super 95 / Super Plus 98 (Particulate Filter)]
15916                                 [Pre-registered, Diesel]
15917                                 [Pre-registered, Diesel]
15918                                [Demonstration, Super 95]
Name: type, Length: 15919, dtype: object


In [751]:
# split into vehicle condition and fuel type
df['condition'] = df['type'].apply(extract_first)
df['fuel_type'] = df['type'].apply(extract_last)
df.drop(columns=['type'], inplace=True)

print(df[['condition', 'fuel_type', 'fuel']])

            condition                                      fuel_type  \
0                Used                    Diesel (Particulate Filter)   
1                Used                                       Gasoline   
2                Used                    Diesel (Particulate Filter)   
3                Used                    Diesel (Particulate Filter)   
4                Used                    Diesel (Particulate Filter)   
...               ...                                            ...   
15914             New                    Diesel (Particulate Filter)   
15915            Used  Super 95 / Super Plus 98 (Particulate Filter)   
15916  Pre-registered                                         Diesel   
15917  Pre-registered                                         Diesel   
15918   Demonstration                                       Super 95   

                                                fuel  
0                        Diesel (Particulate Filter)  
1                        

In [752]:
# merge fuel_type into fuel
df['fuel'] = df['fuel'].fillna(df['fuel_type'])
df.drop(columns=['fuel_type'], inplace=True)

##### **upholstery**

In [753]:
print(df['upholstery'])

0               Cloth, Black
1                Cloth, Grey
2               Cloth, Black
3                        NaN
4               Cloth, Black
                ...         
15914                    NaN
15915                  Cloth
15916    Full leather, Black
15917           Part leather
15918    Full leather, Brown
Name: upholstery, Length: 15919, dtype: str


In [754]:
# split into upholstery type and color
df['upholstery_type'] = df['upholstery'].apply(lambda x: x.split(',')[0] if isinstance(x, str) else pd.NA)
df['upholstery_color'] = df['upholstery'].apply(lambda x: x.split(',')[-1] if isinstance(x, str) and ',' in x else pd.NA)

df.drop(columns=['upholstery'], inplace=True)

print(df[['upholstery_type', 'upholstery_color']])

      upholstery_type upholstery_color
0               Cloth            Black
1               Cloth             Grey
2               Cloth            Black
3                 NaN              NaN
4               Cloth            Black
...               ...              ...
15914             NaN              NaN
15915           Cloth              NaN
15916    Full leather            Black
15917    Part leather              NaN
15918    Full leather            Brown

[15919 rows x 2 columns]


##### **emission_class**

In [755]:
print(df['emission_class'])
#print(df['emission_class'].unique())

0              Euro 6
1              Euro 6
2              Euro 6
3              Euro 6
4              Euro 6
             ...     
15914             NaN
15915             NaN
15916    Euro 6d-TEMP
15917          Euro 6
15918          Euro 6
Name: emission_class, Length: 15919, dtype: object


In [756]:
non_matching_values = df['emission_class'][~df['emission_class'].str.contains("Euro 6", na=False)].dropna()
print(non_matching_values)

55             Euro 5
124      [[], [], []]
142      [[], [], []]
305      [[], [], []]
347      [[], [], []]
             ...     
15545    [[], [], []]
15552    [[], [], []]
15573    [[], [], []]
15738    [[], [], []]
15779          Euro 4
Name: emission_class, Length: 725, dtype: object


In [757]:
# drop any value that can't be converted to a string
df['emission_class'] = df['emission_class'].apply(lambda x: np.nan if isinstance(x, list) else str(x))

print(df['emission_class'])
print(df['emission_class'].unique())

0              Euro 6
1              Euro 6
2              Euro 6
3              Euro 6
4              Euro 6
             ...     
15914             nan
15915             nan
15916    Euro 6d-TEMP
15917          Euro 6
15918          Euro 6
Name: emission_class, Length: 15919, dtype: str
<StringArray>
['Euro 6', 'nan', 'Euro 5', 'Euro 6d-TEMP', nan, 'Euro 6c', 'Euro 4',
 'Euro 6d']
Length: 8, dtype: str


#### Final dataset

In [758]:
print(df.shape)
print(df.head())
print(df.info())

(15919, 36)
  body_type    price               vat       km  previous_owners  make model  \
0    Sedans  15770.0    VAT deductible  56013.0                2  Audi    A1   
1    Sedans  14500.0  Price negotiable  80000.0             <NA>  Audi    A1   
2    Sedans  14640.0    VAT deductible  83450.0                1  Audi    A1   
3    Sedans  14500.0               NaN  73000.0                1  Audi    A1   
4    Sedans  16790.0               NaN  16200.0                1  Audi    A1   

  body_color paint_type    body  nr._of_doors  nr._of_seats gearing_type  \
0      Black   Metallic  Sedans             5             5    Automatic   
1        Red        NaN  Sedans             3             4    Automatic   
2      Black   Metallic  Sedans             4             4    Automatic   
3      Brown   Metallic  Sedans             3             4    Automatic   
4      Black   Metallic  Sedans             5             5    Automatic   

   cylinders drive_chain                         f

In [759]:
df.to_csv('phase1_cleaned_data.csv', index=False)
