# Auto Scout Project 
**Magnimind** 

**Author:** Mark    
**Date:** February 2026  
**GitHub:** [link](https://github.com/mjoslin-ai/Magnimind-Project-Auto-Scout)

In [663]:
import pandas as pd
import numpy as np
import re
pd.set_option('display.max_columns', None)

## Project Phase:
### 1. Data Cleaning:
This phase involves removing broken, irrelevant, or redundant columns, and generating new
columns with meaningful values.

#### Data exploration

In [664]:
df = pd.read_json("scout_car.json", lines=True)
print(df.shape)
print(df.head())
print(df.info())

(15919, 54)
                                                 url make_model  \
0  https://www.autoscout24.com//offers/audi-a1-sp...    Audi A1   
1  https://www.autoscout24.com//offers/audi-a1-1-...    Audi A1   
2  https://www.autoscout24.com//offers/audi-a1-sp...    Audi A1   
3  https://www.autoscout24.com//offers/audi-a1-1-...    Audi A1   
4  https://www.autoscout24.com//offers/audi-a1-sp...    Audi A1   

                                   short_description body_type  price  \
0        Sportback 1.4 TDI S-tronic Xenon Navi Klima    Sedans  15770   
1                                     1.8 TFSI sport    Sedans  14500   
2  Sportback 1.6 TDI S tronic Einparkhilfe plus+m...    Sedans  14640   
3                            1.4 TDi Design S tronic    Sedans  14500   
4  Sportback 1.4 TDI S-Tronic S-Line Ext. admired...    Sedans  16790   

                vat         km registration         prev_owner  kW      hp  \
0    VAT deductible  56,013 km      01/2016  2 previous owners NaN  

#### Helper functions

In [665]:
# remove \n
def remove_na(x):
    if isinstance(x, list):
        cleaned = [str(item).strip().replace('\n', '').replace('\\n', '') 
                   for item in x if str(item).strip() and str(item).strip() != '']
        if len(cleaned) == 0:
            return np.nan
        if len(cleaned) == 1:
            return cleaned[0]
        return cleaned
    elif isinstance(x, str):
        return x.strip().replace('\n', '').replace('\\n', '')
    return x

# extract pure numeric values
def extract_number(x):
    if isinstance(x, list):
        # Grab the first element of the list
        x = x[0] if x else pd.NA
    
    if pd.isna(x):
        return pd.NA
    
    # Convert to string 
    s = str(x).strip()
    
    # Find numbers
    m = re.search(r'-?(?:\d{1,4}(?:,\d{3})*|\d+)(?:\.\d+)?', s)
    if not m:
        return pd.NA
    
    # Remove commas 
    cleaned = m.group().replace(',', '')
    
    try:
        return cleaned
    except ValueError:
        return pd.NA

#### Remove \n

In [666]:
df = (df
      .apply(lambda col: col.apply(remove_na)) # in col values
      .rename(columns=remove_na) # in col names
)

#### Standardize column names

In [667]:
print(df.columns)

Index(['url', 'make_model', 'short_description', 'body_type', 'price', 'vat',
       'km', 'registration', 'prev_owner', 'kW', 'hp', 'Type',
       'Previous Owners', 'Next Inspection', 'Inspection new', 'Warranty',
       'Full Service', 'Non-smoking Vehicle', 'null', 'Make', 'Model',
       'Offer Number', 'First Registration', 'Body Color', 'Paint Type',
       'Body Color Original', 'Upholstery', 'Body', 'Nr. of Doors',
       'Nr. of Seats', 'Model Code', 'Gearing Type', 'Displacement',
       'Cylinders', 'Weight', 'Drive chain', 'Fuel', 'Consumption',
       'CO2 Emission', 'Emission Class', 'Comfort & Convenience',
       'Entertainment & Media', 'Extras', 'Safety & Security', 'description',
       'Emission Label', 'Gears', 'Country version', 'Electricity consumption',
       'Last Service Date', 'Other Fuel Types', 'Availability',
       'Last Timing Belt Service Date', 'Available from'],
      dtype='str')


In [668]:
df.columns = df.columns.str.lower().str.replace(r'[ &]', '_', regex=True).str.replace('___', '_')

print(df.columns)

Index(['url', 'make_model', 'short_description', 'body_type', 'price', 'vat',
       'km', 'registration', 'prev_owner', 'kw', 'hp', 'type',
       'previous_owners', 'next_inspection', 'inspection_new', 'warranty',
       'full_service', 'non-smoking_vehicle', 'null', 'make', 'model',
       'offer_number', 'first_registration', 'body_color', 'paint_type',
       'body_color_original', 'upholstery', 'body', 'nr._of_doors',
       'nr._of_seats', 'model_code', 'gearing_type', 'displacement',
       'cylinders', 'weight', 'drive_chain', 'fuel', 'consumption',
       'co2_emission', 'emission_class', 'comfort_convenience',
       'entertainment_media', 'extras', 'safety_security', 'description',
       'emission_label', 'gears', 'country_version', 'electricity_consumption',
       'last_service_date', 'other_fuel_types', 'availability',
       'last_timing_belt_service_date', 'available_from'],
      dtype='str')


#### Remove irrelevant columns

In [669]:
cols_to_drop = ['url', # metadata
                'kw', # 0 non-null values and redundant with hp
                'null', # empty list (df['null'].apply(lambda x: x == []).all() is true)
                'offer_number', # manufacturer/internal reference code
                'model_code', # manufacturer/internal reference code
                'make_model' # redundant with make and model columns
                ]
df.drop(columns=cols_to_drop, inplace=True)
print(df.shape)

(15919, 48)


#### Remove columns with too many nulls (> 70%)

In [670]:
threshold = 70
missing_percentage = df.isnull().mean() * 100
cols_to_drop = missing_percentage[missing_percentage > threshold].index
print(f"Columns missing values > {threshold}%:\n", cols_to_drop)
df.drop(columns=cols_to_drop, inplace=True)
print(df.shape)

Columns missing values > 70%:
 Index(['next_inspection', 'inspection_new', 'non-smoking_vehicle',
       'emission_label', 'electricity_consumption', 'last_service_date',
       'other_fuel_types', 'availability', 'last_timing_belt_service_date',
       'available_from'],
      dtype='str')
(15919, 38)


#### Clean feature by feature

##### **price**

In [671]:
print(df['price'])

0        15770
1        14500
2        14640
3        14500
4        16790
         ...  
15914    39950
15915    39885
15916    39875
15917    39700
15918    40999
Name: price, Length: 15919, dtype: int64


In [672]:
df['price'] = pd.to_numeric(df['price'].apply(extract_number), errors='coerce').astype('float64')

print(df['price'])

0        1577.0
1        1450.0
2        1464.0
3        1450.0
4        1679.0
          ...  
15914    3995.0
15915    3988.0
15916    3987.0
15917    3970.0
15918    4099.0
Name: price, Length: 15919, dtype: float64


##### **km**

In [673]:
print(df['km'])

0        56,013 km
1        80,000 km
2        83,450 km
3        73,000 km
4        16,200 km
           ...    
15914         - km
15915     9,900 km
15916        15 km
15917        10 km
15918         - km
Name: km, Length: 15919, dtype: str


In [674]:
df['km'] = pd.to_numeric(df['km'].apply(extract_number), errors='coerce').astype('float64')

print(df['km'])

0        56013.0
1        80000.0
2        83450.0
3        73000.0
4        16200.0
          ...   
15914        NaN
15915     9900.0
15916       15.0
15917       10.0
15918        NaN
Name: km, Length: 15919, dtype: float64


##### **registration**

In [675]:
print(df[['registration', 'first_registration']])

      registration first_registration
0          01/2016               2016
1          03/2017               2017
2          02/2016               2016
3          08/2016               2016
4          05/2016               2016
...            ...                ...
15914          -/-                NaN
15915      01/2019               2019
15916      03/2019               2019
15917      06/2019               2019
15918      01/2019               2019

[15919 rows x 2 columns]


In [676]:
# split registration into month and year
split = df['registration'].astype(str).str.split('/', expand=True)

df['registration_month'] = pd.to_numeric(split[0].apply(extract_number), errors='coerce').astype('Int64')

df['registration_year'] = pd.to_numeric(split[1].apply(extract_number), errors='coerce').astype('Int64')

df.drop(columns=['registration'], inplace=True)

# check if first_registration adds any value to registration_year
#df['registration_year'] = df['registration_year'].fillna(df['first_registration'])

df.drop(columns=['first_registration'], inplace=True)

print(df[['registration_month', 'registration_year']])

       registration_month  registration_year
0                       1               2016
1                       3               2017
2                       2               2016
3                       8               2016
4                       5               2016
...                   ...                ...
15914                <NA>               <NA>
15915                   1               2019
15916                   3               2019
15917                   6               2019
15918                   1               2019

[15919 rows x 2 columns]


##### previous_owners

In [677]:
print(df[['previous_owners', 'prev_owner']])

                                         previous_owners         prev_owner
0                                                      2  2 previous owners
1                                                    NaN                NaN
2                                                      1   1 previous owner
3                                                      1   1 previous owner
4                                                      1   1 previous owner
...                                                  ...                ...
15914                                                NaN                NaN
15915  [1, 7.4 l/100 km (comb), 9.2 l/100 km (city), ...   1 previous owner
15916                           [1, 139 g CO2/km (comb)]   1 previous owner
15917                                                NaN                NaN
15918                                                  1   1 previous owner

[15919 rows x 2 columns]


In [678]:
df['previous_owners'] = pd.to_numeric(df['previous_owners'].apply(extract_number), errors='coerce').astype('Int64')

df['prev_owner'] = pd.to_numeric(df['prev_owner'].apply(extract_number), errors='coerce').astype('Int64')

df['previous_owners'] = df['previous_owners'].fillna(df['prev_owner'])

df.drop(columns=['prev_owner'], inplace=True)

print(df[['previous_owners']])

       previous_owners
0                    2
1                 <NA>
2                    1
3                    1
4                    1
...                ...
15914             <NA>
15915                1
15916                1
15917             <NA>
15918                1

[15919 rows x 1 columns]


##### **hp**

In [679]:
print(df['hp'])

0         66 kW
1        141 kW
2         85 kW
3         66 kW
4         66 kW
          ...  
15914    147 kW
15915    165 kW
15916    146 kW
15917    147 kW
15918    165 kW
Name: hp, Length: 15919, dtype: str


In [680]:
df['hp_kW'] = df['hp'].apply(extract_number).astype('float64')
df.drop(columns=['hp'], inplace=True)

print(df['hp_kW'])

0         66.0
1        141.0
2         85.0
3         66.0
4         66.0
         ...  
15914    147.0
15915    165.0
15916    146.0
15917    147.0
15918    165.0
Name: hp_kW, Length: 15919, dtype: float64


##### **type**

In [682]:
print(df['type'])

0                      [Used, Diesel (Particulate Filter)]
1                                         [Used, Gasoline]
2                      [Used, Diesel (Particulate Filter)]
3                      [Used, Diesel (Particulate Filter)]
4                      [Used, Diesel (Particulate Filter)]
                               ...                        
15914                   [New, Diesel (Particulate Filter)]
15915    [Used, Super 95 / Super Plus 98 (Particulate F...
15916                             [Pre-registered, Diesel]
15917                             [Pre-registered, Diesel]
15918                            [Demonstration, Super 95]
Name: type, Length: 15919, dtype: object


In [None]:
# split into vehicle condition and fuel type

#### Final dataset

In [681]:
print(df.shape)
print(df.head())
print(df.info())

(15919, 37)
                                   short_description body_type   price  \
0        Sportback 1.4 TDI S-tronic Xenon Navi Klima    Sedans  1577.0   
1                                     1.8 TFSI sport    Sedans  1450.0   
2  Sportback 1.6 TDI S tronic Einparkhilfe plus+m...    Sedans  1464.0   
3                            1.4 TDi Design S tronic    Sedans  1450.0   
4  Sportback 1.4 TDI S-Tronic S-Line Ext. admired...    Sedans  1679.0   

                vat       km                                 type  \
0    VAT deductible  56013.0  [Used, Diesel (Particulate Filter)]   
1  Price negotiable  80000.0                     [Used, Gasoline]   
2    VAT deductible  83450.0  [Used, Diesel (Particulate Filter)]   
3               NaN  73000.0  [Used, Diesel (Particulate Filter)]   
4               NaN  16200.0  [Used, Diesel (Particulate Filter)]   

   previous_owners            warranty        full_service  make model  \
0                2           4 (Green)                