# Auto Scout Project 
**Magnimind** 

**Author:** Mark    
**Date:** February 2026  
**GitHub:** [link](https://github.com/mjoslin-ai/Magnimind-Project-Auto-Scout)

In [303]:
import pandas as pd
pd.set_option('display.max_columns', None)

## Project Phase:
### 1. Data Cleaning:
This phase involves removing broken, irrelevant, or redundant columns, and generating new
columns with meaningful values.

In [304]:
df = pd.read_json("scout_car.json", lines=True)
print(df.shape)
print(df.head())
print(df.info())

(15919, 54)
                                                 url make_model  \
0  https://www.autoscout24.com//offers/audi-a1-sp...    Audi A1   
1  https://www.autoscout24.com//offers/audi-a1-1-...    Audi A1   
2  https://www.autoscout24.com//offers/audi-a1-sp...    Audi A1   
3  https://www.autoscout24.com//offers/audi-a1-1-...    Audi A1   
4  https://www.autoscout24.com//offers/audi-a1-sp...    Audi A1   

                                   short_description body_type  price  \
0        Sportback 1.4 TDI S-tronic Xenon Navi Klima    Sedans  15770   
1                                     1.8 TFSI sport    Sedans  14500   
2  Sportback 1.6 TDI S tronic Einparkhilfe plus+m...    Sedans  14640   
3                            1.4 TDi Design S tronic    Sedans  14500   
4  Sportback 1.4 TDI S-Tronic S-Line Ext. admired...    Sedans  16790   

                vat         km registration         prev_owner  kW      hp  \
0    VAT deductible  56,013 km      01/2016  2 previous owners NaN  

#### Remove columns based on initial scan of data

In [305]:
cols_to_drop = ['url', # metadata
                'kW', # 0 non-null values and redundant with hp
                'null', # empty list (df['null'].apply(lambda x: x == []).all() is true)
                'Offer Number', # manufacturer/internal reference code
                'Model Code' # manufacturer/internal reference code
                ]
df.drop(columns=cols_to_drop, inplace=True)
print(df.shape)

(15919, 49)


#### Remove columns with too many nulls (> 70%)

In [306]:
threshold = 70
missing_percentage = df.isnull().mean() * 100
cols_to_drop = missing_percentage[missing_percentage > threshold].index
print(f"Columns missing values > {threshold}%:\n", cols_to_drop)
df.drop(columns=cols_to_drop, inplace=True)
print(df.shape)

Columns missing values > 70%:
 Index(['Next Inspection', 'Inspection new', 'Emission Label',
       'Electricity consumption', 'Last Service Date', 'Other Fuel Types',
       'Availability', 'Last Timing Belt Service Date', 'Available from'],
      dtype='str')
(15919, 40)


#### Extracting Pure Numeric Values

In [307]:
print(df.head())
print(df.info())

  make_model                                  short_description body_type  \
0    Audi A1        Sportback 1.4 TDI S-tronic Xenon Navi Klima    Sedans   
1    Audi A1                                     1.8 TFSI sport    Sedans   
2    Audi A1  Sportback 1.6 TDI S tronic Einparkhilfe plus+m...    Sedans   
3    Audi A1                            1.4 TDi Design S tronic    Sedans   
4    Audi A1  Sportback 1.4 TDI S-Tronic S-Line Ext. admired...    Sedans   

   price               vat         km registration         prev_owner      hp  \
0  15770    VAT deductible  56,013 km      01/2016  2 previous owners   66 kW   
1  14500  Price negotiable  80,000 km      03/2017                NaN  141 kW   
2  14640    VAT deductible  83,450 km      02/2016   1 previous owner   85 kW   
3  14500               NaN  73,000 km      08/2016   1 previous owner   66 kW   
4  16790               NaN  16,200 km      05/2016   1 previous owner   66 kW   

                                      Type Previou

**km:** str to float

In [308]:
df['km'] = (
    df['km']
    .str.replace(r'[^\d]', '', regex=True)
    .replace('', float('nan')) # str.replace produced empty strings
    .astype('float') # remove non-numeric characters and convert to float
)
print(df['km'].head())
print("Number of missing values in km:", df['km'].isnull().sum())

0    56013.0
1    80000.0
2    83450.0
3    73000.0
4    16200.0
Name: km, dtype: float64
Number of missing values in km: 1024


**hp(kW):** string to float

In [309]:
df['hp'] = (
    df['hp']
    .str.replace(r'[^\d]', '', regex=True)
    .replace('', float('nan')) 
    .astype('float')
)
print(df['hp'].head())
print("Number of missing values in hp:", df['hp'].isnull().sum())

0     66.0
1    141.0
2     85.0
3     66.0
4     66.0
Name: hp, dtype: float64
Number of missing values in hp: 88


**registration:** split registration into month and year

In [310]:
print(df['registration'])

0        01/2016
1        03/2017
2        02/2016
3        08/2016
4        05/2016
          ...   
15914        -/-
15915    01/2019
15916    03/2019
15917    06/2019
15918    01/2019
Name: registration, Length: 15919, dtype: str


In [None]:
split = df['registration'].astype(str).str.split('/', expand=True)
df['registration month'] = (
    split[0]
    .astype(str)
    .str.replace(r'[^\d]', '', regex=True)
    .replace('', float('nan')) 
    .astype('Int64')
)
df['registration year'] = (
    split[1]
    .astype(str)
    .str.replace(r'[^\d]', '', regex=True)
    .replace('', float('nan')) 
    .astype('Int64')
)
print(df[['registration month', 'registration year']])

       registration month  registration year
0                       1               2016
1                       3               2017
2                       2               2016
3                       8               2016
4                       5               2016
...                   ...                ...
15914                <NA>               <NA>
15915                   1               2019
15916                   3               2019
15917                   6               2019
15918                   1               2019

[15919 rows x 2 columns]
