# 1. Bibliotheken laden

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# 2. Daten laden

In [2]:
# CSV laden und DataFrame 'df' kreieren
df = pd.read_csv('AutoScout24_3172.csv')

# 3. Datensatz bereinigen

In [3]:
# Spalten anzeigen
df.head(0)

Unnamed: 0.1,Unnamed: 0,Car,Price,Date,Km,Transmission,Fuel,PS,Consumption,Location


In [4]:
# Datensatz erstmal beschreiben, 'Unnamed: 0' als einzige numerische Werte, alle anderen Spalten sind Strings
df.describe()

Unnamed: 0.1,Unnamed: 0
count,63440.0
mean,31719.5
std,18313.694876
min,0.0
25%,15859.75
50%,31719.5
75%,47579.25
max,63439.0


In [5]:
# 'Unnamed: 0' löschen, inplace=True verändert das 'df' permanent
df.drop("Unnamed: 0", axis=1, inplace=True)

In [6]:
df.head(0)

Unnamed: 0,Car,Price,Date,Km,Transmission,Fuel,PS,Consumption,Location


In [7]:
# Erste 5 Zeilen anzeigen
df.head()

Unnamed: 0,Car,Price,Date,Km,Transmission,Fuel,PS,Consumption,Location
0,MERCEDES-BENZ GLK 250 CDI BlueEfficiency 4Mati...,CHF 18'799.–,05.2012,68'200 km,Automat,Diesel,204 PS (150 kW),6.5 l/100 km,8156 Oberhasli
1,AUDI RS e-tron GT quattro,CHF 155'990.–,Neues Fahrzeug,9 km,Automatikgetriebe,Elektro,599 PS (440 kW),469 km,4147 Aesch
2,BMW 120d xDrive Sport Line,CHF 36'900.–,05.2023,28'100 km,Automat,Diesel,190 PS (140 kW),5.2 l/100 km,7000 Chur
3,SEAT Leon ST 2.0 TSI Cupra 290 DSG,CHF 21'900.–,03.2016,86'000 km,Halbautomatisches Getriebe,Benzin,290 PS (213 kW),6.6 l/100 km,5417 Untersiggenthal
4,MERCEDES-BENZ B 250 e AMG Line 8G-DCT,CHF 37'900.–,11.2020,9'500 km,Halbautomatisches Getriebe,Plug-in hybrid Benzin/Elektro,217 PS (160 kW),1.5 l/100 km,6438 Ibach-Schwyz


In [8]:
# Preise säubern
df['Price'] = df['Price'].str.replace('CHF ', '', regex=False).str.replace("'", '', regex=False).str.rstrip('.–')

In [9]:
# Km säubern
df['Km'] = df['Km'].str.replace("'", "").str.replace(" km", "", regex=False)

In [10]:
# Non-numerische Werte in df['Km'] finden
non_numeric_km = df[~df['Km'].str.contains('^[0-9]+$', na=False)]

In [11]:
# Zeile mit non-numerischen Werten in df['Km'] anzeigen
non_numeric_km

Unnamed: 0,Car,Price,Date,Km,Transmission,Fuel,PS,Consumption,Location
46793,HONDA Ny1 Elegance,31900,Neues Fahrzeug,-,Automatikgetriebe,Elektro,204 PS (150 kW),412 km,1214 Vernier / Genève


In [12]:
# Alle Werte in df['Km'] mit '- ersetzen'
df['Km'] = df['Km'].str.replace("-", "").str.replace(" km", "", regex=False)

In [13]:
# Erste Zeile des DataFrames anzeigen
df.head(1)

Unnamed: 0,Car,Price,Date,Km,Transmission,Fuel,PS,Consumption,Location
0,MERCEDES-BENZ GLK 250 CDI BlueEfficiency 4Mati...,18799,5.2012,68200,Automat,Diesel,204 PS (150 kW),6.5 l/100 km,8156 Oberhasli


In [14]:
# PS und kW splitten
df[['PS', 'kW']] = df['PS'].str.extract(r'(\d+) PS \((\d+) kW\)')

In [15]:
df.head()

Unnamed: 0,Car,Price,Date,Km,Transmission,Fuel,PS,Consumption,Location,kW
0,MERCEDES-BENZ GLK 250 CDI BlueEfficiency 4Mati...,18799,05.2012,68200,Automat,Diesel,204,6.5 l/100 km,8156 Oberhasli,150
1,AUDI RS e-tron GT quattro,155990,Neues Fahrzeug,9,Automatikgetriebe,Elektro,599,469 km,4147 Aesch,440
2,BMW 120d xDrive Sport Line,36900,05.2023,28100,Automat,Diesel,190,5.2 l/100 km,7000 Chur,140
3,SEAT Leon ST 2.0 TSI Cupra 290 DSG,21900,03.2016,86000,Halbautomatisches Getriebe,Benzin,290,6.6 l/100 km,5417 Untersiggenthal,213
4,MERCEDES-BENZ B 250 e AMG Line 8G-DCT,37900,11.2020,9500,Halbautomatisches Getriebe,Plug-in hybrid Benzin/Elektro,217,1.5 l/100 km,6438 Ibach-Schwyz,160


In [16]:
# Neues Fahrzeug & 
df['Date'] = df['Date'].str.replace('Neues Fahrzeug', '03.2024', regex=False)
df['Date'] = df['Date'].str.replace('Vorführmodell', '01.2023', regex=False)

# Alle Werte mit Format YYYY mit 01. ergänzen
df['Date'] = df['Date'].apply(lambda x: '01.' + x if len(x) == 4 and x.isdigit() else x)

# df['Date'] in Datentypen 'datetime' umwandeln
df['Date'] = pd.to_datetime(df['Date'], format='%m.%Y')

In [17]:
# Dummy-Variable für Vorführmodelle mit Wert 1 hinzufügen
df['Vorführmodell'] = np.where(df['Date'] == 'Vorführmodell', 1, 0)

In [18]:
# PLZ von Location splitten
df[['PLZ', 'Location']] = df['Location'].str.extract(r'(\d{4})\s(.*)')
df['Location'] = df['Location'].str.replace(" ", "")

In [19]:
# Neue Spaltenordnung definieren
new_order = ['Car', 'Price', 'Date', 'Km', 'Transmission', 'Fuel', 'PS', 'kW', 'Consumption', 'PLZ', 'Location', 'Vorführmodell']
df = df[new_order]

In [20]:
# Alle numerisch relevanten Strings in Floats umwandeln
df['Price'] = pd.to_numeric(df['Price'], errors='coerce')
df['PS'] = pd.to_numeric(df['PS'], errors='coerce')
df['kW'] = pd.to_numeric(df['kW'], errors='coerce')
df['PLZ'] = pd.to_numeric(df['PLZ'], errors='coerce')

In [21]:
df.head()

Unnamed: 0,Car,Price,Date,Km,Transmission,Fuel,PS,kW,Consumption,PLZ,Location,Vorführmodell
0,MERCEDES-BENZ GLK 250 CDI BlueEfficiency 4Mati...,18799,2012-05-01,68200,Automat,Diesel,204.0,150.0,6.5 l/100 km,8156,Oberhasli,0
1,AUDI RS e-tron GT quattro,155990,2024-03-01,9,Automatikgetriebe,Elektro,599.0,440.0,469 km,4147,Aesch,0
2,BMW 120d xDrive Sport Line,36900,2023-05-01,28100,Automat,Diesel,190.0,140.0,5.2 l/100 km,7000,Chur,0
3,SEAT Leon ST 2.0 TSI Cupra 290 DSG,21900,2016-03-01,86000,Halbautomatisches Getriebe,Benzin,290.0,213.0,6.6 l/100 km,5417,Untersiggenthal,0
4,MERCEDES-BENZ B 250 e AMG Line 8G-DCT,37900,2020-11-01,9500,Halbautomatisches Getriebe,Plug-in hybrid Benzin/Elektro,217.0,160.0,1.5 l/100 km,6438,Ibach-Schwyz,0


In [22]:
# csv exportieren
df.to_csv('AutoScout24_3172_cleaned.csv')