# 1. Bibliotheken laden

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# 2. Daten laden

In [2]:
# CSV laden und DataFrame 'df' kreieren
df = pd.read_csv('AutoScout24_3172.csv')

# 3. Datensatz bereinigen

In [3]:
# Spalten anzeigen
df.head(0)

Unnamed: 0.1,Unnamed: 0,Car,Price,Date,Km,Transmission,Fuel,PS,Consumption,Location


In [4]:
# Datensatz erstmal beschreiben, 'Unnamed: 0' als einzige numerische Werte, alle anderen Spalten sind Strings
df.describe()

Unnamed: 0.1,Unnamed: 0
count,63440.0
mean,31719.5
std,18313.694876
min,0.0
25%,15859.75
50%,31719.5
75%,47579.25
max,63439.0


In [5]:
# 'Unnamed: 0' löschen, inplace=True verändert das 'df' permanent
df.drop("Unnamed: 0", axis=1, inplace=True)

In [6]:
df.head(0)

Unnamed: 0,Car,Price,Date,Km,Transmission,Fuel,PS,Consumption,Location


In [7]:
# Erste 5 Zeilen anzeigen
df.head()

Unnamed: 0,Car,Price,Date,Km,Transmission,Fuel,PS,Consumption,Location
0,MERCEDES-BENZ GLK 250 CDI BlueEfficiency 4Mati...,CHF 18'799.–,05.2012,68'200 km,Automat,Diesel,204 PS (150 kW),6.5 l/100 km,8156 Oberhasli
1,AUDI RS e-tron GT quattro,CHF 155'990.–,Neues Fahrzeug,9 km,Automatikgetriebe,Elektro,599 PS (440 kW),469 km,4147 Aesch
2,BMW 120d xDrive Sport Line,CHF 36'900.–,05.2023,28'100 km,Automat,Diesel,190 PS (140 kW),5.2 l/100 km,7000 Chur
3,SEAT Leon ST 2.0 TSI Cupra 290 DSG,CHF 21'900.–,03.2016,86'000 km,Halbautomatisches Getriebe,Benzin,290 PS (213 kW),6.6 l/100 km,5417 Untersiggenthal
4,MERCEDES-BENZ B 250 e AMG Line 8G-DCT,CHF 37'900.–,11.2020,9'500 km,Halbautomatisches Getriebe,Plug-in hybrid Benzin/Elektro,217 PS (160 kW),1.5 l/100 km,6438 Ibach-Schwyz


In [8]:
# Preise säubern
df['Price'] = df['Price'].str.replace('CHF ', '', regex=False).str.replace("'", '', regex=False).str.rstrip('.–')

In [9]:
# Km säubern
df['Km'] = df['Km'].str.replace("'", "").str.replace(" km", "", regex=False)

In [10]:
# Non-numerische Werte in df['Km'] finden
non_numeric_km = df[~df['Km'].str.contains('^[0-9]+$', na=False)]

In [11]:
# Zeile mit non-numerischen Werten in df['Km'] anzeigen
non_numeric_km

Unnamed: 0,Car,Price,Date,Km,Transmission,Fuel,PS,Consumption,Location
46793,HONDA Ny1 Elegance,31900,Neues Fahrzeug,-,Automatikgetriebe,Elektro,204 PS (150 kW),412 km,1214 Vernier / Genève


In [12]:
# Alle Werte in df['Km'] mit '- ersetzen'
df['Km'] = df['Km'].str.replace("-", "").str.replace(" km", "", regex=False)

In [13]:
# Erste Zeile des DataFrames anzeigen
df.head(1)

Unnamed: 0,Car,Price,Date,Km,Transmission,Fuel,PS,Consumption,Location
0,MERCEDES-BENZ GLK 250 CDI BlueEfficiency 4Mati...,18799,5.2012,68200,Automat,Diesel,204 PS (150 kW),6.5 l/100 km,8156 Oberhasli


In [14]:
# PS und kW splitten
df[['PS', 'kW']] = df['PS'].str.extract(r'(\d+) PS \((\d+) kW\)')

In [15]:
df.head()

Unnamed: 0,Car,Price,Date,Km,Transmission,Fuel,PS,Consumption,Location,kW
0,MERCEDES-BENZ GLK 250 CDI BlueEfficiency 4Mati...,18799,05.2012,68200,Automat,Diesel,204,6.5 l/100 km,8156 Oberhasli,150
1,AUDI RS e-tron GT quattro,155990,Neues Fahrzeug,9,Automatikgetriebe,Elektro,599,469 km,4147 Aesch,440
2,BMW 120d xDrive Sport Line,36900,05.2023,28100,Automat,Diesel,190,5.2 l/100 km,7000 Chur,140
3,SEAT Leon ST 2.0 TSI Cupra 290 DSG,21900,03.2016,86000,Halbautomatisches Getriebe,Benzin,290,6.6 l/100 km,5417 Untersiggenthal,213
4,MERCEDES-BENZ B 250 e AMG Line 8G-DCT,37900,11.2020,9500,Halbautomatisches Getriebe,Plug-in hybrid Benzin/Elektro,217,1.5 l/100 km,6438 Ibach-Schwyz,160


In [17]:
# Neues Fahrzeug & 
#df['Date'] = df['Date'].str.replace('Neues Fahrzeug', '03.2024', regex=False) # auf 07.2024 erhöhen
df['Date'] = df['Date'].str.replace('Vorführmodell', '01.2023', regex=False)

# Alle Werte mit Format YYYY mit 01. ergänzen
df['Date'] = df['Date'].apply(lambda x: '01.' + x if len(x) == 4 and x.isdigit() else x)

# df['Date'] in Datentypen 'datetime' umwandeln
df['Date'] = pd.to_datetime(df['Date'], format='%m.%Y')

ValueError: time data 'Neues Fahrzeug' does not match format '%m.%Y' (match)

In [16]:
# Dummy-Variable für Vorführmodelle mit Wert 1 hinzufügen
df['Vorführmodell'] = np.where(df['Date'] == 'Vorführmodell', 1, 0)

In [17]:
# PLZ von Location splitten
df[['PLZ', 'Location']] = df['Location'].str.extract(r'(\d{4})\s(.*)')
df['Location'] = df['Location'].str.replace(" ", "")

In [18]:
# Neue Spaltenordnung definieren
new_order = ['Car', 'Price', 'Date', 'Km', 'Transmission', 'Fuel', 'PS', 'kW', 'Consumption', 'PLZ', 'Location', 'Vorführmodell']
df = df[new_order]

In [19]:
# Alle numerisch relevanten Strings in Floats umwandeln
df['Price'] = pd.to_numeric(df['Price'], errors='coerce')
df['PS'] = pd.to_numeric(df['PS'], errors='coerce')
df['kW'] = pd.to_numeric(df['kW'], errors='coerce')
df['PLZ'] = pd.to_numeric(df['PLZ'], errors='coerce')

In [20]:
df.head()

Unnamed: 0,Car,Price,Date,Km,Transmission,Fuel,PS,kW,Consumption,PLZ,Location,Vorführmodell
0,MERCEDES-BENZ GLK 250 CDI BlueEfficiency 4Mati...,18799,05.2012,68200,Automat,Diesel,204.0,150.0,6.5 l/100 km,8156,Oberhasli,0
1,AUDI RS e-tron GT quattro,155990,Neues Fahrzeug,9,Automatikgetriebe,Elektro,599.0,440.0,469 km,4147,Aesch,0
2,BMW 120d xDrive Sport Line,36900,05.2023,28100,Automat,Diesel,190.0,140.0,5.2 l/100 km,7000,Chur,0
3,SEAT Leon ST 2.0 TSI Cupra 290 DSG,21900,03.2016,86000,Halbautomatisches Getriebe,Benzin,290.0,213.0,6.6 l/100 km,5417,Untersiggenthal,0
4,MERCEDES-BENZ B 250 e AMG Line 8G-DCT,37900,11.2020,9500,Halbautomatisches Getriebe,Plug-in hybrid Benzin/Elektro,217.0,160.0,1.5 l/100 km,6438,Ibach-Schwyz,0


In [21]:

# Assuming df is your DataFrame
# Exclude 'Neues Fahrzeug' rows and find duplicates (keeping the first occurrence)
duplicates = df[(df.duplicated(keep='first')) & (df['Date'] != 'Neues Fahrzeug')]

# To track all duplicates including the first occurrence, we can mark all duplicates and then filter
all_duplicates_mask = df.duplicated(keep=False)  # Marks all duplicates
# Exclude 'Neues Fahrzeug' rows from this mask
all_duplicates_mask &= df['Date'] != 'Neues Fahrzeug'
# Extract all duplicates based on the mask
all_duplicates = df[all_duplicates_mask]

# Now, remove duplicates from the original DataFrame while keeping 'Neues Fahrzeug' rows unchanged
df_neues_fahrzeug = df[df['Date'] == 'Neues Fahrzeug']
df_others = df[df['Date'] != 'Neues Fahrzeug'].drop_duplicates()

# Concatenate the non-duplicate and 'Neues Fahrzeug' rows back together
final_df = pd.concat([df_neues_fahrzeug, df_others], ignore_index=True)

# all_duplicates contains all duplicates except those marked as 'Neues Fahrzeug'


In [22]:
all_duplicates

Unnamed: 0,Car,Price,Date,Km,Transmission,Fuel,PS,kW,Consumption,PLZ,Location,Vorführmodell
229,SKODA Octavia Combi 2.0 TDI RS 4x4 DSG,36800,05.2022,31200,Halbautomatisches Getriebe,Diesel,200.0,147.0,6.2 l/100 km,9320,Arbon,0
266,JEEP COMPASS 1.3 PHEV Summ AWD,48700,02.2024,100,Automatikgetriebe,-,240.0,177.0,2.1 l/100 km,9300,Wittenbach,0
626,FIAT TIPO 1.5 Hybrid DCT Station Wagon,23889,10.2023,20,Automatikgetriebe,Voll-Hybrid Benzin/Elektro,130.0,96.0,5.4 l/100 km,5610,Wohlen(AG),0
887,VW E-UP ***TOP AUSSTATTUNG***,23900,11.2023,20,Automat,Elektro,82.0,60.0,225 km,8404,Winterthur,0
1059,BMW 530d xDr 48VTour Pure M S,59900,12.2022,23500,Automat,Mild-Hybrid Diesel/Elektro,286.0,210.0,6 l/100 km,8645,Rapperswil-Jona,0
...,...,...,...,...,...,...,...,...,...,...,...,...
62684,FIAT 500 1.0 N3 MildHybrid Swiss Edition,16490,02.2024,25,Schaltgetriebe manuell,Mild-Hybrid Benzin/Elektro,70.0,51.0,5.2 l/100 km,5032,AarauRohr,0
63039,CUPRA Born 58 kWh,36900,01.2023,50,Automatikgetriebe,Elektro,204.0,150.0,427 km,9200,Gossau,0
63055,SKODA Kamiq 1.5 TSI ACT Monte Carlo DSG,32990,04.2023,20,Halbautomatisches Getriebe,Benzin,150.0,110.0,-,6312,Steinhausen,0
63336,FIAT 500X 1.5 Hybrid Clross DCT,21500,05.2023,8,Automatikgetriebe,Mild-Hybrid Benzin/Elektro,130.0,96.0,5.7 l/100 km,8952,Schlieren,0


In [23]:
final_df

Unnamed: 0,Car,Price,Date,Km,Transmission,Fuel,PS,kW,Consumption,PLZ,Location,Vorführmodell
0,MERCEDES-BENZ GLK 250 CDI BlueEfficiency 4Mati...,18799,05.2012,68200,Automat,Diesel,204.0,150.0,6.5 l/100 km,8156,Oberhasli,0
1,BMW 120d xDrive Sport Line,36900,05.2023,28100,Automat,Diesel,190.0,140.0,5.2 l/100 km,7000,Chur,0
2,SEAT Leon ST 2.0 TSI Cupra 290 DSG,21900,03.2016,86000,Halbautomatisches Getriebe,Benzin,290.0,213.0,6.6 l/100 km,5417,Untersiggenthal,0
3,MERCEDES-BENZ B 250 e AMG Line 8G-DCT,37900,11.2020,9500,Halbautomatisches Getriebe,Plug-in hybrid Benzin/Elektro,217.0,160.0,1.5 l/100 km,6438,Ibach-Schwyz,0
4,FORD S-Max 2.0 TDCi 180 Titanium FPS,19700,03.2018,110700,Halbautomatisches Getriebe,Diesel,180.0,132.0,5.2 l/100 km,9320,Arbon,0
...,...,...,...,...,...,...,...,...,...,...,...,...
63115,FIAT 500 1.0 Hybrid Red,18990,Neues Fahrzeug,1,Schaltgetriebe manuell,Mild-Hybrid Benzin/Elektro,70.0,52.0,6 l/100 km,8274,Tägerwilen,0
63116,MERCEDES-BENZ GLC 200 AMG Line 4MATIC,89900,Neues Fahrzeug,50,Automatikgetriebe,Mild-Hybrid Benzin/Elektro,227.0,167.0,7.6 l/100 km,9500,Wil,0
63117,CUPRA LEON VZ 2.0 TSI DSG,48900,Neues Fahrzeug,15,Automatikgetriebe,Benzin,300.0,221.0,-,9200,Gossau,0
63118,MERCEDES-BENZ GLE 300 d AMG Line 4MATIC,118900,Neues Fahrzeug,50,Automatikgetriebe,Mild-Hybrid Diesel/Elektro,290.0,213.0,7.4 l/100 km,9500,Wil,0


In [24]:
df

Unnamed: 0,Car,Price,Date,Km,Transmission,Fuel,PS,kW,Consumption,PLZ,Location,Vorführmodell
0,MERCEDES-BENZ GLK 250 CDI BlueEfficiency 4Mati...,18799,05.2012,68200,Automat,Diesel,204.0,150.0,6.5 l/100 km,8156,Oberhasli,0
1,AUDI RS e-tron GT quattro,155990,Neues Fahrzeug,9,Automatikgetriebe,Elektro,599.0,440.0,469 km,4147,Aesch,0
2,BMW 120d xDrive Sport Line,36900,05.2023,28100,Automat,Diesel,190.0,140.0,5.2 l/100 km,7000,Chur,0
3,SEAT Leon ST 2.0 TSI Cupra 290 DSG,21900,03.2016,86000,Halbautomatisches Getriebe,Benzin,290.0,213.0,6.6 l/100 km,5417,Untersiggenthal,0
4,MERCEDES-BENZ B 250 e AMG Line 8G-DCT,37900,11.2020,9500,Halbautomatisches Getriebe,Plug-in hybrid Benzin/Elektro,217.0,160.0,1.5 l/100 km,6438,Ibach-Schwyz,0
...,...,...,...,...,...,...,...,...,...,...,...,...
63435,LAND ROVER Range Rover Evoque 2.0 TD4 Pure AT9,22890,09.2018,124000,Automat,Diesel,150.0,110.0,5.1 l/100 km,6370,Stans,0
63436,HYUNDAI Tucson 1.6 CRDi Vertex 4WD,26900,05.2019,67450,Halbautomatisches Getriebe,Diesel,136.0,100.0,6.6 l/100 km,8400,Winterthur,0
63437,CITROEN Grand C4 Picasso 2.0 BlueHDi Shine,13900,08.2016,57000,Schaltgetriebe manuell,Diesel,150.0,110.0,4.1 l/100 km,8618,OetwilamSee,0
63438,AUDI A8 4.2 FSI quattro tiptronic,15900,11.2010,157000,Automat,Benzin,372.0,273.0,9.5 l/100 km,8618,OetwilamSee,0


In [22]:
# csv exportieren
df.to_csv('AutoScout24_3172_cleaned.csv')