# Exploring eBay Car Sales Data

Corresponds to a DataQuest guided project.

In [1]:
import pandas as pd
import numpy as np
from pathlib import Path

In [2]:
data_path = Path.home() / "datasets" / "tabular_practice"

autos = pd.read_csv(data_path / "autos.csv", encoding="Latin-1")

In [3]:
autos

Unnamed: 0,dateCrawled,name,seller,offerType,price,abtest,vehicleType,yearOfRegistration,gearbox,powerPS,model,odometer,monthOfRegistration,fuelType,brand,notRepairedDamage,dateCreated,nrOfPictures,postalCode,lastSeen
0,2016-03-26 17:47:46,Peugeot_807_160_NAVTECH_ON_BOARD,privat,Angebot,"$5,000",control,bus,2004,manuell,158,andere,"150,000km",3,lpg,peugeot,nein,2016-03-26 00:00:00,0,79588,2016-04-06 06:45:54
1,2016-04-04 13:38:56,BMW_740i_4_4_Liter_HAMANN_UMBAU_Mega_Optik,privat,Angebot,"$8,500",control,limousine,1997,automatik,286,7er,"150,000km",6,benzin,bmw,nein,2016-04-04 00:00:00,0,71034,2016-04-06 14:45:08
2,2016-03-26 18:57:24,Volkswagen_Golf_1.6_United,privat,Angebot,"$8,990",test,limousine,2009,manuell,102,golf,"70,000km",7,benzin,volkswagen,nein,2016-03-26 00:00:00,0,35394,2016-04-06 20:15:37
3,2016-03-12 16:58:10,Smart_smart_fortwo_coupe_softouch/F1/Klima/Pan...,privat,Angebot,"$4,350",control,kleinwagen,2007,automatik,71,fortwo,"70,000km",6,benzin,smart,nein,2016-03-12 00:00:00,0,33729,2016-03-15 03:16:28
4,2016-04-01 14:38:50,Ford_Focus_1_6_Benzin_TÜV_neu_ist_sehr_gepfleg...,privat,Angebot,"$1,350",test,kombi,2003,manuell,0,focus,"150,000km",7,benzin,ford,nein,2016-04-01 00:00:00,0,39218,2016-04-01 14:38:50
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49995,2016-03-27 14:38:19,Audi_Q5_3.0_TDI_qu._S_tr.__Navi__Panorama__Xenon,privat,Angebot,"$24,900",control,limousine,2011,automatik,239,q5,"100,000km",1,diesel,audi,nein,2016-03-27 00:00:00,0,82131,2016-04-01 13:47:40
49996,2016-03-28 10:50:25,Opel_Astra_F_Cabrio_Bertone_Edition___TÜV_neu+...,privat,Angebot,"$1,980",control,cabrio,1996,manuell,75,astra,"150,000km",5,benzin,opel,nein,2016-03-28 00:00:00,0,44807,2016-04-02 14:18:02
49997,2016-04-02 14:44:48,Fiat_500_C_1.2_Dualogic_Lounge,privat,Angebot,"$13,200",test,cabrio,2014,automatik,69,500,"5,000km",11,benzin,fiat,nein,2016-04-02 00:00:00,0,73430,2016-04-04 11:47:27
49998,2016-03-08 19:25:42,Audi_A3_2.0_TDI_Sportback_Ambition,privat,Angebot,"$22,900",control,kombi,2013,manuell,150,a3,"40,000km",11,diesel,audi,nein,2016-03-08 00:00:00,0,35683,2016-04-05 16:45:07


In [4]:
autos.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 20 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   dateCrawled          50000 non-null  object
 1   name                 50000 non-null  object
 2   seller               50000 non-null  object
 3   offerType            50000 non-null  object
 4   price                50000 non-null  object
 5   abtest               50000 non-null  object
 6   vehicleType          44905 non-null  object
 7   yearOfRegistration   50000 non-null  int64 
 8   gearbox              47320 non-null  object
 9   powerPS              50000 non-null  int64 
 10  model                47242 non-null  object
 11  odometer             50000 non-null  object
 12  monthOfRegistration  50000 non-null  int64 
 13  fuelType             45518 non-null  object
 14  brand                50000 non-null  object
 15  notRepairedDamage    40171 non-null  object
 16  date

Some first observations:

* Most columns have `object` types (i.e., string)
* Only 5 columns have missing values
* There are a number of date columns

We first convert the column names to more convenient ones.

In [5]:
convert_names = {
    "dateCrawled": "date_crawled",
    "yearOfRegistration": "registration_year",
    "monthOfRegistration": "registration_month",
    "notRepairedDamage": "unrepaired_damage",
    "dateCreated": "ad_created",
    "offerType": "offer_type",
    "vehicleType": "vehicle_type",
    "powerPS": "power_ps",
    "fuelType": "fuel_type",
    "nrOfPictures": "nr_of_pictures",
    "postalCode": "postal_code",
    "lastSeen": "last_seen",
}
new_columns = [convert_names.get(name, name) for name in autos.columns]
autos.columns = new_columns
autos.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 20 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   date_crawled        50000 non-null  object
 1   name                50000 non-null  object
 2   seller              50000 non-null  object
 3   offer_type          50000 non-null  object
 4   price               50000 non-null  object
 5   abtest              50000 non-null  object
 6   vehicle_type        44905 non-null  object
 7   registration_year   50000 non-null  int64 
 8   gearbox             47320 non-null  object
 9   power_ps            50000 non-null  int64 
 10  model               47242 non-null  object
 11  odometer            50000 non-null  object
 12  registration_month  50000 non-null  int64 
 13  fuel_type           45518 non-null  object
 14  brand               50000 non-null  object
 15  unrepaired_damage   40171 non-null  object
 16  ad_created          50

Let us first look at date columns and convert them to `datetime`, so we can work with them.

In [6]:
for name in ["date_crawled", "ad_created", "last_seen"]:
    autos[name] = pd.to_datetime(autos[name])
    print(f"{name}: Between {autos[name].min()} and {autos[name].max()}")

date_crawled: Between 2016-03-05 14:06:30 and 2016-04-07 14:36:56
ad_created: Between 2015-06-11 00:00:00 and 2016-04-07 00:00:00
last_seen: Between 2016-03-05 14:45:46 and 2016-04-07 14:58:50


Next, we convert `price` and `odometer` to numerical types.

In [7]:
autos.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 20 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   date_crawled        50000 non-null  datetime64[ns]
 1   name                50000 non-null  object        
 2   seller              50000 non-null  object        
 3   offer_type          50000 non-null  object        
 4   price               50000 non-null  object        
 5   abtest              50000 non-null  object        
 6   vehicle_type        44905 non-null  object        
 7   registration_year   50000 non-null  int64         
 8   gearbox             47320 non-null  object        
 9   power_ps            50000 non-null  int64         
 10  model               47242 non-null  object        
 11  odometer            50000 non-null  object        
 12  registration_month  50000 non-null  int64         
 13  fuel_type           45518 non-null  object    

In [8]:
autos["price"] = autos["price"].str.replace("\$|\,", "", regex=True).astype(int)
autos["odometer"] = autos["odometer"].str.replace("km|\,", "", regex=True).astype(int)
new_names = {"price": "price_dollars", "odometer": "odometer_km"}
autos.rename(new_names, axis=1, inplace=True)
autos.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 20 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   date_crawled        50000 non-null  datetime64[ns]
 1   name                50000 non-null  object        
 2   seller              50000 non-null  object        
 3   offer_type          50000 non-null  object        
 4   price_dollars       50000 non-null  int64         
 5   abtest              50000 non-null  object        
 6   vehicle_type        44905 non-null  object        
 7   registration_year   50000 non-null  int64         
 8   gearbox             47320 non-null  object        
 9   power_ps            50000 non-null  int64         
 10  model               47242 non-null  object        
 11  odometer_km         50000 non-null  int64         
 12  registration_month  50000 non-null  int64         
 13  fuel_type           45518 non-null  object    

In [9]:
autos.describe()

Unnamed: 0,date_crawled,price_dollars,registration_year,power_ps,odometer_km,registration_month,ad_created,nr_of_pictures,postal_code,last_seen
count,50000,50000.0,50000.0,50000.0,50000.0,50000.0,50000,50000.0,50000.0,50000
mean,2016-03-21 13:43:25.697280,9840.044,2005.07328,116.35592,125732.7,5.72336,2016-03-20 19:41:34.656000,0.0,50813.6273,2016-03-30 04:04:36.309079808
min,2016-03-05 14:06:30,0.0,1000.0,0.0,5000.0,0.0,2015-06-11 00:00:00,0.0,1067.0,2016-03-05 14:45:46
25%,2016-03-13 14:51:25.750000128,1100.0,1999.0,70.0,125000.0,3.0,2016-03-13 00:00:00,0.0,30451.0,2016-03-23 11:11:33.500000
50%,2016-03-21 17:53:56,2950.0,2003.0,105.0,150000.0,6.0,2016-03-21 00:00:00,0.0,49577.0,2016-04-04 01:17:24
75%,2016-03-29 14:36:47.750000128,7200.0,2008.0,150.0,150000.0,9.0,2016-03-29 00:00:00,0.0,71540.0,2016-04-06 10:45:28.249999872
max,2016-04-07 14:36:56,100000000.0,9999.0,17700.0,150000.0,12.0,2016-04-07 00:00:00,0.0,99998.0,2016-04-07 14:58:50
std,,481104.4,105.712813,209.216627,40042.211706,3.711984,,0.0,25779.747957,


In [10]:
year_reg = autos["registration_year"].sort_values()
year_reg.head(50)

22316    1000
49283    1001
24511    1111
35238    1500
10556    1800
32585    1800
28693    1910
42181    1910
15898    1910
3679     1910
30781    1910
33295    1910
45157    1910
22659    1910
46213    1910
21416    1927
22101    1929
11246    1931
2573     1934
2221     1934
26607    1937
39725    1937
21421    1937
23804    1937
26103    1938
24855    1939
25792    1941
13963    1941
11585    1943
11047    1948
1171     1950
14020    1950
32091    1950
35921    1951
44406    1951
23372    1952
35453    1953
40765    1954
25556    1954
36794    1955
24515    1955
7294     1956
23483    1956
347      1956
26570    1956
19914    1956
40077    1957
23044    1957
10358    1958
20105    1958
Name: registration_year, dtype: int64

In [11]:
year_reg.tail(20)

5763     2019
49185    2019
27578    2800
4549     4100
453      4500
42079    4800
22799    5000
4164     5000
49153    5000
24519    5000
27618    5911
8360     6200
25003    8888
49910    9000
13559    9000
6308     9996
8012     9999
14341    9999
33950    9999
38076    9999
Name: registration_year, dtype: int64

Values for "yearOfRegistration" above 2019 are clearly invalid. These rows should probably be removed.

In [12]:
autos[year_reg > 2019]

  autos[year_reg > 2019]


Unnamed: 0,date_crawled,name,seller,offer_type,price_dollars,abtest,vehicle_type,registration_year,gearbox,power_ps,model,odometer_km,registration_month,fuel_type,brand,unrepaired_damage,ad_created,nr_of_pictures,postal_code,last_seen
453,2016-03-28 13:51:12,Armee_Jeep,privat,Angebot,9800,test,,4500,manuell,0,andere,5000,0,,jeep,,2016-03-28,0,7545,2016-04-06 17:45:49
4164,2016-03-29 18:39:40,Verkaufe_DESIGN_Streifen_/_Aufkleber_VW__Opel_...,privat,Angebot,49,control,,5000,,0,golf,5000,12,,volkswagen,,2016-03-29,0,74523,2016-04-06 04:16:14
4549,2016-04-01 21:57:05,Kompressor,privat,Angebot,1600,test,,4100,,0,,5000,0,,sonstige_autos,,2016-04-01,0,67686,2016-04-05 20:19:27
6308,2016-03-12 17:38:17,Kaufe_Autos_jeglicher,privat,Angebot,0,test,,9996,,0,,10000,0,,sonstige_autos,,2016-03-12,0,21244,2016-03-12 17:38:17
8012,2016-03-23 16:43:29,Opel_GT_Karosserie_mit_Brief!,privat,Angebot,700,test,,9999,,0,andere,10000,0,,opel,,2016-03-23,0,21769,2016-04-05 20:16:15
8360,2016-03-11 22:56:30,Vito_touret_119_Blue_Tec,privat,Angebot,42800,control,,6200,automatik,0,vito,10000,7,diesel,mercedes_benz,nein,2016-03-11,0,63739,2016-03-19 20:16:56
13559,2016-03-19 15:57:44,Saab_9000_CSE_Automatik_2_3_ltr._mit_EGSD,privat,Angebot,0,control,,9000,automatik,170,9000,150000,2,benzin,saab,nein,2016-03-19,0,32457,2016-03-21 21:18:11
14341,2016-03-23 01:36:20,Hole_kostenlos_ab,privat,Angebot,0,test,,9999,,0,,10000,0,,bmw,,2016-03-23,0,32689,2016-03-23 08:47:00
22799,2016-03-20 18:56:44,Subaru_Impreza_GT,privat,Angebot,9000,test,,5000,manuell,420,impreza,5000,6,benzin,subaru,nein,2016-03-20,0,34253,2016-04-07 02:45:30
24519,2016-03-05 17:53:37,4x_Winterreifen_auf_Alufelge_der_naechste_Wint...,privat,Angebot,250,test,,5000,,0,andere,5000,0,,seat,,2016-03-05,0,49124,2016-04-05 13:46:51


The 6 values below 1910 are also bogus. But there are also a lot of very old cars in there, if these values are correct.

In [13]:
autos[year_reg < 1934]

  autos[year_reg < 1934]


Unnamed: 0,date_crawled,name,seller,offer_type,price_dollars,abtest,vehicle_type,registration_year,gearbox,power_ps,model,odometer_km,registration_month,fuel_type,brand,unrepaired_damage,ad_created,nr_of_pictures,postal_code,last_seen
3679,2016-04-04 00:36:17,Suche_Auto,privat,Angebot,1,test,,1910,,0,,5000,0,,sonstige_autos,,2016-04-04,0,40239,2016-04-04 07:49:15
10556,2016-04-01 06:02:10,UNFAL_Auto,privat,Angebot,450,control,,1800,,1800,,5000,2,,mitsubishi,nein,2016-04-01,0,63322,2016-04-01 09:42:30
11246,2016-03-26 19:49:59,Ford_Model_A_Roadster_Deluxe_1931,privat,Angebot,27500,control,cabrio,1931,manuell,39,andere,10000,7,benzin,ford,nein,2016-03-26,0,9322,2016-04-06 09:46:59
15898,2016-03-08 10:50:05,Tausch_alles_aus_meinen_Anzeigen_gegen_Auto,privat,Angebot,0,test,,1910,,0,,5000,0,,sonstige_autos,,2016-03-08,0,6108,2016-03-08 17:47:19
21416,2016-03-12 08:36:21,Essex_super_six__Ford_A,privat,Angebot,16500,control,cabrio,1927,manuell,40,andere,5000,5,benzin,ford,,2016-03-12,0,74821,2016-03-15 12:45:12
22101,2016-03-09 16:51:17,BMW_Andere,privat,Angebot,11500,test,cabrio,1929,manuell,15,andere,5000,1,,bmw,ja,2016-03-09,0,70569,2016-04-07 06:17:11
22316,2016-03-29 16:56:41,VW_Kaefer.__Zwei_zum_Preis_von_einem.,privat,Angebot,1500,control,,1000,manuell,0,kaefer,5000,0,benzin,volkswagen,,2016-03-29,0,48324,2016-03-31 10:15:28
22659,2016-03-14 08:51:18,Opel_Corsa_B,privat,Angebot,500,test,,1910,,0,corsa,150000,0,,opel,,2016-03-14,0,52393,2016-04-03 07:53:55
24511,2016-03-17 19:45:11,Trabant__wartburg__Ostalgie,privat,Angebot,490,control,,1111,,0,,5000,0,,trabant,,2016-03-17,0,16818,2016-04-07 07:17:29
28693,2016-03-22 17:48:41,Renault_Twingo,privat,Angebot,599,control,kleinwagen,1910,manuell,0,,5000,0,benzin,renault,,2016-03-22,0,70376,2016-04-06 09:16:59


We will remove rows with `registration_year` earlier than 1911 (1910 looks bogus as well) and later than 2019.

In [14]:
autos = autos.query("1910 < registration_year <= 2019")
autos.info()

<class 'pandas.core.frame.DataFrame'>
Index: 49967 entries, 0 to 49999
Data columns (total 20 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   date_crawled        49967 non-null  datetime64[ns]
 1   name                49967 non-null  object        
 2   seller              49967 non-null  object        
 3   offer_type          49967 non-null  object        
 4   price_dollars       49967 non-null  int64         
 5   abtest              49967 non-null  object        
 6   vehicle_type        44903 non-null  object        
 7   registration_year   49967 non-null  int64         
 8   gearbox             47310 non-null  object        
 9   power_ps            49967 non-null  int64         
 10  model               47226 non-null  object        
 11  odometer_km         49967 non-null  int64         
 12  registration_month  49967 non-null  int64         
 13  fuel_type           45508 non-null  object        


In [15]:
autos.describe()

Unnamed: 0,date_crawled,price_dollars,registration_year,power_ps,odometer_km,registration_month,ad_created,nr_of_pictures,postal_code,last_seen
count,49967,49967.0,49967.0,49967.0,49967.0,49967.0,49967,49967.0,49967.0,49967
mean,2016-03-21 13:42:19.851542272,9843.673,2003.383653,116.342586,125803.830528,5.726079,2016-03-20 19:40:23.895771136,0.0,50814.244121,2016-03-30 04:04:02.862388992
min,2016-03-05 14:06:30,0.0,1927.0,0.0,5000.0,0.0,2015-06-11 00:00:00,0.0,1067.0,2016-03-05 14:45:46
25%,2016-03-13 14:51:25.500000,1100.0,1999.0,70.0,125000.0,3.0,2016-03-13 00:00:00,0.0,30450.0,2016-03-23 11:07:43
50%,2016-03-21 17:52:47,2950.0,2003.0,105.0,150000.0,6.0,2016-03-21 00:00:00,0.0,49577.0,2016-04-04 01:16:57
75%,2016-03-29 14:36:22.500000,7200.0,2008.0,150.0,150000.0,9.0,2016-03-29 00:00:00,0.0,71543.0,2016-04-06 10:45:29.500000
max,2016-04-07 14:36:56,100000000.0,2019.0,17700.0,150000.0,12.0,2016-04-07 00:00:00,0.0,99998.0,2016-04-07 14:58:50
std,,481263.2,7.5919,208.992958,39948.178503,3.710795,,0.0,25781.580135,


More observations:

* `power_ps` equals 0
* `registration_month` equals 0 (must be 1:12)
* `nr_of_pictures` is constant 0
* `price_dollars` equals 0

In [16]:
autos["power_ps"].value_counts()

power_ps
0       5475
75      3170
60      2195
150     2046
140     1883
        ... 
460        1
4400       1
242        1
1090       1
650        1
Name: count, Length: 448, dtype: int64

In [17]:
autos.query("power_ps == 0")

Unnamed: 0,date_crawled,name,seller,offer_type,price_dollars,abtest,vehicle_type,registration_year,gearbox,power_ps,model,odometer_km,registration_month,fuel_type,brand,unrepaired_damage,ad_created,nr_of_pictures,postal_code,last_seen
4,2016-04-01 14:38:50,Ford_Focus_1_6_Benzin_TÜV_neu_ist_sehr_gepfleg...,privat,Angebot,1350,test,kombi,2003,manuell,0,focus,150000,7,benzin,ford,nein,2016-04-01,0,39218,2016-04-01 14:38:50
8,2016-03-22 16:51:34,Seat_Arosa,privat,Angebot,250,test,,2000,manuell,0,arosa,150000,10,,seat,nein,2016-03-22,0,7426,2016-03-26 18:18:10
11,2016-03-16 18:45:34,Mercedes_A140_Motorschaden,privat,Angebot,350,control,,2000,,0,,150000,0,benzin,mercedes_benz,,2016-03-16,0,17498,2016-03-16 18:45:34
27,2016-03-27 18:45:01,Hat_einer_Ahnung_mit_Ford_Galaxy_HILFE,privat,Angebot,0,control,,2005,,0,,150000,0,,ford,,2016-03-27,0,66701,2016-03-27 18:45:01
32,2016-03-20 05:03:03,Corsa_mit_TÜV_5.2016,privat,Angebot,350,control,kleinwagen,1999,manuell,0,corsa,150000,7,benzin,opel,,2016-03-20,0,27619,2016-04-06 03:15:20
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49984,2016-03-31 22:48:48,Student_sucht_ein__Anfaengerauto___ab_2000_BJ_...,privat,Angebot,0,test,,2000,,0,,150000,0,,sonstige_autos,,2016-03-31,0,12103,2016-04-02 19:44:53
49985,2016-04-02 16:38:23,Verkaufe_meinen_vw_vento!,privat,Angebot,1000,control,,1995,automatik,0,,150000,0,benzin,volkswagen,,2016-04-02,0,30900,2016-04-06 15:17:52
49989,2016-03-11 19:50:37,VW_Polo_zum_Ausschlachten_oder_Wiederaufbau,privat,Angebot,150,test,kleinwagen,1997,manuell,0,polo,150000,5,benzin,volkswagen,ja,2016-03-11,0,21244,2016-03-12 10:17:55
49991,2016-03-06 15:25:19,Kleinwagen,privat,Angebot,500,control,,2016,manuell,0,twingo,150000,0,benzin,renault,,2016-03-06,0,61350,2016-03-06 18:24:19


In [18]:
autos["registration_month"].value_counts()

registration_month
3     5070
0     5052
6     4367
5     4107
4     4102
7     3947
10    3651
12    3446
9     3389
11    3359
1     3281
8     3191
2     3005
Name: count, dtype: int64

In [19]:
autos.query("registration_month == 0 and power_ps == 0")

Unnamed: 0,date_crawled,name,seller,offer_type,price_dollars,abtest,vehicle_type,registration_year,gearbox,power_ps,model,odometer_km,registration_month,fuel_type,brand,unrepaired_damage,ad_created,nr_of_pictures,postal_code,last_seen
11,2016-03-16 18:45:34,Mercedes_A140_Motorschaden,privat,Angebot,350,control,,2000,,0,,150000,0,benzin,mercedes_benz,,2016-03-16,0,17498,2016-03-16 18:45:34
27,2016-03-27 18:45:01,Hat_einer_Ahnung_mit_Ford_Galaxy_HILFE,privat,Angebot,0,control,,2005,,0,,150000,0,,ford,,2016-03-27,0,66701,2016-03-27 18:45:01
46,2016-03-31 10:53:28,BMW_mit__Lpg,privat,Angebot,9000,control,,2005,automatik,0,,150000,0,,bmw,,2016-03-31,0,12351,2016-04-06 03:44:41
52,2016-03-25 18:50:03,Senator_A_3.0E_Karosserie_restauriert_m._viele...,privat,Angebot,3500,test,limousine,1985,,0,andere,5000,0,benzin,opel,nein,2016-03-25,0,63500,2016-04-07 00:46:00
59,2016-03-17 17:50:54,Mercedes_A_Klasse_W_168__A_140_gruen,privat,Angebot,700,control,,2016,manuell,0,a_klasse,150000,0,benzin,mercedes_benz,,2016-03-17,0,95356,2016-03-19 17:46:47
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49926,2016-03-13 00:36:56,Vw_golf_cabrio,privat,Angebot,850,control,,2000,,0,,150000,0,,volkswagen,,2016-03-12,0,53227,2016-03-15 17:17:49
49932,2016-03-14 18:52:18,Mercedes_320_cdi_7_Sitzer_Avantgarde,privat,Angebot,1400,test,kombi,1999,automatik,0,,150000,0,,mercedes_benz,,2016-03-14,0,94315,2016-03-14 18:52:18
49984,2016-03-31 22:48:48,Student_sucht_ein__Anfaengerauto___ab_2000_BJ_...,privat,Angebot,0,test,,2000,,0,,150000,0,,sonstige_autos,,2016-03-31,0,12103,2016-04-02 19:44:53
49985,2016-04-02 16:38:23,Verkaufe_meinen_vw_vento!,privat,Angebot,1000,control,,1995,automatik,0,,150000,0,benzin,volkswagen,,2016-04-02,0,30900,2016-04-06 15:17:52


In [20]:
autos.sort_values("price_dollars").head(40)

Unnamed: 0,date_crawled,name,seller,offer_type,price_dollars,abtest,vehicle_type,registration_year,gearbox,power_ps,model,odometer_km,registration_month,fuel_type,brand,unrepaired_damage,ad_created,nr_of_pictures,postal_code,last_seen
36597,2016-03-21 14:48:25,Tausche_polo_86c_gegen_Action_cam,privat,Angebot,0,control,,1990,manuell,45,,125000,5,benzin,volkswagen,,2016-03-21,0,26655,2016-04-06 12:15:28
1975,2016-03-25 14:49:24,Suche_audi_a4_b5!!,privat,Angebot,0,control,limousine,1995,manuell,125,,150000,0,benzin,audi,,2016-03-25,0,30900,2016-04-06 18:17:32
13061,2016-03-05 23:50:59,Seat_Alhambra_1.9_Signo,privat,Angebot,0,test,bus,2001,manuell,116,alhambra,150000,10,diesel,seat,nein,2016-03-05,0,57577,2016-03-07 17:31:08
14904,2016-03-15 22:48:44,Peugeot_406_HDI_2.0__SONDERPREIS!!!!,privat,Angebot,0,control,kombi,2001,manuell,110,4_reihe,150000,12,diesel,peugeot,,2016-03-15,0,27313,2016-03-22 02:44:51
10649,2016-03-17 10:52:53,Verkaufe_oder_Tausche_mein_Bmw,privat,Angebot,0,test,cabrio,1996,,0,3er,150000,0,,bmw,,2016-03-17,0,6193,2016-03-22 18:45:38
1959,2016-03-07 18:43:57,Tausch_oder_Verkauf_!,privat,Angebot,0,control,kleinwagen,2003,manuell,0,corsa,125000,8,benzin,opel,,2016-03-07,0,99439,2016-03-09 07:46:29
24436,2016-03-25 23:53:29,Suche_Golf_2_VR6_Umbau,privat,Angebot,0,control,,1990,,0,golf,5000,0,,volkswagen,,2016-03-25,0,16352,2016-03-27 18:18:32
10646,2016-03-07 17:41:44,Ersatzteilspender_/_Schlachtfahrzeug,privat,Angebot,0,test,limousine,1993,manuell,90,escort,125000,8,benzin,ford,ja,2016-03-07,0,6667,2016-04-06 05:16:48
35968,2016-03-20 07:58:58,Tausche_iPhone6_16_gb_+Geld_gegen_Auto,privat,Angebot,0,control,,2000,,0,,150000,0,,bmw,,2016-03-20,0,53783,2016-03-21 14:30:59
18157,2016-03-07 22:56:13,Mercedes_Benz_Viano,privat,Angebot,0,test,bus,2008,,0,viano,150000,0,diesel,mercedes_benz,,2016-03-07,0,9603,2016-03-09 12:17:01


In [21]:
autos["price_dollars"].value_counts()

price_dollars
0        1413
500       779
1500      733
2500      642
1000      639
         ... 
414         1
79933       1
5198        1
18890       1
16995       1
Name: count, Length: 2355, dtype: int64

We do the following:

* Remove rows with both `power_ps` and `registration_month` being zero
* Map 0 to NaN for the remaining values

In [22]:
autos = autos.query("power_ps > 0 or registration_month > 0")
for name in ["power_ps", "registration_month"]:
    autos.loc[autos[name] == 0, name] = np.nan
autos.info()

<class 'pandas.core.frame.DataFrame'>
Index: 47930 entries, 0 to 49999
Data columns (total 20 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   date_crawled        47930 non-null  datetime64[ns]
 1   name                47930 non-null  object        
 2   seller              47930 non-null  object        
 3   offer_type          47930 non-null  object        
 4   price_dollars       47930 non-null  int64         
 5   abtest              47930 non-null  object        
 6   vehicle_type        44016 non-null  object        
 7   registration_year   47930 non-null  int64         
 8   gearbox             46278 non-null  object        
 9   power_ps            44492 non-null  float64       
 10  model               45671 non-null  object        
 11  odometer_km         47930 non-null  int64         
 12  registration_month  44915 non-null  float64       
 13  fuel_type           44519 non-null  object        


In [23]:
autos["registration_month"].value_counts(dropna=False)

registration_month
3.0     5070
6.0     4367
5.0     4107
4.0     4102
7.0     3947
10.0    3651
12.0    3446
9.0     3389
11.0    3359
1.0     3281
8.0     3191
NaN     3015
2.0     3005
Name: count, dtype: int64

In [24]:
autos["power_ps"].value_counts(dropna=False)

power_ps
NaN       3438
75.0      3170
60.0      2195
150.0     2046
140.0     1883
          ... 
460.0        1
4400.0       1
242.0        1
1090.0       1
650.0        1
Name: count, Length: 448, dtype: int64

In [25]:
autos.describe()

Unnamed: 0,date_crawled,price_dollars,registration_year,power_ps,odometer_km,registration_month,ad_created,nr_of_pictures,postal_code,last_seen
count,47930,47930.0,47930.0,44492.0,47930.0,44915.0,47930,47930.0,47930.0,47930
mean,2016-03-21 13:46:49.411976192,9953.445,2003.480534,130.65922,126059.87899,6.370144,2016-03-20 19:40:00.050073088,0.0,50977.832944,2016-03-30 05:00:07.746067200
min,2016-03-05 14:06:30,0.0,1927.0,1.0,5000.0,1.0,2015-06-11 00:00:00,0.0,1067.0,2016-03-05 14:45:46
25%,2016-03-13 14:47:13.500000,1200.0,1999.0,80.0,125000.0,3.0,2016-03-13 00:00:00,0.0,30657.0,2016-03-23 12:43:38.249999872
50%,2016-03-21 17:55:04,3000.0,2004.0,116.0,150000.0,6.0,2016-03-21 00:00:00,0.0,49733.0,2016-04-04 04:46:52
75%,2016-03-29 14:46:10.500000,7499.0,2008.0,150.0,150000.0,9.0,2016-03-29 00:00:00,0.0,71665.0,2016-04-06 10:46:22
max,2016-04-07 14:36:56,100000000.0,2019.0,17700.0,150000.0,12.0,2016-04-07 00:00:00,0.0,99998.0,2016-04-07 14:58:50
std,,488755.6,7.427577,217.215042,39346.517923,3.34903,,0.0,25760.126387,


In [26]:
autos["power_ps"].dropna().sort_values().tail(50)

4464      1001.0
37651     1001.0
2220      1003.0
36191     1011.0
9968      1016.0
17930     1055.0
20421     1056.0
17990     1082.0
13903     1090.0
19352     1103.0
19659     1202.0
10813     1300.0
33330     1367.0
27580     1398.0
13573     1400.0
40625     1400.0
24197     1400.0
47948     1401.0
43870     1405.0
45465     1704.0
40091     1753.0
36973     1771.0
8000      1779.0
28399     1780.0
4405      1781.0
4777      1793.0
25381     1796.0
29518     1800.0
27179     1986.0
2670      1988.0
11914     1998.0
49263     1998.0
18720     2018.0
23660     2729.0
11311     3500.0
10659     3750.0
14608     4400.0
41673     5867.0
3753      6045.0
11009     6226.0
1699      6512.0
24943     7511.0
41172     8404.0
23742     9011.0
16743    14009.0
22592    15001.0
45671    15016.0
46986    16011.0
35039    16312.0
36421    17700.0
Name: power_ps, dtype: float64

In [27]:
autos.query("power_ps > 2000")

Unnamed: 0,date_crawled,name,seller,offer_type,price_dollars,abtest,vehicle_type,registration_year,gearbox,power_ps,model,odometer_km,registration_month,fuel_type,brand,unrepaired_damage,ad_created,nr_of_pictures,postal_code,last_seen
1699,2016-04-04 19:49:19,Opel_Corsa_1.0_Motor_ecotek,privat,Angebot,1200,test,limousine,2001,manuell,6512.0,corsa,150000,12.0,benzin,opel,,2016-04-04,0,47198,2016-04-06 22:16:46
3753,2016-04-03 18:47:14,VW_Polo_9n,privat,Angebot,4700,control,kleinwagen,2009,manuell,6045.0,polo,125000,12.0,benzin,volkswagen,nein,2016-04-03,0,48565,2016-04-05 19:17:39
10659,2016-03-18 11:47:40,VW_POLO_BASTLERFAHRZEUG,privat,Angebot,100,control,kleinwagen,1996,manuell,3750.0,polo,125000,4.0,benzin,volkswagen,ja,2016-03-18,0,12489,2016-04-03 21:17:20
11009,2016-03-15 21:55:11,Smart_fortwo_Silver_**Brabus_Style**,privat,Angebot,2550,control,kleinwagen,2004,automatik,6226.0,fortwo,125000,5.0,benzin,smart,nein,2016-03-15,0,12359,2016-04-07 07:45:21
11311,2016-04-01 02:36:43,Vw__zu__verkaufen__Caravelle_Confort_kurz_,privat,Angebot,8600,control,andere,2007,automatik,3500.0,transporter,150000,12.0,,volkswagen,,2016-04-01,0,30855,2016-04-07 05:17:11
14608,2016-03-29 20:38:00,Seat_Ibiza_fuer_Bastler,privat,Angebot,1000,test,kleinwagen,1996,manuell,4400.0,ibiza,90000,9.0,benzin,seat,ja,2016-03-29,0,32760,2016-04-04 05:55:48
16743,2016-03-17 23:53:06,Opel_Astra_j,privat,Angebot,9799,control,limousine,2011,manuell,14009.0,,80000,6.0,benzin,opel,nein,2016-03-17,0,86415,2016-03-23 17:18:38
18720,2016-03-24 11:50:41,Bmw_530d_tuev_euro_4_top_gepflegt_!,privat,Angebot,7498,test,,2017,automatik,2018.0,5er,150000,9.0,,bmw,nein,2016-03-24,0,59425,2016-03-30 12:46:26
22592,2016-04-04 14:59:24,Top_geopflegter_fiesta_st,privat,Angebot,5200,test,kleinwagen,2005,manuell,15001.0,fiesta,150000,12.0,benzin,ford,nein,2016-04-04,0,70372,2016-04-06 16:17:54
23660,2016-03-20 17:54:04,BMW_535_touring,privat,Angebot,9750,test,kombi,2005,automatik,2729.0,5er,150000,7.0,diesel,bmw,nein,2016-03-20,0,21376,2016-03-23 02:48:53


In [28]:
# This column is constant 0, can be dropped
autos.drop(columns="nr_of_pictures")

Unnamed: 0,date_crawled,name,seller,offer_type,price_dollars,abtest,vehicle_type,registration_year,gearbox,power_ps,model,odometer_km,registration_month,fuel_type,brand,unrepaired_damage,ad_created,postal_code,last_seen
0,2016-03-26 17:47:46,Peugeot_807_160_NAVTECH_ON_BOARD,privat,Angebot,5000,control,bus,2004,manuell,158.0,andere,150000,3.0,lpg,peugeot,nein,2016-03-26,79588,2016-04-06 06:45:54
1,2016-04-04 13:38:56,BMW_740i_4_4_Liter_HAMANN_UMBAU_Mega_Optik,privat,Angebot,8500,control,limousine,1997,automatik,286.0,7er,150000,6.0,benzin,bmw,nein,2016-04-04,71034,2016-04-06 14:45:08
2,2016-03-26 18:57:24,Volkswagen_Golf_1.6_United,privat,Angebot,8990,test,limousine,2009,manuell,102.0,golf,70000,7.0,benzin,volkswagen,nein,2016-03-26,35394,2016-04-06 20:15:37
3,2016-03-12 16:58:10,Smart_smart_fortwo_coupe_softouch/F1/Klima/Pan...,privat,Angebot,4350,control,kleinwagen,2007,automatik,71.0,fortwo,70000,6.0,benzin,smart,nein,2016-03-12,33729,2016-03-15 03:16:28
4,2016-04-01 14:38:50,Ford_Focus_1_6_Benzin_TÜV_neu_ist_sehr_gepfleg...,privat,Angebot,1350,test,kombi,2003,manuell,,focus,150000,7.0,benzin,ford,nein,2016-04-01,39218,2016-04-01 14:38:50
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49995,2016-03-27 14:38:19,Audi_Q5_3.0_TDI_qu._S_tr.__Navi__Panorama__Xenon,privat,Angebot,24900,control,limousine,2011,automatik,239.0,q5,100000,1.0,diesel,audi,nein,2016-03-27,82131,2016-04-01 13:47:40
49996,2016-03-28 10:50:25,Opel_Astra_F_Cabrio_Bertone_Edition___TÜV_neu+...,privat,Angebot,1980,control,cabrio,1996,manuell,75.0,astra,150000,5.0,benzin,opel,nein,2016-03-28,44807,2016-04-02 14:18:02
49997,2016-04-02 14:44:48,Fiat_500_C_1.2_Dualogic_Lounge,privat,Angebot,13200,test,cabrio,2014,automatik,69.0,500,5000,11.0,benzin,fiat,nein,2016-04-02,73430,2016-04-04 11:47:27
49998,2016-03-08 19:25:42,Audi_A3_2.0_TDI_Sportback_Ambition,privat,Angebot,22900,control,kombi,2013,manuell,150.0,a3,40000,11.0,diesel,audi,nein,2016-03-08,35683,2016-04-05 16:45:07


In [29]:
autos.describe(include=['O'])

Unnamed: 0,name,seller,offer_type,abtest,vehicle_type,gearbox,model,fuel_type,brand,unrepaired_damage
count,47930,47930,47930,47930,44016,46278,45671,44519,47930,39636
unique,36990,2,1,2,8,2,245,7,40,2
top,Volkswagen_Golf_1.4,privat,Angebot,test,limousine,manuell,golf,benzin,volkswagen,nein
freq,75,47929,47930,24663,12675,36136,3883,29357,10244,34924


In [30]:
autos["seller"].value_counts()

seller
privat        47929
gewerblich        1
Name: count, dtype: int64

In [31]:
# This column is constant, can be dropped
autos.drop(columns=["offer_type", "nr_of_pictures"], inplace=True)

In [32]:
autos.describe(include=['O'])

Unnamed: 0,name,seller,abtest,vehicle_type,gearbox,model,fuel_type,brand,unrepaired_damage
count,47930,47930,47930,44016,46278,45671,44519,47930,39636
unique,36990,2,2,8,2,245,7,40,2
top,Volkswagen_Golf_1.4,privat,test,limousine,manuell,golf,benzin,volkswagen,nein
freq,75,47929,24663,12675,36136,3883,29357,10244,34924


In [33]:
for name in ["abtest", "vehicle_type", "fuel_type", "unrepaired_damage"]:
    print(f"[{name}]\n" + str(autos[name].value_counts()))


[abtest]
abtest
test       24663
control    23267
Name: count, dtype: int64
[vehicle_type]
vehicle_type
limousine     12675
kleinwagen    10510
kombi          8954
bus            4011
cabrio         3022
coupe          2496
suv            1960
andere          388
Name: count, dtype: int64
[fuel_type]
fuel_type
benzin     29357
diesel     14345
lpg          674
cng           70
hybrid        37
elektro       18
andere        18
Name: count, dtype: int64
[unrepaired_damage]
unrepaired_damage
nein    34924
ja       4712
Name: count, dtype: int64


At this point, the dataset looks reasonably cleaned.

In [34]:
autos.info()

<class 'pandas.core.frame.DataFrame'>
Index: 47930 entries, 0 to 49999
Data columns (total 18 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   date_crawled        47930 non-null  datetime64[ns]
 1   name                47930 non-null  object        
 2   seller              47930 non-null  object        
 3   price_dollars       47930 non-null  int64         
 4   abtest              47930 non-null  object        
 5   vehicle_type        44016 non-null  object        
 6   registration_year   47930 non-null  int64         
 7   gearbox             46278 non-null  object        
 8   power_ps            44492 non-null  float64       
 9   model               45671 non-null  object        
 10  odometer_km         47930 non-null  int64         
 11  registration_month  44915 non-null  float64       
 12  fuel_type           44519 non-null  object        
 13  brand               47930 non-null  object        


There are still quite a number of missing values in certain columns:

In [35]:
autos.isnull().sum()

date_crawled             0
name                     0
seller                   0
price_dollars            0
abtest                   0
vehicle_type          3914
registration_year        0
gearbox               1652
power_ps              3438
model                 2259
odometer_km              0
registration_month    3015
fuel_type             3411
brand                    0
unrepaired_damage     8294
ad_created               0
postal_code              0
last_seen                0
dtype: int64

In [36]:
autos[autos["vehicle_type"].isnull()]

Unnamed: 0,date_crawled,name,seller,price_dollars,abtest,vehicle_type,registration_year,gearbox,power_ps,model,odometer_km,registration_month,fuel_type,brand,unrepaired_damage,ad_created,postal_code,last_seen
8,2016-03-22 16:51:34,Seat_Arosa,privat,250,test,,2000,manuell,,arosa,150000,10.0,,seat,nein,2016-03-22,7426,2016-03-26 18:18:10
10,2016-03-15 01:41:36,VW_Golf_Tuning_in_siber/grau,privat,999,test,,2017,manuell,90.0,,150000,4.0,benzin,volkswagen,nein,2016-03-14,86157,2016-04-07 03:16:21
55,2016-03-07 02:47:54,Mercedes_E320_AMG_zu_Tauschen!,privat,1,test,,2017,automatik,224.0,e_klasse,125000,7.0,benzin,mercedes_benz,nein,2016-03-06,22111,2016-03-08 05:45:44
64,2016-04-05 07:36:19,Autotransport__Abschlepp_Schlepper,privat,40,test,,2011,,,5er,150000,5.0,,bmw,,2016-04-05,40591,2016-04-07 12:16:01
65,2016-04-04 19:30:39,Ford_Fiesta_zum_ausschlachten,privat,250,control,,2017,manuell,65.0,fiesta,125000,9.0,benzin,ford,,2016-04-04,65606,2016-04-05 12:22:12
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49880,2016-03-30 08:52:57,E39_528i_an_Bastler,privat,0,control,,2017,manuell,193.0,5er,150000,4.0,,bmw,ja,2016-03-30,65468,2016-04-07 01:15:27
49919,2016-03-10 09:49:43,Fiat_Punto,privat,180,test,,2016,manuell,86.0,punto,150000,8.0,,fiat,ja,2016-03-10,59558,2016-03-10 10:39:58
49935,2016-04-01 21:48:20,Mercedes_A_klasse_angemeldet_mit_Tuef_und_Auto...,privat,800,test,,2017,automatik,101.0,a_klasse,150000,9.0,,mercedes_benz,nein,2016-04-01,39108,2016-04-01 21:48:20
49938,2016-03-28 18:45:06,Mercedes_Benz_A_160_Avantgarde,privat,2300,control,,2016,automatik,102.0,a_klasse,150000,6.0,benzin,mercedes_benz,nein,2016-03-28,13507,2016-04-07 00:44:35


In [37]:
autos["vehicle_type"].value_counts(dropna=False)

vehicle_type
limousine     12675
kleinwagen    10510
kombi          8954
bus            4011
NaN            3914
cabrio         3022
coupe          2496
suv            1960
andere          388
Name: count, dtype: int64

In [38]:
autos.describe()

Unnamed: 0,date_crawled,price_dollars,registration_year,power_ps,odometer_km,registration_month,ad_created,postal_code,last_seen
count,47930,47930.0,47930.0,44492.0,47930.0,44915.0,47930,47930.0,47930
mean,2016-03-21 13:46:49.411976192,9953.445,2003.480534,130.65922,126059.87899,6.370144,2016-03-20 19:40:00.050073088,50977.832944,2016-03-30 05:00:07.746067200
min,2016-03-05 14:06:30,0.0,1927.0,1.0,5000.0,1.0,2015-06-11 00:00:00,1067.0,2016-03-05 14:45:46
25%,2016-03-13 14:47:13.500000,1200.0,1999.0,80.0,125000.0,3.0,2016-03-13 00:00:00,30657.0,2016-03-23 12:43:38.249999872
50%,2016-03-21 17:55:04,3000.0,2004.0,116.0,150000.0,6.0,2016-03-21 00:00:00,49733.0,2016-04-04 04:46:52
75%,2016-03-29 14:46:10.500000,7499.0,2008.0,150.0,150000.0,9.0,2016-03-29 00:00:00,71665.0,2016-04-06 10:46:22
max,2016-04-07 14:36:56,100000000.0,2019.0,17700.0,150000.0,12.0,2016-04-07 00:00:00,99998.0,2016-04-07 14:58:50
std,,488755.6,7.427577,217.215042,39346.517923,3.34903,,25760.126387,


In [39]:
autos.query("registration_year > 2016")

Unnamed: 0,date_crawled,name,seller,price_dollars,abtest,vehicle_type,registration_year,gearbox,power_ps,model,odometer_km,registration_month,fuel_type,brand,unrepaired_damage,ad_created,postal_code,last_seen
10,2016-03-15 01:41:36,VW_Golf_Tuning_in_siber/grau,privat,999,test,,2017,manuell,90.0,,150000,4.0,benzin,volkswagen,nein,2016-03-14,86157,2016-04-07 03:16:21
55,2016-03-07 02:47:54,Mercedes_E320_AMG_zu_Tauschen!,privat,1,test,,2017,automatik,224.0,e_klasse,125000,7.0,benzin,mercedes_benz,nein,2016-03-06,22111,2016-03-08 05:45:44
65,2016-04-04 19:30:39,Ford_Fiesta_zum_ausschlachten,privat,250,control,,2017,manuell,65.0,fiesta,125000,9.0,benzin,ford,,2016-04-04,65606,2016-04-05 12:22:12
68,2016-04-03 17:36:59,Mini_cooper_s_clubman_/vollausstattung_/_Navi/...,privat,10990,test,,2017,manuell,174.0,clubman,100000,,,mini,nein,2016-04-03,83135,2016-04-05 17:26:26
84,2016-03-27 19:52:54,Renault_twingo,privat,900,control,,2018,,60.0,twingo,150000,,,renault,,2016-03-27,40589,2016-04-05 18:46:49
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49770,2016-03-15 12:54:26,VW_Polo_6n_Tuev_Neu!__1.6_75PS,privat,999,control,,2018,manuell,75.0,polo,150000,12.0,benzin,volkswagen,nein,2016-03-15,24321,2016-04-06 02:16:02
49796,2016-03-09 09:38:38,Opel_corsa_1.4_zu_verkaufen,privat,4500,test,,2017,manuell,90.0,corsa,70000,7.0,benzin,opel,nein,2016-03-09,88433,2016-03-17 20:45:08
49841,2016-03-11 15:37:02,Passat_abzugeben.,privat,600,test,,2017,manuell,101.0,passat,150000,7.0,,volkswagen,,2016-03-11,53804,2016-03-11 16:41:14
49880,2016-03-30 08:52:57,E39_528i_an_Bastler,privat,0,control,,2017,manuell,193.0,5er,150000,4.0,,bmw,ja,2016-03-30,65468,2016-04-07 01:15:27


We also remove rows with `registration_year` after 2016, because all listings are from 2016.

In [40]:
autos = autos.query("registration_year <= 2016")
autos.info()

<class 'pandas.core.frame.DataFrame'>
Index: 46175 entries, 0 to 49999
Data columns (total 18 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   date_crawled        46175 non-null  datetime64[ns]
 1   name                46175 non-null  object        
 2   seller              46175 non-null  object        
 3   price_dollars       46175 non-null  int64         
 4   abtest              46175 non-null  object        
 5   vehicle_type        44015 non-null  object        
 6   registration_year   46175 non-null  int64         
 7   gearbox             44685 non-null  object        
 8   power_ps            43039 non-null  float64       
 9   model               44149 non-null  object        
 10  odometer_km         46175 non-null  int64         
 11  registration_month  43440 non-null  float64       
 12  fuel_type           43423 non-null  object        
 13  brand               46175 non-null  object        


In [41]:
autos.isnull().sum()

date_crawled             0
name                     0
seller                   0
price_dollars            0
abtest                   0
vehicle_type          2160
registration_year        0
gearbox               1490
power_ps              3136
model                 2026
odometer_km              0
registration_month    2735
fuel_type             2752
brand                    0
unrepaired_damage     7607
ad_created               0
postal_code              0
last_seen                0
dtype: int64

Finally, let us look at the price column

In [42]:
autos["price_dollars"].describe()

count    4.617500e+04
mean     9.657285e+03
std      4.912921e+05
min      0.000000e+00
25%      1.200000e+03
50%      3.100000e+03
75%      7.500000e+03
max      1.000000e+08
Name: price_dollars, dtype: float64

In [43]:
autos.sort_values("price_dollars").tail(50)

Unnamed: 0,date_crawled,name,seller,price_dollars,abtest,vehicle_type,registration_year,gearbox,power_ps,model,odometer_km,registration_month,fuel_type,brand,unrepaired_damage,ad_created,postal_code,last_seen
36794,2016-04-01 09:47:52,Mercedes_Benz_190SL_FIN_Nr._63_top_History__In...,privat,99900,test,cabrio,1955,manuell,105.0,andere,90000,6.0,,mercedes_benz,nein,2016-04-01,74321,2016-04-07 06:17:29
29286,2016-03-22 15:58:56,Porsche_911_Turbo_PDK__Sport_Chrono_Garantie_D...,privat,104900,control,coupe,2011,automatik,500.0,911,30000,1.0,benzin,porsche,nein,2016-03-22,42111,2016-04-06 05:45:48
16964,2016-04-01 16:46:18,Bentley_Continental_Supersports,privat,105000,control,coupe,2010,automatik,630.0,,80000,1.0,benzin,sonstige_autos,nein,2016-04-01,70192,2016-04-05 11:49:31
17540,2016-03-30 00:49:49,Porsche_930_Turbo_classic_analytics_Gutachten_2+,privat,105000,test,coupe,1988,manuell,300.0,911,150000,6.0,benzin,porsche,nein,2016-03-29,76829,2016-04-06 21:18:20
49391,2016-03-18 00:55:16,"Lamborghini_Gallardo_LP560_4_E_Gear_""Callisto_...",privat,109999,test,coupe,2008,automatik,560.0,,30000,9.0,benzin,sonstige_autos,nein,2016-03-17,96052,2016-04-05 21:46:24
22060,2016-03-09 00:58:24,Tesla_Model_X90D_Autopilot_Leder_AHK_Kaltwette...,privat,114400,test,suv,2016,automatik,,,5000,3.0,elektro,sonstige_autos,nein,2016-03-08,33335,2016-03-24 09:46:27
7402,2016-03-22 19:48:09,Porsche_911_Carrera_4S_Cabrio_PDK__BOSE__NEU__...,privat,115000,test,cabrio,2016,automatik,400.0,911,5000,3.0,benzin,porsche,nein,2016-03-22,51379,2016-03-26 21:46:46
21783,2016-03-26 22:06:24,Porsche_911_Carrera_4S_Cabriolet_PDK,privat,115991,test,cabrio,2013,automatik,400.0,911,10000,5.0,benzin,porsche,nein,2016-03-26,65843,2016-04-07 00:17:34
33884,2016-03-26 21:55:12,Porsche_911_Carrera_4S_Cabriolet_PDK,privat,116000,control,cabrio,2013,automatik,430.0,911,30000,4.0,benzin,porsche,nein,2016-03-26,4808,2016-03-26 22:46:40
38814,2016-03-22 16:53:44,Porsche_Porsche_964_TURBO_S_X33_WLS_ab_Werk_35...,privat,119500,test,coupe,1992,manuell,355.0,911,150000,6.0,benzin,porsche,nein,2016-03-22,52062,2016-03-24 00:47:12


Entries from 350000 and above seem to be bogus: Remove them.

In [44]:
autos = autos.query("price_dollars < 350000")
autos["price_dollars"].describe()

count     46163.000000
mean       5965.201287
std        9000.321862
min           0.000000
25%        1200.000000
50%        3100.000000
75%        7500.000000
max      345000.000000
Name: price_dollars, dtype: float64

Let us store the cleaned data for later use.

In [45]:
autos.to_csv(data_path / "autos_cleaned.csv", index=False)

At this point, we can analyze some numerical values, grouped by `brand`. We restrict the analysis to the brands which cover 95% of all rows

In [46]:
brand_cumsum = autos["brand"].value_counts(normalize=True).cumsum()
dominant_brands = brand_cumsum[brand_cumsum <= 0.95].index
dominant_brands

Index(['volkswagen', 'bmw', 'opel', 'mercedes_benz', 'audi', 'ford', 'renault',
       'peugeot', 'fiat', 'seat', 'skoda', 'mazda', 'nissan', 'smart',
       'citroen', 'toyota', 'hyundai', 'sonstige_autos', 'volvo', 'mini',
       'mitsubishi', 'honda'],
      dtype='object', name='brand')

In [47]:
# Select rows for these dominant brands
regex = "|".join(list(dominant_brands))
indicator = autos["brand"].str.contains(regex)
autos_selected = autos[indicator]
autos_selected.info()

<class 'pandas.core.frame.DataFrame'>
Index: 43600 entries, 0 to 49999
Data columns (total 18 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   date_crawled        43600 non-null  datetime64[ns]
 1   name                43600 non-null  object        
 2   seller              43600 non-null  object        
 3   price_dollars       43600 non-null  int64         
 4   abtest              43600 non-null  object        
 5   vehicle_type        41531 non-null  object        
 6   registration_year   43600 non-null  int64         
 7   gearbox             42189 non-null  object        
 8   power_ps            40654 non-null  float64       
 9   model               41674 non-null  object        
 10  odometer_km         43600 non-null  int64         
 11  registration_month  40990 non-null  float64       
 12  fuel_type           41010 non-null  object        
 13  brand               43600 non-null  object        


Let us look at some stats, grouped w.r.t. `brand`.

In [48]:
stats = {"price_dollars": "mean", "power_ps": "mean", "odometer_km": "mean"}
autos_selected.groupby("brand").agg(stats).sort_values("price_dollars", ascending=False)

Unnamed: 0_level_0,price_dollars,power_ps,odometer_km
brand,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
sonstige_autos,12903.018605,177.714286,90534.883721
mini,10551.350365,132.280702,88418.491484
audi,9262.15844,172.335759,129410.167818
mercedes_benz,8633.336174,168.339714,131190.849966
bmw,8218.786994,178.642159,132826.129283
skoda,6439.580132,107.523481,110735.099338
hyundai,5429.217391,110.02765,106065.217391
volkswagen,5382.602514,109.131886,128855.273917
toyota,5168.086735,112.773309,115663.265306
volvo,4868.016509,153.224138,139186.320755


Audi, Mercedes, BMW make sense up there. But "sonstige_autos", "mini"?

In [50]:
autos_selected.query("brand == 'sonstige_autos'")

Unnamed: 0,date_crawled,name,seller,price_dollars,abtest,vehicle_type,registration_year,gearbox,power_ps,model,odometer_km,registration_month,fuel_type,brand,unrepaired_damage,ad_created,postal_code,last_seen
15,2016-04-01 12:06:20,Corvette_C3_Coupe_T_Top_Crossfire_Injection,privat,18900,test,coupe,1982,automatik,203.0,,80000,6.0,benzin,sonstige_autos,nein,2016-04-01,61276,2016-04-02 21:10:48
140,2016-03-19 19:47:07,Ssangyong_Actyon_SUV_2.0_xdi2wd_55000_km,privat,5400,control,suv,2008,manuell,141.0,,60000,3.0,diesel,sonstige_autos,nein,2016-03-19,94447,2016-03-26 00:18:23
152,2016-03-20 12:45:01,Ssanyong_Rexton_2.7,privat,4499,test,suv,2005,automatik,163.0,,150000,9.0,diesel,sonstige_autos,nein,2016-03-20,73312,2016-03-25 11:45:09
175,2016-03-19 15:48:21,MG_MGB_GT,privat,13800,control,coupe,1972,,,,20000,6.0,,sonstige_autos,nein,2016-03-19,53639,2016-03-23 02:17:04
259,2016-04-03 23:49:58,guenstiges_Auto_/_auch_defekt,privat,0,control,,2000,,,,5000,6.0,,sonstige_autos,,2016-04-03,89269,2016-04-06 07:16:22
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49391,2016-03-18 00:55:16,"Lamborghini_Gallardo_LP560_4_E_Gear_""Callisto_...",privat,109999,test,coupe,2008,automatik,560.0,,30000,9.0,benzin,sonstige_autos,nein,2016-03-17,96052,2016-04-05 21:46:24
49612,2016-04-02 23:55:49,Opel_Corsa_1.2_Schiebedach_Voll_fahrbereit,privat,499,test,kleinwagen,1996,manuell,58.0,,150000,4.0,benzin,sonstige_autos,nein,2016-04-02,10999,2016-04-03 06:48:20
49720,2016-03-31 21:37:36,MG_Andere,privat,14500,test,coupe,1969,manuell,145.0,,50000,1.0,benzin,sonstige_autos,nein,2016-03-31,86911,2016-04-06 16:45:25
49745,2016-03-19 22:50:20,ALPINA_B12,privat,17500,test,limousine,1996,,387.0,,150000,5.0,,sonstige_autos,nein,2016-03-19,76131,2016-03-20 01:41:49


In [52]:
autos_selected.query("brand == 'mini'").sort_values("price_dollars", ascending=False)

Unnamed: 0,date_crawled,name,seller,price_dollars,abtest,vehicle_type,registration_year,gearbox,power_ps,model,odometer_km,registration_month,fuel_type,brand,unrepaired_damage,ad_created,postal_code,last_seen
37400,2016-03-13 17:43:10,MINI_Mini_John_Cooper_Works_Coupe_Aut._Ps_279_...,privat,34500,control,kleinwagen,2015,automatik,279.0,cooper,20000,6.0,benzin,mini,nein,2016-03-13,97332,2016-03-16 15:48:00
11977,2016-03-26 13:55:48,MINI_Cooper_S_Clubman__Chilli__HeadUp__Navi__L...,privat,33300,control,kombi,2015,manuell,192.0,clubman,10000,11.0,benzin,mini,nein,2016-03-26,86568,2016-04-06 01:47:06
11791,2016-03-19 13:54:04,MINI_MINI_Cooper_S_3_T_Vollausstattung_NP_41000,privat,28400,test,kleinwagen,2014,automatik,192.0,cooper,20000,10.0,benzin,mini,nein,2016-03-19,99837,2016-04-06 20:18:43
33969,2016-03-24 22:42:46,MINI_Cooper_SD,privat,27300,control,limousine,2014,automatik,170.0,cooper,20000,12.0,diesel,mini,nein,2016-03-24,40667,2016-04-05 17:17:22
19925,2016-03-28 16:36:44,MINI_Mini_Cooper_S_/__LED___Panorama_____HiFi_...,privat,26999,test,kleinwagen,2015,manuell,192.0,cooper,20000,5.0,benzin,mini,nein,2016-03-28,49638,2016-04-06 21:44:51
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31054,2016-03-21 14:54:12,LVM_Versicherung___BMW_Mini___Fahrzeug,privat,9,control,limousine,2016,manuell,,cooper,5000,3.0,andere,mini,,2016-03-21,51145,2016-03-21 14:54:12
43433,2016-03-23 13:39:28,mini_cooper_s,privat,0,test,coupe,2004,manuell,170.0,cooper,125000,,,mini,nein,2016-03-23,79639,2016-04-05 13:45:36
17351,2016-03-31 13:57:14,Mini_One_Sportfahrwerk_S___Scheckheftgepflegt,privat,0,control,kleinwagen,2003,manuell,90.0,one,100000,12.0,benzin,mini,,2016-03-31,26382,2016-04-06 07:15:50
3427,2016-03-18 08:37:19,Mini_Cooper_S_zu_verkaufen,privat,0,test,coupe,2004,manuell,163.0,cooper,150000,5.0,benzin,mini,nein,2016-03-18,74374,2016-04-05 22:15:35


In [53]:
autos["unrepaired_damage"].value_counts()

unrepaired_damage
nein    33983
ja       4577
Name: count, dtype: int64

In [54]:
autos["name"].head(50)

0                      Peugeot_807_160_NAVTECH_ON_BOARD
1            BMW_740i_4_4_Liter_HAMANN_UMBAU_Mega_Optik
2                            Volkswagen_Golf_1.6_United
3     Smart_smart_fortwo_coupe_softouch/F1/Klima/Pan...
4     Ford_Focus_1_6_Benzin_TÜV_neu_ist_sehr_gepfleg...
5     Chrysler_Grand_Voyager_2.8_CRD_Aut.Limited_Sto...
6     VW_Golf_III_GT_Special_Electronic_Green_Metall...
7                                  Golf_IV_1.9_TDI_90PS
8                                            Seat_Arosa
9             Renault_Megane_Scenic_1.6e_RT_Klimaanlage
12    Smart_smart_fortwo_coupe_softouch_pure_MHD_Pan...
13                                   Audi_A3_1.6_tuning
14    Renault_Clio_3__Dynamique_1.2__16_V;_viele_Ver...
15          Corvette_C3_Coupe_T_Top_Crossfire_Injection
16                                  Opel_Vectra_B_Kombi
17                            Volkswagen_Scirocco_2_G60
18                 Verkaufen_mein_bmw_e36_320_i_touring
19          mazda_tribute_2.0_mit_gas_und_tuev_n

In [55]:
autos["name"].str.contains("!!").sum()

855

In [56]:
autos[autos["name"].str.contains("!!")]

Unnamed: 0,date_crawled,name,seller,price_dollars,abtest,vehicle_type,registration_year,gearbox,power_ps,model,odometer_km,registration_month,fuel_type,brand,unrepaired_damage,ad_created,postal_code,last_seen
6,2016-03-20 17:55:21,VW_Golf_III_GT_Special_Electronic_Green_Metall...,privat,300,test,limousine,1995,manuell,90.0,golf,150000,8.0,benzin,volkswagen,,2016-03-20,31535,2016-03-23 02:48:59
50,2016-03-09 16:48:05,??_????????_??Top!!!_Peugeot_308_sw_Sport_?...,privat,5999,test,kombi,2008,manuell,150.0,3_reihe,125000,8.0,benzin,peugeot,nein,2016-03-09,55569,2016-04-07 06:17:09
189,2016-03-19 14:43:52,Toyota_Avensis_1.8_VVT_i_Combi_Sol_/_Sitzheizu...,privat,7399,test,kombi,2007,manuell,129.0,avensis,125000,,benzin,toyota,nein,2016-03-19,24943,2016-04-06 21:18:40
200,2016-03-28 20:41:51,Corvette_C6_Grand_Sport_Cabrio_Schalter_EU_Mod...,privat,56500,control,cabrio,2012,manuell,436.0,andere,30000,11.0,benzin,chevrolet,nein,2016-03-28,96129,2016-04-07 02:47:03
214,2016-03-14 15:57:01,Volkswagen_Sharan_2.0_TDI_DSG__Leder_SHZ_Tempo...,privat,15000,control,bus,2013,automatik,140.0,sharan,150000,5.0,diesel,volkswagen,nein,2016-03-14,52249,2016-03-21 05:45:05
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49685,2016-03-29 18:51:13,Alfa_156_Sportwagon_nur_3Tage_!!!,privat,350,control,kombi,2002,manuell,166.0,156,150000,10.0,benzin,alfa_romeo,,2016-03-29,29574,2016-03-31 12:17:24
49746,2016-03-23 15:57:27,LUPO_1.0_*TOP_FLITZER*!!,privat,1300,test,kleinwagen,1999,manuell,50.0,lupo,150000,3.0,benzin,volkswagen,nein,2016-03-23,48268,2016-03-26 09:45:31
49792,2016-04-05 09:59:03,Fiat_500_C_1_4_Cabrio!!!_Lounge,privat,9400,control,cabrio,2010,manuell,99.0,500,80000,10.0,benzin,fiat,nein,2016-04-05,86511,2016-04-07 13:17:29
49976,2016-03-19 18:56:05,Audi_80_Avant_2.6_E__Vollausstattung!!_Einziga...,privat,5900,test,kombi,1992,automatik,150.0,80,150000,12.0,benzin,audi,nein,2016-03-19,36100,2016-04-07 06:16:44
