In [1]:
## Project: Exploring Ebay Car Sales Data

In [2]:
# read file autos.csv

import pandas as pd

autos = pd.read_csv("autos.csv",encoding="Latin-1")


## Task: Display information about dataframe autos

In [3]:
#display information about autos

print(autos.info())
print(autos.head())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 20 columns):
dateCrawled            50000 non-null object
name                   50000 non-null object
seller                 50000 non-null object
offerType              50000 non-null object
price                  50000 non-null object
abtest                 50000 non-null object
vehicleType            44905 non-null object
yearOfRegistration     50000 non-null int64
gearbox                47320 non-null object
powerPS                50000 non-null int64
model                  47242 non-null object
odometer               50000 non-null object
monthOfRegistration    50000 non-null int64
fuelType               45518 non-null object
brand                  50000 non-null object
notRepairedDamage      40171 non-null object
dateCreated            50000 non-null object
nrOfPictures           50000 non-null int64
postalCode             50000 non-null int64
lastSeen               50000 non-null obj

## Task: Clean Column Names

In [4]:
#clean column names

print(autos.columns)


Index(['dateCrawled', 'name', 'seller', 'offerType', 'price', 'abtest',
       'vehicleType', 'yearOfRegistration', 'gearbox', 'powerPS', 'model',
       'odometer', 'monthOfRegistration', 'fuelType', 'brand',
       'notRepairedDamage', 'dateCreated', 'nrOfPictures', 'postalCode',
       'lastSeen'],
      dtype='object')


In [5]:
# assign cleaned columns names to a new array
# function created to clean columns

def clean_col(col):
    col = col.strip()
    col = col.replace("yearOfRegistration","registration_year")
    col = col.replace("monthOfRegistration", "registration_month")
    col = col.replace("notRepairedDamage", "unrepaired_damage")
    col = col.replace("dateCreated", "ad_created")
    col = col.lower()
    return col

new_columns = []
for c in autos.columns:
    clean_c = clean_col(c)
    new_columns.append(clean_c)
    
autos.columns = new_columns

#print new column names
print(autos.columns)



Index(['datecrawled', 'name', 'seller', 'offertype', 'price', 'abtest',
       'vehicletype', 'registration_year', 'gearbox', 'powerps', 'model',
       'odometer', 'registration_month', 'fueltype', 'brand',
       'unrepaired_damage', 'ad_created', 'nrofpictures', 'postalcode',
       'lastseen'],
      dtype='object')


In [6]:
#check current state of the dataframe

print(autos.head())

           datecrawled                                               name  \
0  2016-03-26 17:47:46                   Peugeot_807_160_NAVTECH_ON_BOARD   
1  2016-04-04 13:38:56         BMW_740i_4_4_Liter_HAMANN_UMBAU_Mega_Optik   
2  2016-03-26 18:57:24                         Volkswagen_Golf_1.6_United   
3  2016-03-12 16:58:10  Smart_smart_fortwo_coupe_softouch/F1/Klima/Pan...   
4  2016-04-01 14:38:50  Ford_Focus_1_6_Benzin_TÜV_neu_ist_sehr_gepfleg...   

   seller offertype   price   abtest vehicletype  registration_year  \
0  privat   Angebot  $5,000  control         bus               2004   
1  privat   Angebot  $8,500  control   limousine               1997   
2  privat   Angebot  $8,990     test   limousine               2009   
3  privat   Angebot  $4,350  control  kleinwagen               2007   
4  privat   Angebot  $1,350     test       kombi               2003   

     gearbox  powerps   model   odometer  registration_month fueltype  \
0    manuell      158  andere  150,00

### Note: The autos column have been cleaned and renamed. All column names converted from camelcase to snakecase.

## Task: Initial Exploration and Cleaning

In [7]:
# describe autos dataframe

autos.describe(include='all')

Unnamed: 0,datecrawled,name,seller,offertype,price,abtest,vehicletype,registration_year,gearbox,powerps,model,odometer,registration_month,fueltype,brand,unrepaired_damage,ad_created,nrofpictures,postalcode,lastseen
count,50000,50000,50000,50000,50000,50000,44905,50000.0,47320,50000.0,47242,50000,50000.0,45518,50000,40171,50000,50000.0,50000.0,50000
unique,48213,38754,2,2,2357,2,8,,2,,245,13,,7,40,2,76,,,39481
top,2016-03-10 15:36:24,Ford_Fiesta,privat,Angebot,$0,test,limousine,,manuell,,golf,"150,000km",,benzin,volkswagen,nein,2016-04-03 00:00:00,,,2016-04-07 06:17:27
freq,3,78,49999,49999,1421,25756,12859,,36993,,4024,32424,,30107,10687,35232,1946,,,8
mean,,,,,,,,2005.07328,,116.35592,,,5.72336,,,,,0.0,50813.6273,
std,,,,,,,,105.712813,,209.216627,,,3.711984,,,,,0.0,25779.747957,
min,,,,,,,,1000.0,,0.0,,,0.0,,,,,0.0,1067.0,
25%,,,,,,,,1999.0,,70.0,,,3.0,,,,,0.0,30451.0,
50%,,,,,,,,2003.0,,105.0,,,6.0,,,,,0.0,49577.0,
75%,,,,,,,,2008.0,,150.0,,,9.0,,,,,0.0,71540.0,


## Notes for the cells in the following lines:
### - Any column that have mostly one value that are candidates to be dropped
### - Any column that need more investigation
### - Any example of numeric data stored as text  that needs to be cleaned 

In [8]:
# check for null columns

print(autos.isnull().sum())


datecrawled              0
name                     0
seller                   0
offertype                0
price                    0
abtest                   0
vehicletype           5095
registration_year        0
gearbox               2680
powerps                  0
model                 2758
odometer                 0
registration_month       0
fueltype              4482
brand                    0
unrepaired_damage     9829
ad_created               0
nrofpictures             0
postalcode               0
lastseen                 0
dtype: int64


In [9]:
# check columns price and odometer

# price column
print(autos["price"].dtype)
print(autos["price"].unique())

# odometer column
print(autos["odometer"].dtype)
print(autos["odometer"].unique())

object
['$5,000' '$8,500' '$8,990' ... '$385' '$22,200' '$16,995']
object
['150,000km' '70,000km' '50,000km' '80,000km' '10,000km' '30,000km'
 '125,000km' '90,000km' '20,000km' '60,000km' '5,000km' '100,000km'
 '40,000km']


In [10]:
# clean columns by:
# - remove non-numeric character
# - convert column to a numeric type
# - rename column to odometer_km

# price column cleanup step
autos["price"] = autos["price"].str.replace("$","").str.replace(",","").astype(int)
print(autos["price"].unique())


[ 5000  8500  8990 ...   385 22200 16995]


In [11]:
# odometer column cleanup step
autos["odometer"] = autos["odometer"].str.replace(",","").str.replace("km","").astype(int)
print(autos["odometer"].unique())

autos.rename({"odometer": "odometer_km"}, axis=1, inplace=True)
odometer_desc = autos["odometer_km"].describe()
print(odometer_desc)


[150000  70000  50000  80000  10000  30000 125000  90000  20000  60000
   5000 100000  40000]
count     50000.000000
mean     125732.700000
std       40042.211706
min        5000.000000
25%      125000.000000
50%      150000.000000
75%      150000.000000
max      150000.000000
Name: odometer_km, dtype: float64


In [12]:
# print column names to check the renamed odometer columan name
print(autos.columns)

Index(['datecrawled', 'name', 'seller', 'offertype', 'price', 'abtest',
       'vehicletype', 'registration_year', 'gearbox', 'powerps', 'model',
       'odometer_km', 'registration_month', 'fueltype', 'brand',
       'unrepaired_damage', 'ad_created', 'nrofpictures', 'postalcode',
       'lastseen'],
      dtype='object')


## Exploring the Odometer and Price Columns

In [24]:
# Check number of price unique values

print("unique values")
print(autos["price"].unique())

# describe column
print("")
print(autos["price"].describe())

# unique counts-head
print("\n")
print("value counts")
print("\n")
print("all counts")
print(autos["price"].value_counts())
print("\n")
print("descending order")
print(autos["price"].value_counts().head().sort_index(ascending=True))
print("\n")
print("ascending order")
print(autos["price"].value_counts().head().sort_index(ascending=False))
print("\n")
print("max price")
print(autos["price"].max())
print("\n")
print("min price")
print(autos["price"].min())

unique values
[ 5000  8500  8990 ...   385 22200 16995]

count    5.000000e+04
mean     9.840044e+03
std      4.811044e+05
min      0.000000e+00
25%      1.100000e+03
50%      2.950000e+03
75%      7.200000e+03
max      1.000000e+08
Name: price, dtype: float64


value counts


all counts
0         1421
500        781
1500       734
2500       643
1000       639
1200       639
600        531
800        498
3500       498
2000       460
999        434
750        433
900        420
650        419
850        410
700        395
4500       394
300        384
2200       382
950        379
1100       376
1300       371
3000       365
550        356
1800       355
5500       340
1250       335
350        335
1600       327
1999       322
          ... 
46200        1
29600        1
13480        1
21700        1
7373         1
3279         1
4286         1
188          1
17830        1
9130         1
910          1
238          1
2671         1
69900        1
151990       1
2479         1
4510  

In [14]:
# outliers

#print("unique values")
#print(autos["registration_year"].unique())

def display_unique(col_name):
    print(autos[col_name].unique())
    
for c in autos.columns:
    print("\n")
    print(c)
    display_unique(c)
    




datecrawled
['2016-03-26 17:47:46' '2016-04-04 13:38:56' '2016-03-26 18:57:24' ...
 '2016-03-28 10:50:25' '2016-03-08 19:25:42' '2016-03-14 00:42:12']


name
['Peugeot_807_160_NAVTECH_ON_BOARD'
 'BMW_740i_4_4_Liter_HAMANN_UMBAU_Mega_Optik' 'Volkswagen_Golf_1.6_United'
 ... 'Audi_Q5_3.0_TDI_qu._S_tr.__Navi__Panorama__Xenon'
 'Opel_Astra_F_Cabrio_Bertone_Edition___TÜV_neu+Reifen_neu_!!'
 'Fiat_500_C_1.2_Dualogic_Lounge']


seller
['privat' 'gewerblich']


offertype
['Angebot' 'Gesuch']


price
[ 5000  8500  8990 ...   385 22200 16995]


abtest
['control' 'test']


vehicletype
['bus' 'limousine' 'kleinwagen' 'kombi' nan 'coupe' 'suv' 'cabrio'
 'andere']


registration_year
[2004 1997 2009 2007 2003 2006 1995 1998 2000 2017 2010 1999 1982 1990
 2015 2014 1996 1992 2005 2002 2012 2011 2008 1985 2016 1994 1986 2001
 2018 2013 1972 1993 1988 1989 1967 1973 1956 1976 4500 1987 1991 1983
 1960 1969 1950 1978 1980 1984 1963 1977 1961 1968 1934 1965 1971 1966
 1979 1981 1970 1974 1910 1975 5000

## Cleanup outliers for car prices. Counts with

In [22]:
# nrofpictures column has 0 values

print(autos["nrofpictures"].value_counts())
print(autos["nrofpictures"].describe())
print("null")
print(autos["nrofpictures"].isnull().sum())



0    50000
Name: nrofpictures, dtype: int64
count    50000.0
mean         0.0
std          0.0
min          0.0
25%          0.0
50%          0.0
75%          0.0
max          0.0
Name: nrofpictures, dtype: float64
null
0


In [26]:
# df[df["col"].between(x,y)]

print("\n")
print("max price")
print(autos["price"].max())
print("\n")
print("min price")
print(autos["price"].min())
print(autos["price"].unique())



max price
99999999


min price
0
[ 5000  8500  8990 ...   385 22200 16995]
