### 1. Setup workspace

In [1]:
import pandas as pd
import numpy as np
import re

cars_price = pd.read_csv('data/cars_train.csv')

### 2. Explore the dataset

In [2]:
cars_price.head()

Unnamed: 0,Id,city,year,manufacturer,make,condition,cylinders,fuel,odometer,title_status,...,paint_color,lat,long,county_fips,county_name,state_fips,state_code,state_name,weather,price
0,559327,baltimore,2006.0,,Scion tc,excellent,4 cylinders,gas,190000.0,clean,...,blue,39.287,-76.6476,24510.0,Baltimore City,24.0,MD,Maryland,56.0,3200
1,1429566,carbondale,2018.0,dodge,charger sxt,,,gas,,clean,...,,37.72,-89.2158,17077.0,Jackson,17.0,IL,Illinois,48.0,30620
2,931606,thumb,1997.0,ford,f 250 2 wheel dr pickup,fair,,gas,,clean,...,white,43.4833,-83.3835,26157.0,Tuscola,26.0,MI,Michigan,45.0,1800
3,1265412,laredo,2003.0,ram,,,8 cylinders,gas,,clean,...,,27.850069,-99.668883,48479.0,Webb,48.0,TX,Texas,67.0,4500
4,1133731,ocala,2000.0,dodge,1500,,,gas,,clean,...,,29.165,-81.5399,12069.0,Lake,12.0,FL,Florida,65.0,1400


### 3. Data cleaning

#### 3.1 Setup dataframe

In [3]:
#Remove all location info
location_columns = ['city', 'lat', 'long', 'county_fips', 'county_name', 'state_fips', 'state_code', 'state_name']
cars_price = cars_price.drop(columns=location_columns, axis=1)

#Remove 'Id' column
cars_price = cars_price.drop('Id', axis=1)

#Remove unpopulated columns
cars_price = cars_price.drop(columns=['drive', 'size', 'type', 'paint_color', 'condition', 'weather'], axis=1)

#Rename columns
cars_price.rename(columns={'manufacturer':'brand', 'make':'model'}, inplace=True)

cars_price.head()

Unnamed: 0,year,brand,model,cylinders,fuel,odometer,title_status,transmission,price
0,2006.0,,Scion tc,4 cylinders,gas,190000.0,clean,automatic,3200
1,2018.0,dodge,charger sxt,,gas,,clean,automatic,30620
2,1997.0,ford,f 250 2 wheel dr pickup,,gas,,clean,automatic,1800
3,2003.0,ram,,8 cylinders,gas,,clean,automatic,4500
4,2000.0,dodge,1500,,gas,,clean,automatic,1400


#### 3.2 Clean 'brand' column

In [4]:
#Transform 'brand' column to lowercase string
cars_price['brand'] = cars_price['brand'].astype('str').str.lower() 

#Create a dictionary to format the variations of the marks
brands_typo = {'^alfa$':'alfa-romeo', 
               '^aston$':'aston-martin', 
               '^chev$':'chevrolet', '^chevy$':'chevrolet', 
               '^harley$':'harley-davidson',
               '^infinity$':'infiniti',
               '^landrover$':'land-rover', '^land rover$':'land-rover', '^rover$':'land-rover',
               '^mercedes$':'mercedes-benz',
               '^porche$':'porsche',
               '^vw$':'volkswagen',
               '^nan$':'' #Replace 'nan' with empty string
              }

#Clean 'brand' column using the dictionary
cars_price['brand'] = cars_price[['brand']].replace(brands_typo, regex=True)

#Cleaned list of brands
brands = sorted(cars_price.brand.astype('str').unique()) #[1:] is used to remove empty string ''
if '' in brands:
    brands.remove('')

cars_price.head()

Unnamed: 0,year,brand,model,cylinders,fuel,odometer,title_status,transmission,price
0,2006.0,,Scion tc,4 cylinders,gas,190000.0,clean,automatic,3200
1,2018.0,dodge,charger sxt,,gas,,clean,automatic,30620
2,1997.0,ford,f 250 2 wheel dr pickup,,gas,,clean,automatic,1800
3,2003.0,ram,,8 cylinders,gas,,clean,automatic,4500
4,2000.0,dodge,1500,,gas,,clean,automatic,1400


In [5]:
cars_price.brand.unique()

array(['', 'dodge', 'ford', 'ram', 'toyota', 'lincoln', 'chevrolet',
       'jaguar', 'land-rover', 'volvo', 'volkswagen', 'acura', 'honda',
       'nissan', 'cadillac', 'jeep', 'subaru', 'gmc', 'buick', 'pontiac',
       'hyundai', 'audi', 'mazda', 'infiniti', 'bmw', 'saturn',
       'mercedes-benz', 'kia', 'lexus', 'chrysler', 'datsun',
       'mitsubishi', 'mercury', 'mini', 'fiat', 'harley-davidson',
       'ferrari', 'porsche', 'alfa-romeo', 'aston-martin', 'morgan'],
      dtype=object)

#### 3.3 Clean 'model' column

In [6]:
#Transform 'model' column to lowercase string and replace 'nan' with empty string
cars_price['model'] = cars_price['model'].astype('str').str.lower()
cars_price['model'] = cars_price['model'].map({'nan':''}).fillna(cars_price['model'])

#Clean symbols and strip the column
cars_price['model'] =  [re.sub(r'[^\s\w]','', str(model)) for model in cars_price['model']]
cars_price['model'] = cars_price['model'].str.strip()

cars_price.head()

Unnamed: 0,year,brand,model,cylinders,fuel,odometer,title_status,transmission,price
0,2006.0,,scion tc,4 cylinders,gas,190000.0,clean,automatic,3200
1,2018.0,dodge,charger sxt,,gas,,clean,automatic,30620
2,1997.0,ford,f 250 2 wheel dr pickup,,gas,,clean,automatic,1800
3,2003.0,ram,,8 cylinders,gas,,clean,automatic,4500
4,2000.0,dodge,1500,,gas,,clean,automatic,1400


##### 3.3.1 Extract years from 'model' column and complete 'year' column with it

In [7]:
#Transform 'year' column to lowercase string
cars_price['year'] = cars_price['year'].astype('str').str.lower()

#Create 'extracted_year' column from 'model'
from_1900_to_2019 = '(19[0-9][0-9]|20[0-1][0-9])'
cars_price['extracted_year'] = cars_price.model.str.extract(from_1900_to_2019, expand=False).fillna('')

#If 'year' is empty, use 'extracted_year' instead
def complete_year(row):
    
    value_in_year = row['year']!=''
    value_in_extracted_year = row['extracted_year']!=''
    
    if value_in_extracted_year and not value_in_year:
        return row['extracted_year']
    else:
        return row['year']
    
cars_price['year'] = cars_price.apply(lambda row: complete_year(row), axis=1)

#Remove years from 'model' column and 'extracted_year' column
cars_price['model'] = cars_price['model'].str.replace(from_1900_to_2019, '')
cars_price = cars_price.drop('extracted_year', axis=1)

cars_price.head()

Unnamed: 0,year,brand,model,cylinders,fuel,odometer,title_status,transmission,price
0,2006.0,,scion tc,4 cylinders,gas,190000.0,clean,automatic,3200
1,2018.0,dodge,charger sxt,,gas,,clean,automatic,30620
2,1997.0,ford,f 250 2 wheel dr pickup,,gas,,clean,automatic,1800
3,2003.0,ram,,8 cylinders,gas,,clean,automatic,4500
4,2000.0,dodge,1500,,gas,,clean,automatic,1400


##### 3.3.2 Extract brands from 'model' column and complete 'brand' column with it

In [8]:
#Add new transformations to brand_dict
brands_typo.update({'^cadillaj$':'cadillac',
                    '^chervolet$':'chevrolet', '^cheverolet$': 'chevrolet',
                    '^volksvagen$':'volkswagen', '^volkswagon$':'volkswagen',
                    '^mecedez$':'mercedes-benz', '^benz$':'mercedes-benz',
                    '^nissaan$':'nissan',
                    '^totota$':'toyota',
                    '^suburu$':'subaru',
                    '^crystler$':'chrysler', '^crysler$':'chrysler',   
            })

#Add new brands listed in 'model' column to 'brands' list
brands.extend(['nissan', 'lamborghini', 'mustang', 'suzuki', 'kawasaki', 'scion', 'yamaha', 'thomas', 'oldsmobile'])
brands = list(set(brands)) #Remove duplicates

#Clean 'model' column using the dictionary
cars_price['model'] = cars_price[['model']].replace(brands_typo, regex=True)

#Convert 'model' field into a list
cars_price['extracted_brand'] = cars_price["model"].str.split(" ", n=-1, expand=False) 

#When 'model' contains a item that already exists in 'brands', export it in the new column 'exported_brand'
def brand_in_model(row):    
    brand = list(set(row['extracted_brand']).intersection(brands))
    return brand

cars_price['extracted_brand'] = cars_price.apply(lambda row: brand_in_model(row), axis=1)

#Convert column into string again
cars_price['extracted_brand'] = cars_price['extracted_brand'].apply(lambda brand: ','.join(map(str, brand)))

#Split 'extracted brand' to preserve only the first match with brands list
split_brand = cars_price['extracted_brand'].str.split(',', n=1, expand=True)
cars_price['extracted_brand'] = split_brand[0]

#Where 'brand' is empty, replace it with 'extracted_brand'
def complete_brand(row):

    value_in_brand = row['brand']!=''
    value_in_extracted_brand = row['extracted_brand']!=''
    
    if value_in_extracted_brand and not value_in_brand:
        return row['extracted_brand']
    else:
        return row['brand']

cars_price['brand'] = cars_price.apply(lambda row: complete_brand(row), axis=1)

#Drop 'model' and 'extracted_brand' columns
cars_price = cars_price.drop(['model', 'extracted_brand'], axis=1)

cars_price.head()

Unnamed: 0,year,brand,cylinders,fuel,odometer,title_status,transmission,price
0,2006.0,scion,4 cylinders,gas,190000.0,clean,automatic,3200
1,2018.0,dodge,,gas,,clean,automatic,30620
2,1997.0,ford,,gas,,clean,automatic,1800
3,2003.0,ram,8 cylinders,gas,,clean,automatic,4500
4,2000.0,dodge,,gas,,clean,automatic,1400


#### 3.4 Clean 'cylinders' column

In [9]:
#Transform 'cylinders' column to lowercase string
cars_price['cylinders'] = cars_price['cylinders'].astype('str').str.lower()

#Remove non-defining substrings
cars_price['cylinders'] = cars_price.cylinders.replace({r'.*(\d+).*': r'\1'}, regex=True)
cars_price['cylinders'] = cars_price.cylinders.replace({r'other': ''})
cars_price['cylinders'] = cars_price.cylinders.replace({r'nan': ''})

#Insert '0' number of cylinders when fuel is 'electric'
cars_price.loc[cars_price.fuel == 'electric', 'cylinders'] = '0'

#Insert 'electric' fuel when number of cylinders is '0'
cars_price.loc[cars_price.cylinders == '0', 'fuel'] = 'electric'

cars_price.head()

Unnamed: 0,year,brand,cylinders,fuel,odometer,title_status,transmission,price
0,2006.0,scion,4.0,gas,190000.0,clean,automatic,3200
1,2018.0,dodge,,gas,,clean,automatic,30620
2,1997.0,ford,,gas,,clean,automatic,1800
3,2003.0,ram,8.0,gas,,clean,automatic,4500
4,2000.0,dodge,,gas,,clean,automatic,1400


#### 3.5 Clean 'fuel'

In [10]:
#Transform 'fuel' column to lowercase string
cars_price['fuel'] = cars_price['fuel'].astype('str').str.lower()

#Remove 'nan'
cars_price['fuel'] = cars_price.fuel.replace({r'nan': ''})

cars_price.head()

Unnamed: 0,year,brand,cylinders,fuel,odometer,title_status,transmission,price
0,2006.0,scion,4.0,gas,190000.0,clean,automatic,3200
1,2018.0,dodge,,gas,,clean,automatic,30620
2,1997.0,ford,,gas,,clean,automatic,1800
3,2003.0,ram,8.0,gas,,clean,automatic,4500
4,2000.0,dodge,,gas,,clean,automatic,1400


#### 3.6 Clean 'odometer'

In [11]:
#Transform 'odometer' column to lowercase string
cars_price['odometer'] = cars_price['odometer'].astype('str').str.lower()

#Remove 'nan'
cars_price['odometer'] = cars_price.odometer.replace({r'nan': ''})

cars_price.head()

Unnamed: 0,year,brand,cylinders,fuel,odometer,title_status,transmission,price
0,2006.0,scion,4.0,gas,190000.0,clean,automatic,3200
1,2018.0,dodge,,gas,,clean,automatic,30620
2,1997.0,ford,,gas,,clean,automatic,1800
3,2003.0,ram,8.0,gas,,clean,automatic,4500
4,2000.0,dodge,,gas,,clean,automatic,1400


#### 3.7 Clean 'title_status'

In [12]:
#Transform 'title_status' column to lowercase string
cars_price['title_status'] = cars_price['title_status'].astype('str').str.lower()

#Remove 'nan'
cars_price['title_status'] = cars_price.title_status.replace({r'nan': ''})

cars_price.head()

Unnamed: 0,year,brand,cylinders,fuel,odometer,title_status,transmission,price
0,2006.0,scion,4.0,gas,190000.0,clean,automatic,3200
1,2018.0,dodge,,gas,,clean,automatic,30620
2,1997.0,ford,,gas,,clean,automatic,1800
3,2003.0,ram,8.0,gas,,clean,automatic,4500
4,2000.0,dodge,,gas,,clean,automatic,1400


#### 3.8 Clean 'tranmission'

In [13]:
#Transform 'transmission' column to lowercase string
cars_price['transmission'] = cars_price['transmission'].astype('str').str.lower()

#Remove non-defining substrings
cars_price['transmission'] = cars_price.transmission.replace({r'nan': ''}, regex=True)

#Insert 'automatic' when 'fuel' is hybrid or electric
cars_price.loc[cars_price.transmission == 'automatic', 'fuel'] = 'electric'
cars_price.loc[cars_price.transmission == 'automatic', 'fuel'] = 'hybrid'

cars_price.head()

Unnamed: 0,year,brand,cylinders,fuel,odometer,title_status,transmission,price
0,2006.0,scion,4.0,hybrid,190000.0,clean,automatic,3200
1,2018.0,dodge,,hybrid,,clean,automatic,30620
2,1997.0,ford,,hybrid,,clean,automatic,1800
3,2003.0,ram,8.0,hybrid,,clean,automatic,4500
4,2000.0,dodge,,hybrid,,clean,automatic,1400


### 3. Empty-data processing and values rescalation

In [22]:
#Transform numeric columns to float
cars_price['year'] = pd.to_numeric(cars_price['year'], errors = 'coerce')
cars_price['cylinders'] = pd.to_numeric(cars_price['cylinders'], errors = 'coerce')
cars_price['odometer'] = pd.to_numeric(cars_price['odometer'], errors = 'coerce') 

year_global_avg = cars_price['year'].mean()
cylinders_global_avg = cars_price['cylinders'].mean()
odometer_global_avg = cars_price['odometer'].mean()

Unnamed: 0_level_0,year,cylinders,odometer,price
brand,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
,1998.556284,5.977439,134911.703466,141750.853742
acura,2006.653015,5.384951,124387.743374,8977.344289
alfa-romeo,1988.444444,4.358974,57804.55,14113.793651
aston-martin,2005.518519,5.2,26367.352941,47356.37037
audi,2008.303974,5.122125,99013.282702,12327.706314


In [55]:
#Calculate brands ang global averages
brands_avg = cars_price.groupby('brand').mean()
brands_avg.head()

#Fill in the gaps with brands averages, when possible
def complete_missing_year(row):
    
    year = row['year'] 
    
    if np.isnan(year):
        try:
            year = brands_avg.loc[row['brand'], 'year']
        except:
            year = year_global_avg 
    return year

cars_price['year'] = cars_price.apply(lambda row: complete_missing_year(row), axis=1)

def complete_missing_cylinders(row):
    
    cylinders = row['cylinders'] 
    
    if np.isnan(cylinders):
        try:
            cylinders = brands_avg.loc[row['brand'], 'cylinders']
        except:
            cylinders = cylinders_global_avg
    return cylinders

cars_price['cylinders'] = cars_price.apply(lambda row: complete_missing_cylinders(row), axis=1)

def complete_missing_odometer(row):
    
    odometer = row['odometer'] 
    
    if np.isnan(odometer):
        try:
            odometer = brands_avg.loc[row['brand'], 'odometer']
        except:
            odometer = odometer_global_avg
    return odometer

cars_price['odometer'] = cars_price.apply(lambda row: complete_missing_odometer(row), axis=1)

#Replace empty non-numeric strings with 'unknown'
cars_price = cars_price.replace({r'': 'unknown'})
cars_price.head()

### 4. Export model

In [None]:
clean_train_dataset.to_csv(r'data/clean_train_dataset.csv', index=False)