# DeLorean
## Back to the future to estimate cars prices

### 1. Setup workspace

In [1]:
import pandas as pd
import numpy as np
import re

#Append folder for custom libraries
import sys
sys.path.append('libraries')

#Import custom library
import shinypanda

cars_price = pd.read_csv('data/cars_test.csv')

### 2. Explore the dataset

In [2]:
#Head of the dataframe
cars_price.head()

Unnamed: 0,Id,city,year,manufacturer,make,condition,cylinders,fuel,odometer,title_status,...,type,paint_color,lat,long,county_fips,county_name,state_fips,state_code,state_name,weather
0,974298,duluth,2006.0,ford,f-250 super duty,good,8 cylinders,gas,154400.0,clean,...,pickup,white,47.746524,-90.357742,27031.0,Cook,27.0,MN,Minnesota,43.0
1,1051884,kansascity,1987.0,chevrolet,,,,gas,,clean,...,,,38.373182,-93.776859,29083.0,Henry/Rives,29.0,MO,Missouri,52.0
2,684464,palmsprings,2010.0,jeep,liberty sport,,6 cylinders,gas,127722.0,clean,...,SUV,,33.741059,-116.356434,6065.0,Riverside,6.0,CA,California,59.0
3,1255387,sanmarcos,2003.0,chevrolet,tahoe,fair,8 cylinders,gas,,clean,...,SUV,white,30.026266,-98.133363,48209.0,Hays,48.0,TX,Texas,67.0
4,1195520,tampa,2006.0,lexus,gs 300,,,gas,,clean,...,,,27.8688,-82.7344,12103.0,Pinellas,12.0,FL,Florida,65.0


In [3]:
cars_price.count()

Id              253073
city            253073
year            252162
manufacturer    233025
make            242961
condition       150378
cylinders       151927
fuel            251490
odometer        169963
title_status    252700
transmission    251723
drive           156207
size             88234
type            150050
paint_color     151286
lat             253073
long            253073
county_fips     244441
county_name     244441
state_fips      244441
state_code      244441
state_name      253073
weather         244357
dtype: int64

In [4]:
#Type of data
#cars_price.dtypes

In [5]:
#Descriptive stats
#cars_price.describe()

### 3. Data cleaning
#### In order to use sklearn algorithms, the entire dataset must contain operable numerical data

#### 3.1 Setup dataframe

In [6]:
#Remove all location info but State Code (could affect to taxes)
location_columns = ['city', 'lat', 'long', 'county_fips', 'county_name', 'state_fips', 'state_code', 'state_name']
cars_price = cars_price.drop(columns=location_columns, axis=1)

#Remove not relevant or unknown columns
cars_price = cars_price.drop(columns=['Id', 'weather'], axis=1)

#Remove unpopulated columns
cars_price = cars_price.drop(columns=['drive', 'size', 'type', 'paint_color', 'condition', 'cylinders', 'odometer'], axis=1)

#Rename columns
cars_price.rename(columns={'manufacturer':'brand', 'make':'model'}, inplace=True)

cars_price.head()

Unnamed: 0,year,brand,model,fuel,title_status,transmission
0,2006.0,ford,f-250 super duty,gas,clean,automatic
1,1987.0,chevrolet,,gas,clean,automatic
2,2010.0,jeep,liberty sport,gas,clean,automatic
3,2003.0,chevrolet,tahoe,gas,clean,automatic
4,2006.0,lexus,gs 300,gas,clean,automatic


#### 3.2 Clean 'brand' column

In [7]:
#Transform column type from 'object' to string
cars_price['brand'] = cars_price['brand'].astype('str') 

brands = sorted(cars_price.brand.unique())
print('Original brands list:', brands)

Original brands list: ['acura', 'alfa', 'alfa-romeo', 'aston', 'aston-martin', 'audi', 'bmw', 'buick', 'cadillac', 'chev', 'chevrolet', 'chevy', 'chrysler', 'datsun', 'dodge', 'ferrari', 'fiat', 'ford', 'gmc', 'harley', 'harley-davidson', 'honda', 'hyundai', 'infiniti', 'infinity', 'jaguar', 'jeep', 'kia', 'land rover', 'landrover', 'lexus', 'lincoln', 'mazda', 'mercedes', 'mercedes-benz', 'mercedesbenz', 'mercury', 'mini', 'mitsubishi', 'morgan', 'nan', 'nissan', 'pontiac', 'porche', 'ram', 'rover', 'saturn', 'subaru', 'toyota', 'volkswagen', 'volvo', 'vw']


In [8]:
#Remove NaN
cars_price['brand'] = cars_price.brand.replace({r'nan': ''}, regex=True)

#Fix duplicity: 'alfa' and 'alfa-romeo'
cars_price['brand'] = cars_price.brand.replace({r'alfa-romeo': 'alfa'}, regex=True)
cars_price['brand'] = cars_price.brand.replace({r'alfa': 'alfa-romeo'}, regex=True)

#Fix duplicity: 'aston' and 'aston-martin'
cars_price['brand'] = cars_price.brand.replace({r'aston-martin': 'aston'}, regex=True)
cars_price['brand'] = cars_price.brand.replace({r'aston': 'aston-martin'}, regex=True)

#Fix duplicity: 'chev', 'chevy' and 'aston-martin'
cars_price['brand'] = cars_price.brand.replace({r'chevrolet': 'chev'}, regex=True)
cars_price['brand'] = cars_price.brand.replace({r'chevy': 'chev'}, regex=True)
cars_price['brand'] = cars_price.brand.replace({r'chev': 'chevrolet'}, regex=True)

#Fix duplicity: 'harley' and 'harley-davidson'
cars_price['brand'] = cars_price.brand.replace({r'harley-davidson': 'harley'}, regex=True)
cars_price['brand'] = cars_price.brand.replace({r'harley': 'harley-davidson'}, regex=True)

#Fix misspelling: 'infinity' instead of 'infinity'
cars_price['brand'] = cars_price.brand.replace({r'infinity': 'infiniti'}, regex=True)

#Fix duplicity: 'rover', 'landrover' and 'land rover'
cars_price['brand'] = cars_price.brand.replace({r'landrover': 'land-rovex'}, regex=True)
cars_price['brand'] = cars_price.brand.replace({r'land rover': 'land-rovex'}, regex=True)
cars_price['brand'] = cars_price.brand.replace({r'rover': 'land-rovex'}, regex=True)
cars_price['brand'] = cars_price.brand.replace({r'land-rovex': 'land-rover'}, regex=True)

#Fix duplicity: 'landrover' and 'land rover'
cars_price['brand'] = cars_price.brand.replace({r'mercedes-benz': 'mercedes'}, regex=True)
cars_price['brand'] = cars_price.brand.replace({r'mercedes': 'mercedes-benz'}, regex=True)

#Fix misspelling: 'porsche' instead of 'porche'
cars_price['brand'] = cars_price.brand.replace({r'porche': 'porsche'}, regex=True)

#Fix duplicity: 'vw' and 'volkswagen'
cars_price['brand'] = cars_price.brand.replace({r'vw': 'volkswagen'}, regex=True)

brands = sorted(cars_price.brand.unique())[1:] #[1:] is used to remove empty string ''
print('Clean brands list:', brands)

Clean brands list: ['acura', 'alfa-romeo', 'aston-martin', 'audi', 'bmw', 'buick', 'cadillac', 'chevrolet', 'chrysler', 'datsun', 'dodge', 'ferrari', 'fiat', 'ford', 'gmc', 'harley-davidson', 'honda', 'hyundai', 'infiniti', 'jaguar', 'jeep', 'kia', 'land-rover', 'lexus', 'lincoln', 'mazda', 'mercedes-benz', 'mercedes-benzbenz', 'mercury', 'mini', 'mitsubishi', 'morgan', 'nissan', 'pontiac', 'porsche', 'ram', 'saturn', 'subaru', 'toyota', 'volkswagen', 'volvo']


#### 3.3 Curate 'model' column

In [9]:
#Transform column type from 'object' to string and make it lowercase
cars_price['model'] = cars_price['model'].astype('str').str.lower()

#Fix detected misspelling
cars_price['model'] = cars_price.model.replace({r'cadillaj': 'cadillac'}, regex=True)

cars_price['model'] = cars_price.model.replace({r'chervolet': 'chevrolet'}, regex=True)
cars_price['model'] = cars_price.model.replace({r'cheverolet': 'chevrolet'}, regex=True)

cars_price['model'] = cars_price.model.replace({r'volksvagen': 'volkswagen'}, regex=True)
cars_price['model'] = cars_price.model.replace({r'volkswagon': 'volkswagen'}, regex=True)

cars_price['model'] = cars_price.model.replace({r'ercedes': 'mercedes-benz'}, regex=True)
cars_price['model'] = cars_price.model.replace({r'mercedes-benzbenz': 'mercedes-benz'}, regex=True)
cars_price['model'] = cars_price.model.replace({r'mecedez': 'mercedes-benz'}, regex=True)
cars_price['model'] = cars_price.model.replace({r'benz': 'mercedes-benz'}, regex=True)

cars_price['model'] = cars_price.model.replace({r'nissaan': 'nissan'}, regex=True)

cars_price['model'] = cars_price.model.replace({r'totota': 'toyota'}, regex=True)

cars_price['model'] = cars_price.model.replace({r'suburu': 'subaru'}, regex=True)

cars_price['model'] = cars_price.model.replace({r'crystler': 'chrysler'}, regex=True)
cars_price['model'] = cars_price.model.replace({r'crysler': 'chrysler'}, regex=True)

#Add new brands listed in 'model' to 'brands' list
brands.extend(['nissan', 'lamborghini', 'mustang', 'suzuki', 'kawasaki', 'scion', 'yamaha', 'thomas', 'oldsmobile'])
print('Expanded brands list:', brands)

Expanded brands list: ['acura', 'alfa-romeo', 'aston-martin', 'audi', 'bmw', 'buick', 'cadillac', 'chevrolet', 'chrysler', 'datsun', 'dodge', 'ferrari', 'fiat', 'ford', 'gmc', 'harley-davidson', 'honda', 'hyundai', 'infiniti', 'jaguar', 'jeep', 'kia', 'land-rover', 'lexus', 'lincoln', 'mazda', 'mercedes-benz', 'mercedes-benzbenz', 'mercury', 'mini', 'mitsubishi', 'morgan', 'nissan', 'pontiac', 'porsche', 'ram', 'saturn', 'subaru', 'toyota', 'volkswagen', 'volvo', 'nissan', 'lamborghini', 'mustang', 'suzuki', 'kawasaki', 'scion', 'yamaha', 'thomas', 'oldsmobile']


In [10]:
#Split 'model' data
cars_price['extracted_brand'] = cars_price["model"].str.split(" ", n=-1, expand=False) 

#When 'model' contains a string that already exists in 'brands', export it in a new column
def brand_in_model(row):    
    brand = list(set(row['extracted_brand']).intersection(brands))
    return brand

#Apply the function and convert column to string again
cars_price['extracted_brand'] = cars_price.apply(lambda row: brand_in_model(row), axis=1)
cars_price['extracted_brand'] = cars_price['extracted_brand'].apply(lambda brand: ','.join(map(str, brand)))

#Split 'extracted brand' to preserve only the first match with brands list
split_brand = cars_price['extracted_brand'].str.split(',', n=1, expand=True)
cars_price['extracted_brand'] = split_brand[0]

found_brands = sorted(cars_price['extracted_brand'].unique())[1:] #[1:] is used to remove empty string ''
print('Brands found in \'model\' column:', found_brands)

Brands found in 'model' column: ['chevrolet', 'chrysler', 'kawasaki', 'lamborghini', 'mercedes-benz', 'mustang', 'oldsmobile', 'porsche', 'scion', 'subaru', 'suzuki', 'thomas', 'toyota', 'volkswagen', 'yamaha']


In [11]:
#Show new content in table
brand_in_model = cars_price['extracted_brand'] != ''
empty_brand = cars_price['brand'] == ''

cars_price[empty_brand & brand_in_model].head()

Unnamed: 0,year,brand,model,fuel,title_status,transmission,extracted_brand
16,1986.0,,porsche 944 turbo (951),gas,clean,manual,porsche
47,2008.0,,porsche cayman s,gas,clean,manual,porsche
63,1995.0,,porsche 911,gas,clean,other,porsche
101,2003.0,,porsche boxster,gas,clean,automatic,porsche
103,2014.0,,mustang,other,parts only,other,mustang


In [12]:
#Remove brands from 'model' column
for brand in brands:
    cars_price['model'] = cars_price.model.replace({brand: ''}, regex=True)

#Clean symbols and strip 'model' column
cars_price['model'] =  [re.sub(r'[^\s\w]','', str(model)) for model in cars_price['model']]
cars_price['model'] = cars_price['model'].str.strip()

#Extract years from 'model' column
pattern = '(19[0-9][0-9]|20[0-1][0-9])'
cars_price['extracted_year'] = cars_price.model.str.extract(pattern, expand=False)

#Remove years from 'model' column
cars_price['model'] = [re.sub(pattern,'', str(model)) for model in cars_price['model']]

#Show new content in table
brand_in_model = cars_price['extracted_brand'] != ''
empty_brand = cars_price['brand'] == ''
year_in_model = cars_price['extracted_year'].notnull()

cars_price[empty_brand & brand_in_model & year_in_model].head()

Unnamed: 0,year,brand,model,fuel,title_status,transmission,extracted_brand,extracted_year
207,2006.0,,tc,gas,clean,automatic,scion,2006
606,1987.0,,cutlass,gas,clean,automatic,oldsmobile,1987
853,2002.0,,gt,gas,clean,automatic,mustang,2002
1820,2007.0,,911 targa 4,gas,clean,manual,porsche,2007
2426,2001.0,,alero,gas,lien,automatic,oldsmobile,2001


In [13]:
#Convert year-columns types to string
cars_price['year'] = cars_price['year'].astype(str)
cars_price['extracted_year'] = cars_price['extracted_year'].astype(str)

#Replace 'nan' year with empty strings
cars_price['year'] =  cars_price['year'].replace({r'nan': ''}, regex=True)

#Replace 'extracted_year'
cars_price['extracted_year'] =  cars_price['extracted_year'].replace({r'nan': ''}, regex=True)

#Show new content in table
empty_year = cars_price['year']==''
extracted_year = cars_price['extracted_year']!=''

cars_price[empty_year & extracted_year].head()

Unnamed: 0,year,brand,model,fuel,title_status,transmission,extracted_brand,extracted_year
3581,,,school bus,diesel,clean,automatic,thomas,2001
13181,,,school bus,diesel,clean,automatic,thomas,2000
45546,,,school bus,diesel,clean,automatic,thomas,2004
54858,,,school bus,diesel,clean,automatic,thomas,2005
62722,,,school bus,diesel,clean,automatic,thomas,2005


In [14]:
#Where 'brand' is empty, replace it with 'extracted_brand'
def complete_brand(row):
    
    brand_doesnt_exist = row['brand']==''
    extracted_brand_exists = row['extracted_brand']!=''
    
    if brand_doesnt_exist and extracted_brand_exists:
        return row['extracted_brand']
    else:
        return row['brand']

cars_price['brand'] = cars_price.apply(lambda row: complete_brand(row), axis=1)

#Where 'year' is empty, replace it with 'extracted_year'
def complete_year(row):
    
    year_doesnt_exist = row['year']==''
    extracted_year_exists = row['extracted_year']!=''
    
    if year_doesnt_exist and extracted_year_exists:
        return row['extracted_year']
    else:
        return row['year']

cars_price['year'] = cars_price.apply(lambda row: complete_year(row), axis=1)

#Remove 'extracted_brand' and 'extracted_year' columns
cars_price = cars_price.drop(['extracted_brand', 'extracted_year'], axis=1)

#Replace empty fields in 'brand' and 'year' columns with 'unknown'
cars_price['brand'] = cars_price['brand'].replace('', 'unknown')
cars_price['year'] = cars_price['year'].replace('', 'unknown')

cars_price.head()

Unnamed: 0,year,brand,model,fuel,title_status,transmission
0,2006.0,ford,f250 super duty,gas,clean,automatic
1,1987.0,chevrolet,,gas,clean,automatic
2,2010.0,jeep,liberty sport,gas,clean,automatic
3,2003.0,chevrolet,tahoe,gas,clean,automatic
4,2006.0,lexus,gs 300,gas,clean,automatic


#### 3.4 Clean 'fuel'

In [15]:
print ('Original fuel types:', list(cars_price['fuel'].unique()))

Original fuel types: ['gas', 'diesel', 'other', nan, 'hybrid', 'electric']


In [16]:
#Fill 'nan' with 'unknown'
cars_price['fuel'] = cars_price['fuel'].fillna(value='unknown')

print ('Current fuel types:', list(cars_price['fuel'].unique()))

Current fuel types: ['gas', 'diesel', 'other', 'unknown', 'hybrid', 'electric']


#### 3.5 Clean 'title_status'

In [17]:
print ('Original title status:', list(cars_price['title_status'].unique()))

Original title status: ['clean', 'missing', 'rebuilt', 'salvage', 'lien', 'parts only', nan]


In [18]:
#Change 'other' and fill 'nan' with 'unknown'
cars_price['title_status'] = cars_price['title_status'].fillna(value='unknown')
cars_price['title_status'] = cars_price['title_status'].replace('other', 'unknown')

print ('Current title status:', list(cars_price['title_status'].unique()))

Current title status: ['clean', 'missing', 'rebuilt', 'salvage', 'lien', 'parts only', 'unknown']


#### 3.6 Clean 'tranmission'

In [19]:
print ('Original transmission types:', list(cars_price['transmission'].unique()))

Original transmission types: ['automatic', 'manual', 'other', nan]


In [20]:
#Fill 'nan' with 'unknown'
cars_price['transmission'] = cars_price['transmission'].fillna(value='unknown')

print ('Current transmission types:', list(cars_price['transmission'].unique()))

Current transmission types: ['automatic', 'manual', 'other', 'unknown']


### 3. Export model

#### 3.1 Last columns settings

In [21]:
#Drop model column
cars_price = cars_price.drop(columns=['model'], axis=1)

#Turn 'year' to int and fill 'nan'
cars_price['year'] = pd.to_numeric(cars_price['year'], errors='coerce')
mean_year = round(cars_price['year'].mean())
cars_price['year'] = cars_price['year'].fillna(mean_year)

#Get dummies
clean_train_dataset = pd.get_dummies(cars_price)
clean_train_dataset.head()

Unnamed: 0,year,brand_acura,brand_alfa-romeo,brand_aston-martin,brand_audi,brand_bmw,brand_buick,brand_cadillac,brand_chevrolet,brand_chrysler,...,title_status_lien,title_status_missing,title_status_parts only,title_status_rebuilt,title_status_salvage,title_status_unknown,transmission_automatic,transmission_manual,transmission_other,transmission_unknown
0,2006.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
1,1987.0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,1,0,0,0
2,2010.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
3,2003.0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,1,0,0,0
4,2006.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0


In [22]:
#Set dataframes
training_dataset = pd.read_csv('data/clean_train_dataset.csv')
test_dataset = pd.get_dummies(cars_price)

#Remove 'unknown' columns from 'training_dataset'
training_cols = [c for c in training_dataset.columns if c.lower() != 'unknown']
training_dataset = training_dataset[training_cols]

#Remove 'unknown' columns from 'test_dataset'
test_cols = [c for c in test_dataset.columns if c.lower() != 'unknown']
test_dataset = test_dataset[test_cols]

#In training_dataset, but not in test_dataset
training_diff = training_dataset.columns.difference(test_dataset.columns)

#Create columns in test_dataset (without 'price')
for column in training_diff[:-1]:
    test_dataset[column] = 0
    
#In test_dataset, but not in training
test_diff = test_dataset.columns.difference(training_dataset.columns)

#Create columns in training_dataset
for column in test_diff:
    training_dataset[column] = 0

#### 3.2 Setup model

In [23]:
from sklearn import linear_model

x_columns = [col for col in training_dataset.columns.values if col != 'price'] 

cars_x = training_dataset[x_columns] 
cars_y = training_dataset['price']

cars_model = linear_model.LinearRegression() 
cars_model.fit(cars_x, cars_y)
cars_model.score(cars_x, cars_y)

0.0001936339417157873

In [24]:
#Generate prediction
regression = cars_model.predict(test_dataset)

#Read a clean dataset
submission = pd.read_csv('data/cars_test.csv')

#Add price prediction and remove negative values
submission['price'] = abs(regression)

#Remove all columns but 'Id' and 'price'
submission = submission[['Id', 'price']]

In [30]:
submission.to_csv(r'data/submission.csv', index=False)