In [1]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from dask import delayed
from dask.distributed import Client, progress
import dask.array as da
import dask.dataframe as dd
from dask_ml.preprocessing import DummyEncoder, StandardScaler
from dask_ml.model_selection import train_test_split
import joblib
from sklearn.model_selection import cross_validate
from sklearn.linear_model import LinearRegression

import pandas as pd
import spacy

import warnings
warnings.filterwarnings("ignore")
pd.options.display.float_format = '{:.3f}'.format

In [2]:
def print_full(x):
    pd.set_option('display.max_rows', len(x))
    print(x)
    pd.reset_option('display.max_rows')
    
def print_nulls(ddf):
    out = ddf.isnull().sum()*100/ddf.isnull().isnull().count()
    
    return print_full(out.compute())

In [3]:
def plot_hist_boxplot(df):
    '''
    Accepts only numerical columns;
    use a maks.
    '''
    dim = df.shape[1]
    plt.figure(figsize=(20, dim*5))
    
    # iterate through the columns
    for i, column in enumerate(df.columns):
        
        # plot a histogram
        plt.subplot(dim, 2, (i+1)*2-1)
        plt.hist(df[column])
        plt.ylabel(column, size='xx-large')
        
        # plot a boxplot
        plt.subplot(dim, 2, (i+1)*2)
        plt.boxplot(df[column], whis=[2.5, 97.5]) # boxplot will show outliers beyond the inner 90th percentile
    plt.show()

In [5]:
client = Client('tcp://127.0.0.1:43705')
client

0,1
Client  Scheduler: tcp://127.0.0.1:43705  Dashboard: http://127.0.0.1:8787/status,Cluster  Workers: 4  Cores: 8  Memory: 12.00 GB


In [6]:
# read the first 10,000 rows in pandas to do understand the data structure
df = pd.read_csv('used_cars_data.csv', nrows=100000)

In [7]:
dtypes = {col : df[col].dtype for col in df.columns}

In [8]:
# These are retyped because they are being dropped
dtypes['sp_id'] = 'O'
dtypes['seller_rating'] = 'O'
dtypes['vehicle_damage_category'] = 'O'
dtypes['is_certified'] = 'O'
dtypes['combine_fuel_economy'] = 'O'
dtypes['listing_id'] = 'O'
dtypes['savings_amount'] = 'O'



# This contains problem values probably 4 digit codes
dtypes['dealer_zip'] = 'O'


In [9]:
# Load the data into Dask using the type map

df = dd.from_pandas(df, npartitions=5)

#df = dd.read_csv('/home/owen/Jupyter/Final/used_cars_data.csv', dtype=dtypes)

In [10]:
print_nulls(df)

vin                         0.000
back_legroom                4.954
bed                        99.393
bed_height                 91.877
bed_length                 91.877
body_type                   0.401
cabin                      98.333
city                        0.000
city_fuel_economy          16.016
combine_fuel_economy      100.000
daysonmarket                0.000
dealer_zip                  0.000
description                 2.543
engine_cylinders            2.998
engine_displacement         5.339
engine_type                 2.998
exterior_color              0.000
fleet                      43.075
frame_damaged              43.075
franchise_dealer            0.000
franchise_make             25.416
front_legroom               4.954
fuel_tank_volume            4.954
fuel_type                   2.459
has_accidents              43.075
height                      4.954
highway_fuel_economy       16.016
horsepower                  5.339
interior_color              0.000
isCab         

In [11]:
### Operations to clean the dataset
'''
Retain only some of the featuers.
Many contain redundant information or information that I am not using on this pass.
In particular I am not using location data on this pass. 
'''

# Limit the model to vehicles with NHTSA data available
df = df[df['year'] > 1984]


# Retain the columns first to reduce memory footprint
kept = ['vin' 
        , 'make_name' # Use make and body type instead of model
        #, 'model_name'
        , 'body_type'
        , 'year'
        , 'engine_type' # This contains the number of cylnders as well as their configureation.   
        , 'wheel_system'
        , 'transmission'
        #, 'trimId'
        #, 'trim_name'
        , 'horsepower'
        , 'maximum_seating'
        , 'mileage'
        
        # Target Variable
        , 'price'

        # Fuel Economy Variables
        , 'fuel_type'  
        , 'city_fuel_economy'
        , 'highway_fuel_economy'
        , 'combine_fuel_economy'

        , 'daysonmarket'
        , 'franchise_dealer'
        , 'is_cpo'
        
        # These Features would need NLP
        , 'description'
        , 'major_options'
        , 'exterior_color'
        , 'interior_color'

        # Have cleaning method but probably not necessary
        , 'front_legroom'
        , 'back_legroom'

        # Vehicle History Features
        , 'owner_count'
        , 'has_accidents'       
        , 'salvage'
        , 'theft_title'
        , 'fleet'
        , 'frame_damaged'
       ]

df = df.loc[:, kept].copy()


#First Drop Duplicates
df = df.drop_duplicates(subset='vin')


# Use vin as index?
'''
May additionally need to add a vin checker here
'''
df = df.set_index('vin')

df.persist()

# Generate a make/model/yearname column?
    # Would be most helpful if/when I merge outside data.
    # Dictionary of these and each has its own model?
        # Is that just LIFT?
    
# Drop almost half the dataset because it is missing history?, Drop those features?
'''
In this pass I'm going to drop those features. I want to see how good the estimation is first.
Added to the drop below

The following features are all 47.553% Null. Seems to correspond to VIN history data

['fleet'
, 'frame_damaged'
, 'isCab'
, 'has_accidents'
, 'salvage'
, 'theft_title']

'''

# Color and Description both need to be parsed
'''
Just dropping them in this pass. Added to the drop below
'''    


# Define a function to retype columns with unit inches
def strip_inches(t):
    try:
        t = float(str(t).strip(' in'))
        return t
    except:
        return np.nan

cols = ['back_legroom', 'front_legroom']
    
for col in cols:
    df[col] = df[col].apply(strip_inches, meta=(col, 'int64'))
    

# Clean and retype seating capacity 
def fix_seating(r):
    try:
        r = int(str(r).strip(' seats'))
        return r
    except:
        return np.nan
    
df['maximum_seating'] = df['maximum_seating'].apply(fix_seating, meta=
                                                    ('maximum_seating', 'int64')).compute()


# Clean up boolean columns into numerical
df['is_cpo'] = df['is_cpo'].apply(lambda x: 1 if x == True else 0)

df['franchise_dealer'] = df['franchise_dealer'].apply(lambda x: 1 if x == True else 0)


# Decide what to do about fuel economy

    # EPA generates combined by weighting city 55% and highway 45%
df['combine_fuel_economy'] = df['city_fuel_economy']*0.55 + df['highway_fuel_economy']*0.45
    # Would be cool to see a model that encorporates a ratio as input?
    
    
    
# See if I can bin TrimID better
'''
Drop them in this pass
'''



# Drop some features
df = df.drop([ 'description'
              , 'major_options'
              , 'exterior_color'
              , 'interior_color'
              , 'fleet'
              , 'frame_damaged'
              , 'has_accidents'
              , 'salvage'
              , 'theft_title'
              , 'owner_count'
             #, 'trimId'
             #, 'trim_name'
             ], axis=1).copy()


In [12]:
print_nulls(df)

make_name               0.000
body_type               0.394
year                    0.000
engine_type             2.940
wheel_system            4.569
transmission            1.002
horsepower              5.282
maximum_seating         4.918
mileage                 3.554
price                   0.000
fuel_type               2.404
city_fuel_economy      15.962
highway_fuel_economy   15.962
combine_fuel_economy   15.962
daysonmarket            0.000
franchise_dealer        0.000
is_cpo                  0.000
front_legroom           5.707
back_legroom            8.224
dtype: float64


In [13]:
df = df.dropna()

In [14]:
#plot_hist_boxplot(df.select_dtypes(include='number'))

In [15]:
# Apply a log transformation to price
df['price'] = np.log(df['price'])
df = df.categorize().compute()
de = DummyEncoder()
df = de.fit_transform(df)

In [16]:
scaler = StandardScaler()

In [17]:
#X = scaler.fit_transform(df.drop('price', axis=1))

In [18]:
X = df.drop('price', axis=1)
y = df['price']

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.25, random_state=1312)


In [20]:
lrm = LinearRegression()

with joblib.parallel_backend('dask'):
    scores = cross_validate(lrm, X_train, y_train, cv=4)
    
scores

{'fit_time': array([0.48899007, 1.07382751, 0.67618299, 1.12537146]),
 'score_time': array([0.01127696, 0.0143826 , 0.02260137, 0.01280069]),
 'test_score': array([0.91946004, 0.92089638, 0.92199615, 0.92168143])}

In [21]:
df

Unnamed: 0_level_0,year,horsepower,maximum_seating,mileage,price,city_fuel_economy,highway_fuel_economy,combine_fuel_economy,daysonmarket,franchise_dealer,...,wheel_system_4X2,transmission_A,transmission_Dual Clutch,transmission_M,transmission_CVT,fuel_type_Gasoline,fuel_type_Hybrid,fuel_type_Flex Fuel Vehicle,fuel_type_Diesel,fuel_type_Biodiesel
vin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
19UDE2F30HA008509,2017,201.000,5.000,47370.000,9.433,25.000,35.000,29.500,159,0,...,0,1,0,0,0,1,0,0,0,0
19UDE2F30KA001602,2019,201.000,5.000,30022.000,10.043,24.000,34.000,28.500,18,1,...,0,1,0,0,0,1,0,0,0,0
19UDE2F30LA008101,2020,201.000,5.000,10.000,10.216,24.000,34.000,28.500,24,1,...,0,0,1,0,0,1,0,0,0,0
19UDE2F30LA009541,2020,201.000,5.000,10.000,10.216,24.000,34.000,28.500,34,1,...,0,0,1,0,0,1,0,0,0,0
19UDE2F30LA010642,2020,201.000,5.000,10.000,10.216,24.000,34.000,28.500,6,1,...,0,0,1,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ZFBHRFBBXL6S00026,2020,178.000,5.000,13.000,10.312,21.000,28.000,24.150,39,1,...,0,1,0,0,0,1,0,0,0,0
ZFBNFYA12KP776265,2019,177.000,5.000,73.000,10.188,24.000,30.000,26.700,518,1,...,0,1,0,0,0,1,0,0,0,0
ZFBNFYA14KP835784,2019,177.000,5.000,120.000,10.251,24.000,30.000,26.700,281,1,...,0,1,0,0,0,1,0,0,0,0
ZFBNFYB19KP799640,2019,177.000,5.000,52.000,10.269,24.000,30.000,26.700,438,1,...,0,1,0,0,0,1,0,0,0,0


In [22]:
X

Unnamed: 0_level_0,year,horsepower,maximum_seating,mileage,city_fuel_economy,highway_fuel_economy,combine_fuel_economy,daysonmarket,franchise_dealer,is_cpo,...,wheel_system_4X2,transmission_A,transmission_Dual Clutch,transmission_M,transmission_CVT,fuel_type_Gasoline,fuel_type_Hybrid,fuel_type_Flex Fuel Vehicle,fuel_type_Diesel,fuel_type_Biodiesel
vin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
19UDE2F30HA008509,-0.166,-0.500,-0.458,0.404,0.512,1.034,0.765,0.782,-1.669,-0.325,...,-0.037,0.460,-0.056,-0.129,-0.428,0.225,-0.141,-0.156,-0.066,-0.025
19UDE2F30KA001602,0.432,-0.500,-0.458,-0.019,0.341,0.862,0.589,-0.552,0.599,3.074,...,-0.037,0.460,-0.056,-0.129,-0.428,0.225,-0.141,-0.156,-0.066,-0.025
19UDE2F30LA008101,0.731,-0.500,-0.458,-0.749,0.341,0.862,0.589,-0.495,0.599,-0.325,...,-0.037,-2.175,17.856,-0.129,-0.428,0.225,-0.141,-0.156,-0.066,-0.025
19UDE2F30LA009541,0.731,-0.500,-0.458,-0.749,0.341,0.862,0.589,-0.401,0.599,-0.325,...,-0.037,-2.175,17.856,-0.129,-0.428,0.225,-0.141,-0.156,-0.066,-0.025
19UDE2F30LA010642,0.731,-0.500,-0.458,-0.749,0.341,0.862,0.589,-0.665,0.599,-0.325,...,-0.037,-2.175,17.856,-0.129,-0.428,0.225,-0.141,-0.156,-0.066,-0.025
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ZFBHRFBBXL6S00026,0.731,-0.777,-0.458,-0.749,-0.174,-0.168,-0.175,-0.353,0.599,-0.325,...,-0.037,0.460,-0.056,-0.129,-0.428,0.225,-0.141,-0.156,-0.066,-0.025
ZFBNFYA12KP776265,0.432,-0.789,-0.458,-0.748,0.341,0.175,0.273,4.177,0.599,-0.325,...,-0.037,0.460,-0.056,-0.129,-0.428,0.225,-0.141,-0.156,-0.066,-0.025
ZFBNFYA14KP835784,0.432,-0.789,-0.458,-0.747,0.341,0.175,0.273,1.936,0.599,-0.325,...,-0.037,0.460,-0.056,-0.129,-0.428,0.225,-0.141,-0.156,-0.066,-0.025
ZFBNFYB19KP799640,0.432,-0.789,-0.458,-0.748,0.341,0.175,0.273,3.420,0.599,-0.325,...,-0.037,0.460,-0.056,-0.129,-0.428,0.225,-0.141,-0.156,-0.066,-0.025
