In [1]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from dask import delayed
from dask.distributed import Client, progress
import dask.array as da
import dask.dataframe as dd
from dask_ml.preprocessing import DummyEncoder, StandardScaler
from dask_ml.model_selection import train_test_split
import joblib
from sklearn.model_selection import cross_validate
from sklearn.linear_model import LinearRegression

import pandas as pd
import spacy

import warnings
warnings.filterwarnings("ignore")
pd.options.display.float_format = '{:.3f}'.format

In [2]:
def print_full(x):
    pd.set_option('display.max_rows', len(x))
    print(x)
    pd.reset_option('display.max_rows')
    
def print_nulls(ddf):
    out = ddf.isnull().sum()*100/ddf.isnull().isnull().count()
    
    return print_full(out.compute())

In [3]:
def plot_hist_boxplot(df):
    '''
    Accepts only numerical columns;
    use a maks.
    '''
    dim = df.shape[1]
    plt.figure(figsize=(20, dim*5))
    
    # iterate through the columns
    for i, column in enumerate(df.columns):
        
        # plot a histogram
        plt.subplot(dim, 2, (i+1)*2-1)
        plt.hist(df[column])
        plt.ylabel(column, size='xx-large')
        
        # plot a boxplot
        plt.subplot(dim, 2, (i+1)*2)
        plt.boxplot(df[column], whis=[2.5, 97.5]) # boxplot will show outliers beyond the inner 90th percentile
    plt.show()

In [4]:
client = Client(n_workers=3, threads_per_worker=1, memory_limit='2GB')
client

0,1
Client  Scheduler: tcp://127.0.0.1:44661  Dashboard: http://127.0.0.1:8787/status,Cluster  Workers: 3  Cores: 3  Memory: 6.00 GB


In [5]:
# read the first 10,000 rows in pandas to do understand the data structure
df = pd.read_csv('/home/owen/Jupyter/Final/used_cars_data.csv', nrows=50000)

In [6]:
dtypes = {col : df[col].dtype for col in df.columns}

In [7]:
# These are retyped because they are being dropped
dtypes['sp_id'] = 'O'
dtypes['seller_rating'] = 'O'
dtypes['vehicle_damage_category'] = 'O'
dtypes['is_certified'] = 'O'
dtypes['combine_fuel_economy'] = 'O'
dtypes['listing_id'] = 'O'
dtypes['savings_amount'] = 'O'



# This contains problem values probably 4 digit codes
dtypes['dealer_zip'] = 'O'


In [8]:
# Load the data into Dask using the type map

df = dd.from_pandas(df, npartitions=5)

#df = dd.read_csv('/home/owen/Jupyter/Final/used_cars_data.csv', dtype=dtypes)

In [9]:
print_nulls(df)

vin                         0.000
back_legroom                4.994
bed                        99.152
bed_height                 93.136
bed_length                 93.136
body_type                   0.458
cabin                      98.648
city                        0.000
city_fuel_economy          14.886
combine_fuel_economy      100.000
daysonmarket                0.000
dealer_zip                  0.000
description                 4.224
engine_cylinders            2.872
engine_displacement         5.344
engine_type                 2.872
exterior_color              0.000
fleet                      39.884
frame_damaged              39.884
franchise_dealer            0.000
franchise_make             31.212
front_legroom               4.994
fuel_tank_volume            4.994
fuel_type                   2.470
has_accidents              39.884
height                      4.994
highway_fuel_economy       14.886
horsepower                  5.344
interior_color              0.000
isCab         

In [10]:
### Operations to clean the dataset
'''
Retain only some of the featuers.
Many contain redundant information or information that I am not using on this pass.
In particular I am not using location data on this pass. 
'''

# Limit the model to vehicles with NHTSA data available
df = df[df['year'] > 1984]


# Retain the columns first to reduce memory footprint
kept = ['vin' 
        , 'make_name' # Use make and body type instead of model
        #, 'model_name'
        , 'body_type'
        , 'year'
        , 'engine_type' # This contains the number of cylnders as well as their configureation.   
        , 'wheel_system'
        , 'transmission'
        #, 'trimId'
        #, 'trim_name'
        , 'horsepower'
        , 'maximum_seating'
        , 'mileage'
        
        # Target Variable
        , 'price'

        # Fuel Economy Variables
        , 'fuel_type'  
        , 'city_fuel_economy'
        , 'highway_fuel_economy'
        , 'combine_fuel_economy'

        , 'daysonmarket'
        , 'franchise_dealer'
        , 'is_cpo'
        
        # These Features would need NLP
        , 'description'
        , 'major_options'
        , 'exterior_color'
        , 'interior_color'

        # Have cleaning method but probably not necessary
        , 'front_legroom'
        , 'back_legroom'

        # Vehicle History Features
        , 'owner_count'
        , 'has_accidents'       
        , 'salvage'
        , 'theft_title'
        , 'fleet'
        , 'frame_damaged'
       ]

df = df.loc[:, kept].copy()


#First Drop Duplicates
df = df.drop_duplicates(subset='vin')


# Use vin as index?
'''
May additionally need to add a vin checker here
'''
df = df.set_index('vin')

df.persist()

# Generate a make/model/yearname column?
    # Would be most helpful if/when I merge outside data.
    # Dictionary of these and each has its own model?
        # Is that just LIFT?
    
# Drop almost half the dataset because it is missing history?, Drop those features?
'''
In this pass I'm going to drop those features. I want to see how good the estimation is first.
Added to the drop below

The following features are all 47.553% Null. Seems to correspond to VIN history data

['fleet'
, 'frame_damaged'
, 'isCab'
, 'has_accidents'
, 'salvage'
, 'theft_title']

'''

# Color and Description both need to be parsed
'''
Just dropping them in this pass. Added to the drop below
'''    


# Define a function to retype columns with unit inches
def strip_inches(t):
    try:
        t = float(str(t).strip(' in'))
        return t
    except:
        return np.nan

cols = ['back_legroom', 'front_legroom']
    
for col in cols:
    df[col] = df[col].apply(strip_inches, meta=(col, 'int64'))
    

# Clean and retype seating capacity 
def fix_seating(r):
    try:
        r = int(str(r).strip(' seats'))
        return r
    except:
        return np.nan
    
df['maximum_seating'] = df['maximum_seating'].apply(fix_seating, meta=
                                                    ('maximum_seating', 'int64')).compute()


# Clean up boolean columns into numerical
df['is_cpo'] = df['is_cpo'].apply(lambda x: 1 if x == True else 0)

df['franchise_dealer'] = df['franchise_dealer'].apply(lambda x: 1 if x == True else 0)


# Decide what to do about fuel economy

    # EPA generates combined by weighting city 55% and highway 45%
df['combine_fuel_economy'] = df['city_fuel_economy']*0.55 + df['highway_fuel_economy']*0.45
    # Would be cool to see a model that encorporates a ratio as input?
    
    
    
# See if I can bin TrimID better
'''
Drop them in this pass
'''



# Drop some features
df = df.drop([ 'description'
              , 'major_options'
              , 'exterior_color'
              , 'interior_color'
              , 'fleet'
              , 'frame_damaged'
              , 'has_accidents'
              , 'salvage'
              , 'theft_title'
              , 'owner_count'
             #, 'trimId'
             #, 'trim_name'
             ], axis=1).copy()


In [11]:
print_nulls(df)

make_name               0.000
body_type               0.448
year                    0.000
engine_type             2.802
wheel_system            4.614
transmission            0.853
horsepower              5.276
maximum_seating         4.948
mileage                 3.655
price                   0.000
fuel_type               2.404
city_fuel_economy      14.823
highway_fuel_economy   14.823
combine_fuel_economy   14.823
daysonmarket            0.000
franchise_dealer        0.000
is_cpo                  0.000
front_legroom           5.860
back_legroom            8.186
dtype: float64


In [12]:
df = df.dropna()

In [13]:
#plot_hist_boxplot(df.select_dtypes(include='number'))

In [14]:
# Apply a log transformation to price
df['price'] = np.log(df['price'])
df = df.categorize().compute()
de = DummyEncoder()
df = de.fit_transform(df)

In [15]:
scaler = StandardScaler()

In [16]:
X = scaler.fit_transform(df.drop('price', axis=1))

In [17]:
y = df['price']

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.25, random_state=1312)


In [19]:
lrm = LinearRegression()

with joblib.parallel_backend('dask'):
    scores = cross_validate(lrm, X_train, y_train, cv=4)
    
scores

{'fit_time': array([0.29964042, 0.34453011, 0.21441698, 0.35865569]),
 'score_time': array([0.02055049, 0.01479721, 0.00572181, 0.01215172]),
 'test_score': array([0.92161174, 0.9224681 , 0.92209996, 0.92464731])}