# Import Libraries

In [1]:
# Basic libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns ; sns.set()

# For Text Preprocessing
import re

# For Splitting the data
from sklearn.model_selection import train_test_split

# For Feature Scaling
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler

# For Imputing missing values
from sklearn.experimental import enable_iterative_imputer  
from sklearn.impute import IterativeImputer

pd.set_option('display.max_columns', None)

# Define classes & functions

## Cleaning Price

In [2]:
def filter_price(df):
    dropped_data = df[(df.price < 1000) | (df.price > 70000)].index
    df.drop(dropped_data, axis = 0, inplace = True)
    
    print('\tPrice filtered\n')
    return df

## Cleaning odometer

In [3]:
def filter_odometer(df):
    dropped_data = df[df.odometer >= 1000000].index
    df.drop(dropped_data, axis = 0, inplace = True)
    
    print('\tOdometer filtered\n')
    return df

## Encode categorical features

In [4]:
def encode_cylinders(df):
    df['cylinders'] = [float(d) for d in df.cylinders.str.extract(r'(\d+)')[0]]
    
    print('\t"Cylinders" encoded\n')
    return df

In [5]:
def encode_condition(df):
    condition_dict = {'salvage':0,
                     'fair':1,
                     'good':2,
                     'excellent':3,
                     'like new':4,
                     'new':5}

    df['condition'].replace(condition_dict, inplace = True)
    
    print('\t"Condition" encoded\n')
    return df

In [6]:
def encode_fuel(df):
    fuel_dict = {'gas':1,
                'other':0,
                'diesel':0,
                'hybrid':0,
                'electric':0}

    df['fuel'].replace(fuel_dict, inplace = True)
    df.rename(columns = {'fuel':'fuel_gas'}, inplace = True)
    
    print('\t"Fuel" encoded\n')
    return df

In [7]:
def encode_title_status(df):
    title_status_dict = {'clean':1,
                'rebuilt':1,
                'salvage':0,
                'lien':0,
                'missing':0,
                'parts only':0}

    df['title_status'].replace(title_status_dict, inplace = True)
    df.rename(columns = {'title_status':'title_status_ok'}, inplace = True)
    
    print('\t"Title status" encoded\n')
    return df

In [8]:
def encode_transmission(df):
    transmission_dict = {'automatic':1,
                        'other':0,
                        'manual':0}
    
    df['transmission'].replace(transmission_dict, inplace = True)
    df.rename(columns = {'transmission':'transmission_automatic'}, inplace = True)
    
    print('\t"Transmission" encoded\n')
    return df

In [9]:
def encode_type(df):
    top_4_type = df.type.value_counts()[:4].index
    
    df.type = ['other' if car_type not in top_4_type else car_type for car_type in df.type] 
    
    print('\tType encoded\n')
    return df

In [10]:
def encode_paint_color(df):
    d = {'silver':'grey'}
    df.paint_color.replace(d, inplace = True)
    
    top_4_colors = ['white', 'grey', 'black', 'blue']
    
    df.paint_color = ['other' if color not in top_4_colors else color for color in df.paint_color]
    
    print('\tPaint color encoded\n')
    return df

## Feature Engineering with External/Scraped Data

In [11]:
def states_to_regions(df, states_data):
    states_data['State Code'] = [code.lower() for code in states_data['State Code']]
    
    region_values = states_data['Region'].values
    state_code = states_data['State Code']

    d_regions = dict(zip(state_code, region_values))
    
    df.state.replace(d_regions, inplace = True)
    df.rename(columns = {'state':'regions_usa'}, inplace = True)
    
    print('\tStates data encoded into Regions\n')
    return df

In [12]:
def manufacturers_origin(df, manufacturers_data):
    # Change some compounds name on dataset
    d_mod = {}
    for brand in set(df.manufacturer):
        if type(brand) == str:
            if ('-' in brand) and (brand != 'mercedes-benz'):
                d_mod[brand] = re.sub('-',' ',brand)
    
    df.manufacturer.replace(d_mod, inplace = True)
    
    # Create dictionary "manufacturer - continent"
    dict_cars_origin = {}

    for brand in set(df.manufacturer):
        if len(manufacturers_data[manufacturers_data.isin([f'{brand}'])].stack()) > 0:
            dict_cars_origin[f'{brand}'] = manufacturers_data[manufacturers_data.isin([f'{brand}'])].stack().index[0][1]
    
    # Adding missing brands
    dict_cars_origin['saturn'] = 'north america '
    dict_cars_origin['rover'] = 'europe '
    dict_cars_origin['pontiac'] = 'north america '
    dict_cars_origin['mercury'] = 'north america '
    dict_cars_origin['harley davidson'] = 'north america '
    
    # Replacing manufacturers
    df.manufacturer = df.manufacturer.replace(dict_cars_origin)
    
    print('\tManufacturer data regrouped by manufacturer origin (continent)\n')
    return df

## Defining functions to apply fit/transform methods from sklearn

In [13]:
def apply_imputer(train_set, test_set):
    # Filter rows with too many missing values
    train_rows_to_drop = train_set[train_set.isnull().sum(axis = 1) > 4].index
    test_rows_to_drop = test_set[test_set.isnull().sum(axis = 1) > 4].index
    
    train_set.drop(train_rows_to_drop, axis = 0, inplace = True)
    test_set.drop(test_rows_to_drop, axis = 0, inplace = True)
    
    train_set.reset_index(inplace = True, drop = True)
    test_set.reset_index(inplace = True, drop = True)
    
    # One-Hot encode categorical features
    train_set = pd.get_dummies(train_set)
    test_set = pd.get_dummies(test_set)
    
    
    dummies_to_drop = ['manufacturer_europe ', 'drive_4wd', 'type_other', 'paint_color_other', 'regions_usa_Northeast']
    
    train_set.drop(dummies_to_drop, axis = 1, inplace = True)
    test_set.drop(dummies_to_drop, axis = 1, inplace = True)
        
        
    # Apply imputer
    imp = IterativeImputer(random_state = 42, skip_complete = True,
                      min_value = [train_set.iloc[:, i].min() for i in range(len(train_set.columns))],
                      max_value = [train_set.iloc[:, i].max() for i in range(len(train_set.columns))])
    
    imp.fit(train_set)
    
    train = imp.transform(train_set)
    test = imp.transform(test_set)
    
    train_set = pd.DataFrame(train, columns = train_set.columns)
    test_set = pd.DataFrame(test, columns = test_set.columns)
    
    print('Missing data handled with Iterative Imputer')
    return train_set, test_set

In [14]:
def apply_robust_feature_scaling(train_set, test_set):
    X_train, X_test = train_set.iloc[:, 1:].values, test_set.iloc[:, 1:].values
    y_train, y_test = train_set.iloc[:, 0].values,  test_set.iloc[:, 0].values
    
    rsc = RobustScaler()
    rsc.fit(X_train)
    X_train_ = rsc.transform(X_train)
    X_test_ = rsc.transform(X_test)
    
    y_train_ = y_train.reshape(len(train_set),1)
    y_test_ = y_test.reshape(len(test_set),1)
    
    scaled_train_data = np.concatenate((y_train_, X_train_), axis = 1)
    scaled_test_data = np.concatenate((y_test_, X_test_), axis = 1)

    header = list(train_set.columns)
   
    train_set_std = pd.DataFrame(scaled_train_data, columns = header)
    test_set_std = pd.DataFrame(scaled_test_data, columns = header)
    
    print('Data standardized')
    
    return train_set_std, test_set_std

# Build Pipeline

In [15]:
def preprocessing_pipeline(df, states_data, manufacturers_data):
    
    functions  = [filter_price,
                        filter_odometer,
                        encode_cylinders,
                        encode_condition,
                        encode_fuel,
                        encode_title_status,
                        encode_transmission,
                        encode_type,
                        encode_paint_color,
                        states_to_regions, 
                        manufacturers_origin]
    
    sp_functions = [states_to_regions, manufacturers_origin]
    
    for func in functions:
        print(f'Function being applied: "{func.__name__}"')
        if func not in sp_functions:
            df = func(df)
        elif func == states_to_regions:
            df = func(df, states_data)
        elif func == manufacturers_origin:
            df = func(df, manufacturers_data)
        
    return df

# Import Data

In [16]:
raw_data = pd.read_csv('vehicles.csv')
df_regions = pd.read_csv('states.csv')
df_car_origin = pd.read_csv('scraped_car_origin.csv')

## Remove Unnecessary features

In [17]:
features_to_drop = ['id', 'url', 'region_url', 'VIN', 'image_url', 'description',
                    'posting_date', 'region', 'lat', 'long', 'county', 'size', 'model']

data = raw_data.drop(features_to_drop, axis = 1)

## Split Data

In [18]:
train_set, test_set = train_test_split(data, test_size = 0.2, random_state = 42) 
train_set.shape

(341504, 13)

# Apply main preprocessing 

In [19]:
train_set = preprocessing_pipeline(train_set, df_regions, df_car_origin)

Function being applied: "filter_price"
	Price filtered

Function being applied: "filter_odometer"
	Odometer filtered

Function being applied: "encode_cylinders"
	"Cylinders" encoded

Function being applied: "encode_condition"
	"Condition" encoded

Function being applied: "encode_fuel"
	"Fuel" encoded

Function being applied: "encode_title_status"
	"Title status" encoded

Function being applied: "encode_transmission"
	"Transmission" encoded

Function being applied: "encode_type"
	Type encoded

Function being applied: "encode_paint_color"
	Paint color encoded

Function being applied: "states_to_regions"
	States data encoded into Regions

Function being applied: "manufacturers_origin"
	Manufacturer data regrouped by manufacturer origin (continent)



In [20]:
test_set = preprocessing_pipeline(test_set, df_regions, df_car_origin)

Function being applied: "filter_price"
	Price filtered

Function being applied: "filter_odometer"
	Odometer filtered

Function being applied: "encode_cylinders"
	"Cylinders" encoded

Function being applied: "encode_condition"
	"Condition" encoded

Function being applied: "encode_fuel"
	"Fuel" encoded

Function being applied: "encode_title_status"
	"Title status" encoded

Function being applied: "encode_transmission"
	"Transmission" encoded

Function being applied: "encode_type"
	Type encoded

Function being applied: "encode_paint_color"
	Paint color encoded

Function being applied: "states_to_regions"
	States data encoded into Regions

Function being applied: "manufacturers_origin"
	Manufacturer data regrouped by manufacturer origin (continent)



## Applying imputer

In [21]:
train_set, test_set = apply_imputer(train_set, test_set)

Missing data handled with Iterative Imputer


## Applying feature scaling

In [22]:
train_set, test_set = apply_robust_feature_scaling(train_set, test_set)

Data standardized


# Exporting preprocessed sets

In [23]:
# Check results
tot = len(train_set) + len(test_set)

print(f'train_set shape = {train_set.shape},  contains {round(len(train_set) / tot * 100)}% of total data \n' 
      f'test_set shape  =  {test_set.shape},  contains {round(len(test_set) / tot * 100)}% of total data')

train_set shape = (300999, 23),  contains 80% of total data 
test_set shape  =  (75366, 23),  contains 20% of total data


In [24]:
train_set.to_csv('train_set_preprocessed.csv', index = False)
test_set.to_csv('test_set_preprocessed.csv', index = False)