# Car Residual Value Modeling
_**Part1: Analyze and summarize the data!**_

---

---

This is a problem of finding correlations in multiple time series (TS). Before we can find any correlations, we need to decompose the within-series dependences, without which spurious correlations will happen.

## Load libraries

In [301]:
import pandas as pd
from pandas_datareader import data
import datetime
import numpy as np
from bokeh.plotting import figure, show
from bokeh.palettes import viridis
from bokeh.io import output_notebook
import datetime
from pandas import Series
from sklearn.preprocessing import StandardScaler
output_notebook()

## Data preprocessing (imputation, scaling, and change to TS type)

In [302]:
symbols = ('Population_Civilian','CPI','CCI','Auto_Loans','Exist_HomeSales','Vehicle_Loans','Gas','GDP_Real','Housing_Starts','Indus_Prod','Manufac_Index','Bk_LoanRt','Prod_Index','Consump_Exp','Prod_Price_Index','HH_Debt','New_Auto_Loans','UMich_CCI','Unemp_Rate','Loan_Loss','cnt_M_U','cnt_M_N','cnt_S_U','cnt_S_N','cnt_mkt_U','cnt_mkt_N','BB_avg_value')

rng = pd.date_range('10/1/2005', periods=111, freq='M')

data = pd.read_csv('/Users/meliu/Downloads/sample_data.csv')

# Taking care of missing data
from sklearn.preprocessing import Imputer
imputer = Imputer(missing_values = 'NaN', strategy = 'mean', axis = 0)
imputer = imputer.fit(data.iloc[:, :-2])
data.iloc[:, :-2] = imputer.transform(data.iloc[:, :-2])

# Feature Scaling
from sklearn.preprocessing import StandardScaler
sc_data = StandardScaler()
data.iloc[:, :-1] = sc_data.fit_transform(data.iloc[:, :-1])

data['actual_date'] = pd.to_datetime(data['actual_date'])

data.index = data['actual_date']
del data['actual_date']

data.head()

Unnamed: 0_level_0,Population_Civilian,CPI,CCI,Auto_Loans,Exist_HomeSales,Vehicle_Loans,Gas,GDP_Real,Housing_Starts,Indus_Prod,...,UMich_CCI,Unemp_Rate,Loan_Loss,cnt_M_U,cnt_M_N,cnt_S_U,cnt_S_N,cnt_mkt_U,cnt_mkt_N,BB_avg_value
actual_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2006-10-01,-1.683866,-2.012447,0.362871,5.94627e-16,2.895876,-0.723017,-1.36593,-0.866607,2.114364,0.347377,...,1.564016,-1.517373,-1.083306,-1.505885,0.656598,-1.573073,-0.630947,-1.582178,0.254309,2.246052
2006-11-01,-1.647896,-2.003114,0.371481,5.94627e-16,2.8784,-0.702292,-1.371177,-0.861386,2.390928,0.326357,...,1.424086,-1.462439,-1.079512,-1.505885,0.646455,-1.573073,-0.644597,-1.582178,0.245453,2.034131
2006-12-01,-1.616148,-1.900449,0.375782,5.94627e-16,3.018211,-0.681567,-1.243498,-0.856164,2.667492,0.50879,...,1.386772,-1.517373,-1.075719,-1.505885,0.644363,-1.573073,-0.622229,-1.582178,0.242176,2.034131
2007-01-01,-1.531383,-1.868996,0.376974,5.94627e-16,1.82982,-0.660842,-1.406157,-0.850942,1.827297,0.421868,...,1.87186,-1.407505,-1.071925,-1.684806,0.646455,-1.772533,-0.030191,-1.825951,0.555588,1.76923
2007-02-01,-1.502607,-1.795357,0.36866,5.94627e-16,1.917201,-0.447584,-1.3292,-0.786274,2.075855,0.623852,...,1.349457,-1.462439,-1.071925,-1.691591,0.644685,-1.758338,-0.436062,-1.834885,0.263231,1.663269


## Car value plot

In [303]:
numlines=len(data.columns)
mypalette=viridis(numlines)

p = figure(width=1000, height=600, x_axis_type="datetime") 
color_ix = 0
for symbol in ['BB_avg_value']:
    p.line(data.index.values, data[symbol].values, legend=symbol, line_color=mypalette[color_ix], line_width=2)
    color_ix = color_ix + 1
show(p)

# Detrend data using linear regression to become stationary

In [304]:
series = data
# fit linear model
X = [i for i in range(0, len(series))]
X = numpy.reshape(X, (len(X), 1))
y = series.values
model = LinearRegression()
model.fit(X, y)
# calculate trend
trend = model.predict(X)
# detrend
data.iloc[:, :] = [y[i]-trend[i] for i in range(0, len(series))]
data.head()


Unnamed: 0_level_0,Population_Civilian,CPI,CCI,Auto_Loans,Exist_HomeSales,Vehicle_Loans,Gas,GDP_Real,Housing_Starts,Indus_Prod,...,UMich_CCI,Unemp_Rate,Loan_Loss,cnt_M_U,cnt_M_N,cnt_S_U,cnt_S_N,cnt_mkt_U,cnt_mkt_N,BB_avg_value
actual_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2006-10-01,0.027138,-0.332411,-0.43301,-0.305008,3.292231,0.424699,-1.070647,0.603067,1.993692,1.030293,...,2.216398,-1.455837,-1.52607,-0.301701,0.459488,-0.36794,0.306615,-0.68956,1.120506,0.585395
2006-11-01,0.031999,-0.353623,-0.40993,-0.299462,3.267549,0.424556,-1.081263,0.581567,2.272451,0.996857,...,2.064607,-1.402022,-1.514226,-0.323596,0.452929,-0.389851,0.275918,-0.705789,1.095901,0.403668
2006-12-01,0.032637,-0.281505,-0.391158,-0.293917,3.400153,0.424413,-0.958953,0.560067,2.551209,1.166873,...,2.015431,-1.458075,-1.502381,-0.34549,0.45442,-0.411763,0.28124,-0.722019,1.076876,0.433862
2007-01-01,0.086293,-0.280598,-0.375495,-0.288371,2.204555,0.424271,-1.126981,0.538568,1.713208,1.067534,...,2.488658,-1.349325,-1.490537,-0.546305,0.460097,-0.633135,0.85623,-0.982022,1.374538,0.199154
2007-02-01,0.08396,-0.237505,-0.369339,-0.282826,2.284731,0.616661,-1.055393,0.576514,1.96396,1.257102,...,1.954394,-1.405378,-1.482487,-0.574984,0.46191,-0.640852,0.433314,-1.007185,1.066433,0.123387


## After detrending plot

In [306]:
numlines=len(data.columns)
mypalette=viridis(numlines)

p = figure(width=1000, height=600, x_axis_type="datetime") 
color_ix = 0
for symbol in ['BB_avg_value']:
    p.line(data.index.values, data[symbol].values, legend=symbol, line_color=mypalette[color_ix], line_width=2)
    color_ix = color_ix + 1
show(p)

## Calculate Correlations

In [291]:
corr_df = data.corr(method='pearson')
corr_df

Unnamed: 0,Population_Civilian,CPI,CCI,Auto_Loans,Exist_HomeSales,Vehicle_Loans,Gas,GDP_Real,Housing_Starts,Indus_Prod,...,UMich_CCI,Unemp_Rate,Loan_Loss,cnt_M_U,cnt_M_N,cnt_S_U,cnt_S_N,cnt_mkt_U,cnt_mkt_N,BB_avg_value
Population_Civilian,1.0,-0.076344,-0.314403,-0.055554,0.483417,0.323075,-0.238571,0.537747,0.696443,0.619961,...,0.578814,-0.751381,-0.747917,-0.67979,0.669507,-0.606274,0.472538,-0.750261,0.605693,0.474371
CPI,-0.076344,1.0,0.375634,0.350539,-0.27575,0.298449,0.870111,0.111069,-0.107512,0.134161,...,-0.434101,0.00194,0.05723,0.072787,0.187148,-0.092321,0.349782,-0.037991,0.09293,0.370109
CCI,-0.314403,0.375634,1.0,0.409686,-0.216789,0.080869,0.456011,-0.114221,-0.314467,-0.193373,...,-0.256107,0.358479,0.425545,0.341935,-0.178471,0.195601,0.021602,0.279612,-0.216234,-0.062912
Auto_Loans,-0.055554,0.350539,0.409686,1.0,-0.031163,0.303932,0.497353,0.003772,-0.193982,-0.138125,...,-0.152014,0.263245,0.277046,0.458056,0.053209,0.354619,0.296773,0.351071,0.010674,0.105175
Exist_HomeSales,0.483417,-0.27575,-0.216789,-0.031163,1.0,0.457338,-0.176014,0.49695,0.765867,0.504208,...,0.771091,-0.552681,-0.597368,-0.28065,0.424125,-0.228649,0.375528,-0.411884,0.596133,0.497051
Vehicle_Loans,0.323075,0.298449,0.080869,0.303932,0.457338,1.0,0.479093,0.674595,0.578993,0.60369,...,0.445124,-0.481209,-0.446732,-0.049188,0.636999,-0.048554,0.791959,-0.211556,0.768953,0.648314
Gas,-0.238571,0.870111,0.456011,0.497353,-0.176014,0.479093,1.0,0.14073,-0.09369,0.127248,...,-0.300558,0.113546,0.153389,0.402224,0.201127,0.200593,0.479163,0.240853,0.204896,0.343997
GDP_Real,0.537747,0.111069,-0.114221,0.003772,0.49695,0.674595,0.14073,1.0,0.763852,0.867377,...,0.574701,-0.785603,-0.678443,-0.443957,0.63553,-0.387108,0.624961,-0.56245,0.803869,0.646486
Housing_Starts,0.696443,-0.107512,-0.314467,-0.193982,0.765867,0.578993,-0.09369,0.763852,1.0,0.877845,...,0.789398,-0.907013,-0.897307,-0.548484,0.649891,-0.498873,0.565061,-0.692078,0.848003,0.690464
Indus_Prod,0.619961,0.134161,-0.193373,-0.138125,0.504208,0.60369,0.127248,0.867377,0.877845,1.0,...,0.614928,-0.925097,-0.855844,-0.535168,0.665318,-0.554828,0.640576,-0.693596,0.864162,0.745966


## Create a Heat Map

In [307]:
corr_matrix = corr_df.as_matrix()
N = len(symbols)
factors = list(symbols)
x = []
y = []
colors = []
for i in range(N):
    for j in range(N):
        x.append(symbols[j])
        y.append(symbols[i])
        cor = corr_matrix[i, j]
        rgb = (int(abs(cor) * 255), 0, int((1 - abs(cor)) * 255))
        colors.append('#%02x%02x%02x' % rgb)

p2 = figure(x_range=factors, y_range=factors)

p2.rect(x, y, color=colors, width=1, height=1)

show(p2)

## Close Inspection of the parameters

In [294]:
corr_df['BB_avg_value']

Population_Civilian    0.474371
CPI                    0.370109
CCI                   -0.062912
Auto_Loans             0.105175
Exist_HomeSales        0.497051
Vehicle_Loans          0.648314
Gas                    0.343997
GDP_Real               0.646486
Housing_Starts         0.690464
Indus_Prod             0.745966
Manufac_Index          0.725142
Bk_LoanRt              0.713502
Prod_Index            -0.079396
Consump_Exp            0.698522
Prod_Price_Index       0.269914
HH_Debt               -0.000337
New_Auto_Loans        -0.032393
UMich_CCI              0.403320
Unemp_Rate            -0.716399
Loan_Loss             -0.711467
cnt_M_U               -0.359981
cnt_M_N                0.595551
cnt_S_U               -0.418432
cnt_S_N                0.617735
cnt_mkt_U             -0.541888
cnt_mkt_N              0.722038
BB_avg_value           1.000000
Name: BB_avg_value, dtype: float64

 Positive Correlation: market new #, segament new #, model new #, Comsump_Exp, Bk_LoanRt, Manufac_Index, Indus_Prod, Housing_Starts, GDP_Real, Vehicle_Loans

 Negative Correlation: Unemp_Rate, Loan_Loss, market used #, segament used #, model used #