# Consider only the below columns and prepare a prediction model for predicting Price.

Columns = "Price","Age_08_04","KM","HP","cc","Doors","Gears","Quarterly_Tax","Weight"


## Import Libraries 

In [1]:
# Silence warnings since they do not affect the analysis.
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Data load and manipulation
import numpy as np
import pandas as pd

# Data visualization and analysis.
import matplotlib.pyplot as plt
import seaborn as sns

# Data preprocessing
from sklearn.model_selection import train_test_split

# Machine learning models
import statsmodels.api as sm
import statsmodels.formula.api as smf
from sklearn.linear_model import LinearRegression, Ridge, Lasso,ElasticNet

# Evaluation metrics
from statsmodels.tools.eval_measures import meanabs
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [3]:
# Setting to avoid output turn into scrollable frames when plottong pairplot:
# Ref: https://stackoverflow.com/questions/41641205/how-to-avoid-output-into-scrollable-frames-in-jupyter-notebook/41646557

In [4]:
%%javascript
IPython.OutputArea.prototype._should_scroll = function(lines) {
    return false;
}

<IPython.core.display.Javascript object>

In [5]:
# Matplotlib configurations

# Display interactive plots. Used this since convenient for displaying plots in github.
# %matplotlib notebook
%matplotlib notebook
# Font and figure size:
# Ref: https://stackoverflow.com/questions/3899980/how-to-change-the-font-size-on-a-matplotlib-plot
SMALL_SIZE = 8
MEDIUM_SIZE = 9
BIGGER_SIZE = 12

plt.rc('font', size=SMALL_SIZE)          # controls default text sizes
plt.rc('axes', titlesize=SMALL_SIZE)     # fontsize of the axes title
plt.rc('axes', labelsize=MEDIUM_SIZE)    # fontsize of the x and y labels
plt.rc('xtick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
plt.rc('ytick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
plt.rc('legend', fontsize=SMALL_SIZE)    # legend fontsize
plt.rc('figure', titlesize=BIGGER_SIZE)  # fontsize of the figure title

## Load data and perform initial checks

In [6]:
corolla_df = pd.read_csv('ToyotaCorolla.csv', encoding='latin1') # Encoding set to latin1 to fix file read error.
corolla_df.head()

Unnamed: 0,Id,Model,Price,Age_08_04,Mfg_Month,Mfg_Year,KM,Fuel_Type,HP,Met_Color,...,Central_Lock,Powered_Windows,Power_Steering,Radio,Mistlamps,Sport_Model,Backseat_Divider,Metallic_Rim,Radio_cassette,Tow_Bar
0,1,TOYOTA Corolla 2.0 D4D HATCHB TERRA 2/3-Doors,13500,23,10,2002,46986,Diesel,90,1,...,1,1,1,0,0,0,1,0,0,0
1,2,TOYOTA Corolla 2.0 D4D HATCHB TERRA 2/3-Doors,13750,23,10,2002,72937,Diesel,90,1,...,1,0,1,0,0,0,1,0,0,0
2,3,TOYOTA Corolla 2.0 D4D HATCHB TERRA 2/3-Doors,13950,24,9,2002,41711,Diesel,90,1,...,0,0,1,0,0,0,1,0,0,0
3,4,TOYOTA Corolla 2.0 D4D HATCHB TERRA 2/3-Doors,14950,26,7,2002,48000,Diesel,90,0,...,0,0,1,0,0,0,1,0,0,0
4,5,TOYOTA Corolla 2.0 D4D HATCHB SOL 2/3-Doors,13750,30,3,2002,38500,Diesel,90,0,...,1,1,1,0,1,0,1,0,0,0


In [7]:
corolla_df1 = corolla_df[["Age_08_04","KM","HP","cc","Doors","Gears","Quarterly_Tax","Weight","Price"]]
corolla_df1.head()

Unnamed: 0,Age_08_04,KM,HP,cc,Doors,Gears,Quarterly_Tax,Weight,Price
0,23,46986,90,2000,3,5,210,1165,13500
1,23,72937,90,2000,3,5,210,1165,13750
2,24,41711,90,2000,3,5,210,1165,13950
3,26,48000,90,2000,3,5,210,1165,14950
4,30,38500,90,2000,3,5,210,1170,13750


### Metadata
- Price  -- Offer Price in EUROs
- Age_08_04 -- Age in months as in August 2004
- KM -- Accumulated Kilometers on odometer
- HP -- Horse Power
- cc -- Cylinder Volume in cubic centimeters
- Doors -- Number of doors
- Gears -- Number of gear positions
- Quarterly_Tax -- Quarterly road tax in EUROs
- Weight -- Weight in Kilograms

The data is collected from a used car dealership in europe.

### Objective 
To prepare a prediction model for predicting price of a used toyota corolla.

## Exploratory Data Analysis

In [8]:
corolla_df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1436 entries, 0 to 1435
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype
---  ------         --------------  -----
 0   Age_08_04      1436 non-null   int64
 1   KM             1436 non-null   int64
 2   HP             1436 non-null   int64
 3   cc             1436 non-null   int64
 4   Doors          1436 non-null   int64
 5   Gears          1436 non-null   int64
 6   Quarterly_Tax  1436 non-null   int64
 7   Weight         1436 non-null   int64
 8   Price          1436 non-null   int64
dtypes: int64(9)
memory usage: 101.1 KB


### Observations:
- Data is for toyota corolla cars, probably from a second hand car dealership.
- All columns have numeric data, the datatypes are appropriate and there are no null values.

## Assumptions Check:

### 1. Linearity | 2. Normality | 3. Multicollinearity

In [9]:
### Correlation matrix:
corr_matrix = corolla_df1.corr()
corr_matrix.style.background_gradient(cmap = 'PuOr_r').set_precision(2)

Unnamed: 0,Age_08_04,KM,HP,cc,Doors,Gears,Quarterly_Tax,Weight,Price
Age_08_04,1.0,0.51,-0.16,-0.1,-0.15,-0.01,-0.2,-0.47,-0.88
KM,0.51,1.0,-0.33,0.1,-0.04,0.02,0.28,-0.03,-0.57
HP,-0.16,-0.33,1.0,0.04,0.09,0.21,-0.3,0.09,0.31
cc,-0.1,0.1,0.04,1.0,0.08,0.01,0.31,0.34,0.13
Doors,-0.15,-0.04,0.09,0.08,1.0,-0.16,0.11,0.3,0.19
Gears,-0.01,0.02,0.21,0.01,-0.16,1.0,-0.01,0.02,0.06
Quarterly_Tax,-0.2,0.28,-0.3,0.31,0.11,-0.01,1.0,0.63,0.22
Weight,-0.47,-0.03,0.09,0.34,0.3,0.02,0.63,1.0,0.58
Price,-0.88,-0.57,0.31,0.13,0.19,0.06,0.22,0.58,1.0


In [10]:
## Scatterplot between variables along with KDE.
sns.pairplot(corolla_df1,kind='scatter', diag_kind='kde')
plt.show()

<IPython.core.display.Javascript object>

## Observations:
- Linearity: None of the input features are linearly correlated with output. Linearity test fails.
- Normality: Apart fro "KM" and "Price" which have a skew normal distribution, the rest of the features have either bimodal or narrow, higly heaked distributions. Normality test fails.
- Multicollinearity: None of the input features are significantly correlated among themselves. All pair wise correlations are between +/-0.8, ie -0.8< orrelation oefficient < 0.8. Hence all input features are independent. Multicollinearity test passes.

### No autocorrelation:
The data is not time series, hence there is no autocorrelation

### Building a regression model using statsmodels, training and evaluation
(Also Homoscedasticity and zero residual mean test)

In [11]:
lin_model = smf.ols('Price ~ Age_08_04 + KM + HP + cc + Doors + Gears + Quarterly_Tax + Weight',
                    data=corolla_df1).fit()

In [12]:
# Coefficients:
lin_model.params

Intercept       -5573.106358
Age_08_04        -121.658402
KM                 -0.020817
HP                 31.680906
cc                 -0.121100
Doors              -1.616641
Gears             594.319936
Quarterly_Tax       3.949081
Weight             16.958632
dtype: float64

In [13]:
# t and pvalues:
print(lin_model.tvalues, '\n', lin_model.pvalues)

Intercept        -3.948666
Age_08_04       -46.511852
KM              -16.621622
HP               11.241018
cc               -1.344222
Doors            -0.040410
Gears             3.016007
Quarterly_Tax     3.014535
Weight           15.879803
dtype: float64 
 Intercept         8.241949e-05
Age_08_04        3.354724e-288
KM                7.538439e-57
HP                3.757218e-28
cc                1.790902e-01
Doors             9.677716e-01
Gears             2.606549e-03
Quarterly_Tax     2.619148e-03
Weight            2.048576e-52
dtype: float64


**Remark:** Since the distribution of the features are not normal, it may be erroneous to use pvalues to judge if a feature is significant.

In [14]:
# Actual and prediced price:
y = corolla_df1.Price
y_pred = lin_model.fittedvalues

In [15]:
# Rsquared values:
# Evaluation metrics:
print("R2 Score           : ", round(lin_model.rsquared,5)) # R2 and Adj R2 decrease
print("Adjusted R2 score  : ", round(lin_model.rsquared_adj,5))
print("AIC                : ", round(lin_model.aic,4))
print("BIC                : ", round(lin_model.bic,4)) # AIC BIC Increase
print("mean absolute error: ", round(meanabs(y,y_pred),4))

R2 Score           :  0.86376
Adjusted R2 score  :  0.863
AIC                :  24769.0766
BIC                :  24816.5032
mean absolute error:  1001.5466


### 5. Homoscedasticity 


In [16]:
def get_standardized_values(vals):
    return(vals - vals.mean())/vals.std()

In [17]:
fig,ax = plt.subplots()
ax.scatter(get_standardized_values(lin_model.fittedvalues)
            ,get_standardized_values(lin_model.resid))

ax.plot([-2.5,5.5], [0,0], 'r-')
ax.set_title('Residual Plot')
ax.set_xlabel('Standardized Fitted values')
ax.set_ylabel('Standardized residual values')
ax.set_xlim(-2.5,5.5)
ax.set_ylim(-6.5,6.5)
plt.show()

<IPython.core.display.Javascript object>

###  Observations
- The points are not randomly scattered above and below the zero line. Homoscedasticity test fails

### 6. Zero residual mean

In [18]:
fig,ax = plt.subplots() 
ax.scatter(x = y, y = y_pred)
ax.plot([4000,33000],[4000,33000], 'r-')
ax.set_xlim(4000,33000)
ax.set_ylim(4000,33000)
ax.set_title('Zero residual mean: actual v/ predicted price')
ax.set_xlabel('Actual price')
ax.set_ylabel('Predicted price')

plt.axis('square')
plt.show()

<IPython.core.display.Javascript object>

###  Observations
- The points are not tightly scattered around the diagonal. Zero residual mean test fails.

### Test for normality of residuals (Q-Q plot)

In [19]:
qqplot = sm.qqplot(lin_model.resid,line='q')
plt.title("Normal Q-Q plot of residuals")
plt.show()

<IPython.core.display.Javascript object>

###  Observations
- From the nature of the above Q-Q plot for residuals, we infer that the residual distribution, although normal, is fat tailed this implies the presence of outliers in the distributions and would mean inaccurate predictions.

## Linear regression using sklearn

In [20]:
corolla_df2 = corolla_df1.copy()

In [21]:
# train test split:
X = corolla_df2.iloc[:, :-1] 
y = corolla_df2.iloc[:,-1]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

lm = LinearRegression()
lm.fit(X_train,y_train)
y_pred_lm = (lm.predict(X_test))

In [22]:
# Model evaluation:
print("Coefficients for original data    : ", lm.coef_)
# Mean absolute error:
print("\nMean absolute error               : %.2f" % mean_absolute_error(y_test, y_pred_lm))
# Mean squared error
print("Mean squared error                : %.2f" % mean_squared_error(y_test, y_pred_lm))
# The coefficient of determination: 1 is perfect prediction
print("Coefficient of determination      : %.2f" % r2_score(y_test, y_pred_lm))

Coefficients for original data    :  [-1.19692339e+02 -2.07609498e-02  2.85215331e+01 -1.08496529e-01
  5.57323418e+00  5.66402069e+02  1.42716995e+00  1.93639260e+01]

Mean absolute error               : 995.59
Mean squared error                : 1950244.99
Coefficient of determination      : 0.85


## Improving the model 

- Log transformation of "Weight" and "KM" did not imporve the correlation between the respective variable and price. The change was only in the third decimal, thus it was not pursued.
- From the pair plot, it is evident that the discreet variables - 'HP', 'Doors', 'Gears', 'Quarterly_Tax' have a somewhat linear relationship with price, thus if we transform them appropriately and choose the best features, we can hope to get some improvement in accuracy[1].

[1] Ref:https://towardsdatascience.com/feature-transformation-for-multiple-linear-regression-in-python-8648ddf070b8

### Analyzing the discreet variables

###  HP

In [23]:
X['HP'].describe()

count    1436.000000
mean      101.502089
std        14.981080
min        69.000000
25%        90.000000
50%       110.000000
75%       110.000000
max       192.000000
Name: HP, dtype: float64

Choose the bins [69,90,110,192] from the percentiles

In [24]:
bins = [69,90,110,192]
bins_hp = pd.cut(X['HP'],bins)

fig,ax = plt.subplots()
bins_hp.value_counts().plot(kind='bar')
ax.set_xlabel("HP")
plt.show()

<IPython.core.display.Javascript object>

HP falls of in a linear manner.

In [25]:
# Converting the discreet variable to binary
bins_hp = bins_hp.cat.as_unordered()
hp_dummy = pd.get_dummies(bins_hp)

In [26]:
hp_dummy.head()

Unnamed: 0,"(69, 90]","(90, 110]","(110, 192]"
0,1,0,0
1,1,0,0
2,1,0,0
3,1,0,0
4,1,0,0


###  Doors

In [27]:
X['Doors'].describe()

count    1436.000000
mean        4.033426
std         0.952677
min         2.000000
25%         3.000000
50%         4.000000
75%         5.000000
max         5.000000
Name: Doors, dtype: float64

Choose bins = [2,3,4,5] from percentiles.

In [28]:
bins = [2,3,4,5]
bins_doors = pd.cut(X['Doors'],bins)

fig,ax = plt.subplots()
bins_doors.value_counts().plot(kind='bar')
ax.set_xlabel("Doors")
plt.show()

<IPython.core.display.Javascript object>

Although "Doors" does not fall  in a linear manner, it has a decreasing trend.

In [29]:
bins_doors = bins_doors.cat.as_unordered()
doors_dummy = pd.get_dummies(bins_doors)

### Gears

In [30]:
X['Gears'].describe()

count    1436.000000
mean        5.026462
std         0.188510
min         3.000000
25%         5.000000
50%         5.000000
75%         5.000000
max         6.000000
Name: Gears, dtype: float64

In [31]:
bins = [3,5,6]
bins_gears = pd.cut(X['Gears'],bins)

fig,ax = plt.subplots()
bins_gears.value_counts().plot(kind='bar')
ax.set_xlabel("Gears")
plt.show()

<IPython.core.display.Javascript object>

Only two bars, one dominant and other minor, hence Not transforming this feature.

### Quarterly tax

In [32]:
X['Quarterly_Tax'].describe()

count    1436.000000
mean       87.122563
std        41.128611
min        19.000000
25%        69.000000
50%        85.000000
75%        85.000000
max       283.000000
Name: Quarterly_Tax, dtype: float64

In [33]:
bins = [19,69,85,283]
bins_qtax = pd.cut(X['Quarterly_Tax'],bins)

fig,ax = plt.subplots()
bins_qtax.value_counts().plot(kind='bar')
ax.set_xlabel("Quarterly_Tax")
plt.show()

<IPython.core.display.Javascript object>

Although "Quarterly tax" does not fall  in a linear manner, it has a decreasing trend.

In [34]:
bins_qtax = bins_qtax.cat.as_unordered()
qtax_dummy = pd.get_dummies(bins_qtax)

### CC

In [35]:
X['cc'].describe()

count     1436.00000
mean      1576.85585
std        424.38677
min       1300.00000
25%       1400.00000
50%       1600.00000
75%       1600.00000
max      16000.00000
Name: cc, dtype: float64

In [36]:
bins = [1300,1400,1600,16000]
bins_cc = pd.cut(X['cc'],bins)

fig,ax = plt.subplots()
bins_cc.value_counts().plot(kind='bar')

<IPython.core.display.Javascript object>

<AxesSubplot:>

Although there are three bars, first bar is dominant and others are minor, hence Not transforming this feature.

In [37]:
corolla_df3 = corolla_df2.copy()

# train test split:
X1 = corolla_df3.iloc[:, :-1] 
y1 = corolla_df3.iloc[:,-1]

X_train1, X_test1, y_train1, y_test1 = train_test_split(X1, y1, test_size=0.2, random_state=42)

In [38]:
# Function to transform the discrete variables in the intup dataframe and return the transformed dataframe.
def descrete_var_transform(X_df,hp = 'HP',doors = 'Doors', qtax = 'Quarterly_Tax'):
    bins_hp = [69,90,110,192]
    X_hp = pd.cut(X_df[hp],bins_hp)
    X_hp = X_hp.cat.as_unordered()
    hp_dummy = pd.get_dummies(X_hp)
        
    bins_doors = [2,3,4,5]
    X_doors = pd.cut(X_df[doors],bins_doors)
    X_doors = X_doors.cat.as_unordered()
    doors_dummy = pd.get_dummies(X_doors)
    
    bins_qtax = [19,69,85,283]
    X_qtax = pd.cut(X_df[qtax],bins_qtax)
    X_qtax = X_qtax.cat.as_unordered()
    qtax_dummy = pd.get_dummies(X_qtax)
    
    X_new = pd.concat([X_df,hp_dummy,doors_dummy,qtax_dummy], axis=1)
    X_new = X_new.drop([hp,doors,qtax],axis=1)
    return X_new

In [39]:
X_train_tf = descrete_var_transform(X_train1,'HP')
X_test_tf = descrete_var_transform(X_test1,'HP')

lm_tf = LinearRegression()

lm_tf.fit(X_train_tf,y_train1)
y_pred_tf = lm_tf.predict(X_test_tf)

# Model evaluation:
print("Coefficients for original data    : ", lm_tf.coef_)
# Mean absolute error:
print("\nMean absolute error               : %.2f" % mean_absolute_error(y_test1, y_pred_tf))
# Mean squared error
print("Mean squared error                : %.2f" % mean_squared_error(y_test1, y_pred_tf))
# The coefficient of determination: 1 is perfect prediction
print("Coefficient of determination      : %.2f" % r2_score(y_test1, y_pred_tf))

Coefficients for original data    :  [-1.21805812e+02 -2.04307651e-02  1.61285508e-02  6.08337601e+02
  1.83057307e+01  1.79711085e+03  1.77813274e+03  5.03987541e+03
 -7.30314391e+01  2.49870185e+02 -4.71395560e+01  1.84108948e+02
  4.18414925e+02  8.54853240e+01]

Mean absolute error               : 938.83
Mean squared error                : 1704907.10
Coefficient of determination      : 0.87


## Observations:
- There is a significant reduction in mean absolute error and mean square error. R2 score has jumped from 0.85 to 0.87. 

## Conclusion:
From the above analysis we see that, using a linear model with some transformations on the discreet variables, we can make reasonable predictions fot the used car price. 