In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import sklearn
import seaborn as sns
from sklearn import preprocessing

In [2]:
df = pd.read_csv(r'C:/Users/Nathan/Desktop/car_sales.csv')
df.head()

Unnamed: 0,Manufacturer,Model,Sales_in_thousands,__year_resale_value,Vehicle_type,Price_in_thousands,Engine_size,Horsepower,Wheelbase,Width,Length,Curb_weight,Fuel_capacity,Fuel_efficiency,Latest_Launch,Power_perf_factor
0,Acura,Integra,16.919,16.36,Passenger,21.5,1.8,140.0,101.2,67.3,172.4,2.639,13.2,28.0,2/2/2012,58.28015
1,Acura,TL,39.384,19.875,Passenger,28.4,3.2,225.0,108.1,70.3,192.9,3.517,17.2,25.0,6/3/2011,91.370778
2,Acura,CL,14.114,18.225,Passenger,,3.2,225.0,106.9,70.6,192.0,3.47,17.2,26.0,1/4/2012,
3,Acura,RL,8.588,29.725,Passenger,42.0,3.5,210.0,114.6,71.4,196.6,3.85,18.0,22.0,3/10/2011,91.389779
4,Audi,A4,20.397,22.255,Passenger,23.99,1.8,150.0,102.6,68.2,178.0,2.998,16.4,27.0,10/8/2011,62.777639


In [3]:
#Replace NaN with mean()
df['Engine_size'].fillna((df['Engine_size'].mean()), inplace=True)
df['Horsepower'].fillna((df['Horsepower'].mean()), inplace=True)
df['__year_resale_value'].fillna((df['__year_resale_value'].mean()), inplace=True)
df['Price_in_thousands'].fillna((df['Price_in_thousands'].mean()), inplace=True)
df['Power_perf_factor'].fillna((df['Power_perf_factor'].mean()), inplace=True)
df['Fuel_capacity'].fillna((df['Fuel_capacity'].mean()), inplace=True)
df['Curb_weight'].fillna((df['Curb_weight'].mean()), inplace=True)

In [4]:
df.head(20)

Unnamed: 0,Manufacturer,Model,Sales_in_thousands,__year_resale_value,Vehicle_type,Price_in_thousands,Engine_size,Horsepower,Wheelbase,Width,Length,Curb_weight,Fuel_capacity,Fuel_efficiency,Latest_Launch,Power_perf_factor
0,Acura,Integra,16.919,16.36,Passenger,21.5,1.8,140.0,101.2,67.3,172.4,2.639,13.2,28.0,2/2/2012,58.28015
1,Acura,TL,39.384,19.875,Passenger,28.4,3.2,225.0,108.1,70.3,192.9,3.517,17.2,25.0,6/3/2011,91.370778
2,Acura,CL,14.114,18.225,Passenger,27.390755,3.2,225.0,106.9,70.6,192.0,3.47,17.2,26.0,1/4/2012,77.043591
3,Acura,RL,8.588,29.725,Passenger,42.0,3.5,210.0,114.6,71.4,196.6,3.85,18.0,22.0,3/10/2011,91.389779
4,Audi,A4,20.397,22.255,Passenger,23.99,1.8,150.0,102.6,68.2,178.0,2.998,16.4,27.0,10/8/2011,62.777639
5,Audi,A6,18.78,23.555,Passenger,33.95,2.8,200.0,108.7,76.1,192.0,3.561,18.5,22.0,8/9/2011,84.565105
6,Audi,A8,1.38,39.0,Passenger,62.0,4.2,310.0,113.0,74.0,198.2,3.902,23.7,21.0,2/27/2012,134.656858
7,BMW,323i,19.747,18.072975,Passenger,26.99,2.5,170.0,107.3,68.4,176.0,3.179,16.6,26.0,6/28/2011,71.191207
8,BMW,328i,9.231,28.675,Passenger,33.4,2.8,193.0,107.3,68.5,176.0,3.197,16.6,24.0,1/29/2012,81.877069
9,BMW,528i,17.527,36.125,Passenger,38.9,2.8,193.0,111.4,70.9,188.0,3.472,18.5,25.0,4/4/2011,83.998724


#### Now we are going to build a model that predicts the "Price_in_thousands" based on some features. We are going to try different models and see which one works best

In [5]:
#import modules first
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

Now we are going to use the list of features to predict the 'Price_in_thousands', split the data into training and testing and determine the [Math Processing Error] on the test data

In [6]:
features = ['Engine_size','Horsepower','__year_resale_value', 'Power_perf_factor']    
X = df[features]
Y = df['Price_in_thousands']

In [7]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.15, random_state=1)

In [8]:
print("number of test samples :", x_test.shape[0])
print("number of training samples:",x_train.shape[0])

number of test samples : 24
number of training samples: 133


In [9]:
lre=LinearRegression()

lre.fit(x_train, y_train)

LinearRegression()

In [10]:
lre.score(x_test, y_test)

0.9693472723491181

We are going to create a pipeline object that scales the data, performs a polynomial transform and fits a linear regression model.

In [11]:
from sklearn.preprocessing import StandardScaler,PolynomialFeatures
from sklearn.pipeline import Pipeline

In [12]:
Input=[('scale',StandardScaler()),('polynomial', PolynomialFeatures(include_bias=False)),('model',LinearRegression())]

In [13]:
pipe=Pipeline(Input)
pipe

Pipeline(steps=[('scale', StandardScaler()),
                ('polynomial', PolynomialFeatures(include_bias=False)),
                ('model', LinearRegression())])

In [14]:
pipe.fit(X,df['Price_in_thousands'])

Pipeline(steps=[('scale', StandardScaler()),
                ('polynomial', PolynomialFeatures(include_bias=False)),
                ('model', LinearRegression())])

In [15]:
lre.intercept_

-2.114327731610029

In [16]:
lre.coef_

array([-0.92726473, -0.62217006,  0.17302958,  1.88089039])

In [17]:
pipe.score(X,df['Price_in_thousands'])

0.9956347658652809

Now we are going to build a model using a Ridge Regression Model

In [18]:
#Using a Ridge Regression model
from sklearn.linear_model import Ridge

In [21]:
RigeModel=Ridge(alpha=0.1)
RigeModel.fit(x_train, y_train)

Ridge(alpha=0.1)

In [20]:
RigeModel.score(x_test, y_test)

0.9693111141972607

In [22]:
RigeModel.score(x_test, y_test)

0.9693111141972607

Now we are going to use a Polynomial Transform


In [23]:
pr=PolynomialFeatures(degree=2)

In [24]:
x_train_pr=pr.fit_transform(x_train[features])

In [25]:
x_test_pr=pr.fit_transform(x_test[features])

In [26]:
RigeModel=Ridge(alpha=0.1)

In [28]:
RigeModel.fit(x_train_pr, y_train)
RigeModel.score(x_test_pr, y_test)

0.9595058664356463

## Conclusion

The answer to the question if a predictive model can be built to predict a car's price is YES. A predictive model can be built. I found that creating a pipeline object that scales the data, performs a polynomial transform and fits a linear regression model is the closest we can get to predict the price of a car based on some selected features. Our model has an accuracy of 99.56%.