# Analysing used car data to get meaningful insights


Import data sets using pandas


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
# %pip install pandas seaborn matplotlib numpy


In [None]:
# File path to read csv
file_path="https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBMDeveloperSkillsNetwork-DA0101EN-SkillsNetwork/labs/Data%20files/auto.csv"
headers= ["symboling","normalized-losses","make","fuel-type","aspiration", "num-of-doors","body-style",
         "drive-wheels","engine-location","wheel-base", "length","width","height","curb-weight","engine-type",
         "num-of-cylinders", "engine-size","fuel-system","bore","stroke","compression-ratio","horsepower",
         "peak-rpm","city-mpg","highway-mpg","price"]
df=pd.read_csv(file_path,names=headers)
df.head(5)

In [None]:
df.replace('?',np.NaN,inplace=True)

In [None]:
df.info()

In [None]:
df['price']=df['price'].astype(float)

In [None]:
df['horsepower']=df['horsepower'].astype(float)
df['peak-rpm']=df['peak-rpm'].astype(float)
df['normalized-losses']=df['normalized-losses'].astype(float)


In [None]:
df.head(5)

In [None]:
df['stroke']=df['stroke'].astype(float)
df['bore']=df['bore'].astype(float)

In [None]:
df.isna().sum()

In [None]:
df.describe()

In [None]:
avg=df['normalized-losses'].mean()
df['normalized-losses'].replace(np.NaN,avg,inplace=True)

In [None]:
df.dropna(subset=['price'],axis=0,inplace=True)

In [None]:
vals=df['num-of-doors'].value_counts().idxmax()
df['num-of-doors'].replace(np.NaN,vals,inplace=True)

In [None]:
avgs_1=df['horsepower'].mean()
df['horsepower'].replace(np.NaN,avgs_1,inplace=True)
avgs_2=df['bore'].mean()
df['bore'].replace(np.NaN,avgs_2,inplace=True)
avgs_3=df['stroke'].mean()
df['stroke'].replace(np.NaN,avgs_3,inplace=True)
avgs_4=df['peak-rpm'].value_counts().idxmax()
df['peak-rpm'].replace(np.NaN,avgs_4,inplace=True)





In [None]:
df.isna().sum()

### Data Standardization

Modify Some columns values to meet the perspective required

In [None]:
df['city-L/100']=235/df['city-mpg']
df.drop(columns=['city-mpg'],inplace=True)
df['highway-L/100']=235/df['highway-mpg']
df.drop(columns=['highway-mpg'],inplace=True)

### Data Normalisation
***Normalisation is process of scaling the multiple variables into a similar range***

In [None]:
# Simple Normalisation Method
df['length']=df['length']/df['length'].max()
df['width']=df['width']/df['width'].max()


In [None]:
# Min - Max method
df['height']=(df['height']-df['height'].min())/(df['height'].max()-df['height'].min())



In [None]:
# Bins
# Transforming continuous values into discrete categorical bins

In [None]:
df['horsepower']=df['horsepower'].astype(int,copy=True)

In [None]:
plt.hist(df['horsepower'])
plt.xlabel('Horsepower')
plt.ylabel('Count')
plt.title('Horsepower bins')

In [None]:
bins=np.linspace(df['horsepower'].min(),df['horsepower'].max(),4)
labels=['Low','Medium','High']
df['horsepower-label']=pd.cut(df['horsepower'],bins,labels=labels,include_lowest=True)
df.head(5)

In [None]:
df['horsepower-label'].value_counts()

In [None]:
plt.bar(labels,df['horsepower-label'].value_counts())
plt.xlabel('horsepower')
plt.ylabel('count')
plt.title('horsepower bins')

In [None]:
label_=['sedan','hatchback','wagon','hardtop','convertible']
plt.bar(label_,df['body-style'].value_counts())

In [None]:
# Indicator Variable
# Dummy is used for categorical data to divide it into different columns and value with 0,1
df.columns

In [None]:
dummies=pd.get_dummies(df,columns=['aspiration'])

In [None]:
dummies.head(5)


In [None]:
df=dummies

In [None]:
df.head(5)

In [None]:
# df.to_csv('Used_Car_Data')

In [None]:
# Right way to visualise data
numeric_data=df.select_dtypes(include=['float64','int64'])
numeric_data.corr()

In [None]:
df[['bore','horsepower','stroke','compression-ratio']].corr()

In [None]:
# Plotting a regression graph
sns.regplot(x='engine-size',y='price',data=df)
plt.ylim(0)

In [None]:
# Positive coorelation between price and engine
df[['engine-size','price']].corr()

In [None]:
# Plotting reggplot for highmpg and price
sns.regplot(x='highway-L/100',y='price',data=df)
plt.xlim(0)
df[['highway-L/100','price']].corr()

In [None]:
df[['stroke','price']].corr()

In [None]:
sns.regplot(x='price',y='stroke',data=df)
plt.xlim(0)

In [None]:
sns.boxplot(x='body-style',y='price',data=df)


In [None]:
sns.boxplot(x='engine-location',y='price',data=df)

In [None]:
sns.boxplot(x='drive-wheels',y='price',data=df)

In [None]:
import sqlite3
%load_ext sql
import prettytable
prettytable.DEFAULT='DEFAULT'
conn=sqlite3.Connection('Used_Car_Data.db')
cursor=conn.cursor()
%sql sqlite:///Used_Car_Data.db


In [None]:
df.to_sql('Used_Car',conn,index=False)


In [None]:
query='''Select name from sqlite_master where type='table' '''
output=cursor.execute(query)
print(cursor.fetchall())

In [None]:
%sql SELECT make,Round(AVG(price),2) as average_brand_price FROM USED_CAR group by make  

In [None]:
df.head()

In [None]:
# engine-location as variable
engine_loc_counts = df['engine-location'].value_counts().to_frame()
engine_loc_counts.rename(columns={'engine-location': 'value_counts'}, inplace=True)
engine_loc_counts.index.name = 'engine-location'
engine_loc_counts.head(10)


In [None]:
df.groupby('body-style')['price'].mean()

## Model Development


We will develop several models that will predict the price of the car using the variables or features. This is just an estimate but should give us an objective idea of how much the car should cost.

### Linear Regression

In [None]:
from sklearn.linear_model import LinearRegression
lm=LinearRegression()
lm

In [None]:
# Lets Try with highway-L/100 
X=df[['highway-L/100']]
Y=df[['price']]


In [None]:
lm.fit(X,Y)


In [None]:
Yhat=lm.predict(X)
Yhat[0:5]

In [None]:
lm.coef_

In [None]:
lm.intercept_

In [None]:
lm1=LinearRegression()
lm1

In [None]:
X1=df[['engine-size']]
Y1=df[['price']]


In [None]:
lm1.fit(X1,Y1)

In [None]:
Yhat1=lm1.predict(X1)
Yhat1[0:5]

In [None]:
lm1.intercept_

In [None]:
lm1.coef_

In [None]:
#yhat=166.86*engine-size-7963.34

In [None]:
sns.regplot(x=df['engine-size'],y=df['price'],data=df)
plt.xlim(0)

In [None]:
df[['engine-size','price']].corr()

In [None]:
plt.scatter(x=df['engine-size'],y=df['price'],color='red')
plt.plot(df['engine-size'],Yhat1,color='green')
plt.xlabel('Engine Size')
plt.ylabel('Price')

In [None]:
residuals=Y-Yhat1

In [None]:
sns.residplot(x=Y,y=residuals,lowess=True)

In [None]:
sns.kdeplot(df['price'],color='red')
sns.kdeplot(Yhat1,color='blue')
plt.xlabel('Actual vs predicted')
plt.ylabel('Count')
plt.show()

## Multiple Regression

If we want to use more variables in our model to predict car price, we can use Multiple Linear Regression. Multiple Linear Regression is very similar to Simple Linear Regression, but this method is used to explain the relationship between one continuous response (dependent) variable and two or more predictor (independent) variables. Most of the real-world regression models involve multiple predictors. We will illustrate the structure by using four predictor variables, but these results can generalize to any integer:

In [None]:
lm2=LinearRegression()
lm2

In [None]:
X2=df[['horsepower','curb-weight','engine-size','highway-L/100']]
lm2.fit(X2,df[['price']])


In [None]:
lm2.intercept_

In [None]:
lm2.coef_

In [None]:
Yhat_=lm2.predict(X2)
Yhat_[0:5]

In [None]:
sns.kdeplot(df['price'],color='red')
sns.kdeplot(Yhat_,color='green')
plt.title('Actual vs predicted values')

In [None]:
from sklearn.metrics import mean_absolute_error,mean_squared_error
mae,mse=mean_absolute_error(df['price'],Yhat_),mean_squared_error(df['price'],Yhat_)
r2=lm2.score(X2,df['price'])
print(f'The mae is {mae} , mse is {mse} , r2 is {r2}')

In [None]:
difference=df[['price']]-Yhat_

In [None]:
sns.residplot(x=df['price'],y=difference,color='blue',lowess=True)

## Polynomial Regression

Polynomial regression is a particular case of the general linear regression model or multiple linear regression models.

We get non-linear relationships by squaring or setting higher-order terms of the predictor variables.

In [None]:
def PlotPolly(model, independent_variable, dependent_variabble, Name):
    x_new = np.linspace(15, 55, 100)
    y_new = model(x_new)

    plt.plot(independent_variable, dependent_variabble, '.', x_new, y_new, '-')
    plt.title('Polynomial Fit with Matplotlib for Price ~ Length')
    ax = plt.gca()
    ax.set_facecolor((0.898, 0.898, 0.898))
    fig = plt.gcf()
    plt.xlabel(Name)
    plt.ylabel('Price of Cars')

    plt.show()
    plt.close()

In [None]:
x=df['highway-L/100']
y=df['price']


In [None]:
f=np.polyfit(x,y,3)
p=np.poly1d(f)
print(f)

In [None]:
PlotPolly(p, x, y, 'highway-mpg')

In [None]:
np.polyfit(x,y,3)

In [None]:
c1=np.polyfit(x,y,11)
c2=np.poly1d(c1)
print(c1)

In [None]:
PlotPolly(c2,x,y,'highway-L/100')

### Polynomial Feautures

In [None]:
from sklearn.preprocessing import PolynomialFeatures

In [None]:
pr=PolynomialFeatures(2)
pr

In [None]:
z_pr=pr.fit_transform(X2)

In [None]:
X2.shape

In [None]:
z_pr.shape

### Pipe lines

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler


In [None]:
input=[('scale',StandardScaler()),('polynomial',PolynomialFeatures(include_bias=False)),('model',LinearRegression())]

In [None]:
pipe=Pipeline(input)
pipe

In [None]:
X2=X2.astype(float)
pipe.fit(X2,df['price'])

In [None]:
ypipe=pipe.predict(X2)
ypipe[0:4]

In [None]:
sns.kdeplot(df['price'],color='red')
sns.kdeplot(ypipe,color='blue')


In [None]:
pipe.score(X2,df['price'])

In [None]:
mae_pr,mse_pr=mean_absolute_error(df['price'],ypipe),mean_squared_error(df['price'],ypipe)
print(f'{mae_pr},{mse_pr}')
