# Cars Sales Prediction Using Deep Learning

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import sklearn as sk
from sklearn.metrics import mean_squared_error,mean_absolute_error,r2_score
plt.rcParams['figure.figsize'] = 6, 4
plt.rcParams['axes.grid'] = True

import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

**Data Analysing**

In [None]:
#Data Audit
def continuous_var_summary(x):
    return pd.Series([x.count(), x.isnull().sum(), x.sum(), x.mean(), x.median(),  
                      x.std(), x.var(), x.min(), x.quantile(0.01), x.quantile(0.05),
                          x.quantile(0.10),x.quantile(0.25),x.quantile(0.50),x.quantile(0.75), 
                              x.quantile(0.90),x.quantile(0.95), x.quantile(0.99),x.max()], 
                  index = ['N', 'NMISS', 'SUM', 'MEAN','MEDIAN', 'STD', 'VAR', 'MIN', 'P1', 
                               'P5' ,'P10' ,'P25' ,'P50' ,'P75' ,'P90' ,'P95' ,'P99' ,'MAX'])

In [None]:
def categorical_var_summary(x):
    Mode = x.value_counts().sort_values(ascending = False)[0:1].reset_index()
    return pd.Series([x.count(), x.isnull().sum(), Mode.iloc[0, 0], Mode.iloc[0, 1], 
                          round(Mode.iloc[0, 1] * 100/x.count(), 2)], 
                  index = ['N', 'NMISS', 'MODE', 'FREQ', 'PERCENT'])

In [None]:
# Missing value imputation for categorical and continuous variables
def missing_imputation(x, stats = 'mean'):
    if (x.dtypes == 'float64') | (x.dtypes == 'int64'):
        x = x.fillna(x.mean()) if stats == 'mean' else x.fillna(x.median())
    else:
        x = x.fillna(x.mode())
    return x

In [None]:
# An utility function to create dummy variable
def create_dummies(df, colname):
    col_dummies = pd.get_dummies(df[colname], prefix = colname, drop_first = True)
    df = pd.concat([df, col_dummies], axis = 1)
    df.drop(colname, axis = 1, inplace = True )
    return df

In [None]:
cars = pd.read_excel('car_sales.xlsx')

In [None]:
cars

In [None]:
cars.describe(include='all')

In [None]:
cars_conti_vars = cars.loc[:, (cars.dtypes == 'float64') | (cars.dtypes == 'int64')]
cars_cat_vars = cars.loc[:, (cars.dtypes == 'object')]
#cars_cat_vars

In [None]:
cars_conti_vars.apply(continuous_var_summary).T.round(2)

In [None]:
cars_cat_vars.apply(categorical_var_summary).T

**Outlier Treatment**

In [None]:
cars_conti_vars = cars_conti_vars.apply(lambda x: x.clip(lower = x.quantile(0.01), upper = x.quantile(0.99)))
cars_conti_vars.apply(continuous_var_summary).T.round(2)

**Missing Value Treatment**

In [None]:
cars_conti_vars = cars_conti_vars.apply(missing_imputation)
cars_cat_vars = cars_cat_vars.apply(missing_imputation)
cars_conti_vars.apply(continuous_var_summary).T.round(1)

**Dealing Categorical Values**

In [None]:
cars_cat_vars.Manufacturer.value_counts()

In [None]:
cars_cat_vars = cars[['Manufacturer', 'Vehicle_type']]

for c_feature in ['Manufacturer', 'Vehicle_type']:
    cars_cat_vars[c_feature] = cars_cat_vars[c_feature].astype('category')
    cars_cat_vars = create_dummies(cars_cat_vars, c_feature)

**Final Data**

In [None]:
cars_new = pd.concat([cars_conti_vars, cars_cat_vars], axis = 1)
cars_new.head(3)

In [None]:
# Distribution of variables
sns.distplot(cars_new.Sales_in_thousands)
plt.show()

In [None]:
# apply log transformation: log is rescalling the data and making the distribution normal
cars_new['ln_sales_in_thousands'] = np.log(cars_new['Sales_in_thousands']+1)

# Distribution of variables
sns.distplot(cars_new.ln_sales_in_thousands)
plt.show()


In [None]:
cars_new.columns

In [None]:
cars_new.drop(['Power_perf_factor','__year_resale_value'], axis = 1, inplace = True)

In [None]:
# dropping the variables based low correlation with Y
#cars_new.drop(['four_year_resale_value', 'Power_perf_factor'], axis = 1, inplace = True)
cars_new.head(3)

In [None]:
cars_new.shape

In [None]:
# splitting the data: separate out the feature/input/independant columns and dependant variable
#cars_new.columns
feature_columns = cars_new.columns.difference(['ln_sales_in_thousands', 'Sales_in_thousands'])
feature_columns

In [None]:
from sklearn.preprocessing import StandardScaler

# Separate Target Variable and Predictor Variables
TargetVariable=['ln_sales_in_thousands']
 
X=cars_new.drop(columns=['ln_sales_in_thousands','Sales_in_thousands'])
y=cars_new[TargetVariable].values
 
### Sandardization of data ###
PredictorScaler=StandardScaler()
TargetVarScaler=StandardScaler()
 
# Storing the fit object for later reference
PredictorScalerFit=PredictorScaler.fit(X)
TargetVarScalerFit=TargetVarScaler.fit(y)
 
# Generating the standardized values of X and y
X=PredictorScalerFit.transform(X)
y=TargetVarScalerFit.transform(y)
 
# Split the data into training and testing set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
 
# Quick sanity check with the shapes of Training and testing datasets
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)


In [None]:
from sklearn.linear_model import LinearRegression
LR = LinearRegression()
LR.fit(X_train,y_train)


In [None]:
y_pred =  LR.predict(X_test)


In [None]:
mae=mean_absolute_error(y_test,y_pred)
mse=mean_squared_error(y_test,y_pred)
r2=r2_score(y_test,y_pred)
print("Mean Absolute Error:",mae)
print("Mean Squared Error:",mse)
print("R2 Score:",r2)

In [None]:
predictors=pd.DataFrame(data=cars_new,columns=cars_new.drop(columns=['ln_sales_in_thousands','Sales_in_thousands']).columns)

In [None]:
Predictions=TargetVarScalerFit.inverse_transform(y_pred)
 
# Scaling the y_test Price data back to original price scale
y_test_orig=TargetVarScalerFit.inverse_transform(y_test)
 
# Scaling the test data back to original scale
LRTest_Data=PredictorScalerFit.inverse_transform(X_test)
 
LRTestingData=pd.DataFrame(data=LRTest_Data, columns=predictors.columns)
LRTestingData['Value']=y_test_orig
LRTestingData['PredictedValue']=Predictions
LRTestingData.head(10)

In [None]:
LRresult=pd.DataFrame({'Actual':LRTestingData['Value'],'Predicted':LRTestingData['PredictedValue']})
LRresult['Actual']=np.exp(LRresult['Actual'])
LRresult['Predicted']=np.exp(LRresult['Predicted'])
LRresult=LRresult.astype({'Actual':float,'Predicted':float})

LRresult.head(10)

In [None]:
##applying deep learning
# importing the libraries
from keras.models import Sequential
from keras.layers import Dense
 
# create ANN model
model = Sequential()
 
# Defining the Input layer 
model.add(Dense(units=5, input_dim=28, kernel_initializer='normal', activation='relu'))

# The output neuron is a single fully connected node 
# Since we will be predicting a single number
model.add(Dense(1, kernel_initializer='normal'))
 
# Compiling the model
model.compile(loss='mean_squared_error', optimizer='adam')
 
# Fitting the ANN to the Training set
model.fit(X_train, y_train ,batch_size =10, epochs = 100, verbose=1)

In [None]:
# Generating Predictions on testing data
Predictions=model.predict(X_test)
 
# Scaling the predicted Price data back to original price scale
Predictions=TargetVarScalerFit.inverse_transform(Predictions)
 
# Scaling the y_test Price data back to original price scale
y_test_orig=TargetVarScalerFit.inverse_transform(y_test)
 
# Scaling the test data back to original scale
Test_Data=PredictorScalerFit.inverse_transform(X_test)
 
TestingData=pd.DataFrame(data=Test_Data, columns=predictors.columns)
TestingData['Value']=y_test_orig
TestingData['PredictedValue']=Predictions
TestingData.head(10)

In [None]:
#from sklearn.metrics import r2_score,mean_squared_error,mean_absolute_error
print('Accuracy of the model is',r2_score(y_test_orig,Predictions))
print('MSE',mean_squared_error(y_test_orig,Predictions))
print('MAE',mean_absolute_error(y_test_orig,Predictions))

In [None]:
fresult=pd.DataFrame({'Actual':TestingData['Value'],'Predicted':TestingData['PredictedValue']})
fresult['Actual']=np.exp(fresult['Actual'])
fresult['Predicted']=np.exp(fresult['Predicted'])
fresult=fresult.astype({'Actual':float,'Predicted':float})

fresult.head(10)

In [None]:
#both have same actual values
Gresult=pd.DataFrame({'Actual(All in thousands)':LRresult.Actual,'Linear Regression.Predicted':LRresult.Predicted,
                      'Neural Network.Predicted':fresult.Predicted})
Gresult.head(10)

In [None]:
f = plt.figure()
f.set_figwidth(8)
f.set_figheight(8)
plt.rcParams['axes.grid'] = True
sns.lineplot(x=LRresult['Actual'],y=LRresult['Predicted'],label='Linear Regression')
sns.lineplot(x=fresult['Actual'],y=fresult['Predicted'],label='Neural Network')

