# Cars Sales Prediction

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import sklearn as sk
from sklearn.metrics import mean_squared_error,mean_absolute_error,r2_score
plt.rcParams['figure.figsize'] = 6, 4
plt.rcParams['axes.grid'] = True

import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

**Data Analysing**

In [None]:
#Data Audit
def continuous_var_summary(x):
    return pd.Series([x.count(), x.isnull().sum(), x.sum(), x.mean(), x.median(),  
                      x.std(), x.var(), x.min(), x.quantile(0.01), x.quantile(0.05),
                          x.quantile(0.10),x.quantile(0.25),x.quantile(0.50),x.quantile(0.75), 
                              x.quantile(0.90),x.quantile(0.95), x.quantile(0.99),x.max()], 
                  index = ['N', 'NMISS', 'SUM', 'MEAN','MEDIAN', 'STD', 'VAR', 'MIN', 'P1', 
                               'P5' ,'P10' ,'P25' ,'P50' ,'P75' ,'P90' ,'P95' ,'P99' ,'MAX'])

In [None]:
def categorical_var_summary(x):
    Mode = x.value_counts().sort_values(ascending = False)[0:1].reset_index()
    return pd.Series([x.count(), x.isnull().sum(), Mode.iloc[0, 0], Mode.iloc[0, 1], 
                          round(Mode.iloc[0, 1] * 100/x.count(), 2)], 
                  index = ['N', 'NMISS', 'MODE', 'FREQ', 'PERCENT'])

In [None]:
# Missing value imputation for categorical and continuous variables
def missing_imputation(x, stats = 'mean'):
    if (x.dtypes == 'float64') | (x.dtypes == 'int64'):
        x = x.fillna(x.mean()) if stats == 'mean' else x.fillna(x.median())
    else:
        x = x.fillna(x.mode())
    return x

In [None]:
# An utility function to create dummy variable
def create_dummies(df, colname):
    col_dummies = pd.get_dummies(df[colname], prefix = colname, drop_first = True)
    df = pd.concat([df, col_dummies], axis = 1)
    df.drop(colname, axis = 1, inplace = True )
    return df

In [None]:
cars = pd.read_excel('car_sales.xlsx')

In [None]:
cars.head(10)

In [None]:
cars.describe(include='all')

In [None]:
cars_conti_vars = cars.loc[:, (cars.dtypes == 'float64') | (cars.dtypes == 'int64')]
cars_cat_vars = cars.loc[:, (cars.dtypes == 'object')]
cars_cat_vars

In [None]:
cars_conti_vars.apply(continuous_var_summary).T.round(2)

In [None]:
cars_cat_vars.apply(categorical_var_summary).T

**Outlier Treatment**

In [None]:
cars_conti_vars = cars_conti_vars.apply(lambda x: x.clip(lower = x.quantile(0.01), upper = x.quantile(0.99)))
cars_conti_vars.apply(continuous_var_summary).T.round(2)

**Missing Value Treatment**

In [None]:
cars_conti_vars = cars_conti_vars.apply(missing_imputation)
cars_cat_vars = cars_cat_vars.apply(missing_imputation)
cars_conti_vars.apply(continuous_var_summary).T.round(1)

**Dealing Categorical Values**

In [None]:
cars_cat_vars.Manufacturer.value_counts()

In [None]:
cars_cat_vars = cars[['Manufacturer', 'Vehicle_type']]

for c_feature in ['Manufacturer', 'Vehicle_type']:
    cars_cat_vars[c_feature] = cars_cat_vars[c_feature].astype('category')
    cars_cat_vars = create_dummies(cars_cat_vars, c_feature)

**Final Data**

In [None]:
cars_new = pd.concat([cars_conti_vars, cars_cat_vars], axis = 1)
cars_new.head(5)

In [None]:
cars_new.iloc[0:5,:]

In [None]:
# Distribution of variables
sns.distplot(cars_new.Sales_in_thousands)
plt.show()

In [None]:
# apply log transformation: log is rescalling the data and making the distribution normal
cars_new['ln_sales_in_thousands'] = np.log(cars_new['Sales_in_thousands']+1)

# Distribution of variables
sns.distplot(cars_new.ln_sales_in_thousands)
plt.show()


In [None]:
# Linearity: correlation matrix (ranges from 1 to -1)
corrm = cars_new.corr()
corrm.to_csv('corrm.csv')
corrm.head(5)

In [None]:
cars.columns

In [None]:
cars_new.drop(['__year_resale_value'], axis = 1, inplace = True)

In [None]:
# dropping the variables based low correlation with Y
#cars_new.drop(['four_year_resale_value', 'Power_perf_factor'], axis = 1, inplace = True)
cars_new.head(5)

In [None]:
cars_new.shape

In [None]:
# splitting the data: separate out the feature/input/independant columns and dependant variable
#cars_new.columns
feature_columns = cars_new.columns.difference(['ln_sales_in_thousands', 'Sales_in_thousands'])
feature_columns

In [None]:
from sklearn.preprocessing import StandardScaler
standard = StandardScaler()
x = cars_new.loc[:, feature_columns].values
scaled_data = standard.fit_transform(x)
scaled_data


In [None]:
train_X, test_X, train_y, test_y = train_test_split(scaled_data,
                                                    cars_new['ln_sales_in_thousands'], test_size = 0.2, random_state = 1000)

In [None]:
from sklearn.linear_model import LinearRegression
LR = LinearRegression()
LR.fit(train_X,train_y)


In [None]:
y_pred =  LR.predict(test_X)

In [None]:
mae=mean_absolute_error(test_y,y_pred)
mse=mean_squared_error(test_y,y_pred)
r2=r2_score(test_y,y_pred)
print("Mean Absolute Error:",mae)
print("Mean Squared Error:",mse)
print("R2 Score:",r2)

In [None]:
print("Linear Regression")
pred_df=pd.DataFrame({'Actual Value':test_y,'Predicted Value':y_pred})
pred_df.head(10)

In [None]:
sns.regplot(x=test_y,y=y_pred,ci=None,color ='blue');
plt.title("Linear Regression")

## Applying Adaboost Regression

In [None]:
from sklearn.ensemble import AdaBoostRegressor
model=AdaBoostRegressor(n_estimators=40,random_state=4)
model.fit(train_X,train_y)


In [None]:
y_predAB=model.predict(test_X)
print("R2 Score Using Adaboost",r2_score(test_y,y_predAB))


In [None]:
print("Adaboost Regression")
pred_df=pd.DataFrame({'Actual Value':test_y,'Predicted Value':y_predAB})
pred_df.head(10)

In [None]:
sns.regplot(x=test_y,y=y_predAB,ci=None,color ='blue');
plt.title("Adaboost Regression")

## Applying Random Forest Regression

In [None]:
from sklearn.ensemble import RandomForestRegressor
model=RandomForestRegressor(n_estimators=15,random_state=4)
model.fit(train_X,train_y)


In [None]:
y_predRF=model.predict(test_X)
r2=r2_score(test_y,y_predRF)
print(r2)

In [None]:
print("Random Forest Regression")
pred_df=pd.DataFrame({'Actual Value':test_y,'Predicted Value':y_predRF})
pred_df.head(10)

In [None]:
sns.regplot(x=test_y,y=y_predRF,ci=None,color ='blue');
plt.title("Random Forest")

## Applying KNN Regression

In [None]:
from sklearn import neighbors
r2_val={}
for k in range(50):
    k=k+1
    model=neighbors.KNeighborsRegressor(n_neighbors=k)
    model.fit(train_X,train_y)
    y_predKNN=model.predict(test_X)
    r2=r2_score(test_y,y_predKNN)
    r2_val[k]=r2
    
v=list(r2_val.values())
k=list(r2_val.keys())
print("max r2 score in KNN: ",max(v))


In [None]:
print("KNeighbors")
pred_df=pd.DataFrame({'Actual Value':test_y,'Predicted Value':y_predKNN})
pred_df.head(10)

In [None]:
sns.regplot(x=test_y,y=y_predKNN,ci=None,color ='blue');
plt.title("KNeighbors")

## Applying Support Vector Machines SVM Regression

In [None]:
from sklearn.svm import SVR
model=SVR(kernel='rbf')
model.fit(train_X,train_y.ravel())


In [None]:
y_predSVM=model.predict(test_X)
r2=r2_score(test_y,y_predSVM)
print("R2 Score in SVR",r2)

In [None]:
print("Support Vector regression")
pred_df=pd.DataFrame({'Actual Value':test_y,'Predicted Value':y_predSVM})
pred_df.head(10)

In [None]:
sns.regplot(x=test_y,y=y_predSVM,ci=None,color ='blue');
plt.title("SVR")

In [None]:
from sklearn.tree import DecisionTreeRegressor
dtree=DecisionTreeRegressor(ccp_alpha=0.02,random_state=30)
dtree.fit(train_X,train_y)
y_predDT=dtree.predict(test_X)
print("Decision Tree r2 score")
r2_score(test_y,y_predDT)

In [None]:
print("Decision Tree")
pred_df=pd.DataFrame({'Actual Value':test_y,'Predicted Value':y_predDT})
pred_df.head(10)

In [None]:
from sklearn import tree
plt.figure(figsize=(15,10))
tree.plot_tree(dtree)
plt.show()

In [None]:
sns.regplot(x=test_y,y=y_predDT,ci=None,color ='blue');
plt.title("Decision Tree")

In [None]:
result=pd.DataFrame({'test values':test_y,
    'Linear Regression':y_pred,'Adaboost':y_predAB,'Random Forest':y_predRF,'KNeighbors':y_predKNN,'SVR':y_predSVM,'Decision Tree':y_predDT})
result.head(10)

In [None]:
f = plt.figure()
f.set_figwidth(12)
f.set_figheight(8)
plt.xlabel('Test Values')
plt.ylabel('Methods')
sns.lineplot(x=result['test values'],y=result['Linear Regression'],data=result,label='Linear Regression')
sns.lineplot(x=result['test values'],y=result['Adaboost'],data=result,label='Adaboost')
sns.lineplot(x=result['test values'],y=result['Random Forest'],data=result,label='Random Forest')
sns.lineplot(x=result['test values'],y=result['KNeighbors'],data=result,label='KNeighbors')
sns.lineplot(x=result['test values'],y=result['SVR'],data=result,label='SVR')
sns.lineplot(x=result['test values'],y=result['Decision Tree'],data=result,label='Decision Tree')