# Motorcycle sales analysis

In [17]:
import pandas as pd
import numpy as np
pd.set_option("display.max_columns",100)

from sklearn.model_selection import train_test_split

import tensorflow as tf

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation
from tensorflow.keras.optimizers import Adam
from sklearn.metrics import mean_squared_error,r2_score

from sklearn.metrics import r2_score, mean_squared_error,mean_absolute_error
from sklearn.linear_model import LinearRegression, ElasticNet, Ridge, Lasso
from sklearn.tree import ExtraTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor
from xgboost import XGBRegressor

from sklearn.preprocessing import MinMaxScaler
scaler=MinMaxScaler()

import warnings
warnings.filterwarnings('ignore')

In [2]:
df=pd.read_csv("sales_data.csv") #https://www.kaggle.com/datasets/devijeganath/motorcycle-sales-analysis

### EDA

In [16]:
df

Unnamed: 0,warehouse,client_type,product_line,quantity,unit_price,total,payment,day,month,year
0,0,0,4,8,16.85,134.83,0,1,6,2021
1,1,0,0,9,19.29,173.61,2,1,6,2021
2,1,0,1,8,32.93,263.45,0,1,6,2021
3,1,1,3,16,37.84,605.44,1,1,6,2021
4,0,0,5,2,60.48,120.96,0,1,6,2021
...,...,...,...,...,...,...,...,...,...,...
995,0,0,2,9,32.87,295.83,0,28,8,2021
996,2,1,0,32,10.03,320.96,1,28,8,2021
997,2,1,2,12,32.80,393.64,1,28,8,2021
998,1,0,3,5,48.25,241.23,2,28,8,2021


In [21]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   warehouse     1000 non-null   int32  
 1   client_type   1000 non-null   int32  
 2   product_line  1000 non-null   int32  
 3   quantity      1000 non-null   int64  
 4   unit_price    1000 non-null   float64
 5   total         1000 non-null   float64
 6   payment       1000 non-null   int32  
 7   day           1000 non-null   int32  
 8   month         1000 non-null   int32  
 9   year          1000 non-null   int32  
dtypes: float64(2), int32(7), int64(1)
memory usage: 50.9 KB


In [5]:
df.isnull().sum() #We examine the empty lines.

date            0
warehouse       0
client_type     0
product_line    0
quantity        0
unit_price      0
total           0
payment         0
dtype: int64

In [7]:
df["product_line"].value_counts()

Breaking system          230
Suspension & traction    228
Electrical system        193
Frame & body             166
Miscellaneous            122
Engine                    61
Name: product_line, dtype: int64

In [8]:
df["client_type"].value_counts()

Retail       775
Wholesale    225
Name: client_type, dtype: int64

In [9]:
df["warehouse"].value_counts()

Central    480
North      340
West       180
Name: warehouse, dtype: int64

In [10]:
df["payment"].value_counts()

Credit card    659
Transfer       225
Cash           116
Name: payment, dtype: int64

### Feature Engineering

In [11]:
df["product_line"]=df["product_line"].map({"Breaking system":0,"Suspension & traction":1,"Electrical system":2,"Frame & body":3,"Miscellaneous":4,"Engine":5})
#We replace the data in product_line with 0, 1, 2, 3, 4 and 5.
df['product_line']=df['product_line'].astype(int) #We change the data type of product_line to integer.

df["client_type"]=df["client_type"].map({"Retail":0,"Wholesale":1})
df['client_type']=df['client_type'].astype(int)

df["warehouse"]=df["warehouse"].map({"Central":0,"North":1,"West":2})
df['warehouse']=df['warehouse'].astype(int)

df["payment"]=df["payment"].map({"Credit card":0,"Transfer":1,"Cash":2})
df['payment']=df['payment'].astype(int)

In [12]:
df["date"]=pd.to_datetime(df["date"])
df["day"]=(df["date"]).dt.day
df["month"]=(df["date"]).dt.month
df["year"]=(df["date"]).dt.year
del df["date"]
df['day']=df['day'].astype(int)
df['month']=df['month'].astype(int)
df['year']=df['year'].astype(int)
#We divide date into day, month and year.

In [20]:
abs(df.corr()["total"].sort_values(ascending=False)) #We look at their correlations.

total           1.000000
quantity        0.870207
client_type     0.656483
unit_price      0.372942
payment         0.275685
product_line    0.217674
month           0.039390
warehouse       0.031947
day             0.061492
year                 NaN
Name: total, dtype: float64

### Regression

In [23]:
x,y=df.drop(["total","month","warehouse","day","year"],axis=1),df[["total"]]
x=scaler.fit_transform(x)
x.shape

(1000, 5)

In [25]:
def algo_test(x,y):
    L = LinearRegression()
    E = ElasticNet()
    R = Ridge()
    Lass = Lasso()
    ETR=ExtraTreeRegressor()
    GBR=GradientBoostingRegressor()
    XGBC= XGBRegressor()
    x_train, x_test, y_train, y_test=train_test_split(x,y,test_size=0.2,random_state=13)
    algos = [L,E,R,Lass,ETR,GBR,XGBC]
    algo_names = ['Linear','ElasticNet','Ridge','Lasso','Extra Tree','Gradient Boosting','XGradientBooting']
    r_squared = []
    rmse = []
    mae = []
    result = pd.DataFrame(columns = ['R_Squared','RMSE','MAE'],index = algo_names)
    for algo in algos:
        algo.fit(x_train,y_train)    
        r_squared.append(r2_score(y_test,algo.predict(x_test)))
        rmse.append(mean_squared_error(y_test, algo.predict(x_test))**.5)
        mae.append(mean_absolute_error(y_test, algo.predict(x_test)))
    result.R_Squared = r_squared
    result.RMSE = rmse
    result.MAE= mae
    return result.sort_values('R_Squared', ascending=False)

In [26]:
algo_test(x,y)

Unnamed: 0,R_Squared,RMSE,MAE
Gradient Boosting,0.993412,29.562517,13.045627
XGradientBooting,0.987806,40.219284,12.4101
Extra Tree,0.987079,41.40125,16.97645
Lasso,0.859722,136.411966,72.963078
Linear,0.859273,136.630471,75.497196
Ridge,0.859268,136.632623,73.758383
ElasticNet,0.327339,298.714716,187.590509


In [27]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.20,random_state=42)

In [28]:
model=Sequential()
model.add(Dense(60,activation="relu"))
model.add(Dense(60,activation="relu"))
model.add(Dense(60,activation="relu"))
model.add(Dense(60,activation="relu"))
model.add(Dense(60,activation="relu"))
model.add(Dense(1))
model.compile(optimizer="adam",loss="mse")

In [29]:
model.fit(x_train,y_train,validation_data=(x_test,y_test),batch_size=10,epochs=500)
model.summary()

Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch 12/500
Epoch 13/500
Epoch 14/500
Epoch 15/500
Epoch 16/500
Epoch 17/500
Epoch 18/500
Epoch 19/500
Epoch 20/500
Epoch 21/500
Epoch 22/500
Epoch 23/500
Epoch 24/500
Epoch 25/500
Epoch 26/500
Epoch 27/500
Epoch 28/500
Epoch 29/500
Epoch 30/500
Epoch 31/500
Epoch 32/500
Epoch 33/500
Epoch 34/500
Epoch 35/500
Epoch 36/500
Epoch 37/500
Epoch 38/500
Epoch 39/500
Epoch 40/500
Epoch 41/500
Epoch 42/500
Epoch 43/500
Epoch 44/500
Epoch 45/500
Epoch 46/500
Epoch 47/500
Epoch 48/500
Epoch 49/500
Epoch 50/500
Epoch 51/500
Epoch 52/500
Epoch 53/500
Epoch 54/500
Epoch 55/500
Epoch 56/500
Epoch 57/500
Epoch 58/500
Epoch 59/500
Epoch 60/500
Epoch 61/500
Epoch 62/500
Epoch 63/500
Epoch 64/500
Epoch 65/500
Epoch 66/500
Epoch 67/500
Epoch 68/500
Epoch 69/500
Epoch 70/500
Epoch 71/500
Epoch 72/500
Epoch 73/500
Epoch 74/500
Epoch 75/500
Epoch 76/500
Epoch 77/500
Epoch 78

Epoch 159/500
Epoch 160/500
Epoch 161/500
Epoch 162/500
Epoch 163/500
Epoch 164/500
Epoch 165/500
Epoch 166/500
Epoch 167/500
Epoch 168/500
Epoch 169/500
Epoch 170/500
Epoch 171/500
Epoch 172/500
Epoch 173/500
Epoch 174/500
Epoch 175/500
Epoch 176/500
Epoch 177/500
Epoch 178/500
Epoch 179/500
Epoch 180/500
Epoch 181/500
Epoch 182/500
Epoch 183/500
Epoch 184/500
Epoch 185/500
Epoch 186/500
Epoch 187/500
Epoch 188/500
Epoch 189/500
Epoch 190/500
Epoch 191/500
Epoch 192/500
Epoch 193/500
Epoch 194/500
Epoch 195/500
Epoch 196/500
Epoch 197/500
Epoch 198/500
Epoch 199/500
Epoch 200/500
Epoch 201/500
Epoch 202/500
Epoch 203/500
Epoch 204/500
Epoch 205/500
Epoch 206/500
Epoch 207/500
Epoch 208/500
Epoch 209/500
Epoch 210/500
Epoch 211/500
Epoch 212/500
Epoch 213/500
Epoch 214/500
Epoch 215/500
Epoch 216/500
Epoch 217/500
Epoch 218/500
Epoch 219/500
Epoch 220/500
Epoch 221/500
Epoch 222/500
Epoch 223/500
Epoch 224/500
Epoch 225/500
Epoch 226/500
Epoch 227/500
Epoch 228/500
Epoch 229/500
Epoch 

Epoch 317/500
Epoch 318/500
Epoch 319/500
Epoch 320/500
Epoch 321/500
Epoch 322/500
Epoch 323/500
Epoch 324/500
Epoch 325/500
Epoch 326/500
Epoch 327/500
Epoch 328/500
Epoch 329/500
Epoch 330/500
Epoch 331/500
Epoch 332/500
Epoch 333/500
Epoch 334/500
Epoch 335/500
Epoch 336/500
Epoch 337/500
Epoch 338/500
Epoch 339/500
Epoch 340/500
Epoch 341/500
Epoch 342/500
Epoch 343/500
Epoch 344/500
Epoch 345/500
Epoch 346/500
Epoch 347/500
Epoch 348/500
Epoch 349/500
Epoch 350/500
Epoch 351/500
Epoch 352/500
Epoch 353/500
Epoch 354/500
Epoch 355/500
Epoch 356/500
Epoch 357/500
Epoch 358/500
Epoch 359/500
Epoch 360/500
Epoch 361/500
Epoch 362/500
Epoch 363/500
Epoch 364/500
Epoch 365/500
Epoch 366/500
Epoch 367/500
Epoch 368/500
Epoch 369/500
Epoch 370/500
Epoch 371/500
Epoch 372/500
Epoch 373/500
Epoch 374/500
Epoch 375/500
Epoch 376/500
Epoch 377/500
Epoch 378/500
Epoch 379/500
Epoch 380/500
Epoch 381/500
Epoch 382/500
Epoch 383/500
Epoch 384/500
Epoch 385/500
Epoch 386/500
Epoch 387/500
Epoch 

Epoch 475/500
Epoch 476/500
Epoch 477/500
Epoch 478/500
Epoch 479/500
Epoch 480/500
Epoch 481/500
Epoch 482/500
Epoch 483/500
Epoch 484/500
Epoch 485/500
Epoch 486/500
Epoch 487/500
Epoch 488/500
Epoch 489/500
Epoch 490/500
Epoch 491/500
Epoch 492/500
Epoch 493/500
Epoch 494/500
Epoch 495/500
Epoch 496/500
Epoch 497/500
Epoch 498/500
Epoch 499/500
Epoch 500/500
Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (10, 60)                  360       
                                                                 
 dense_1 (Dense)             (10, 60)                  3660      
                                                                 
 dense_2 (Dense)             (10, 60)                  3660      
                                                                 
 dense_3 (Dense)             (10, 60)                  3660      
                      

In [30]:
tahmin=model.predict(x_test)



In [31]:
r2_score(tahmin,y_test)

0.9992183336843911

In [32]:
(mean_squared_error(tahmin,y_test))**0.5

7.017635927850327