In [1]:
import pandas as pd
import numpy as np
import datetime
import nbformat

import seaborn as sns
pal = sns.color_palette()

import plotly.express as px
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.figure_factory as ff
import plotly.graph_objects as go
from plotly import tools
import plotly.io as pio
import io
pio.renderers.default = "vscode"

import matplotlib as mpl
import matplotlib.pyplot as plt
plt.rcParams["patch.force_edgecolor"] = True
plt.style.use('fivethirtyeight')

from sklearn.preprocessing import  StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_absolute_error, explained_variance_score, r2_score, mean_squared_error, f1_score, precision_score, recall_score, accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsRegressor
import xgboost as xgb
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression, LogisticRegression
from IPython.display import display
mpl.rc('patch', edgecolor = 'dimgray', linewidth=1)
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "last_expr"
pd.options.display.max_columns = 50
%matplotlib inline
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) # to avoid deprecation warnings

 # FILE READING AND DATA CLEANING

In [3]:
# Import dataset
print("Loading dataset...")
df = pd.read_csv("./assets/get_around_pricing_project.csv")  # sep = ";"
print("...Done.")
pd.set_option('display.max_columns', None)

Loading dataset...
...Done.


In [4]:
# Basic stats
print("Number of rows : {}".format(df.shape[0]))
print("Number of columns : {}".format(df.shape[1]))
print()

print("Display of dataset: ")
display(df.head(1))
print()

print("Basics statistics: ")
data_desc = df.describe(include='all')
display(data_desc)
print()

print("Percentage of missing values: ")

pd.concat([df.isnull().sum(), 100 * df.isnull().sum()/len(df)], axis=1).rename(columns={0:'Missing Records', 1:'Percentage (%)'})


Number of rows : 4843
Number of columns : 15

Display of dataset: 


Unnamed: 0.1,Unnamed: 0,model_key,mileage,engine_power,fuel,paint_color,car_type,private_parking_available,has_gps,has_air_conditioning,automatic_car,has_getaround_connect,has_speed_regulator,winter_tires,rental_price_per_day
0,0,Citroën,140411,100,diesel,black,convertible,True,True,False,False,True,True,True,106



Basics statistics: 


Unnamed: 0.1,Unnamed: 0,model_key,mileage,engine_power,fuel,paint_color,car_type,private_parking_available,has_gps,has_air_conditioning,automatic_car,has_getaround_connect,has_speed_regulator,winter_tires,rental_price_per_day
count,4843.0,4843,4843.0,4843.0,4843,4843,4843,4843,4843,4843,4843,4843,4843,4843,4843.0
unique,,28,,,4,10,8,2,2,2,2,2,2,2,
top,,Citroën,,,diesel,black,estate,True,True,False,False,False,False,True,
freq,,969,,,4641,1633,1606,2662,3839,3865,3881,2613,3674,4514,
mean,2421.0,,140962.8,128.98823,,,,,,,,,,,121.214536
std,1398.198007,,60196.74,38.99336,,,,,,,,,,,33.568268
min,0.0,,-64.0,0.0,,,,,,,,,,,10.0
25%,1210.5,,102913.5,100.0,,,,,,,,,,,104.0
50%,2421.0,,141080.0,120.0,,,,,,,,,,,119.0
75%,3631.5,,175195.5,135.0,,,,,,,,,,,136.0



Percentage of missing values: 


Unnamed: 0,Missing Records,Percentage (%)
Unnamed: 0,0,0.0
model_key,0,0.0
mileage,0,0.0
engine_power,0,0.0
fuel,0,0.0
paint_color,0,0.0
car_type,0,0.0
private_parking_available,0,0.0
has_gps,0,0.0
has_air_conditioning,0,0.0


In [5]:
# droping useless features

df = df.drop(['Unnamed: 0'], axis=1)
df.head(1)


Unnamed: 0,model_key,mileage,engine_power,fuel,paint_color,car_type,private_parking_available,has_gps,has_air_conditioning,automatic_car,has_getaround_connect,has_speed_regulator,winter_tires,rental_price_per_day
0,Citroën,140411,100,diesel,black,convertible,True,True,False,False,True,True,True,106


In [14]:
df['model_key'].value_counts()

model_key
Citroën        969
Renault        916
BMW            827
Peugeot        642
Audi           526
Nissan         275
Mitsubishi     231
Mercedes        97
Volkswagen      65
Toyota          53
SEAT            46
Subaru          44
Opel            33
Ferrari         33
PGO             33
Maserati        18
Suzuki           8
Porsche          6
Ford             5
KIA Motors       3
Alfa Romeo       3
Fiat             2
Lexus            2
Lamborghini      2
Mini             1
Mazda            1
Honda            1
Yamaha           1
Name: count, dtype: int64

# EDA

In [23]:
# plotting cars by model

# colors = sns.color_palette("colorblind", len(years))
colors = pal

df_model = df['model_key'].value_counts()[:10]
label = df_model.index
size = df_model.values

trace = go.Pie(labels=label, values=size, marker=dict(colors=colors),hole = .2)

data = [trace]
layout = go.Layout(
    title='Percentage of Ten First Cars by Model'
)

fig = go.Figure(data=data, layout=layout)


py.iplot(fig)

On constate que Citroên est la marque la plus présente dans la flotte, suivie de Renault.

In [27]:
# plotting rental price par day

df_price = df.rental_price_per_day.value_counts()

trace = go.Bar(
    x=df_price.index,
    y=df_price.values,
    marker=dict(
        color = df_price.values,
        colorscale='Jet',
        showscale=True)
)

data = [trace]
layout = go.Layout(xaxis=dict(tickangle=15),
    title='Rental Price Per Day', 
    yaxis = dict(title = 'Number of observations'))

fig = go.Figure(data=data, layout=layout)
py.iplot(fig)

On constate que les prix des locations tournent beaucoup entre 90€ et 140€ par jour, en focntion du model et caractéristiques de la voiture. la voiture la plus chère coute 422€ par jour et la moins chère coute 10€.

In [67]:
# Ploting percentage of cars by model an fuel type
cont_table = pd.crosstab(df["model_key"], df["fuel"], normalize="index")

# Créer le graphique à barres
fig = px.bar(cont_table, x=cont_table.index, y=cont_table.columns, barmode='group')

# Définir les options d'affichage du graphique
fig.update_layout(
    title="Percentage Of Cars by Model and Fuel",
    xaxis_title="Car's Model",
    yaxis_title="Percentage",
    yaxis_tickformat = '.2%',
    legend_title="Fuel"
)

# Afficher le graphique
fig.show()

On constate que le diesel est de loin le type de carburant le plus utilsé et que les voitures de marque Porsche dans la flotte sont à 50% Hybride-Essence et 50% electrique, pas de moteur thermique.

In [73]:
# Airlines comparison

df_a = df.groupby('model_key').rental_price_per_day.mean().to_frame().sort_values(by='rental_price_per_day', ascending=False).round(2)
trace1 = go.Bar(
    x=df_a.index,
    y=df_a.rental_price_per_day,
    name='Price',
    marker=dict(
        color = 'yellow'
    )
)

df_a = df.groupby('model_key').mileage.mean().to_frame().sort_values(by='mileage', ascending=False).round(2)
trace2 = go.Bar(
    x=df_a.index,
    y=df_a.mileage,
    name='Mileage',
    marker=dict(
        color = 'orange'
    )
)

data = [trace1, trace2]
layout = go.Layout(xaxis=dict(tickangle=15), title='Mean Arrival & Departure Delay by Airlines',
    yaxis = dict(title = 'minute'), 
                   barmode='stack')

fig = go.Figure(data=data, layout=layout)
py.iplot(fig)

In [83]:
df.head(1)

Unnamed: 0,model_key,mileage,engine_power,fuel,paint_color,car_type,private_parking_available,has_gps,has_air_conditioning,automatic_car,has_getaround_connect,has_speed_regulator,winter_tires,rental_price_per_day
0,Citroën,140411,100,diesel,black,convertible,True,True,False,False,True,True,True,106


In [86]:
# Correlation Matrix


correlation = df[['rental_price_per_day', 'mileage', 'engine_power',  'private_parking_available', 'has_gps', 'has_air_conditioning', 'automatic_car', 'has_getaround_connect', 'has_speed_regulator', 'winter_tires']].corr()
cols = correlation.columns.values
corr  = correlation.values

# Création d'un Heatmap avec les données
trace = go.Heatmap(z=corr,
                   x=cols,
                   y=cols,
                   reversescale=True)

# Ajout des annotations de texte à l'intérieur du Heatmap
annotations = []
for i in range(corr.shape[0]):
    for j in range(corr.shape[1]):
        annotations.append(dict(x=cols[j], y=cols[i], text='{:.2f}'.format(corr[i, j]), font=dict(color='white'), showarrow=False))

# Configuration du layout
layout = go.Layout(dict(title="Correlation Matrix for variables",
                        autosize=False,
                        height=600,
                        width=800,
                        margin=dict(l=200),
                        yaxis=dict(tickfont=dict(size=8)),
                        xaxis=dict(tickfont=dict(size=8)),
                        annotations=annotations))

# Création de la figure avec le Heatmap et le layout
fig = go.Figure(data=[trace], layout=layout)

# Affichage de la figure
fig.show()
