# Importing libraries and dataset

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import (
    accuracy_score,
    f1_score,
    ConfusionMatrixDisplay,
    RocCurveDisplay,
    classification_report
)
import matplotlib.pyplot as plt
import warnings

warnings.filterwarnings(
    "ignore", category=DeprecationWarning
)  # to avoid deprecation warnings

import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio

import math

# setting Jedha color palette as default
pio.templates["jedha"] = go.layout.Template(
    layout_colorway=[
        "#4B9AC7",
        "#4BE8E0",
        "#9DD4F3",
        "#97FBF6",
        "#2A7FAF",
        "#23B1AB",
        "#0E3449",
        "#015955",
    ]
)
pio.templates.default = "jedha"
pio.renderers.default = "svg"

In [30]:
# Import dataset
print("Loading dataset...")
dataset = pd.read_csv("src/uber.csv")
print("Dataset loaded successfully.")
print(f"Shape of dataset: {dataset.shape}")
print("First 5 rows:")
print(dataset.head())
print()

Loading dataset...
Dataset loaded successfully.
Shape of dataset: (20000, 9)
First 5 rows:
   Unnamed: 0                            key  fare_amount  \
0    48462598    2015-05-07 10:24:44.0000004         13.0   
1     6637611    2014-07-09 09:14:04.0000002          5.5   
2     8357193  2013-11-11 18:51:00.000000240          8.5   
3    40466112   2014-05-22 01:54:00.00000069         19.0   
4    35405035    2011-06-21 23:37:33.0000002          7.7   

           pickup_datetime  pickup_longitude  pickup_latitude  \
0  2015-05-07 10:24:44 UTC        -73.971664        40.797035   
1  2014-07-09 09:14:04 UTC        -73.991635        40.749855   
2  2013-11-11 18:51:00 UTC        -73.982352        40.777042   
3  2014-05-22 01:54:00 UTC        -73.991455        40.751700   
4  2011-06-21 23:37:33 UTC        -73.974749        40.756255   

   dropoff_longitude  dropoff_latitude  passenger_count  
0         -73.958939         40.777649                1  
1         -73.988250         40.741

# Basic exploring and cleaning

Display basic statistics about the dataset. Do you notice some inconsistent values?

In [31]:
# Basic statistics and dataset overview
print("Dataset Overview:")
print(f"Number of rows: {dataset.shape[0]}")
print(f"Number of columns: {dataset.shape[1]}")
print()

# Display first few rows of the dataset
print("Preview of the dataset:")
display(dataset.head())
print()

# Basic statistics
print("Basic statistics for numerical and categorical columns:")
data_desc = dataset.describe(include="all")
display(data_desc)
print()

# Percentage of missing values for each column
print("Percentage of missing values in each column:")
missing_values = 100 * dataset.isnull().sum() / dataset.shape[0]
display(missing_values)

Dataset Overview:
Number of rows: 20000
Number of columns: 9

Preview of the dataset:


Unnamed: 0.1,Unnamed: 0,key,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,48462598,2015-05-07 10:24:44.0000004,13.0,2015-05-07 10:24:44 UTC,-73.971664,40.797035,-73.958939,40.777649,1
1,6637611,2014-07-09 09:14:04.0000002,5.5,2014-07-09 09:14:04 UTC,-73.991635,40.749855,-73.98825,40.741341,2
2,8357193,2013-11-11 18:51:00.000000240,8.5,2013-11-11 18:51:00 UTC,-73.982352,40.777042,-73.995912,40.759757,1
3,40466112,2014-05-22 01:54:00.00000069,19.0,2014-05-22 01:54:00 UTC,-73.991455,40.7517,-73.936357,40.812327,1
4,35405035,2011-06-21 23:37:33.0000002,7.7,2011-06-21 23:37:33 UTC,-73.974749,40.756255,-73.952276,40.778332,1



Basic statistics for numerical and categorical columns:


Unnamed: 0.1,Unnamed: 0,key,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
count,20000.0,20000,20000.0,20000,20000.0,20000.0,20000.0,20000.0,20000.0
unique,,20000,,19967,,,,,
top,,2015-05-07 10:24:44.0000004,,2012-08-28 14:03:00 UTC,,,,,
freq,,1,,2,,,,,
mean,27679490.0,,11.358151,,-72.490431,39.918498,-72.459891,39.923345,1.69015
std,16011230.0,,9.89199,,10.461597,6.051561,10.564266,6.90152,1.311384
min,3949.0,,-23.7,,-75.419276,-74.00619,-75.423067,-73.991765,0.0
25%,13834760.0,,6.0,,-73.992075,40.734733,-73.991423,40.734105,1.0
50%,27697240.0,,8.5,,-73.981904,40.752554,-73.980305,40.752997,1.0
75%,41480820.0,,12.5,,-73.967229,40.767075,-73.963509,40.768348,2.0



Percentage of missing values in each column:


Unnamed: 0           0.0
key                  0.0
fare_amount          0.0
pickup_datetime      0.0
pickup_longitude     0.0
pickup_latitude      0.0
dropoff_longitude    0.0
dropoff_latitude     0.0
passenger_count      0.0
dtype: float64

Drop the useless columns and the rows containing outliers.

In [32]:
to_drop = ['Unnamed: 0', 'key']

# Vérifiez que les colonnes existent avant de les supprimer
print("Colonnes avant suppression :", dataset.columns)

# Suppression des colonnes
dataset = dataset.drop(columns=to_drop)  # axis=1 est implicite dans `drop(columns=...)`

# Afficher un aperçu du dataset
print("Preview of the dataset après suppression:")
display(dataset.head())

Colonnes avant suppression : Index(['Unnamed: 0', 'key', 'fare_amount', 'pickup_datetime',
       'pickup_longitude', 'pickup_latitude', 'dropoff_longitude',
       'dropoff_latitude', 'passenger_count'],
      dtype='object')
Preview of the dataset après suppression:


Unnamed: 0,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,13.0,2015-05-07 10:24:44 UTC,-73.971664,40.797035,-73.958939,40.777649,1
1,5.5,2014-07-09 09:14:04 UTC,-73.991635,40.749855,-73.98825,40.741341,2
2,8.5,2013-11-11 18:51:00 UTC,-73.982352,40.777042,-73.995912,40.759757,1
3,19.0,2014-05-22 01:54:00 UTC,-73.991455,40.7517,-73.936357,40.812327,1
4,7.7,2011-06-21 23:37:33 UTC,-73.974749,40.756255,-73.952276,40.778332,1


In [33]:
# Filtrer pour conserver uniquement les valeurs >= 0 dans la colonne 'valeurs'
df_filtered = dataset[dataset['fare_amount'] >= 0]

print("DataFrame après suppression des valeurs en dessous de 0 :")
print(df_filtered)

DataFrame après suppression des valeurs en dessous de 0 :
       fare_amount          pickup_datetime  pickup_longitude  \
0            13.00  2015-05-07 10:24:44 UTC        -73.971664   
1             5.50  2014-07-09 09:14:04 UTC        -73.991635   
2             8.50  2013-11-11 18:51:00 UTC        -73.982352   
3            19.00  2014-05-22 01:54:00 UTC        -73.991455   
4             7.70  2011-06-21 23:37:33 UTC        -73.974749   
...            ...                      ...               ...   
19995         8.90  2010-07-29 23:59:00 UTC        -74.002582   
19996        38.33  2013-05-15 17:03:07 UTC        -73.870932   
19997         5.70  2012-03-08 18:39:00 UTC        -73.945258   
19998         6.50  2012-02-13 09:51:00 UTC        -73.974400   
19999        14.90  2010-12-19 21:06:01 UTC        -73.964427   

       pickup_latitude  dropoff_longitude  dropoff_latitude  passenger_count  
0            40.797035         -73.958939         40.777649                1  
1  

In [34]:
# Basic statistics
print("Basic statistics for numerical and categorical columns:")
data_desc = df_filtered.describe(include="all")
display(df_filtered)
print()

Basic statistics for numerical and categorical columns:


Unnamed: 0,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,13.00,2015-05-07 10:24:44 UTC,-73.971664,40.797035,-73.958939,40.777649,1
1,5.50,2014-07-09 09:14:04 UTC,-73.991635,40.749855,-73.988250,40.741341,2
2,8.50,2013-11-11 18:51:00 UTC,-73.982352,40.777042,-73.995912,40.759757,1
3,19.00,2014-05-22 01:54:00 UTC,-73.991455,40.751700,-73.936357,40.812327,1
4,7.70,2011-06-21 23:37:33 UTC,-73.974749,40.756255,-73.952276,40.778332,1
...,...,...,...,...,...,...,...
19995,8.90,2010-07-29 23:59:00 UTC,-74.002582,40.744520,-73.987415,40.720005,2
19996,38.33,2013-05-15 17:03:07 UTC,-73.870932,40.773763,-73.982838,40.761912,1
19997,5.70,2012-03-08 18:39:00 UTC,-73.945258,40.778640,-73.955853,40.772418,2
19998,6.50,2012-02-13 09:51:00 UTC,-73.974400,40.753947,-73.987952,40.734953,3





# Feature engineering

Dealing with datetime objects
    
Convert the pickup_datetime column into datetime format. Use panda's dt module to create the following columns:

Year

Month

Day

Weekday: contains the name of the day of week

Then, you can drop the column pickup_datetime.

In [None]:
df_filtered.loc[:, 'pickup_datetime'] = pd.to_datetime(df_filtered['pickup_datetime'], errors='coerce')

display(dataset.head())

Unnamed: 0,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,13.0,2015-05-07 10:24:44 UTC,-73.971664,40.797035,-73.958939,40.777649,1
1,5.5,2014-07-09 09:14:04 UTC,-73.991635,40.749855,-73.98825,40.741341,2
2,8.5,2013-11-11 18:51:00 UTC,-73.982352,40.777042,-73.995912,40.759757,1
3,19.0,2014-05-22 01:54:00 UTC,-73.991455,40.7517,-73.936357,40.812327,1
4,7.7,2011-06-21 23:37:33 UTC,-73.974749,40.756255,-73.952276,40.778332,1


In [38]:
print(df_filtered['pickup_datetime'].dtype)

object


In [39]:
invalid_dates = df_filtered[df_filtered['pickup_datetime'].isna()]
print("Valeurs non convertibles :")
print(invalid_dates)

Valeurs non convertibles :
Empty DataFrame
Columns: [fare_amount, pickup_datetime, pickup_longitude, pickup_latitude, dropoff_longitude, dropoff_latitude, passenger_count]
Index: []


In [40]:
# Si df_filtered est une vue, les modifications peuvent ne pas être appliquées comme prévu. Essayez de créer une copie explicite :
df_filtered = df_filtered.copy()
df_filtered['pickup_datetime'] = pd.to_datetime(df_filtered['pickup_datetime'], errors='coerce')

In [41]:
print(df_filtered['pickup_datetime'].dtype)

datetime64[ns, UTC]


In [46]:
df_filtered.loc[:, "year"] = df_filtered["pickup_datetime"].dt.year
df_filtered.loc[:, "month"] = df_filtered["pickup_datetime"].dt.month
df_filtered.loc[:, "day"] = df_filtered["pickup_datetime"].dt.day

display(df_filtered.head())

Unnamed: 0,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,year,month,day
0,13.0,2015-05-07 10:24:44+00:00,-73.971664,40.797035,-73.958939,40.777649,1,2015,5,7
1,5.5,2014-07-09 09:14:04+00:00,-73.991635,40.749855,-73.98825,40.741341,2,2014,7,9
2,8.5,2013-11-11 18:51:00+00:00,-73.982352,40.777042,-73.995912,40.759757,1,2013,11,11
3,19.0,2014-05-22 01:54:00+00:00,-73.991455,40.7517,-73.936357,40.812327,1,2014,5,22
4,7.7,2011-06-21 23:37:33+00:00,-73.974749,40.756255,-73.952276,40.778332,1,2011,6,21


In [47]:
weekday = {
    0: 'Monday',
    1: 'Tuesday',
    2: 'Wednesday',
    3: 'Thursday',
    4: 'Friday',
    5: 'Saturday',
    6: 'Sunday'
}

df_filtered.loc[:, "weekday"] = df_filtered["pickup_datetime"].dt.weekday.map(weekday)
df_filtered = df_filtered.drop("pickup_datetime", axis=1)

display(df_filtered.head())

Unnamed: 0,fare_amount,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,year,month,day,weekday
0,13.0,-73.971664,40.797035,-73.958939,40.777649,1,2015,5,7,Thursday
1,5.5,-73.991635,40.749855,-73.98825,40.741341,2,2014,7,9,Wednesday
2,8.5,-73.982352,40.777042,-73.995912,40.759757,1,2013,11,11,Monday
3,19.0,-73.991455,40.7517,-73.936357,40.812327,1,2014,5,22,Thursday
4,7.7,-73.974749,40.756255,-73.952276,40.778332,1,2011,6,21,Tuesday


# Haversine formula

It would be very interesting to compute the ride distance from the GPS coordinates. Haversine formula allows to do this 🤓:

In [51]:
def haversine(lon_1, lon_2, lat_1, lat_2):
    
    lon_1, lon_2, lat_1, lat_2 = map(np.radians, [lon_1, lon_2, lat_1, lat_2])  # Convert degrees to Radians
    
    
    diff_lon = lon_2 - lon_1
    diff_lat = lat_2 - lat_1
    

    distance_km = 2*6371*np.arcsin(np.sqrt(np.sin(diff_lat/2.0)**2 + np.cos(lat_1) * np.cos(lat_2) * np.sin(diff_lon/2.0)**2)) # earth radius: 6371km
    
    return distance_km

In [54]:
df_filtered.loc[:, "ride_distance"] = df_filtered[['pickup_longitude', 'dropoff_longitude', 'pickup_latitude', 'dropoff_latitude']].apply(
    lambda x: haversine(x['pickup_longitude'], x['dropoff_longitude'], x['pickup_latitude'], x['dropoff_latitude']), axis=1
)

display(df_filtered.head())

Unnamed: 0,fare_amount,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,year,month,day,weekday,ride_distance
0,13.0,-73.971664,40.797035,-73.958939,40.777649,1,2015,5,7,Thursday,2.407225
1,5.5,-73.991635,40.749855,-73.98825,40.741341,2,2014,7,9,Wednesday,0.988729
2,8.5,-73.982352,40.777042,-73.995912,40.759757,1,2013,11,11,Monday,2.235651
3,19.0,-73.991455,40.7517,-73.936357,40.812327,1,2014,5,22,Thursday,8.183379
4,7.7,-73.974749,40.756255,-73.952276,40.778332,1,2011,6,21,Tuesday,3.099698


# Preprocessing

Separate the target from the features

In [55]:
# Separate target variable Y from features X
print("Separating labels from features...")
target_variable = "fare_amount"

X = df_filtered.drop(columns=[target_variable])
Y = df_filtered[target_variable]

print("...Done.")
print()

print("Y (Target variable):")
print(Y.head())
print("\nX (Features):")
print(X.head())

Separating labels from features...
...Done.

Y (Target variable):
0    13.0
1     5.5
2     8.5
3    19.0
4     7.7
Name: fare_amount, dtype: float64

X (Features):
   pickup_longitude  pickup_latitude  dropoff_longitude  dropoff_latitude  \
0        -73.971664        40.797035         -73.958939         40.777649   
1        -73.991635        40.749855         -73.988250         40.741341   
2        -73.982352        40.777042         -73.995912         40.759757   
3        -73.991455        40.751700         -73.936357         40.812327   
4        -73.974749        40.756255         -73.952276         40.778332   

   passenger_count  year  month  day    weekday  ride_distance  
0                1  2015      5    7   Thursday       2.407225  
1                2  2014      7    9  Wednesday       0.988729  
2                1  2013     11   11     Monday       2.235651  
3                1  2014      5   22   Thursday       8.183379  
4                1  2011      6   21    Tuesday

Detect names of numeric/categorical features

In [58]:
# Automatically detect names of numeric and categorical columns
numeric_features = X.select_dtypes(include=["int64", "float64"]).columns.tolist()
categorical_features = X.select_dtypes(exclude=["int64", "float64"]).columns.tolist()

print("Found numeric features:", numeric_features)
print("Found categorical features:", categorical_features)

Found numeric features: ['pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude', 'passenger_count', 'ride_distance']
Found categorical features: ['year', 'month', 'day', 'weekday']


In [64]:
# Convertir uniquement 'year', 'month', et 'day' en types numériques
X[['year', 'month', 'day']] = X[['year', 'month', 'day']].astype(int)

In [65]:
# Automatically detect names of numeric and categorical columns
numeric_features = X.select_dtypes(include=["int64", "float64"]).columns.tolist()
categorical_features = X.select_dtypes(exclude=["int64", "float64"]).columns.tolist()

print("Found numeric features:", numeric_features)
print("Found categorical features:", categorical_features)

Found numeric features: ['pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude', 'passenger_count', 'year', 'month', 'day', 'ride_distance']
Found categorical features: ['weekday']


Make a train/test splitting with test_size = 0.2

In [68]:
# Divide dataset into Train and Test sets
print("Dividing dataset into train and test sets...")
# Using stratify=Y to ensure balanced class distribution in both sets
X_train, X_test, Y_train, Y_test = train_test_split(
    X, Y, test_size=0.2, random_state=0
)
print("Dataset division complete.")
print(f"Training set size: {X_train.shape[0]} samples")
print(f"Test set size: {X_test.shape[0]} samples")
print()

Dividing dataset into train and test sets...
Dataset division complete.
Training set size: 15999 samples
Test set size: 4000 samples



Make all the necessary preprocessings.

Hint: in this exercise, we'll first create a baseline model with a multivariate linear regression. So don't forget to make all the transformations that are required for this kind of model 😉

In [69]:
# Pipeline pour les features numériques
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())  # Mise à l'échelle des données
])

print("Pipeline pour les features numériques créée avec succès.")

Pipeline pour les features numériques créée avec succès.


In [70]:
# Pipeline pour les features catégoriques
categorical_transformer = Pipeline(steps=[
    ('encoder', OneHotEncoder(drop='first', handle_unknown='ignore'))  # Encodage en One-Hot, ignorer les catégories inconnues
])

print("Pipeline pour les features catégoriques créée avec succès.")

Pipeline pour les features catégoriques créée avec succès.


In [71]:
# Créer un objet preprocessor qui décrit tous les traitements à appliquer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),  # Pipeline pour les variables numériques
        ('cat', categorical_transformer, categorical_features)  # Pipeline pour les variables catégoriques
    ]
)

print("Preprocessor créé avec succès.")

Preprocessor créé avec succès.


In [72]:
# Prétraitement sur l'ensemble d'entraînement
print("Prétraitement sur l'ensemble d'entraînement...")
print(X_train[:5])  # Affichage des premières lignes avant transformation
X_train = preprocessor.fit_transform(X_train)  # Appliquer fit + transform sur l'ensemble d'entraînement
print("...Fini !\n")
print("Premières lignes de X_train après prétraitement :")
print(X_train[:5], "\n")  # X_train est désormais un tableau NumPy

# Prétraitement sur l'ensemble de test
print("Prétraitement sur l'ensemble de test...")
print(X_test[:5])  # Affichage des premières lignes avant transformation
X_test = preprocessor.transform(X_test)  # Appliquer seulement transform sur l'ensemble de test
print("...Fini !\n")
print("Premières lignes de X_test après prétraitement :")
print(X_test[:5], "\n")  # X_test est désormais un tableau NumPy

Prétraitement sur l'ensemble d'entraînement...
       pickup_longitude  pickup_latitude  dropoff_longitude  dropoff_latitude  \
13811        -73.961563        40.780048         -73.973564         40.792052   
16120        -73.972808        40.749658         -73.982309         40.735853   
1380         -73.988100        40.764807         -74.001052         40.746947   
14763        -73.959093        40.767957         -73.967107         40.755797   
17378        -73.975608        40.760969         -73.967462         40.759989   

       passenger_count  year  month  day    weekday  ride_distance  
13811                1  2012     12   30     Sunday       1.674074  
16120                1  2009      8    5  Wednesday       1.731199  
1380                 1  2013     10   23  Wednesday       2.265861  
14763                2  2012      5   20     Sunday       1.511233  
17378                2  2014      9   23    Tuesday       0.694690  
...Fini !

Premières lignes de X_train après prétrai

# Baseline: Linear Regression

Train a linear regression model and evaluate its performances. Is it satisfying?

In [None]:
# Entraîner le modèle de régression linéaire
print("Entraînement du modèle de régression linéaire...")
regressor = LinearRegression()
regressor.fit(X_train, Y_train)
print("...Fini !\n")

# Afficher les coefficients du modèle
print(f"Coefficients du modèle : {regressor.coef_}")
print(f"Ordonnée à l'origine (intercept) : {regressor.intercept_}\n")