In [1]:
import pandas as pd
import numpy as np
!pip install -U ydata-profiling

Collecting ydata-profiling
  Downloading ydata_profiling-4.18.0-py2.py3-none-any.whl.metadata (22 kB)
Collecting visions<0.8.2,>=0.7.5 (from visions[type_image_path]<0.8.2,>=0.7.5->ydata-profiling)
  Downloading visions-0.8.1-py3-none-any.whl.metadata (11 kB)
Collecting minify-html>=0.15.0 (from ydata-profiling)
  Downloading minify_html-0.18.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (18 kB)
Collecting filetype>=1.0.0 (from ydata-profiling)
  Downloading filetype-1.2.0-py2.py3-none-any.whl.metadata (6.5 kB)
Collecting phik<0.13,>=0.12.5 (from ydata-profiling)
  Downloading phik-0.12.5-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (5.6 kB)
Collecting multimethod<2,>=1.4 (from ydata-profiling)
  Downloading multimethod-1.12-py3-none-any.whl.metadata (9.6 kB)
Collecting imagehash==4.3.2 (from ydata-profiling)
  Downloading ImageHash-4.3.2-py2.py3-none-any.whl.metadata (8.4 kB)
Collecting dacite<2,>=1.9 (from ydata-profiling)
  Downloading

In [3]:
df = pd.read_csv('NYC_Taxi_dataset_with_anomalies.csv')
df = df.drop(columns=['Unnamed: 0'])
df.head(2)

Unnamed: 0,pickup_datetime,dropoff_datetime,pickup_latitude,pickup_longitude,dropoff_latitude,dropoff_longitude,trip_distance_miles,fare_amount,passenger_count,payment_type
0,2023-02-20 17:27:00,2023-02-20 17:49:00,40.808941,-73.914482,40.807336,-73.90527,1.03,5.84$,5.0,Credit Card
1,2023-02-28 19:41:00,2023-02-28 20:07:00,40.685842,-73.855449,40.663358,-73.826745,4.02,15.6£,4.0,Unknown


# Skimming Data

In [39]:
pd.DataFrame({
    'feature': df.columns.values,
    'data_type': df.dtypes.values,
    'null_value(%)': df.isna().mean().values * 100,
    'neg_value(%)': [len(df[col][df[col] < 0]) / len(df) * 100 if col in df.select_dtypes(include=[np.number]).columns else 0 for col in df.columns],
    '0_value(%)': [len(df[col][df[col] == 0]) / len(df) * 100 if col in df.select_dtypes(include=[np.number]).columns else 0 for col in df.columns],
    'duplicate' : df.duplicated().sum(),
    'n_unique': df.nunique().values,
    'sample_unique': [df[col].unique() for col in df.columns]}
).round(3)

Unnamed: 0,feature,data_type,null_value(%),neg_value(%),0_value(%),duplicate,n_unique,sample_unique
0,pickup_datetime,object,5.097,0.0,0.0,12,758,"[2023-02-20 17:27:00, 2023-02-28 19:41:00, 202..."
1,dropoff_datetime,object,0.0,0.0,0.0,12,797,"[2023-02-20 17:49:00, 2023-02-28 20:07:00, 202..."
2,pickup_latitude,float64,0.0,0.0,0.0,12,803,"[40.808941, 40.685842, 40.668055, 40.765394, 4..."
3,pickup_longitude,float64,0.0,100.0,0.0,12,802,"[-73.914482, -73.855449, -74.04505, -73.753324..."
4,dropoff_latitude,float64,0.0,0.0,0.0,12,802,"[40.807336, 40.663358, 40.642572, 40.775285, 4..."
5,dropoff_longitude,float64,0.0,100.0,0.0,12,802,"[-73.90527, -73.826745, -74.055617, -73.775441..."
6,trip_distance_miles,float64,4.854,0.0,0.0,12,332,"[1.03, 4.02, 3.04, 2.67, 3.11, 4.18, 2.27, 4.0..."
7,fare_amount,object,4.612,0.0,0.0,12,673,"[5.84$, 15.6£, 2192.65¥, 0, 13.7€, 10.08£, 10...."
8,passenger_count,float64,4.854,0.0,0.0,12,5,"[5.0, 4.0, 3.0, 2.0, 1.0, nan]"
9,payment_type,object,0.0,0.0,0.0,12,10,"[Credit Card, Unknown, Cash, CREDIT CARD , cr..."


### Synthèse skimming

- **Valeurs manquantes** :
  - `pickup_datetime` (5,1%), `trip_distance_miles` (4,85%), `fare_amount` (4,61%), `passenger_count` (4,85%).

- **Formats incorrects** :
  - `pickup_datetime` et `dropoff_datetime` : doivent être en `datetime` (actuellement `object`).
  - `fare_amount` : mélange de devises ($, £, ¥, €) et format `object`.

- **Doublons de ligne** :
  - 12 doublons

- **Standardisation** :
  - `payment_type` : valeurs non uniformes (ex: "Credit Card" vs "CREDIT CARD").



In [40]:
from ydata_profiling import ProfileReport
profile = ProfileReport(df, title="Profiling Report")
html = profile.to_html()
with open("profiling_report.html", "w", encoding="utf-8") as f:
    f.write(html)

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]


  0%|          | 0/10 [00:00<?, ?it/s][A
 50%|█████     | 5/10 [00:00<00:00, 43.92it/s][A
100%|██████████| 10/10 [00:00<00:00, 42.97it/s]


Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

In [10]:
profile.to_notebook_iframe()

### Synthèse du Rapport de Profiling

- **Nombre d'observations** : 824
- **Cellules manquantes** : 160 (1.9%)
- **Lignes dupliquées** : 12 (1.5%)
- On peut lire egalement la presence de correlation entre plusieurs variables**Corrélations** : Approfondir l'analyse des interactions entre les variables.
- Les colonnes de **dates** ont toutes des valeurs valides
- La colonne passenger_count a des valeurs toutes comprises [1,5] ce qui est correct



### Assessment Scénario Driver

In [4]:
cols = ["fare_amount", "trip_distance_miles", "pickup_datetime", "dropoff_datetime", "pickup_latitude", "pickup_longitude"]
driver_scenario_df = df[cols]

# 1) Complétude (%) par colonne
compl_pct = (driver_scenario_df.notna().mean() * 100).round(2)
compl_df = compl_pct.reset_index()
compl_df.columns = ["feature", "completeness_pourcent"]
print("--- Complétude (%) ---")
display(compl_df)

--- Complétude (%) ---


Unnamed: 0,feature,completeness_pourcent
0,fare_amount,95.39
1,trip_distance_miles,95.15
2,pickup_datetime,94.9
3,dropoff_datetime,100.0
4,pickup_latitude,100.0
5,pickup_longitude,100.0


1- Evaluons l'accuracy des attributs

In [5]:
acc_df = driver_scenario_df.copy()
# Référence temporelle
now = pd.Timestamp.now()

# Parsing des dates
pickup = pd.to_datetime(acc_df['pickup_datetime'], errors='coerce')
dropoff = pd.to_datetime(acc_df['dropoff_datetime'], errors='coerce')

# Masks de validité de dates (pas NaT, pas de dates dans le futur, et bon ordre de date)
valid_pickup = pickup.notna() & (pickup <= now) & ~((pickup >= dropoff) & dropoff.notna())
valid_dropoff = dropoff.notna() & (dropoff <= now) & ~((dropoff <= pickup) & pickup.notna())

pu_accuracy_pct = valid_pickup.mean() * 100
do_accuracy_pct = valid_dropoff.mean() * 100

print(f"Accuracy pickup_datetime (en %): {pu_accuracy_pct:.2f}")
print(f"Accuracy dropoff_datetime (en %): {do_accuracy_pct:.2f}")





Accuracy pickup_datetime (en %): 94.90
Accuracy dropoff_datetime (en %): 95.15


In [6]:
acc_df['fare_amount']

Unnamed: 0,fare_amount
0,5.84$
1,15.6£
2,2192.65¥
3,0
4,13.7€
...,...
819,6.75€
820,13.3£
821,13.57€
822,13.52€


Comme vu précédement les valeurs de fare_amount sont des string et pas sous les memes unités.

In [7]:
# Fonction pour extraire la valeur numérique et convertir en USD
def convert_to_usd(fare):
    if isinstance(fare, str):
        if '€' in fare:
            amount = float(fare.replace('€', '').strip())
            return amount * 1.18  # 1 EUR = 1.18 USD
        elif '£' in fare:
            amount = float(fare.replace('£', '').strip())
            return amount * 1.33  # 1 GBP = 1.33 USD
        elif '¥' in fare:
            amount = float(fare.replace('¥', '').strip())
            return amount * 0.009  # 1 JPY = 0.009 USD
        elif '$' in fare:
            amount = float(fare.replace('$', '').strip())
            return amount  # Déjà en USD
        else:
            return float(fare)  # Si pas de symbole (ex: '0')
    else:
        return float(fare)  # Si déjà numérique


acc_df['dollar_fareamount'] = acc_df['fare_amount'].apply(convert_to_usd)

# Afficher le résultat
acc_df['dollar_fareamount'].head(5)


Unnamed: 0,dollar_fareamount
0,5.84
1,20.748
2,19.73385
3,0.0
4,16.166


In [45]:
#fare_amount accuracy (valeur negative ou 0), (accuracy %)
neg_fare_count = (acc_df['dollar_fareamount'] < 0).sum()
zero_fare_count = (acc_df['dollar_fareamount'] == 0).sum()
total = len(acc_df)
fare_accuracy_pct = ((total - (neg_fare_count + zero_fare_count)) / total) * 100
print(f"fare_amount accuracy en % : {fare_accuracy_pct:.2f}")


fare_amount accuracy en % : 97.09


In [46]:
# accuracy pour trip_distance_miles (valeur negative ou 0), (accuracy %)
neg_distance_count = (acc_df['trip_distance_miles'] < 0).sum()
zero_distance_count = (acc_df['trip_distance_miles'] == 0).sum()
total = len(acc_df)
distance_accuracy_pct = ((total - (neg_distance_count + zero_distance_count)) / total) * 100
print(f"trip_distance_miles accuracy en % : {distance_accuracy_pct:.2f}")

trip_distance_miles accuracy en % : 100.00


### Consistancy

In [10]:
# date inconsistent (pickup >= dropoff)

acc_df['temporal_inconsistent'] = (
    acc_df['pickup_datetime'].notna() &
    acc_df['dropoff_datetime'].notna() &
    (acc_df['pickup_datetime'] >= acc_df['dropoff_datetime'])
).astype(int)

n_bad = acc_df['temporal_inconsistent'].sum()
pct_bad = n_bad / len(acc_df) * 100
print(f"Nombre d'enregistrements temporal_inconsistent : {n_bad} ({pct_bad:.2f}%)")

# afficher quelques exemples pour inspection
display(acc_df[acc_df['temporal_inconsistent'] == 1].head())

Nombre d'enregistrements temporal_inconsistent : 0 (0.00%)


Unnamed: 0,fare_amount,trip_distance_miles,pickup_datetime,dropoff_datetime,pickup_latitude,pickup_longitude,dollar_fareamount,temporal_inconsistent


# Relevance

In [18]:
pickup = pd.to_datetime(acc_df['pickup_datetime'], errors='coerce')
dropoff = pd.to_datetime(acc_df['dropoff_datetime'], errors='coerce')

# 1) Prix valide : non NaN et > 0
cond_fare_driver = acc_df['dollar_fareamount'].notna() & (acc_df['dollar_fareamount'] > 0)

# 2) Distance valide : non NaN et > 0
cond_dist_driver = acc_df['trip_distance_miles'].notna() & (acc_df['trip_distance_miles'] > 0)

# 3) Temps cohérent : pickup < dropoff, dates non NaT
cond_time_driver = pickup.notna() & dropoff.notna() & (pickup < dropoff)

# 4) Coordonnées pickup plausibles (fenêtre NYC)
nyc_min_lat, nyc_max_lat = 40.5, 40.9
nyc_min_lon, nyc_max_lon = -74.25, -73.7
cond_geo_driver = (
    acc_df['pickup_latitude'].between(nyc_min_lat, nyc_max_lat)
    & acc_df['pickup_longitude'].between(nyc_min_lon, nyc_max_lon)
)

# Condition globale de relevance Driver
driver_relevance_conditions = (
    cond_fare_driver &
    cond_dist_driver &
    cond_time_driver &
    cond_geo_driver
)

driver_scenario_df['is_relevant_driver'] = driver_relevance_conditions.astype(int)
driver_relevance_score = driver_scenario_df['is_relevant_driver'].mean() * 100

print(f"Relevance (Driver) : {driver_relevance_score:.2f}% de trajets pertinents pour le scénario Driver")

Relevance (Driver) : 79.37% de trajets pertinents pour le scénario Driver


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  driver_scenario_df['is_relevant_driver'] = driver_relevance_conditions.astype(int)


# Assessment Passenger

In [11]:
passenger_cols = ["fare_amount", "pickup_datetime", "dropoff_datetime", "passenger_count"]
passenger_df = df[passenger_cols].copy()

# Complétude (%) par colonne pour Passenger
compl_pct = (passenger_df.notna().mean() * 100).round(2)
compl_df = compl_pct.reset_index()
compl_df.columns = ["feature", "completeness_pourcent"]

print("--- Complétude Passenger (%) ---")
display(compl_df)


--- Complétude Passenger (%) ---


Unnamed: 0,feature,completeness_pourcent
0,fare_amount,95.39
1,pickup_datetime,94.9
2,dropoff_datetime,100.0
3,passenger_count,95.15


In [12]:
acc_pass_df = passenger_df.copy()
now = pd.Timestamp.now()

# Conversion fare_amount -> USD
acc_pass_df["fare_amount_usd"] = acc_pass_df["fare_amount"].apply(convert_to_usd)
acc_pass_df["fare_amount_usd"] = pd.to_numeric(acc_pass_df["fare_amount_usd"], errors="coerce")

# Dates
pickup = pd.to_datetime(acc_pass_df["pickup_datetime"], errors="coerce")
dropoff = pd.to_datetime(acc_pass_df["dropoff_datetime"], errors="coerce")

valid_pickup = pickup.notna() & (pickup <= now) & ~(pickup >= dropoff)
valid_dropoff = dropoff.notna() & (dropoff <= now) & ~(dropoff <= pickup)

pu_accuracy_pct = valid_pickup.mean() * 100
do_accuracy_pct = valid_dropoff.mean() * 100

print(f"Accuracy pickup_datetime Passenger (en %) : {pu_accuracy_pct:.2f}")
print(f"Accuracy dropoff_datetime Passenger (en %) : {do_accuracy_pct:.2f}")

# Accuracy fare_amount_usd : > 0
neg_fare_count = (acc_pass_df["fare_amount_usd"] <= 0).sum()
total_pass = len(acc_pass_df)
fare_acc_pass = ((total_pass - neg_fare_count) / total_pass) * 100
print(f"Accuracy fare_amount (USD) Passenger (en %) : {fare_acc_pass:.2f}")

Accuracy pickup_datetime Passenger (en %) : 94.90
Accuracy dropoff_datetime Passenger (en %) : 95.15
Accuracy fare_amount (USD) Passenger (en %) : 97.09


In [13]:
# 3) Consistency temporelle Passenger
cons_pass_df = acc_pass_df.copy()
temporal_inconsistent = (
    pickup.notna()
    & dropoff.notna()
    & (dropoff <= pickup)
)

nb_bad = temporal_inconsistent.sum()
pct_bad = nb_bad / len(cons_pass_df) * 100
print(f"Nombre d'enregistrements temporalinconsistent (Passenger) : {nb_bad} ({pct_bad:.2f}%)")
display(cons_pass_df[temporal_inconsistent].head())

Nombre d'enregistrements temporalinconsistent (Passenger) : 0 (0.00%)


Unnamed: 0,fare_amount,pickup_datetime,dropoff_datetime,passenger_count,fare_amount_usd


In [15]:
acc_pass_df["fare_amount_usd"] = acc_pass_df["fare_amount"].apply(convert_to_usd)
acc_pass_df["fare_amount_usd"] = pd.to_numeric(acc_pass_df["fare_amount_usd"], errors="coerce")

# Lignes dont le montant est irrécupérable (NaN après conversion)
mask_nan = acc_pass_df["fare_amount_usd"].isna()
print("Nb NaN :", mask_nan.sum())
display(acc_pass_df.loc[mask_nan, ["fare_amount", "fare_amount_usd"]].head(30))


Nb NaN : 38


Unnamed: 0,fare_amount,fare_amount_usd
23,,
27,,
45,,
60,,
69,,
99,,
166,,
182,,
214,,
221,,


In [16]:
mask_bad_value = acc_pass_df["fare_amount_usd"].notna() & (acc_pass_df["fare_amount_usd"] <= 0)
print("Nb valeurs <= 0 :", mask_bad_value.sum())
display(acc_pass_df.loc[mask_bad_value, ["fare_amount", "fare_amount_usd"]].head(30))


Nb valeurs <= 0 : 24


Unnamed: 0,fare_amount,fare_amount_usd
3,0,0.0
50,0,0.0
165,0,0.0
184,0,0.0
328,0,0.0
332,0,0.0
336,0,0.0
346,0,0.0
362,0,0.0
372,0,0.0


In [17]:
# 4) Relevance Passenger (avec fare_amount_usd)
passenger_relevance_conditions = (
    acc_pass_df["fare_amount_usd"].notna() &
    pickup.notna() &
    dropoff.notna() &
    (acc_pass_df["fare_amount_usd"] > 0) &
    (pickup < dropoff)
)

passenger_df["is_relevant_passenger"] = passenger_relevance_conditions.astype(int)
passenger_relevance_score = passenger_df["is_relevant_passenger"].mean() * 100
print(f"Relevance (Passenger) : {passenger_relevance_score:.2f}% de trajets pertinents pour le scénario Passenger")


Relevance (Passenger) : 87.86% de trajets pertinents pour le scénario Passenger
