In [1]:
import pandas as pd
import numpy as np

In [3]:
df = pd.read_csv('NYC_Taxi_dataset_with_anomalies.csv')
df = df.drop(columns=['Unnamed: 0'])
df.head(2)

Unnamed: 0,pickup_datetime,dropoff_datetime,pickup_latitude,pickup_longitude,dropoff_latitude,dropoff_longitude,trip_distance_miles,fare_amount,passenger_count,payment_type
0,2023-02-20 17:27:00,2023-02-20 17:49:00,40.808941,-73.914482,40.807336,-73.90527,1.03,5.84$,5.0,Credit Card
1,2023-02-28 19:41:00,2023-02-28 20:07:00,40.685842,-73.855449,40.663358,-73.826745,4.02,15.6£,4.0,Unknown


# Skimming Data

In [5]:
pd.DataFrame({
    'feature': df.columns.values,
    'data_type': df.dtypes.values,
    'null_value(%)': df.isna().mean().values * 100,
    'neg_value(%)': [len(df[col][df[col] < 0]) / len(df) * 100 if col in df.select_dtypes(include=[np.number]).columns else 0 for col in df.columns],
    '0_value(%)': [len(df[col][df[col] == 0]) / len(df) * 100 if col in df.select_dtypes(include=[np.number]).columns else 0 for col in df.columns],
    'duplicate' : df.duplicated().sum(),
    'n_unique': df.nunique().values,
    'sample_unique': [df[col].unique() for col in df.columns]}
).round(3)

Unnamed: 0,feature,data_type,null_value(%),neg_value(%),0_value(%),duplicate,n_unique,sample_unique
0,pickup_datetime,object,5.097,0.0,0.0,12,758,"[2023-02-20 17:27:00, 2023-02-28 19:41:00, 202..."
1,dropoff_datetime,object,0.0,0.0,0.0,12,797,"[2023-02-20 17:49:00, 2023-02-28 20:07:00, 202..."
2,pickup_latitude,float64,0.0,0.0,0.0,12,803,"[40.808941, 40.685842, 40.668055, 40.765394, 4..."
3,pickup_longitude,float64,0.0,100.0,0.0,12,802,"[-73.914482, -73.855449, -74.04505, -73.753324..."
4,dropoff_latitude,float64,0.0,0.0,0.0,12,802,"[40.807336, 40.663358, 40.642572, 40.775285, 4..."
5,dropoff_longitude,float64,0.0,100.0,0.0,12,802,"[-73.90527, -73.826745, -74.055617, -73.775441..."
6,trip_distance_miles,float64,4.854,0.0,0.0,12,332,"[1.03, 4.02, 3.04, 2.67, 3.11, 4.18, 2.27, 4.0..."
7,fare_amount,object,4.612,0.0,0.0,12,673,"[5.84$, 15.6£, 2192.65¥, 0, 13.7€, 10.08£, 10...."
8,passenger_count,float64,4.854,0.0,0.0,12,5,"[5.0, 4.0, 3.0, 2.0, 1.0, nan]"
9,payment_type,object,0.0,0.0,0.0,12,10,"[Credit Card, Unknown, Cash, CREDIT CARD , cr..."


### Synthèse skimming

- **Valeurs manquantes** :
  - `pickup_datetime` (5,1%), `trip_distance_miles` (4,85%), `fare_amount` (4,61%), `passenger_count` (4,85%).

- **Formats incorrects** :
  - `pickup_datetime` et `dropoff_datetime` : doivent être en `datetime` (actuellement `object`).
  - `fare_amount` : mélange de devises ($, £, ¥, €) et format `object`.

- **Doublons de ligne** :
  - 12 doublons 

- **Standardisation** :
  - `payment_type` : valeurs non uniformes (ex: "Credit Card" vs "CREDIT CARD").



In [6]:
from ydata_profiling import ProfileReport
profile = ProfileReport(df, title="Profiling Report")
html = profile.to_html()
with open("profiling_report.html", "w", encoding="utf-8") as f:
    f.write(html)

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

100%|██████████| 10/10 [00:00<00:00, 193.42it/s]


Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

In [8]:
profile.to_notebook_iframe()

### Synthèse du Rapport de Profiling

- **Nombre d'observations** : 824
- **Cellules manquantes** : 160 (1.9%)
- **Lignes dupliquées** : 12 (1.5%)
- On peut lire egalement la presence de correlation entre plusieurs variables**Corrélations** : Approfondir l'analyse des interactions entre les variables.
- Les colonnes de **dates** ont toutes des valeurs valides
- La colonne passenger_count a des valeurs toutes comprises [1,5] ce qui est correct



### Assessment Scénario Driver

In [11]:
cols = ["fare_amount", "trip_distance_miles", "pickup_datetime", "dropoff_datetime", "pickup_latitude", "pickup_longitude"]
driver_scenario_df = df[cols]

# 1) Complétude (%) par colonne
compl_pct = (driver_scenario_df.notna().mean() * 100).round(2)
compl_df = compl_pct.reset_index()
compl_df.columns = ["feature", "completeness_pourcent"]
print("--- Complétude (%) ---")
display(compl_df)

--- Complétude (%) ---


Unnamed: 0,feature,completeness_pourcent
0,fare_amount,95.39
1,trip_distance_miles,95.15
2,pickup_datetime,94.9
3,dropoff_datetime,100.0
4,pickup_latitude,100.0
5,pickup_longitude,100.0


#### Dans l'étape 01 on avait 