In [3]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [4]:
df = pd.read_csv(r"yellow_tripdata_2015-01.csv")

In [5]:
import pandas as pd

def sample_uniform_by_day(
    df: pd.DataFrame,
    datetime_col: str = "tpep_pickup_datetime",
    n_per_day: int | None = None,
    frac_per_day: float | None = None,
    random_state: int = 42
) -> pd.DataFrame:
    """
    Samples uniformly at random within each day.
- Choose either `n_per_day` (fixed number of rows per day) or `frac_per_day` (fraction per day).
- If `n_per_day` > size of a given day, take all rows from that day.
    """
    if (n_per_day is None) == (frac_per_day is None):
        raise ValueError("Spécifie exactement l'un de `n_per_day` ou `frac_per_day`.")

    # Make sure the time column is set to datetime
    df = df.copy()
    df[datetime_col] = pd.to_datetime(df[datetime_col], errors="coerce")
    df = df.dropna(subset=[datetime_col])

    # Key ‘date’ (calendar day) from the time of pick-up
    df["_date"] = df[datetime_col].dt.date

    if n_per_day is not None:
        # Sampling with a fixed number per day (without replacement)
        out = (
            df.groupby("_date", group_keys=False)
              .apply(lambda g: g.sample(n=min(n_per_day, len(g)),
                                        replace=False,
                                        random_state=random_state))
        )
    else:
         # Sampling with one fraction per day
        out = (
            df.groupby("_date", group_keys=False)
              .apply(lambda g: g.sample(frac=frac_per_day,
                                        replace=False,
                                        random_state=random_state))
        )

    return out.drop(columns=["_date"])


# Fraction par jour (10% des données de chaque jour)

In [7]:
df_sample = sample_uniform_by_day(df, frac_per_day=0.010, random_state=42)

  .apply(lambda g: g.sample(frac=frac_per_day,


In [8]:
df_sample.shape

(127490, 19)

In [9]:
df_sample.dropna()

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,pickup_longitude,pickup_latitude,RateCodeID,store_and_fwd_flag,dropoff_longitude,dropoff_latitude,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount
9690050,2,2015-01-01 16:07:13,2015-01-01 16:35:55,1,18.26,-73.979401,40.760380,3,N,-74.176826,40.694511,2,66.0,0.0,0.0,0.00,16.35,0.3,82.65
2813105,2,2015-01-01 15:59:47,2015-01-01 16:02:06,1,0.66,-73.980637,40.730099,1,N,-73.983124,40.722569,2,4.0,0.0,0.5,0.00,0.00,0.3,4.80
1965740,2,2015-01-01 04:05:37,2015-01-01 04:08:25,5,0.33,-73.971985,40.749748,1,N,-73.971886,40.746304,2,4.0,0.5,0.5,0.00,0.00,0.3,5.30
11762488,1,2015-01-01 10:10:53,2015-01-01 10:27:01,3,7.30,-74.010773,40.714050,1,N,-73.950073,40.779720,1,22.5,0.0,0.5,5.00,0.00,0.0,28.30
6320277,1,2015-01-01 03:30:14,2015-01-01 03:46:46,1,2.60,-73.982277,40.742973,1,N,-73.990211,40.712654,2,12.5,0.5,0.5,0.00,0.00,0.0,13.80
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
455735,1,2015-01-31 02:20:39,2015-01-31 02:37:49,2,2.40,-74.002457,40.718697,1,N,-73.974197,40.731674,1,12.5,0.5,0.5,2.76,0.00,0.3,16.56
11885579,1,2015-01-31 15:55:06,2015-01-31 16:06:46,1,1.50,0.000000,0.000000,1,N,0.000000,0.000000,1,9.0,0.0,0.5,1.95,0.00,0.3,11.75
280131,2,2015-01-31 16:04:28,2015-01-31 16:25:14,1,4.84,-73.967323,40.766136,1,N,-73.949982,40.821941,2,18.5,0.0,0.5,0.00,0.00,0.3,19.30
2393737,1,2015-01-31 13:43:19,2015-01-31 13:57:11,1,1.60,-73.983215,40.730484,1,N,-73.999733,40.743397,1,10.0,0.0,0.5,2.00,0.00,0.3,12.80


In [10]:
df_sample.shape

(127490, 19)

Next step for me : fully understand the importance of each column and delete unnecessary columns.

In [12]:
df_sample = df_sample[df_sample["trip_distance"] > 0]


In [13]:
df_sample.shape

(126705, 19)

In [14]:
df_sample = df_sample.drop(columns=['store_and_fwd_flag'])

In [26]:
df_sample

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,pickup_longitude,pickup_latitude,RateCodeID,dropoff_longitude,dropoff_latitude,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount
9690050,2,2015-01-01 16:07:13,2015-01-01 16:35:55,1,18.26,-73.979401,40.760380,3,-74.176826,40.694511,2,66.0,0.0,0.0,0.00,16.35,0.3,82.65
2813105,2,2015-01-01 15:59:47,2015-01-01 16:02:06,1,0.66,-73.980637,40.730099,1,-73.983124,40.722569,2,4.0,0.0,0.5,0.00,0.00,0.3,4.80
1965740,2,2015-01-01 04:05:37,2015-01-01 04:08:25,5,0.33,-73.971985,40.749748,1,-73.971886,40.746304,2,4.0,0.5,0.5,0.00,0.00,0.3,5.30
11762488,1,2015-01-01 10:10:53,2015-01-01 10:27:01,3,7.30,-74.010773,40.714050,1,-73.950073,40.779720,1,22.5,0.0,0.5,5.00,0.00,0.0,28.30
6320277,1,2015-01-01 03:30:14,2015-01-01 03:46:46,1,2.60,-73.982277,40.742973,1,-73.990211,40.712654,2,12.5,0.5,0.5,0.00,0.00,0.0,13.80
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
455735,1,2015-01-31 02:20:39,2015-01-31 02:37:49,2,2.40,-74.002457,40.718697,1,-73.974197,40.731674,1,12.5,0.5,0.5,2.76,0.00,0.3,16.56
11885579,1,2015-01-31 15:55:06,2015-01-31 16:06:46,1,1.50,0.000000,0.000000,1,0.000000,0.000000,1,9.0,0.0,0.5,1.95,0.00,0.3,11.75
280131,2,2015-01-31 16:04:28,2015-01-31 16:25:14,1,4.84,-73.967323,40.766136,1,-73.949982,40.821941,2,18.5,0.0,0.5,0.00,0.00,0.3,19.30
2393737,1,2015-01-31 13:43:19,2015-01-31 13:57:11,1,1.60,-73.983215,40.730484,1,-73.999733,40.743397,1,10.0,0.0,0.5,2.00,0.00,0.3,12.80


In [28]:
df_sample = df_sample.drop(columns=['passenger_count'])

In [30]:
df_sample = df_sample.drop(columns=['payment_type'])

In [32]:
df_sample = df_sample.drop(columns=['mta_tax'])

In [34]:
df_sample = df_sample.drop(columns=['improvement_surcharge'])