In [20]:
import pandas as pd
import numpy as np

In [21]:
df = pd.read_csv('NYC_Taxi_dataset_with_anomalies.csv')
df = df.drop(columns=['Unnamed: 0'])
df.head()

Unnamed: 0,pickup_datetime,dropoff_datetime,pickup_latitude,pickup_longitude,dropoff_latitude,dropoff_longitude,trip_distance_miles,fare_amount,passenger_count,payment_type
0,2023-02-20 17:27:00,2023-02-20 17:49:00,40.808941,-73.914482,40.807336,-73.90527,1.03,5.84$,5.0,Credit Card
1,2023-02-28 19:41:00,2023-02-28 20:07:00,40.685842,-73.855449,40.663358,-73.826745,4.02,15.6£,4.0,Unknown
2,2023-02-14 08:37:00,2023-02-14 09:07:00,40.668055,-74.04505,40.642572,-74.055617,3.04,2192.65¥,5.0,Credit Card
3,2023-01-16 23:54:00,2023-01-17 00:40:00,40.765394,-73.753324,40.775285,-73.775441,2.67,0,4.0,Credit Card
4,2023-02-11 01:16:00,2023-02-11 01:35:00,40.815841,-73.727328,40.836115,-73.746976,3.11,13.7€,3.0,Credit Card


In [22]:
#les duplicatas
df.duplicated().sum()

np.int64(12)

#### #suppression des lignes dupliquées

In [23]:
#suppression des duplicatas
df = df.drop_duplicates()
df.duplicated().sum()

np.int64(0)

In [24]:
df_to_improve = df.copy()


In [25]:
df_to_improve.dtypes

pickup_datetime         object
dropoff_datetime        object
pickup_latitude        float64
pickup_longitude       float64
dropoff_latitude       float64
dropoff_longitude      float64
trip_distance_miles    float64
fare_amount             object
passenger_count        float64
payment_type            object
dtype: object

In [26]:
'''"parse all attributes that look like dates to datetime format"'''
date_columns = ['pickup_datetime', 'dropoff_datetime']
for col in date_columns:
    df_to_improve[col] = pd.to_datetime(df_to_improve[col], errors='coerce')

In [27]:
# etat date parser
now = pd.Timestamp.now()

report = {
        "pickup_parsed": int(df_to_improve['pickup_datetime'].notna().sum()),
        "dropoff_parsed": int(df_to_improve['dropoff_datetime'].notna().sum()),
        "pickup_in_future": int((df_to_improve['pickup_datetime'] > now).sum()),
        "dropoff_in_future": int((df_to_improve['dropoff_datetime'] > now).sum()),
    }
report

{'pickup_parsed': 772,
 'dropoff_parsed': 812,
 'pickup_in_future': 0,
 'dropoff_in_future': 38}

On a 40 lignes sur 800 (~5 %) avec des dates dans le futur, c’est assez peu dans notre contexte,
nous allons procédé à la suppresion de ces lignes

In [28]:
now = pd.Timestamp.now()

df_clean_without_future_date = df_to_improve[
    (df_to_improve['pickup_datetime'] <= now) &
    (df_to_improve['dropoff_datetime'] <= now)
].copy()

In [29]:
# lignes où dropoff est avant pickup
inverted_dates = df_clean_without_future_date[
    df_clean_without_future_date['dropoff_datetime'] < df_clean_without_future_date['pickup_datetime']
]
inverted_dates.shape[0]

0

On n'a pas de date pickup > dropoff

####  Normalize fares -> fare_amount_usd


In [30]:
def convert_to_usd(fare) -> float:
    """Robust single-value conversion: string with currency symbol or numeric -> USD (float) or np.nan."""
    try:
        if pd.isna(fare):
            return np.nan
        if isinstance(fare, (int, float, np.number)):
            return float(fare)
        fare_str = str(fare).strip()
        if fare_str == '':
            return np.nan
        # currency detection
        if '€' in fare_str:
            return float(fare_str.replace('€', '').strip()) * 1.18
        if '£' in fare_str:
            return float(fare_str.replace('£', '').strip()) * 1.33
        if '¥' in fare_str or 'JPY' in fare_str:
            return float(fare_str.replace('¥', '').replace('JPY', '').strip()) * 0.009
        if '$' in fare_str:
            return float(fare_str.replace('$', '').strip())
        # fallback numeric
        return float(fare_str)
    except Exception:
        return np.nan


def normalize_fares(df: pd.DataFrame, col: str = "fare_amount", out_col: str = "fare_amount_usd"):
    """Create numeric USD fare column and report conversion stats."""
    df = df.copy()
    df[out_col] = df[col].apply(convert_to_usd)
    
    report = {
        "total": len(df),
        "converted_notnull": int(df[out_col].notna().sum()),
        "converted_null": int(df[out_col].isna().sum()),
        "non_positive": int((df[out_col] <= 0).sum())
    }
    return df, report

In [31]:
work_df, rep_fares = normalize_fares(df_clean_without_future_date, col="fare_amount", out_col="fare_amount_usd")
rep_fares

{'total': 772,
 'converted_notnull': 737,
 'converted_null': 35,
 'non_positive': 23}

In [32]:
work_df.head()

Unnamed: 0,pickup_datetime,dropoff_datetime,pickup_latitude,pickup_longitude,dropoff_latitude,dropoff_longitude,trip_distance_miles,fare_amount,passenger_count,payment_type,fare_amount_usd
0,2023-02-20 17:27:00,2023-02-20 17:49:00,40.808941,-73.914482,40.807336,-73.90527,1.03,5.84$,5.0,Credit Card,5.84
1,2023-02-28 19:41:00,2023-02-28 20:07:00,40.685842,-73.855449,40.663358,-73.826745,4.02,15.6£,4.0,Unknown,20.748
2,2023-02-14 08:37:00,2023-02-14 09:07:00,40.668055,-74.04505,40.642572,-74.055617,3.04,2192.65¥,5.0,Credit Card,19.73385
3,2023-01-16 23:54:00,2023-01-17 00:40:00,40.765394,-73.753324,40.775285,-73.775441,2.67,0,4.0,Credit Card,0.0
4,2023-02-11 01:16:00,2023-02-11 01:35:00,40.815841,-73.727328,40.836115,-73.746976,3.11,13.7€,3.0,Credit Card,16.166


Pour traiter les valeurs fare_amount_usd (35) nulls et negative(23), nous allons les remplacer par la valeurs medians ces lignes on (35+23)/772 > 5 %

In [33]:
# Impute missing or non-positive fares with median of positive fares

median_fare = work_df.loc[work_df['fare_amount_usd'] > 0, 'fare_amount_usd'].median()
work_df['fare_amount_usd'] = work_df['fare_amount_usd'].apply(lambda x: median_fare if pd.isna(x) or x <= 0 else x)

In [34]:
#valeur null ou negative 
work_df[work_df['fare_amount_usd'] <= 0].shape[0]

0

#### Gestion des valeurs abérantes constatées dans l'étapes 3

On avait noter selon la méthode IQR que 89.68% des valeurs se trouvait dans l'intervalle de valeur acceptable (avant biensur les suppressions de lignes faites plus haut).
Pour donc gerer les valeurs aberantes ici qui sont >10%, nous allons procédé à limputation par la valeur médianne 

In [35]:
# Les valeurs seuil 

Q1 = work_df['fare_amount_usd'].quantile(0.25)
Q3 = work_df['fare_amount_usd'].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# impute par la valeur medianne
median_fare = work_df['fare_amount_usd'].median()

outliers = work_df[
    (work_df['fare_amount_usd'] < lower_bound) | 
    (work_df['fare_amount_usd'] > upper_bound)
]

print(f"Nombre d'outliers avant : {len(outliers)}")

# Remplacer les outliers par la médiane
work_df['fare_amount_usd'] = work_df['fare_amount_usd'].mask(
    (work_df['fare_amount_usd'] < lower_bound) | (work_df['fare_amount_usd'] > upper_bound),
    median_fare
)

outliers = work_df[
    (work_df['fare_amount_usd'] < lower_bound) | 
    (work_df['fare_amount_usd'] > upper_bound)
]

print(f"Nombre d'outliers après : {len(outliers)}")


Nombre d'outliers avant : 36
Nombre d'outliers après : 0


On constate que la suppresion des lignes a fait baissé le nombre de doutliers de fare_amount_usd dans nos données. L'imputation ici par la valeur médianne reste acceptable.

#### Gestion des coordonnées hors NYC
Dans l'étape 03, on avait pu constater la présence de coordonnées géographiques hors de NYC


In [38]:
# Approximation coordonnées ville de new York
nyc_min_latitude = 40.5
nyc_max_latitude = 40.92   
nyc_min_longitude = -74.26 
nyc_max_longitude = -73.7
invalid_pickup = work_df[
    (work_df['pickup_latitude'] < nyc_min_latitude) | (work_df['pickup_latitude'] > nyc_max_latitude) |
    (work_df['pickup_longitude'] < nyc_min_longitude) | (work_df['pickup_longitude'] > nyc_max_longitude)
]
invalid_pickup.shape[0]
invalid_dropoff = work_df[
    (work_df['dropoff_latitude'] < nyc_min_latitude) | (work_df['dropoff_latitude'] > nyc_max_latitude) |
    (work_df['dropoff_longitude'] < nyc_min_longitude) | (work_df['dropoff_longitude'] > nyc_max_longitude)
]
invalid_dropoff.shape[0]

print(f"nombre de coordonnées(pickup ou dropoff) invalides: {invalid_pickup.shape[0] + invalid_dropoff.shape[0]} -- total lignes: {work_df.shape[0]}")

nombre de coordonnées(pickup ou dropoff) invalides: 95 -- total lignes: 772


On a environ 10% des valeurs qui ne sont pas à NYC pour (pickup et dropoff compris),

Pour l'imputation nous allons remplacer les valeurs par les coordonnées mode

In [39]:
# on remplace les coordonnées invalides par le mode des coordonnées valides

# on recupere le mode des coordonnees valides pour pcickup et impute les invalides
valid_pickup_lat_mode = work_df.loc[
    (work_df['pickup_latitude'] >= nyc_min_latitude) & (work_df['pickup_latitude'] <= nyc_max_latitude),
    'pickup_latitude'
].mode()[0]
work_df['pickup_latitude'] = work_df['pickup_latitude'].apply(
    lambda x: valid_pickup_lat_mode if (x < nyc_min_latitude or x > nyc_max_latitude) else x
)

# on recupere le mode des coordonnees valides pour pcickup et impute les invalides
valid_pickup_lon_mode = work_df.loc[
    (work_df['pickup_longitude'] >= nyc_min_longitude) & (work_df['pickup_longitude'] <= nyc_max_longitude),
    'pickup_longitude'
].mode()[0]
work_df['pickup_longitude'] = work_df['pickup_longitude'].apply(
    lambda x: valid_pickup_lon_mode if (x < nyc_min_longitude or x > nyc_max_longitude) else x
)

# on recupere le mode des coordonnees valides pour dropoff et impute les invalides
valid_dropoff_lat_mode = work_df.loc[
    (work_df['dropoff_latitude'] >= nyc_min_latitude) & (work_df['dropoff_latitude'] <= nyc_max_latitude),
    'dropoff_latitude'
].mode()[0]
work_df['dropoff_latitude'] = work_df['dropoff_latitude'].apply(
    lambda x: valid_dropoff_lat_mode if (x < nyc_min_latitude or x > nyc_max_latitude) else x
)

# on recupere le mode des coordonnees valides pour dropoff et impute les invalides
valid_dropoff_lon_mode = work_df.loc[
    (work_df['dropoff_longitude'] >= nyc_min_longitude) & (work_df['dropoff_longitude'] <= nyc_max_longitude),
    'dropoff_longitude'
].mode()[0]
work_df['dropoff_longitude'] = work_df['dropoff_longitude'].apply(
    lambda x: valid_dropoff_lon_mode if (x < nyc_min_longitude or x > nyc_max_longitude) else x
)

# re-evaluation des coordonnées invalides
invalid_pickup = work_df[
    (work_df['pickup_latitude'] < nyc_min_latitude) | (work_df['pickup_latitude'] > nyc_max_latitude) |
    (work_df['pickup_longitude'] < nyc_min_longitude) | (work_df['pickup_longitude'] > nyc_max_longitude)
]
invalid_pickup.shape[0]
invalid_dropoff = work_df[
    (work_df['dropoff_latitude'] < nyc_min_latitude) | (work_df['dropoff_latitude'] > nyc_max_latitude) |
    (work_df['dropoff_longitude'] < nyc_min_longitude) | (work_df['dropoff_longitude'] > nyc_max_longitude)
]   
invalid_dropoff.shape[0]
print(f"nombre de coordonnées(pickup ou dropoff) invalides après imputation: {invalid_pickup.shape[0] + invalid_dropoff.shape[0]} -- total lignes: {work_df.shape[0]}") 



nombre de coordonnées(pickup ou dropoff) invalides après imputation: 0 -- total lignes: 772


Toutes les valeurs (coordonnées gps) invalides ont été imputé

#### Payment_type

En faisant le data profiling du dataset à l'étape 3 on avait pu constater la présence de valeurs avec la même sémantique mais orthographié, ou exprimé par des string différent.

In [43]:
work_df['payment_type'].unique()


array(['Credit Card', 'Unknown', 'Cash', ' CREDIT CARD ', 'credit card',
       'cash', 'Dispute', 'No Charge', 'CREDIT CARD', 'Disagreement'],
      dtype=object)

In [None]:
#Uniformiser les valeurs
work_df['payment_type'] = work_df['payment_type'].str.strip().str.lower()

#  regroupons
work_df['payment_type'] = work_df['payment_type'].replace({
    'credit card': 'credit_card',
    'cash': 'cash',
    'unknown': 'unknown',
    'dispute': 'dispute',
    'no charge': 'no_charge',
    'disagreement': 'dispute'  # Exemple de regroupement
})

work_df['payment_type'].unique()


array(['credit_card', 'unknown', 'cash', 'dispute', 'no_charge'],
      dtype=object)