### Nettoyage des données et sauvegarde

In [1]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [44]:
from pathlib import Path

# On détecte proprement la racine du projet
BASE_DIR = Path().resolve().parent  # <- car on est dans Notebooks/
DATA_DIR = BASE_DIR / "Data"

print(f"BASE_DIR: {BASE_DIR}")
print(f"DATA_DIR: {DATA_DIR}")

BASE_DIR: C:\Users\NAZIFOU\Electric-vehicles-Market-Size-Analysis
DATA_DIR: C:\Users\NAZIFOU\Electric-vehicles-Market-Size-Analysis\Data


In [4]:
from pathlib import Path

# On détecte proprement la racine du projet
BASE_DIR = Path().resolve().parent  
DATA_DIR = BASE_DIR / "Data"

print(f"BASE_DIR: {BASE_DIR}")
print(f"DATA_DIR: {DATA_DIR}")

BASE_DIR: C:\Users\NAZIFOU\Electric-vehicles-Market-Size-Analysis
DATA_DIR: C:\Users\NAZIFOU\Electric-vehicles-Market-Size-Analysis\Data


In [45]:
df = pd.read_csv(DATA_DIR / "Electric_Vehicle_Data_2025.csv.gz")
df.head(3)  # Affiche les 3 premières lignes du DataFrame

Unnamed: 0,VIN (1-10),County,City,State,Postal Code,Model Year,Make,Model,Electric Vehicle Type,Clean Alternative Fuel Vehicle (CAFV) Eligibility,Electric Range,Base MSRP,Legislative District,DOL Vehicle ID,Vehicle Location,Electric Utility,2020 Census Tract
0,5YJ3E1EBXK,King,Seattle,WA,98178.0,2019,TESLA,MODEL 3,Battery Electric Vehicle (BEV),Clean Alternative Fuel Vehicle Eligible,220.0,0.0,37.0,477309682,POINT (-122.23825 47.49461),CITY OF SEATTLE - (WA)|CITY OF TACOMA - (WA),53033010000.0
1,5YJYGDEE3L,Kitsap,Poulsbo,WA,98370.0,2020,TESLA,MODEL Y,Battery Electric Vehicle (BEV),Clean Alternative Fuel Vehicle Eligible,291.0,0.0,23.0,109705683,POINT (-122.64681 47.73689),PUGET SOUND ENERGY INC,53035090000.0
2,KM8KRDAF5P,Kitsap,Olalla,WA,98359.0,2023,HYUNDAI,IONIQ 5,Battery Electric Vehicle (BEV),Eligibility unknown as battery range has not b...,0.0,0.0,26.0,230390492,POINT (-122.54729 47.42602),PUGET SOUND ENERGY INC,53035090000.0


In [46]:
df.info()  # Affiche les informations sur le DataFrame

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 235692 entries, 0 to 235691
Data columns (total 17 columns):
 #   Column                                             Non-Null Count   Dtype  
---  ------                                             --------------   -----  
 0   VIN (1-10)                                         235692 non-null  object 
 1   County                                             235689 non-null  object 
 2   City                                               235689 non-null  object 
 3   State                                              235692 non-null  object 
 4   Postal Code                                        235689 non-null  float64
 5   Model Year                                         235692 non-null  int64  
 6   Make                                               235692 non-null  object 
 7   Model                                              235692 non-null  object 
 8   Electric Vehicle Type                              235692 non-null  object

Notre dataset est composer de :
- 7 variables numériques
- 10 variables catégorielles 

In [47]:
df.isnull().sum().to_frame().T

Unnamed: 0,VIN (1-10),County,City,State,Postal Code,Model Year,Make,Model,Electric Vehicle Type,Clean Alternative Fuel Vehicle (CAFV) Eligibility,Electric Range,Base MSRP,Legislative District,DOL Vehicle ID,Vehicle Location,Electric Utility,2020 Census Tract
0,0,3,3,0,3,0,0,0,0,0,36,36,494,0,10,3,3


## Nettoyage 

In [48]:
  # 1. Remplacer les valeurs "0.0" non valides par NaN
cols_to_check = ['Electric Range', 'Base MSRP']
df[cols_to_check] = df[cols_to_check].replace(0.0, np.nan) 

In [49]:
# 2. Nettoyer les champs de type texte long
df['Clean Alternative Fuel Vehicle (CAFV) Eligibility'] = df['Clean Alternative Fuel Vehicle (CAFV) Eligibility'].replace(
    r'Eligibility unknown.*', 'Unknown', regex=True)

In [50]:
# 3. Corriger les types de données
df['Postal Code'] = df['Postal Code'].astype('Int64')  # gère aussi les NaN
df['2020 Census Tract'] = df['2020 Census Tract'].astype('string')

In [51]:
# 4. Extraire latitude et longitude depuis "Vehicle Location"
df[['Longitude', 'Latitude']] = df['Vehicle Location'].str.extract(r'POINT \((-?\d+\.\d+) (-?\d+\.\d+)\)').astype(float)

In [52]:
# 5. Harmoniser les noms des colonnes (optionnel mais conseillé)
df.columns = df.columns.str.strip().str.lower().str.replace(" ", "_").str.replace(r"[()]", "", regex=True)

In [54]:
# Aperçu
df.head()

Unnamed: 0,vin_1-10,county,city,state,postal_code,model_year,make,model,electric_vehicle_type,clean_alternative_fuel_vehicle_cafv_eligibility,electric_range,base_msrp,legislative_district,dol_vehicle_id,vehicle_location,electric_utility,2020_census_tract,longitude,latitude
0,5YJ3E1EBXK,King,Seattle,WA,98178,2019,TESLA,MODEL 3,Battery Electric Vehicle (BEV),Clean Alternative Fuel Vehicle Eligible,220.0,,37.0,477309682,POINT (-122.23825 47.49461),CITY OF SEATTLE - (WA)|CITY OF TACOMA - (WA),53033011902.0,-122.23825,47.49461
1,5YJYGDEE3L,Kitsap,Poulsbo,WA,98370,2020,TESLA,MODEL Y,Battery Electric Vehicle (BEV),Clean Alternative Fuel Vehicle Eligible,291.0,,23.0,109705683,POINT (-122.64681 47.73689),PUGET SOUND ENERGY INC,53035091100.0,-122.64681,47.73689
2,KM8KRDAF5P,Kitsap,Olalla,WA,98359,2023,HYUNDAI,IONIQ 5,Battery Electric Vehicle (BEV),Unknown,,,26.0,230390492,POINT (-122.54729 47.42602),PUGET SOUND ENERGY INC,53035092802.0,-122.54729,47.42602
3,5UXTA6C0XM,Kitsap,Seabeck,WA,98380,2021,BMW,X5,Plug-in Hybrid Electric Vehicle (PHEV),Clean Alternative Fuel Vehicle Eligible,30.0,,35.0,267929112,POINT (-122.81585 47.64509),PUGET SOUND ENERGY INC,53035091301.0,-122.81585,47.64509
4,JTMAB3FV7P,Thurston,Rainier,WA,98576,2023,TOYOTA,RAV4 PRIME,Plug-in Hybrid Electric Vehicle (PHEV),Clean Alternative Fuel Vehicle Eligible,42.0,,2.0,236505139,POINT (-122.68993 46.88897),PUGET SOUND ENERGY INC,53067012530.0,-122.68993,46.88897


In [55]:
df.info() 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 235692 entries, 0 to 235691
Data columns (total 19 columns):
 #   Column                                           Non-Null Count   Dtype  
---  ------                                           --------------   -----  
 0   vin_1-10                                         235692 non-null  object 
 1   county                                           235689 non-null  object 
 2   city                                             235689 non-null  object 
 3   state                                            235692 non-null  object 
 4   postal_code                                      235689 non-null  Int64  
 5   model_year                                       235692 non-null  int64  
 6   make                                             235692 non-null  object 
 7   model                                            235692 non-null  object 
 8   electric_vehicle_type                            235692 non-null  object 
 9   clean_alternati

In [56]:
df.isnull().sum()

vin_1-10                                                0
county                                                  3
city                                                    3
state                                                   0
postal_code                                             3
model_year                                              0
make                                                    0
model                                                   0
electric_vehicle_type                                   0
clean_alternative_fuel_vehicle_cafv_eligibility         0
electric_range                                     139797
base_msrp                                          232439
legislative_district                                  494
dol_vehicle_id                                          0
vehicle_location                                       10
electric_utility                                        3
2020_census_tract                                       3
longitude     

In [57]:
(df[df["electric_range"].notna()]) 

Unnamed: 0,vin_1-10,county,city,state,postal_code,model_year,make,model,electric_vehicle_type,clean_alternative_fuel_vehicle_cafv_eligibility,electric_range,base_msrp,legislative_district,dol_vehicle_id,vehicle_location,electric_utility,2020_census_tract,longitude,latitude
0,5YJ3E1EBXK,King,Seattle,WA,98178,2019,TESLA,MODEL 3,Battery Electric Vehicle (BEV),Clean Alternative Fuel Vehicle Eligible,220.0,,37.0,477309682,POINT (-122.23825 47.49461),CITY OF SEATTLE - (WA)|CITY OF TACOMA - (WA),53033011902.0,-122.23825,47.49461
1,5YJYGDEE3L,Kitsap,Poulsbo,WA,98370,2020,TESLA,MODEL Y,Battery Electric Vehicle (BEV),Clean Alternative Fuel Vehicle Eligible,291.0,,23.0,109705683,POINT (-122.64681 47.73689),PUGET SOUND ENERGY INC,53035091100.0,-122.64681,47.73689
3,5UXTA6C0XM,Kitsap,Seabeck,WA,98380,2021,BMW,X5,Plug-in Hybrid Electric Vehicle (PHEV),Clean Alternative Fuel Vehicle Eligible,30.0,,35.0,267929112,POINT (-122.81585 47.64509),PUGET SOUND ENERGY INC,53035091301.0,-122.81585,47.64509
4,JTMAB3FV7P,Thurston,Rainier,WA,98576,2023,TOYOTA,RAV4 PRIME,Plug-in Hybrid Electric Vehicle (PHEV),Clean Alternative Fuel Vehicle Eligible,42.0,,2.0,236505139,POINT (-122.68993 46.88897),PUGET SOUND ENERGY INC,53067012530.0,-122.68993,46.88897
5,5YJSA1DN0C,Thurston,Olympia,WA,98502,2012,TESLA,MODEL S,Battery Electric Vehicle (BEV),Clean Alternative Fuel Vehicle Eligible,265.0,59900.0,22.0,186637195,POINT (-122.92333 47.03779),PUGET SOUND ENERGY INC,53067010600.0,-122.92333,47.03779
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
235684,WA1F2AFY7M,Whatcom,Blaine,WA,98230,2021,AUDI,Q5 E,Plug-in Hybrid Electric Vehicle (PHEV),Not eligible due to low battery range,18.0,,42.0,171206003,POINT (-122.74888 48.99404),CITY OF BLAINE - (WA)||PUD NO 1 OF WHATCOM COUNTY,53073010409.0,-122.74888,48.99404
235685,1G1RD6E42E,Pierce,Graham,WA,98338,2014,CHEVROLET,VOLT,Plug-in Hybrid Electric Vehicle (PHEV),Clean Alternative Fuel Vehicle Eligible,38.0,,2.0,106661334,POINT (-122.29477 47.05703),BONNEVILLE POWER ADMINISTRATION||CITY OF TACOM...,53053073124.0,-122.29477,47.05703
235687,1C4RJXN62R,Pierce,Tacoma,WA,98407,2024,JEEP,WRANGLER,Plug-in Hybrid Electric Vehicle (PHEV),Not eligible due to low battery range,21.0,,27.0,267004272,POINT (-122.51134 47.29238),BONNEVILLE POWER ADMINISTRATION||CITY OF TACOM...,53053060904.0,-122.51134,47.29238
235688,5YJSA1E28J,Snohomish,Stanwood,WA,98292,2018,TESLA,MODEL S,Battery Electric Vehicle (BEV),Clean Alternative Fuel Vehicle Eligible,249.0,,10.0,198794410,POINT (-122.37265 48.24159),BONNEVILLE POWER ADMINISTRATION||PUD 1 OF SNOH...,53061053301.0,-122.37265,48.24159


## Sauvegarde 

In [None]:
# df.to_csv("vehicules_nettoyes.csv", index=False)

On note la présence des valeurs manquantes dans notre dataset.

In [35]:
df.shape

(235692, 17)

In [36]:
# suppression des NA 

df = df.dropna()

#print(f"Nombre de lignes après suppression des NA : {df.shape[0]}")
#print(f"Nombre de colonnes après suppression des NA : {df.shape[1]}")
# Affichage des types de données   

In [37]:
df.isnull().sum().to_frame().T

Unnamed: 0,VIN (1-10),County,City,State,Postal Code,Model Year,Make,Model,Electric Vehicle Type,Clean Alternative Fuel Vehicle (CAFV) Eligibility,Electric Range,Base MSRP,Legislative District,DOL Vehicle ID,Vehicle Location,Electric Utility,2020 Census Tract
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [38]:
df.shape

(235155, 17)

apres suppression des valeurs manquants 

.. 

In [39]:
df['County'].value_counts()

County
King            118688
Snohomish        28810
Pierce           19256
Clark            14092
Thurston          8595
Kitsap            7901
Spokane           6579
Whatcom           5703
Benton            3040
Skagit            2676
Island            2551
Yakima            1499
Chelan            1433
Clallam           1403
Jefferson         1243
Cowlitz           1217
San Juan          1132
Mason             1118
Lewis             1064
Franklin           881
Grant              872
Kittitas           863
Grays Harbor       861
Walla Walla        634
Douglas            521
Whitman            463
Klickitat          411
Okanogan           360
Stevens            285
Pacific            283
Skamania           253
Asotin              96
Wahkiakum           83
Adams               80
Pend Oreille        79
Lincoln             72
Ferry               36
Columbia            19
Garfield             3
Name: count, dtype: int64

In [40]:
df['City'].value_counts()

City
Seattle              37403
Bellevue             11505
Vancouver             8509
Redmond               8210
Bothell               7738
                     ...  
Albion                   1
Gardiner                 1
Tumtum                   1
Northwestern Lake        1
Inchelium                1
Name: count, Length: 487, dtype: int64

In [41]:
df.head(10)

Unnamed: 0,VIN (1-10),County,City,State,Postal Code,Model Year,Make,Model,Electric Vehicle Type,Clean Alternative Fuel Vehicle (CAFV) Eligibility,Electric Range,Base MSRP,Legislative District,DOL Vehicle ID,Vehicle Location,Electric Utility,2020 Census Tract
0,5YJ3E1EBXK,King,Seattle,WA,98178.0,2019,TESLA,MODEL 3,Battery Electric Vehicle (BEV),Clean Alternative Fuel Vehicle Eligible,220.0,0.0,37.0,477309682,POINT (-122.23825 47.49461),CITY OF SEATTLE - (WA)|CITY OF TACOMA - (WA),53033010000.0
1,5YJYGDEE3L,Kitsap,Poulsbo,WA,98370.0,2020,TESLA,MODEL Y,Battery Electric Vehicle (BEV),Clean Alternative Fuel Vehicle Eligible,291.0,0.0,23.0,109705683,POINT (-122.64681 47.73689),PUGET SOUND ENERGY INC,53035090000.0
2,KM8KRDAF5P,Kitsap,Olalla,WA,98359.0,2023,HYUNDAI,IONIQ 5,Battery Electric Vehicle (BEV),Eligibility unknown as battery range has not b...,0.0,0.0,26.0,230390492,POINT (-122.54729 47.42602),PUGET SOUND ENERGY INC,53035090000.0
3,5UXTA6C0XM,Kitsap,Seabeck,WA,98380.0,2021,BMW,X5,Plug-in Hybrid Electric Vehicle (PHEV),Clean Alternative Fuel Vehicle Eligible,30.0,0.0,35.0,267929112,POINT (-122.81585 47.64509),PUGET SOUND ENERGY INC,53035090000.0
4,JTMAB3FV7P,Thurston,Rainier,WA,98576.0,2023,TOYOTA,RAV4 PRIME,Plug-in Hybrid Electric Vehicle (PHEV),Clean Alternative Fuel Vehicle Eligible,42.0,0.0,2.0,236505139,POINT (-122.68993 46.88897),PUGET SOUND ENERGY INC,53067010000.0
5,5YJSA1DN0C,Thurston,Olympia,WA,98502.0,2012,TESLA,MODEL S,Battery Electric Vehicle (BEV),Clean Alternative Fuel Vehicle Eligible,265.0,59900.0,22.0,186637195,POINT (-122.92333 47.03779),PUGET SOUND ENERGY INC,53067010000.0
6,WBY1Z6C30H,King,Bellevue,WA,98004.0,2017,BMW,I3,Battery Electric Vehicle (BEV),Clean Alternative Fuel Vehicle Eligible,81.0,0.0,48.0,196789610,POINT (-122.1872 47.61001),PUGET SOUND ENERGY INC||CITY OF TACOMA - (WA),53033020000.0
7,3MW5P9J0XN,Snohomish,Marysville,WA,98271.0,2022,BMW,330E,Plug-in Hybrid Electric Vehicle (PHEV),Not eligible due to low battery range,22.0,0.0,39.0,204822761,POINT (-122.1677 48.11026),PUGET SOUND ENERGY INC,53061050000.0
8,5YJ3E1EA6J,King,Kirkland,WA,98034.0,2018,TESLA,MODEL 3,Battery Electric Vehicle (BEV),Clean Alternative Fuel Vehicle Eligible,215.0,0.0,45.0,2039222,POINT (-122.22901 47.72201),PUGET SOUND ENERGY INC||CITY OF TACOMA - (WA),53033020000.0
9,5YJ3E1EA2J,King,Redmond,WA,98052.0,2018,TESLA,MODEL 3,Battery Electric Vehicle (BEV),Clean Alternative Fuel Vehicle Eligible,215.0,0.0,45.0,474817283,POINT (-122.13158 47.67858),PUGET SOUND ENERGY INC||CITY OF TACOMA - (WA),53033030000.0


In [42]:
df.isnull().sum().to_frame().T

Unnamed: 0,VIN (1-10),County,City,State,Postal Code,Model Year,Make,Model,Electric Vehicle Type,Clean Alternative Fuel Vehicle (CAFV) Eligibility,Electric Range,Base MSRP,Legislative District,DOL Vehicle ID,Vehicle Location,Electric Utility,2020 Census Tract
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


Analyse 

In [None]:
def load_data(filepath):
    try:
        df = pd.read_csv(filepath, compression='infer')
        df = df.dropna()
        df.columns = (
            df.columns
            .str.strip()
            .str.lower()
            .str.replace(" ", "_")
            .str.replace(r"[()]", "", regex=True)
        )
        
        if 'model_year' in df.columns:
            df['model_year'] = df['model_year'].astype("Int64")
        else:
            st.warning("⚠️ La colonne 'model_year' est absente.")

        if 'base_msrp' in df.columns:
            df = df[df['base_msrp'] > 0]
        else:
            st.warning("⚠️ La colonne 'base_msrp' est absente.")

        if 'electric_range' in df.columns:
            df = df[df['electric_range'] > 0]
        else:
            st.warning("⚠️ La colonne 'electric_range' est absente.")
        
        return df

    except FileNotFoundError:
        st.error(f"❌ Fichier non trouvé : {filepath}")
        return pd.DataFrame()
    except Exception as e:
        st.error(f"❌ Erreur lors du chargement : {e}")
        return pd.DataFrame()

# Chargement
df = load_data("Electric_Vehicle_Data_2025.csv.gz")