In [None]:
import os
from io import BytesIO
from pathlib import Path
from zipfile import ZipFile

import requests
import pandas as pd
import seaborn as sns

# Widen pandas display options for exploration
pd.options.display.max_seq_items = 2000
pd.options.display.max_rows = 4000
pd.options.display.max_columns = 4000

In [None]:
RAW_DATA_FILENAME = "CO2EmissionHDV_VehicleExtract_02062021"
RAW_DATA_DOWNLOAD_URL = f"https://discomap.eea.europa.eu/app/CO2HDV/{RAW_DATA_FILENAME}.zip"

DATA_DIR = Path("./data")
RAW_DATA_PATH = DATA_DIR / f"{RAW_DATA_FILENAME}.csv"


## Ensure data file is present
if not os.path.exists(RAW_DATA_PATH):
    print("File not found. Fetching it now")
    resp = requests.get(RAW_DATA_DOWNLOAD_URL).content

    with ZipFile(BytesIO(resp)) as zipfile:
        zipfile.extract(f"{RAW_DATA_FILENAME}.csv", path=DATA_DIR)

    print("File downloaded.")


## Notes

This dataset contains OEM data and MS (Member state) data that has been joined together nivaely.

The MS data has fewer columns prefixed with `MS`. 

#### Some of these are equivalent to the OEM fields:
- (MS_Mh, MS_Mh_msv, MS_Mk) => Make (Manufacturer????)
- MS_Electric => 
- MS_Hybrid => HybridElectricHDV
- MS_FT => Engine_FuelType (With assumptions) 
- MS_TechnPermMaxLadenMass (kg) => GrossVehicleMass_t (tonnes)

#### Need more investigation"
- MS_VehicleCategoryCode => LegislativeClass (????)
- Meta_MS_fileId => Meta_OEM_fileId (???)

#### Related Columns:
- MS_NumberOfAxles === AxleConfiguration


#### Can Drop???
- MS_StageOfCompletionCode
- MS_CryptHashManufacturerRecord


#### TODO:
MS_Bw????  
MS_MaximumSpeed 
MS_SpecificCO2Emissions
MS_AveragePayload
MS_RegistrationDate 



In [None]:
df = pd.read_csv(RAW_DATA_PATH)

In [None]:
ms_pks = df[df["OEM_PK_Vehicle"].isna()]

In [None]:
df.groupby("Match").size()

In [None]:
ms_pks.isna().sum()