Climate Trace Dataset Investigation

Source: https://huggingface.co/datasets/tjhunter/climate-trace/tree/main/v3-2024-ct5

In [2]:
# Tools
!pip install dask




[notice] A new release of pip is available: 25.0.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [3]:
# Dependencies
import dask.dataframe as dd
import pandas as pd

In [4]:
# Parquet Files
parquet_files = {
    "2021_ch4.parquet" : 2021,
    "2021_co2.parquet" : 2021,
    "2021_n2o.parquet" : 2021,
    "2021_co2e_100yr.parquet" : 2021,
    "2022_ch4.parquet" : 2022,
    "2022_co2.parquet" : 2022,
    "2022_n2o.parquet" : 2022,
    "2022_co2e_100yr.parquet" : 2022,
    "2023_ch4.parquet" : 2023,
    "2023_co2.parquet" : 2023,
    "2023_n2o.parquet" : 2023,
    "2023_co2e_100yr.parquet" : 2023,
    "2024_ch4.parquet" : 2024,
    "2024_co2.parquet" : 2024,
    "2024_n2o.parquet" : 2024,
    "2024_co2e_100yr.parquet" : 2024
}
print("Parquet Files in the directory:\n")
for each in parquet_files:
    print(each)

target_parquet = input("Enter the target parquet file name to upload: ")
df = None

if target_parquet in parquet_files:
    parquet_file = f"../raw_data/{parquet_files[target_parquet]}/{target_parquet}"
    df = dd.read_parquet(parquet_file, npartitions=10)
    print("Parquet file loaded successfully.")
else:
    print("Invalid input. Please enter a valid parquet file name.")

Parquet Files in the directory:

2021_ch4.parquet
2021_co2.parquet
2021_n2o.parquet
2021_co2e_100yr.parquet
2022_ch4.parquet
2022_co2.parquet
2022_n2o.parquet
2022_co2e_100yr.parquet
2023_ch4.parquet
2023_co2.parquet
2023_n2o.parquet
2023_co2e_100yr.parquet
2024_ch4.parquet
2024_co2.parquet
2024_n2o.parquet
2024_co2e_100yr.parquet
Parquet file loaded successfully.


In [5]:
# Memory Usage Check
memory_usage = df.memory_usage(deep=True).compute()
memory_in_mb = memory_usage / (1024 * 1024)

print("Memory Usage (MB):")
for column, mb in memory_in_mb.items():
    print(f"{column}: {mb:.2f} MB")

print(f"\nTotal Memory: {memory_in_mb.sum():.2f} MB")

Memory Usage (MB):
Index: 0.00 MB
source_id: 115.85 MB
iso3_country: 28.98 MB
sector: 14.48 MB
subsector: 14.49 MB
original_inventory_sector: 117.66 MB
start_time: 115.85 MB
end_time: 115.85 MB
temporal_granularity: 14.48 MB
gas: 14.48 MB
emissions_quantity: 115.85 MB
emissions_factor: 115.85 MB
emissions_factor_units: 547.73 MB
capacity: 115.85 MB
capacity_units: 378.47 MB
capacity_factor: 115.85 MB
activity: 115.85 MB
activity_units: 359.47 MB
created_date: 115.85 MB
modified_date: 115.85 MB
source_name: 313.05 MB
source_type: 220.24 MB
lat: 115.85 MB
lon: 115.85 MB
other1: 209.93 MB
other2: 231.82 MB
other3: 168.31 MB
other4: 139.40 MB
other5: 140.56 MB
other6: 142.04 MB
other7: 139.30 MB
other8: 127.08 MB
other9: 142.04 MB
other10: 131.54 MB
other11: 117.66 MB
other12: 117.66 MB
other1_def: 464.17 MB
other2_def: 485.87 MB
other3_def: 561.83 MB
other4_def: 311.35 MB
other5_def: 326.02 MB
other6_def: 282.84 MB
other7_def: 392.46 MB
other8_def: 462.81 MB
other9_def: 424.51 MB
other10_

### <span style="color: white; font-weight: bold; text-decoration: underline;">source_id</span>
The source id is the unique identifier for the source of the data.

Data Type: `uint64`

**Status: <span style="color: red;">NOT READY</span>**

In [6]:
# Source ID Exploration
print("🔍 Source ID Description:")
source_id_desc = df['source_id'].describe().compute()
print(source_id_desc)

# Value Counts
print("\n📊 Source ID Value Counts:")
source_id_counts = df['source_id'].value_counts().compute()
print(source_id_counts)

# Null Values
null_count = df['source_id'].isnull().sum().compute()
print(f"\n❌ Null Values: {null_count}")

# Unique Values
unique_count = df['source_id'].nunique().compute()
print(f"\n🔢 Number of Unique Values: {unique_count}")

# Data Type
print(f"\n📝 Data Type: {df['source_id'].dtype}")

unique_values = df['source_id'].unique().compute()[:10]
print("\n🔢 First 10 Unique Values:")
print(unique_values)

🔍 Source ID Description:
count           15184500
mean     14012631.997153
std      12906121.662454
min                  110
25%            3783541.0
50%            8259285.0
75%           25456836.0
max             38339430
Name: source_id, dtype: object

📊 Source ID Value Counts:
source_id
110         12
818         12
923         12
1193        12
1376        12
            ..
38089178    12
38089206    12
38089228    12
38089240    12
38339430    12
Name: count, Length: 1265375, dtype: int64

❌ Null Values: 0

🔢 Number of Unique Values: 1265375

📝 Data Type: uint64

🔢 First 10 Unique Values:
0    11082620
1    11082721
2    11082559
3    11082728
4    11082684
5    11082825
6    10735118
7    11082532
8    11082631
9    37215606
Name: source_id, dtype: uint64


  d = A - u
  diff_b_a = subtract(b, a)


### <span style="color: white; font-weight: bold; text-decoration: underline;">iso3_country</span>
The ISO 3 country code is a three-letter code that is used to identify countries.

Data Type: `category`

**Status: <span style="color: red;">NOT READY</span>**

In [7]:
# Value Counts with Percentage
print("🌍 Value Counts:")
value_counts = df['iso3_country'].value_counts().compute()
value_counts_percent = (df['iso3_country'].value_counts(normalize=True) * 100).compute()
value_counts_summary = pd.DataFrame({
    'Count': value_counts, 
    'Percentage (%)': value_counts_percent.round(2)
})
print(value_counts_summary)

# Null Values
null_count = df['iso3_country'].isnull().sum().compute()
print(f"\n❌ Null Values: {null_count}")

# Unique Values Analysis
unique_values = df['iso3_country'].unique().compute()
unique_count = df['iso3_country'].nunique().compute()
print(f"\n🔢 Number of Unique Values: {unique_count}")

# Sort unique values for better readability
sorted_unique_values = sorted(unique_values)
print("🌐 All Unique Values:")
print(", ".join(map(str, sorted_unique_values)))

# Descriptive Statistics
print("\n📊 Descriptive Statistics:")
desc_stats = df['iso3_country'].describe().compute()
print(desc_stats)

# Data Type
print(f"\n📝 Data Type: {df['iso3_country'].dtypes}")

🌍 Value Counts:
                Count  Percentage (%)
iso3_country                         
USA           2301156           15.15
BRA           1288560            8.49
RUS            652572            4.30
ROU            618048            4.07
MEX            591804            3.90
...               ...             ...
BVT               120            0.00
HMD               120            0.00
UNK                24            0.00
SCG                 0            0.00
XAD                 0            0.00

[254 rows x 2 columns]

❌ Null Values: 0

🔢 Number of Unique Values: 252
🌐 All Unique Values:
ABW, AFG, AGO, AIA, ALA, ALB, AND, ARE, ARG, ARM, ASM, ATA, ATF, ATG, AUS, AUT, AZE, BDI, BEL, BEN, BES, BFA, BGD, BGR, BHR, BHS, BIH, BLM, BLR, BLZ, BMU, BOL, BRA, BRB, BRN, BTN, BVT, BWA, CAF, CAN, CCK, CHE, CHL, CHN, CIV, CMR, COD, COG, COK, COL, COM, CPV, CRI, CUB, CUW, CXR, CYM, CYP, CZE, DEU, DJI, DMA, DNK, DOM, DZA, ECU, EGY, ERI, ESH, ESP, EST, ETH, FIN, FJI, FLK, FRA, FRO, FSM, GAB, 

### <span style="color: white; font-weight: bold; text-decoration: underline;">sector</span>
The sector is the economic sector that the activity belongs to.

Data Type: `category`

**Status: <span style="color: green;">READY</span>**

In [8]:
# 🏭 Sector Exploration

# Descriptive Statistics
print("📊 Sector Description:")
sector_desc = df['sector'].describe().compute()
print(sector_desc)

# Value Counts with Percentage
print("\n📈 Sector Distribution:")
sector_counts = df['sector'].value_counts().compute()
sector_percent = (df['sector'].value_counts(normalize=True) * 100).compute()
sector_summary = pd.DataFrame({
    'Count': sector_counts, 
    'Percentage (%)': sector_percent.round(2)
})
print(sector_summary)

# Null Values
null_count = df['sector'].isnull().sum().compute()
print(f"\n❌ Null Values: {null_count}")

# Unique Values Analysis
unique_count = df['sector'].nunique().compute()
print(f"\n🔢 Number of Unique Sectors: {unique_count}")

# Unique Sectors
unique_sectors = df['sector'].unique().compute()
sorted_unique_sectors = sorted(unique_sectors)
print("\n🌐 Unique Sectors:")
print(", ".join(map(str, sorted_unique_sectors)))

# Data Type
print(f"\n📝 Data Type: {df['sector'].dtype}")

📊 Sector Description:
unique                        9
count                  15184500
top       forestry-and-land-use
freq                    6189372
Name: sector, dtype: object

📈 Sector Distribution:
                          Count  Percentage (%)
sector                                         
forestry-and-land-use   6189372           40.76
agriculture             5213832           34.34
buildings               1371084            9.03
transportation          1022112            6.73
waste                    777072            5.12
manufacturing            423876            2.79
power                    106440            0.70
fossil-fuel-operations    60804            0.40
mineral-extraction        19908            0.13
fluorinated-gases             0            0.00

❌ Null Values: 0

🔢 Number of Unique Sectors: 9

🌐 Unique Sectors:
agriculture, buildings, forestry-and-land-use, fossil-fuel-operations, manufacturing, mineral-extraction, power, transportation, waste

📝 Data Type: categ

### <span style="color: white; font-weight: bold; text-decoration: underline;">subsector</span>
The subsector is the economic subsector that the activity belongs to.

Data Type: `category`

**Status: <span style="color: green;">READY</span>**

In [9]:
# 🔬 Subsector Exploration

# Descriptive Statistics
print("📊 Subsector Description:")
subsector_desc = df['subsector'].describe().compute()
print(subsector_desc)

# Value Counts with Percentage
print("\n📈 Subsector Distribution:")
subsector_counts = df['subsector'].value_counts().compute()
subsector_percent = (df['subsector'].value_counts(normalize=True) * 100).compute()
subsector_summary = pd.DataFrame({
    'Count': subsector_counts, 
    'Percentage (%)': subsector_percent.round(2)
})
print(subsector_summary)

# Null Values
null_count = df['subsector'].isnull().sum().compute()
print(f"\n❌ Null Values: {null_count}")

# Unique Values Analysis
unique_count = df['subsector'].nunique().compute()
print(f"\n🔢 Number of Unique Subsectors: {unique_count}")

# Unique Subsectors
unique_subsectors = df['subsector'].unique().compute()
sorted_unique_subsectors = sorted(unique_subsectors)
print("\n🌐 Unique Subsectors:")
print(", ".join(map(str, sorted_unique_subsectors)))

# Data Type
print(f"\n📝 Data Type: {df['subsector'].dtype}")

📊 Subsector Description:
unique                                       48
count                                  15184500
top       enteric-fermentation-cattle-operation
freq                                     932736
Name: subsector, dtype: object

📈 Subsector Distribution:
                                        Count  Percentage (%)
subsector                                                    
enteric-fermentation-cattle-operation  932736            6.14
manure-management-cattle-operation     932736            6.14
rice-cultivation                       721896            4.75
removals                               721668            4.75
residential-onsite-fuel-usage          721152            4.75
...                                       ...             ...
rock-quarrying                              0            0.00
sand-quarrying                              0            0.00
solid-fuel-transformation                   0            0.00
soil-organic-carbon                        

### <span style="color: white; font-weight: bold; text-decoration: underline;">original_inventory_sector</span>
The original inventory sector is the economic sector that the activity belongs to.

Data Type: `object`

**Status: <span style="color: red;">NOT READY</span>**

In [10]:
# 🔬 Original Inventory Sector Exploration

# Descriptive Statistics
print("📊 Original Inventory Sector Description:")
original_inventory_sector_desc = df['original_inventory_sector'].describe().compute()
print(original_inventory_sector_desc)

# Value Counts with Percentage
print("\n📈 Original Inventory Sector Distribution:")   
original_inventory_sector_counts = df['original_inventory_sector'].value_counts().compute()
original_inventory_sector_percent = (df['original_inventory_sector'].value_counts(normalize=True) * 100).compute()
original_inventory_sector_summary = pd.DataFrame({
    'Count': original_inventory_sector_counts, 
    'Percentage (%)': original_inventory_sector_percent.round(2)
})
print(original_inventory_sector_summary)

# Null Values
null_count = df['original_inventory_sector'].isnull().sum().compute()
print(f"\n❌ Null Values: {null_count}")

# Unique Values Analysis
unique_count = df['original_inventory_sector'].nunique().compute()
print(f"\n🔢 Number of Unique Original Inventory Sectors: {unique_count}")

# Unique Original Inventory Sectors
unique_original_inventory_sectors = df['original_inventory_sector'].unique().compute()
sorted_unique_original_inventory_sectors = sorted(unique_original_inventory_sectors)
print("\n🌐 Unique Original Inventory Sectors:")
print(", ".join(map(str, sorted_unique_original_inventory_sectors)))

# Data Type
print(f"\n📝 Data Type: {df['original_inventory_sector'].dtype}")

📊 Original Inventory Sector Description:
count       0
unique      0
top       NaN
freq      NaN
Name: original_inventory_sector, dtype: object

📈 Original Inventory Sector Distribution:
Empty DataFrame
Index: []

❌ Null Values: 15184500

🔢 Number of Unique Original Inventory Sectors: 0

🌐 Unique Original Inventory Sectors:
<NA>

📝 Data Type: string


### <span style="color: white; font-weight: bold; text-decoration: underline;">start_time</span>
The start time of the activity.

Data Type: `datetime64[ns]`

**Status: <span style="color: green;">READY</span>**

In [11]:
# 🔬 Start Time Exploration

# Descriptive Statistics
print("📊 Start Time Description:")
start_time_desc = df['start_time'].describe().compute()
print(start_time_desc)

# Value Counts with Percentage
print("\n📈 Start Time Distribution:")   
start_time_counts = df['start_time'].value_counts().compute()
start_time_percent = (df['start_time'].value_counts(normalize=True) * 100).compute()
start_time_summary = pd.DataFrame({
    'Count': start_time_counts, 
    'Percentage (%)': start_time_percent.round(2)
})
print(start_time_summary)

# Null Values
null_count = df['start_time'].isnull().sum().compute()
print(f"\n❌ Null Values: {null_count}")

# Unique Values Analysis
unique_count = df['start_time'].nunique().compute()
print(f"\n🔢 Number of Unique Start Times: {unique_count}")

# Unique Start Times
unique_start_times = df['start_time'].unique().compute()
sorted_unique_start_times = sorted(unique_start_times)
print("\n🌐 Unique Start Times:")
print(", ".join(map(str, sorted_unique_start_times)))

# Data Type
print(f"\n📝 Data Type: {df['start_time'].dtype}")

📊 Start Time Description:
count               15184500
min      2021-01-01 00:00:00
25%      2021-03-24 06:00:00
50%      2021-06-16 00:00:00
75%      2021-09-08 12:00:00
max      2021-12-01 00:00:00
Name: start_time, dtype: object

📈 Start Time Distribution:
                             Count  Percentage (%)
start_time                                        
2021-01-01 00:00:00+00:00  1265375            8.33
2021-02-01 00:00:00+00:00  1265375            8.33
2021-03-01 00:00:00+00:00  1265375            8.33
2021-04-01 00:00:00+00:00  1265375            8.33
2021-05-01 00:00:00+00:00  1265375            8.33
2021-06-01 00:00:00+00:00  1265375            8.33
2021-07-01 00:00:00+00:00  1265375            8.33
2021-08-01 00:00:00+00:00  1265375            8.33
2021-09-01 00:00:00+00:00  1265375            8.33
2021-10-01 00:00:00+00:00  1265375            8.33
2021-11-01 00:00:00+00:00  1265375            8.33
2021-12-01 00:00:00+00:00  1265375            8.33

❌ Null Values: 0

🔢 Numbe

### <span style="color: white; font-weight: bold; text-decoration: underline;">end_time</span>
The end time of the activity.

Data Type: `datetime64[ns]`

**Status: <span style="color: green;">READY</span>**

In [12]:
# 🔬 End Time Exploration

# Descriptive Statistics
print("📊 End Time Description:")
end_time_desc = df['end_time'].describe().compute()
print(end_time_desc)

# Value Counts with Percentage
print("\n📈 End Time Distribution:")   
end_time_counts = df['end_time'].value_counts().compute()
end_time_percent = (df['end_time'].value_counts(normalize=True) * 100).compute()
end_time_summary = pd.DataFrame({
    'Count': end_time_counts, 
    'Percentage (%)': end_time_percent.round(2)
})
print(end_time_summary)

# Null Values
null_count = df['end_time'].isnull().sum().compute()
print(f"\n❌ Null Values: {null_count}")

# Unique Values Analysis
unique_count = df['end_time'].nunique().compute()
print(f"\n🔢 Number of Unique End Times: {unique_count}")

# Unique End Times
unique_end_times = df['end_time'].unique().compute()
sorted_unique_end_times = sorted(unique_end_times)
print("\n🌐 Unique End Times:")
print(", ".join(map(str, sorted_unique_end_times)))

# Data Type
print(f"\n📝 Data Type: {df['end_time'].dtype}")

📊 End Time Description:
count               15184500
min      2021-01-31 00:00:00
25%      2021-04-22 12:00:00
50%      2021-07-15 12:00:00
75%      2021-10-07 18:00:00
max      2021-12-31 00:00:00
Name: end_time, dtype: object

📈 End Time Distribution:
                             Count  Percentage (%)
end_time                                          
2021-01-31 00:00:00+00:00  1265375            8.33
2021-02-28 00:00:00+00:00  1265375            8.33
2021-03-31 00:00:00+00:00  1265375            8.33
2021-04-30 00:00:00+00:00  1265375            8.33
2021-05-31 00:00:00+00:00  1265375            8.33
2021-06-30 00:00:00+00:00  1265375            8.33
2021-07-31 00:00:00+00:00  1265375            8.33
2021-08-31 00:00:00+00:00  1265375            8.33
2021-09-30 00:00:00+00:00  1265375            8.33
2021-10-31 00:00:00+00:00  1265375            8.33
2021-11-30 00:00:00+00:00  1265375            8.33
2021-12-31 00:00:00+00:00  1265375            8.33

❌ Null Values: 0

🔢 Number of U

### <span style="color: white; font-weight: bold; text-decoration: underline;">temporal_granularity</span>
The temporal granularity of the activity.

Data Type: `category`

**Status: <span style="color: green;">READY</span>**

In [13]:
# 🔬 Temporal Granularity Exploration

# Descriptive Statistics
print("📊 Temporal Granularity Description:")
temporal_granularity_desc = df['temporal_granularity'].describe().compute()
print(temporal_granularity_desc)

# Value Counts with Percentage
print("\n📈 Temporal Granularity Distribution:")   
temporal_granularity_counts = df['temporal_granularity'].value_counts().compute()
temporal_granularity_percent = (df['temporal_granularity'].value_counts(normalize=True) * 100).compute()
temporal_granularity_summary = pd.DataFrame({
    'Count': temporal_granularity_counts, 
    'Percentage (%)': temporal_granularity_percent.round(2)
})
print(temporal_granularity_summary)

# Null Values
null_count = df['temporal_granularity'].isnull().sum().compute()
print(f"\n❌ Null Values: {null_count}")

# Unique Values Analysis
unique_count = df['temporal_granularity'].nunique().compute()
print(f"\n🔢 Number of Unique Temporal Granularities: {unique_count}")

# Unique Temporal Granularities
unique_temporal_granularities = df['temporal_granularity'].unique().compute()
sorted_unique_temporal_granularities = sorted(unique_temporal_granularities)
print("\n🌐 Unique Temporal Granularities:")
print(", ".join(map(str, sorted_unique_temporal_granularities)))

# Data Type
print(f"\n📝 Data Type: {df['temporal_granularity'].dtype}")

📊 Temporal Granularity Description:
unique           1
count     15184500
top          month
freq      15184500
Name: temporal_granularity, dtype: object

📈 Temporal Granularity Distribution:
                         Count  Percentage (%)
temporal_granularity                          
month                 15184500           100.0
annual                       0             0.0
other                        0             0.0
week                         0             0.0
day                          0             0.0
hour                         0             0.0

❌ Null Values: 0

🔢 Number of Unique Temporal Granularities: 1

🌐 Unique Temporal Granularities:
month

📝 Data Type: category


### <span style="color: white; font-weight: bold; text-decoration: underline;">gas</span>
The gas type observed in the activity.

Data Type: `category`

**Status: <span style="color: green;">READY</span>**

In [14]:
# 🔬 Gas Exploration

# Descriptive Statistics
print("📊 Gas Description:")
gas_desc = df['gas'].describe().compute()
print(gas_desc)

# Value Counts with Percentage
print("\n📈 Gas Distribution:")   
gas_counts = df['gas'].value_counts().compute()
gas_percent = (df['gas'].value_counts(normalize=True) * 100).compute()
gas_summary = pd.DataFrame({
    'Count': gas_counts, 
    'Percentage (%)': gas_percent.round(2)
})
print(gas_summary)

# Null Values
null_count = df['gas'].isnull().sum().compute()
print(f"\n❌ Null Values: {null_count}")

# Unique Values Analysis
unique_count = df['gas'].nunique().compute()
print(f"\n🔢 Number of Unique Gases: {unique_count}")

# Unique Gases
unique_gases = df['gas'].unique().compute()
sorted_unique_gases = sorted(unique_gases)
print("\n🌐 Unique Gases:")
print(", ".join(map(str, sorted_unique_gases)))

# Data Type
print(f"\n📝 Data Type: {df['gas'].dtype}")

📊 Gas Description:
unique           1
count     15184500
top            ch4
freq      15184500
Name: gas, dtype: object

📈 Gas Distribution:
               Count  Percentage (%)
gas                                 
ch4         15184500           100.0
co2                0             0.0
n2o                0             0.0
co2e_100yr         0             0.0

❌ Null Values: 0

🔢 Number of Unique Gases: 1

🌐 Unique Gases:
ch4

📝 Data Type: category


### <span style="color: white; font-weight: bold; text-decoration: underline;">emissions_quantity</span>
The emissions quantity of the activity.

Data Type: `float`

**Status: <span style="color: green;">READY</span>**

In [15]:
# 🔬 Emissions Quantity Exploration

# Descriptive Statistics
print("📊 Emissions Quantity Description:")
emissions_quantity_desc = df['emissions_quantity'].describe().compute()
print(emissions_quantity_desc)

# Value Counts with Percentage
print("\n📈 Top 10 Emissions Quantity Distribution:")   
emissions_quantity_counts = df['emissions_quantity'].value_counts().compute().loc[0:9]
emissions_quantity_percent = (df['emissions_quantity'].value_counts(normalize=True) * 100).compute().loc[0:9]
emissions_quantity_summary = pd.DataFrame({
    'Count': emissions_quantity_counts, 
    'Percentage (%)': emissions_quantity_percent.round(2)
})
print(emissions_quantity_summary)

# Null Values
null_count = df['emissions_quantity'].isnull().sum().compute()
print(f"\n❌ Null Values: {null_count}")

# Top 10 Unique Value Counts
top_10_unique_emissions_quantities = df['emissions_quantity'].value_counts().compute().loc[0:9]
print(f"\n🔢 Top 10 Unique Emissions Quantities (Count): {top_10_unique_emissions_quantities}")

# Data Type
print(f"\n📝 Data Type: {df['emissions_quantity'].dtype}")

📊 Emissions Quantity Description:
count    1.518450e+07
mean     2.723283e+01
std      1.462092e+03
min      0.000000e+00
25%      0.000000e+00
50%      0.000000e+00
75%      4.167300e-01
max      1.006126e+06
Name: emissions_quantity, dtype: float64

📈 Top 10 Emissions Quantity Distribution:
                      Count  Percentage (%)
emissions_quantity                         
0.000000e+00        7777599           51.22
1.957801e-23              1            0.00
3.175042e-23              1            0.00
5.413141e-23              1            0.00
5.568285e-23              1            0.00
...                     ...             ...
8.999660e+00             12            0.00
8.999690e+00              1            0.00
8.999789e+00              1            0.00
8.999900e+00              3            0.00
8.999940e+00              3            0.00

[2076951 rows x 2 columns]

❌ Null Values: 0

🔢 Top 10 Unique Emissions Quantities (Count): emissions_quantity
0.000000e+00    777759

### <span style="color: white; font-weight: bold; text-decoration: underline;">emissions_factor</span>
The emissions factor of the activity.

Data Type: `float`

**Status: <span style="color: green;">READY</span>**

In [16]:
# 🔬 Emissions Factor Exploration

# Descriptive Statistics
print("📊 Emissions Factor Description:")
emissions_factor_desc = df['emissions_factor'].describe().compute()
print(emissions_factor_desc)

# Value Counts with Percentage
print("\n📈 Emissions Factor Distribution:")   
emissions_factor_counts = df['emissions_factor'].value_counts().compute()
emissions_factor_percent = (df['emissions_factor'].value_counts(normalize=True) * 100).compute()
emissions_factor_summary = pd.DataFrame({
    'Count': emissions_factor_counts, 
    'Percentage (%)': emissions_factor_percent.round(2)
})
print(emissions_factor_summary)

# Null Values
null_count = df['emissions_factor'].isnull().sum().compute()
print(f"\n❌ Null Values: {null_count}")

# Unique Values Analysis
unique_count = df['emissions_factor'].nunique().compute()
print(f"\n🔢 Number of Unique Emissions Factors: {unique_count}")

# Unique Emissions Factors
unique_emissions_factors = df['emissions_factor'].unique().compute()
sorted_unique_emissions_factors = sorted(unique_emissions_factors)
print("\n🌐 Unique Emissions Factors:")
print(", ".join(map(str, sorted_unique_emissions_factors[:10])))

# Data Type
print(f"\n📝 Data Type: {df['emissions_factor'].dtype}")

📊 Emissions Factor Description:
count    1.517764e+07
mean     1.273095e-02
std      5.296254e+00
min      0.000000e+00
25%      0.000000e+00
50%      0.000000e+00
75%      8.333250e-05
max      5.261260e+03
Name: emissions_factor, dtype: float64

📈 Emissions Factor Distribution:
                    Count  Percentage (%)
emissions_factor                         
0.000000e+00      7733656           50.95
3.866398e-24            3            0.00
3.897226e-24            3            0.00
3.901347e-24            3            0.00
3.930660e-24            3            0.00
...                   ...             ...
6.752092e+02           12            0.00
8.055111e+02           12            0.00
1.615010e+03           12            0.00
2.005900e+03           12            0.00
5.261260e+03           12            0.00

[1149272 rows x 2 columns]

❌ Null Values: 6864

🔢 Number of Unique Emissions Factors: 1149272

🌐 Unique Emissions Factors:
0.0, 2.730378404315253e-13, 2.077933666887432e-1

### <span style="color: white; font-weight: bold; text-decoration: underline;">capacity</span>
The gas type observed in the activity.

Data Type: `float`

**Status: <span style="color: red;">NOT READY</span>**

In [17]:
# 🔬 Capacity Exploration

# Descriptive Statistics
print("📊 Capacity Description:")
capacity_desc = df['capacity'].describe().compute()
print(capacity_desc)

# Value Counts with Percentage
print("\n📈 Capacity Distribution:")   
capacity_counts = df['capacity'].value_counts().compute()
capacity_percent = (df['capacity'].value_counts(normalize=True) * 100).compute()
capacity_summary = pd.DataFrame({
    'Count': capacity_counts, 
    'Percentage (%)': capacity_percent.round(2)
})
print(capacity_summary)

# Null Values
null_count = df['capacity'].isnull().sum().compute()
print(f"\n❌ Null Values: {null_count}")

# Unique Values Analysis
unique_count = df['capacity'].nunique().compute()
print(f"\n🔢 Number of Unique Capacities: {unique_count}")

# Unique Capacities
unique_capacities = df['capacity'].unique().compute()[:10]
sorted_unique_capacities = sorted(unique_capacities)
print("\n🌐 Unique Capacities:")
print(", ".join(map(str, sorted_unique_capacities)))

# Data Type
print(f"\n📝 Data Type: {df['capacity'].dtype}")

📊 Capacity Description:
count    1.517764e+07
mean              inf
std               inf
min      0.000000e+00
25%      6.600000e+01
50%      2.244684e+02
75%      2.335450e+03
max               inf
Name: capacity, dtype: float64

📈 Capacity Distribution:
               Count  Percentage (%)
capacity                            
0.000000e+00  798324            5.26
5.483465e-19      12            0.00
4.921627e-17      12            0.00
5.000000e-16      12            0.00
7.000000e-16      12            0.00
...              ...             ...
7.532654e+09       1            0.00
7.540384e+09       1            0.00
7.558545e+09       1            0.00
7.570077e+09       1            0.00
inf               84            0.00

[1576685 rows x 2 columns]

❌ Null Values: 6864

🔢 Number of Unique Capacities: 1576685

🌐 Unique Capacities:
749.5274770230001, 1542.3470421077782, 2216.7504552943524, 2369.8285726801346, 2714.794356830439, 3159.3613240245086, 5301.96148042613, 6030.1539673625

### <span style="color: white; font-weight: bold; text-decoration: underline;">activity</span>
The gas type observed in the activity.

Data Type: `float`

**Status: <span style="color: green;">READY</span>**

In [18]:
# 🔬 Activity Exploration

# Descriptive Statistics
print("📊 Activity Description:")
activity_desc = df['activity'].describe().compute()
print(activity_desc)

# Value Counts with Percentage
print("\n📈 Activity Distribution:")   
activity_counts = df['activity'].value_counts().compute()
activity_percent = (df['activity'].value_counts(normalize=True) * 100).compute()
activity_summary = pd.DataFrame({
    'Count': activity_counts, 
    'Percentage (%)': activity_percent.round(2)
})
print(activity_summary)

# Null Values
null_count = df['activity'].isnull().sum().compute()
print(f"\n❌ Null Values: {null_count}")

# Unique Values Analysis
unique_count = df['activity'].nunique().compute()
print(f"\n🔢 Number of Unique Activities: {unique_count}")

# Unique Activities
unique_activities = df['activity'].unique().compute()[:10]
sorted_unique_activities = sorted(unique_activities)
print("\n🌐 Unique Activities (only 10):")
print(", ".join(map(str, sorted_unique_activities)))

# Data Type
print(f"\n📝 Data Type: {df['activity'].dtype}")

📊 Activity Description:
count    1.517764e+07
mean     1.608541e+07
std      8.360473e+08
min      0.000000e+00
25%      1.524351e+01
50%      5.044590e+02
75%      6.038511e+04
max      7.462960e+11
Name: activity, dtype: float64

📈 Activity Distribution:
                Count  Percentage (%)
activity                             
0.000000e+00  2771942           18.26
8.512177e-21        1            0.00
1.380453e-20        1            0.00
2.353540e-20        1            0.00
2.420994e-20        1            0.00
...               ...             ...
4.291049e+11        3            0.00
5.764809e+11        3            0.00
6.522362e+11        3            0.00
7.332167e+11        3            0.00
7.462960e+11        3            0.00

[2852320 rows x 2 columns]

❌ Null Values: 6864

🔢 Number of Unique Activities: 2852320

🌐 Unique Activities (only 10):
53.49401588571072, 103.32163171484656, 221.69517881508847, 334.5638404520915, 451.34261362176113, 538.3059282818226, 712.0794886

### <span style="color: white; font-weight: bold; text-decoration: underline;">source_name</span>
The name of the data source.

Data Type: `string`

**Status: <span style="color: green;">READY</span>**

In [19]:
# 🔬 Source Name Exploration

# Descriptive Statistics
print("📊 Source Name Description:")
source_name_desc = df['source_name'].describe().compute()
print(source_name_desc)

# Value Counts with Percentage
print("\n📈 Source Name Distribution:")   
source_name_counts = df['source_name'].value_counts().compute()
source_name_percent = (df['source_name'].value_counts(normalize=True) * 100).compute()
source_name_summary = pd.DataFrame({
    'Count': source_name_counts, 
    'Percentage (%)': source_name_percent.round(2)
})
print(source_name_summary)

# Null Values
null_count = df['source_name'].isnull().sum().compute()
print(f"\n❌ Null Values: {null_count}")

# Unique Values Analysis
unique_count = df['source_name'].nunique().compute()
print(f"\n🔢 Number of Unique Source Names: {unique_count}")

# Unique Source Names
unique_source_names = df['source_name'].unique().compute()[:10]
sorted_unique_source_names = sorted(unique_source_names)
print("\n🌐 Unique Source Names:")
print(", ".join(map(str, sorted_unique_source_names)))

# Data Type
print(f"\n📝 Data Type: {df['source_name'].dtype}")

📊 Source Name Description:
unique                    244016
count                   15087732
top       OpenStreetMap Landfill
freq                       27972
Name: source_name, dtype: object

📈 Source Name Distribution:
                                                                         
source_name                                                              
                                                                         
DAI - Sociedade de Desenvolvimento Agro-Indust...   Count  Percentage (%)
                                                                         
SOPRAGOL - Sociedade de Industrialização de Pr...      12             0.0
                                                       12             0.0
UNICER Cervejas S.A. - Centro de Produção de S...      12             0.0
!Karas                                                204             0.0
"BEL POLSKA" SPÓŁKA Z OGRANICZONĄ ODPOWIEDZIALN...     12             0.0
...                                    

### <span style="color: white; font-weight: bold; text-decoration: underline;">source_type</span>
The industry or service type of source.

Data Type: `string`

**Status: <span style="color: green;">READY</span>**

In [20]:
# 🔬 Source Type Exploration

# Descriptive Statistics
print("📊 Source Type Description:")
source_type_desc = df['source_type'].describe().compute()
print(source_type_desc)

# Value Counts with Percentage
print("\n📈 Source Type Distribution:")   
source_type_counts = df['source_type'].value_counts().compute()
source_type_percent = (df['source_type'].value_counts(normalize=True) * 100).compute()
source_type_summary = pd.DataFrame({
    'Count': source_type_counts, 
    'Percentage (%)': source_type_percent.round(2)
})
print(source_type_summary)

# Null Values
null_count = df['source_type'].isnull().sum().compute()
print(f"\n❌ Null Values: {null_count}")

# Unique Values Analysis
unique_count = df['source_type'].nunique().compute()
print(f"\n🔢 Number of Unique Source Types: {unique_count}")

# Unique Source Types
unique_source_types = df['source_type'].unique().compute()
sorted_unique_source_types = sorted(unique_source_types)
print("\n🌐 Unique Source Types:")
print(", ".join(map(str, sorted_unique_source_types)))

# Data Type
print(f"\n📝 Data Type: {df['source_type'].dtype}")

📊 Source Type Description:
unique                                      1777
count                                    3444792
top       enteric_fermentation_maturedairycattle
freq                                      511800
Name: source_type, dtype: object

📈 Source Type Distribution:
                                                    Count  Percentage (%)
source_type                                                              
 Asphalt production                                   240            0.01
 Assembly | Coating | Cutting | Finishing | Foa...     12             0.0
 Assembly | Cutting | Dyeing | Embroidery | Fin...     12             0.0
 Assembly | Cutting | Embroidery | Finishing | ...     12             0.0
 Assembly | Cutting | Embroidery | Finishing | ...     24             0.0
...                                                   ...             ...
waste, other_fossil                                    12             0.0
weaving|dyeing|printing|cutting|sewing|finishin..

TypeError: boolean value of NA is ambiguous

### <span style="color: white; font-weight: bold; text-decoration: underline;">geometry_ref</span>
Geometry ref of the area being observed

Data Type: `string`

**Status: <span style="color: green;">READY</span>**

In [21]:
# 🔬 Geometry Reference Exploration

# Descriptive Statistics
print("📊 Geometry Reference Description:")
geometry_ref_desc = df['geometry_ref'].describe().compute()
print(geometry_ref_desc)

# Value Counts with Percentage
print("\n📈 Geometry Reference Distribution:")   
geometry_ref_counts = df['geometry_ref'].value_counts().compute()
geometry_ref_percent = (df['geometry_ref'].value_counts(normalize=True) * 100).compute()
geometry_ref_summary = pd.DataFrame({
    'Count': geometry_ref_counts, 
    'Percentage (%)': geometry_ref_percent.round(2)
})
print(geometry_ref_summary)

# Null Values
null_count = df['geometry_ref'].isnull().sum().compute()
print(f"\n❌ Null Values: {null_count}")

# Unique Values Analysis
unique_count = df['geometry_ref'].nunique().compute()
print(f"\n🔢 Number of Unique Geometry References: {unique_count}")

# Unique Geometry References
unique_geometry_ref = df['geometry_ref'].unique().compute()
sorted_unique_geometry_ref = sorted(unique_geometry_ref)[:10]
print("\n🌐 First 10 Unique Geometry References:")
print(", ".join(map(str, sorted_unique_geometry_ref)))

# Data Type
print(f"\n📝 Data Type: {df['geometry_ref'].dtype}")

📊 Geometry Reference Description:
unique             470371
count            15184500
top       gadm_CHL.15.1_1
freq                  216
Name: geometry_ref, dtype: object

📈 Geometry Reference Distribution:
                 Count  Percentage (%)
geometry_ref                          
gadm_ABW           144             0.0
gadm_AFG           192             0.0
gadm_AFG.1.10_1    204             0.0
gadm_AFG.1.11_1    204             0.0
gadm_AFG.1.12_1    204             0.0
...                ...             ...
trace_8555          12             0.0
trace_8556          12             0.0
trace_8560          12             0.0
trace_8561          12             0.0
trace_923           12             0.0

[470371 rows x 2 columns]

❌ Null Values: 0

🔢 Number of Unique Geometry References: 470371

🌐 First 10 Unique Geometry References:
gadm_ABW, gadm_AFG, gadm_AFG.1.10_1, gadm_AFG.1.11_1, gadm_AFG.1.12_1, gadm_AFG.1.13_1, gadm_AFG.1.1_1, gadm_AFG.1.2_1, gadm_AFG.1.3_1, gadm_AFG.1.4_1



In [None]:
del df