## Training the model: XGBoost Integration

In [2]:
import pandas as pd

In [4]:
e_high = pd.read_csv('emissions_high_granularity.csv')

In [6]:
# Check out the basics of the dataset

In [8]:
e_high.shape # 15797 rows, 16 columns
e_high.head()

Unnamed: 0,year,parent_entity,parent_type,reporting_entity,commodity,production_value,production_unit,product_emissions_MtCO2,flaring_emissions_MtCO2,venting_emissions_MtCO2,own_fuel_use_emissions_MtCO2,fugitive_methane_emissions_MtCO2e,fugitive_methane_emissions_MtCH4,total_operational_emissions_MtCO2e,total_emissions_MtCO2e,source
0,1962,Abu Dhabi National Oil Company,State-owned Entity,Abu Dhabi,Oil & NGL,0.9125,Million bbl/yr,0.338928,0.005404,0.001299,0.0,0.018254,0.000652,0.024957,0.363885,Abu Dhabi National Oil Company Annual Report 1...
1,1963,Abu Dhabi National Oil Company,State-owned Entity,Abu Dhabi,Oil & NGL,1.825,Million bbl/yr,0.677855,0.010808,0.002598,0.0,0.036508,0.001304,0.049914,0.72777,Abu Dhabi National Oil Company Annual Report 1...
2,1964,Abu Dhabi National Oil Company,State-owned Entity,Abu Dhabi,Oil & NGL,7.3,Million bbl/yr,2.711422,0.043233,0.010392,0.0,0.146033,0.005215,0.199657,2.911079,Abu Dhabi National Oil Company Annual Report 1...
3,1965,Abu Dhabi National Oil Company,State-owned Entity,Abu Dhabi,Oil & NGL,10.95,Million bbl/yr,4.067132,0.064849,0.015588,0.0,0.219049,0.007823,0.299486,4.366618,Abu Dhabi National Oil Company Annual Report 1...
4,1966,Abu Dhabi National Oil Company,State-owned Entity,Abu Dhabi,Oil & NGL,13.505,Million bbl/yr,5.01613,0.07998,0.019225,0.0,0.27016,0.009649,0.369366,5.385495,Abu Dhabi National Oil Company Annual Report 1...


In [10]:
# Remove duplicates
e_high = e_high.drop_duplicates()

In [12]:
# Sort values by year
e_high = e_high.sort_values(by='year')

In [14]:
# The names of each parent are too long for plotting, so we'll make a new column that just as shortened parent names for plot purposes
short_names = {
    'Westmoreland Mining': 'Westmoreland',
    'CONSOL Energy': 'CONSOL',
    'ExxonMobil': 'Exxon',
    'Shell': 'Shell',
    'Former Soviet Union': 'Soviet Union',
    'Anglo American': 'Anglo American',
    'Chevron': 'Chevron',
    'Poland': 'Poland',
    'BP': 'BP',
    'ConocoPhillips': 'ConocoPhillips',
    'National Iranian Oil Co.': 'NIOC',
    'China (Cement)': 'China Cement',
    'TotalEnergies': 'Total',
    'Czechoslovakia': 'Czech.',
    'Pemex': 'Pemex',
    'Saudi Aramco': 'Saudi Aramco',
    'Marathon Oil': 'Marathon Oil',
    'Kiewit Mining Group': 'Kiewit Mining',
    'China (Coal)': 'China Coal',
    'Peabody Coal Group': 'Peabody Coal',
    'Occidental Petroleum': 'Occidental',
    'Kuwait Petroleum Corp.': 'Kuwait Pet.',
    'Singareni Collieries': 'Singareni Collieries',
    'British Coal Corporation': 'British Coal',
    'North American Coal': 'NorthA Coal',
    'Eni': 'Eni',
    'Sasol': 'Sasol',
    'BASF': 'BASF',
    'Petrobras': 'Petrobras',
    'BHP': 'BHP',
    'ONGC India': 'ONGC',
    'Hess Corporation': 'Hess',
    'Pertamina': 'Pertamina',
    'Rio Tinto': 'Rio Tinto',
    'Egyptian General Petroleum': 'Egypt Pet.',
    'Sonangol': 'Sonangol',
    'Nigerian National Petroleum Corp.': 'NNPC',
    'Sonatrach': 'Sonatrach',
    'QatarEnergy': 'QatarEnergy',
    'Petroleos de Venezuela': 'PDVSA',
    'Iraq National Oil Company': 'Iraq NOC',
    'Libya National Oil Corp.': 'Libya NOC',
    'Abu Dhabi National Oil Company': 'ADNOC',
    'North Korea': 'North Korea',
    'Repsol': 'Repsol',
    'RWE': 'RWE',
    'Petroleum Development Oman': 'Oman Pet.',
    'Syrian Petroleum': 'Syrian Pet.',
    'Bapco Energies': 'Bapco',
    'Cyprus AMAX Minerals': 'Cyprus AMAX',
    'Woodside Energy': 'Woodside',
    'Equinor': 'Equinor',
    'Arch Resources': 'Arch Resources',
    'Coal India': 'Coal India',
    'Petronas': 'Petronas',
    'Taiheiyo Cement': 'Taiheiyo',
    'Vistra': 'Vistra',
    'Alpha Metallurgical Resources': 'Alpha Met.',
    'Murphy Oil': 'Murphy Oil',
    'APA Corporation': 'APA',
    'YPF': 'YPF',
    'Ovintiv': 'Ovintiv',
    'Ecopetrol': 'Ecopetrol',
    'Suncor Energy': 'Suncor',
    'CNOOC': 'CNOOC',
    'Cenovus Energy': 'Cenovus',
    'Devon Energy': 'Devon',
    'American Consolidated Natural Resources': 'ACNR',
    'Southwestern Energy': 'Southwestern',
    'CNPC': 'CNPC',
    'Gazprom': 'Gazprom',
    'Canadian Natural Resources': 'CNRL',
    'Heidelberg Materials': 'Heidelberg',
    'Cemex': 'Cemex',
    'Rosneft': 'Rosneft',
    'Bumi Resources': 'Bumi',
    'Holcim Group': 'Holcim',
    'Santos': 'Santos',
    'Coterra Energy': 'Coterra',
    'PetroEcuador': 'PetroEcuador',
    'PTTEP': 'PTTEP',
    'EOG Resources': 'EOG',
    'Kazakhstan': 'Kazakhstan',
    'Ukraine': 'Ukraine',
    'EQT Corporation': 'EQT',
    'TurkmenGaz': 'TurkmenGaz',
    'Russian Federation': 'Russia',
    'Adaro Energy': 'Adaro',
    'Czech Republic': 'Czech Republic',
    'Slovakia': 'Slovakia',
    'Chesapeake Energy': 'Chesapeake',
    'SM Energy': 'SM Energy',
    'Pioneer Natural Resources': 'Pioneer',
    'UK Coal': 'UK Coal',
    'Alliance Resource Partners': 'Alliance',
    'Obsidian Energy': 'Obsidian',
    'Lukoil': 'Lukoil',
    'OMV Group': 'OMV',
    'Banpu': 'Banpu',
    'Exxaro Resources Ltd': 'Exxaro',
    'Glencore': 'Glencore',
    'Teck Resources': 'Teck',
    'Orlen': 'Orlen',
    'Naftogaz': 'Naftogaz',
    'Sinopec': 'Sinopec',
    'Petoro': 'Petoro',
    'Surgutneftegas': 'Surgut',
    'Tullow Oil': 'Tullow',
    'Wolverine Fuels': 'Wolverine',
    'CRH': 'CRH',
    'Novatek': 'Novatek',
    'Inpex': 'Inpex',
    'Continental Resources': 'Continental',
    'Whitehaven Coal': 'Whitehaven',
    'Vale': 'Vale',
    'Cloud Peak': 'Cloud Peak',
    'Tourmaline Oil': 'Tourmaline',
    'Antero': 'Antero',
    'Adani Enterprises': 'Adani',
    'Navajo Transitional Energy Company': 'Navajo',
    'CNX Resources': 'CNX',
    'Seriti Resources': 'Seriti'
}


e_high['parent_short'] = e_high['parent_entity'].map(short_names)
e_high['parent_short'] = e_high['parent_short'].fillna('Other')
print(e_high[['parent_entity', 'parent_short']])


                    parent_entity    parent_short
15096         Westmoreland Mining    Westmoreland
15098         Westmoreland Mining    Westmoreland
15097         Westmoreland Mining    Westmoreland
15101         Westmoreland Mining    Westmoreland
15100         Westmoreland Mining    Westmoreland
...                           ...             ...
5543                 Devon Energy           Devon
5542                 Devon Energy           Devon
5341               Czech Republic  Czech Republic
205    Alliance Resource Partners        Alliance
15796                         YPF             YPF

[15797 rows x 2 columns]


### ML Model Pipeline starts here!

#### Feature Engineering with Scikit-Learn

In [74]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split

# Define features and target
features = ['year', 'parent_entity', 
            'parent_type', 'commodity', 
            'product_emissions_MtCO2',
            'flaring_emissions_MtCO2', 'venting_emissions_MtCO2',
            'total_operational_emissions_MtCO2e']
target = 'total_emissions_MtCO2e'

# Check if all required columns are present (comment out later)
required_columns = features + [target]
missing_columns = [col for col in required_columns if col not in e_high.columns]
if missing_columns:
    print(f"Missing columns: {missing_columns}")
else:
    print("All required columns are present.")

# Split data into the features and targets defined
X = e_high[features]
y = e_high[target]

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define preprocessing for numeric and categorical features
numeric_features = ['year', 'product_emissions_MtCO2', 'flaring_emissions_MtCO2', 
                     'venting_emissions_MtCO2', 'total_operational_emissions_MtCO2e']
categorical_features = ['parent_entity', 'parent_type', 'commodity']

# Standardize numeric features
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

# One-hot encode categorical features
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Preprocess the training and testing data
X_train_preprocessed = preprocessor.fit_transform(X_train)
X_test_preprocessed = preprocessor.transform(X_test)


All required columns are present.


#### Build the Neural Network with Tensorflow Keras

In [66]:
!pip install tensorflow

Collecting tensorflow
  Downloading tensorflow-2.16.2-cp311-cp311-macosx_12_0_arm64.whl.metadata (4.1 kB)
Collecting absl-py>=1.0.0 (from tensorflow)
  Using cached absl_py-2.1.0-py3-none-any.whl.metadata (2.3 kB)
Collecting astunparse>=1.6.0 (from tensorflow)
  Using cached astunparse-1.6.3-py2.py3-none-any.whl.metadata (4.4 kB)
Collecting flatbuffers>=23.5.26 (from tensorflow)
  Using cached flatbuffers-24.3.25-py2.py3-none-any.whl.metadata (850 bytes)
Collecting gast!=0.5.0,!=0.5.1,!=0.5.2,>=0.2.1 (from tensorflow)
  Downloading gast-0.6.0-py3-none-any.whl.metadata (1.3 kB)
Collecting google-pasta>=0.1.1 (from tensorflow)
  Using cached google_pasta-0.2.0-py3-none-any.whl.metadata (814 bytes)
Collecting h5py>=3.10.0 (from tensorflow)
  Downloading h5py-3.11.0-cp311-cp311-macosx_11_0_arm64.whl.metadata (2.5 kB)
Collecting libclang>=13.0.0 (from tensorflow)
  Downloading libclang-18.1.1-1-py2.py3-none-macosx_11_0_arm64.whl.metadata (5.2 kB)
Collecting ml-dtypes~=0.3.1 (from tensorflow

In [221]:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

# Ensure 'e_high' DataFrame is correctly loaded and processed
# For example: e_high = pd.read_csv('your_file.csv')

# Clean column names
e_high.columns = e_high.columns.str.strip()

# Print columns to check
print("Columns in DataFrame:", e_high.columns)

# Define features and target
features = ['year', 'parent_entity', 'parent_type', 'commodity', 
            'product_emissions_MtCO2', 'flaring_emissions_MtCO2', 
            'venting_emissions_MtCO2', 'total_operational_emissions_MtCO2e']
target = 'total_emissions_MtCO2e'

# Check if all required columns are present
required_columns = features + [target]
missing_columns = [col for col in required_columns if col not in e_high.columns]
if missing_columns:
    print(f"Missing columns: {missing_columns}")
else:
    print("All required columns are present.")

# Drop rows with missing values in features or target
e_high = e_high.dropna(subset=required_columns)

# Split data into features and target
X = e_high[features]
y = e_high[target]

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define preprocessing for numeric and categorical features
numeric_features = ['product_emissions_MtCO2', 'flaring_emissions_MtCO2', 
                     'venting_emissions_MtCO2', 'total_operational_emissions_MtCO2e']
categorical_features = ['year', 'parent_entity', 'parent_type', 'commodity']

# Standardize numeric features
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

# One-hot encode categorical features
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Preprocess the training and testing data
X_train_preprocessed = preprocessor.fit_transform(X_train)
X_test_preprocessed = preprocessor.transform(X_test)

# Define the neural network model
model = Sequential([
    Dense(128, input_dim=X_train_preprocessed.shape[1], activation='relu'),
    Dense(64, activation='relu'),
    Dense(1, activation='linear')  # Output layer for regression
])

# Compile the model
model.compile(optimizer='adam', loss='mean_squared_error', metrics=['mae'])

# Train the model
history = model.fit(X_train_preprocessed, y_train, epochs=100, batch_size=55, validation_split=0.2, verbose=1)

# Evaluate the model
loss, mae = model.evaluate(X_test_preprocessed, y_test, verbose=0)
print(f"Mean Absolute Error on Test Set: {mae}")






# First run, MAE on Test Set is 0.837
# Second run, MAE on Test Set is 5.77
# Third run, MAE on Test Set is 1.19


# Variance is too high, so we will try to lower the amount of features used.
previous_features = ['year', 'parent_entity', 'parent_type', 'commodity', 
            'product_emissions_MtCO2', 'flaring_emissions_MtCO2', 
            'venting_emissions_MtCO2', 'total_operational_emissions_MtCO2e']
updated_features = ['year', 'parent_entity', 'parent_type', 'commodity']

# First run, MAE on Test Set is 36.8
# Second run, MAE on Test Set is 39.9
# Lowering the features made MAE worse. Reset features back to original ones.


# First run, MAE on Test Set is 68.66
# Second run, MAE on Test Set is -

# MAE has increased drastically. Change year to be categorical than numerical.
# First run, MAE on Test Set is 40.41
# Second run, MAE on Test Set is 39.62

# Try adding more layers to the neural network.
# First run, MAE on Test Set is 39.51
# Second run, MAE on Test Set is 37.39


# Lowered batch size from 32 to 16
# MAE on Test Set is 36.0

# Lower epochs to 50
# MAE on Test Set is 44.88

# Put layers back to 3 and batch size to 32 and epochs to 150
# MAE on Test Set is 39.14

# Reset to original model because yikes!
# First run, MAE on Test Set is 1.9
# Second run, MAE on Test Set is 1.62
# Third run, MAE on Test Set is 2.18

# Lower to two layers
# First run, MAE on Test Set is 1.44
# Second run, MAE on Test Set is 1.61
# Second run, MAE on Test Set is 1.11

# 150 epochs, batch size 48 -> MAE -> 1.24 -> 1.01 -> 1.03
# Adding an extra layer brought MAE to 1.42.

# 100 epochs, batch size 55 got us under 1.0! Hooray!



model.save('carbon_model.keras')

Columns in DataFrame: Index(['year', 'parent_entity', 'parent_type', 'reporting_entity', 'commodity',
       'production_value', 'production_unit', 'product_emissions_MtCO2',
       'flaring_emissions_MtCO2', 'venting_emissions_MtCO2',
       'own_fuel_use_emissions_MtCO2', 'fugitive_methane_emissions_MtCO2e',
       'fugitive_methane_emissions_MtCH4',
       'total_operational_emissions_MtCO2e', 'total_emissions_MtCO2e',
       'source', 'parent_short'],
      dtype='object')
All required columns are present.
Epoch 1/100


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m184/184[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 974us/step - loss: 80184.4453 - mae: 81.6014 - val_loss: 39975.8398 - val_mae: 64.6910
Epoch 2/100
[1m184/184[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 639us/step - loss: 24886.5078 - mae: 49.8249 - val_loss: 8233.5400 - val_mae: 29.1407
Epoch 3/100
[1m184/184[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 618us/step - loss: 6148.9658 - mae: 25.9187 - val_loss: 1676.3706 - val_mae: 16.4281
Epoch 4/100
[1m184/184[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 633us/step - loss: 1306.8154 - mae: 14.6957 - val_loss: 679.7532 - val_mae: 13.2569
Epoch 5/100
[1m184/184[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 605us/step - loss: 532.7264 - mae: 11.9577 - val_loss: 449.0153 - val_mae: 10.8487
Epoch 6/100
[1m184/184[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 608us/step - loss: 356.6356 - mae: 10.0139 - val_loss: 326.5858 - val_mae: 9.1961
Epoch 7/100
[1m184/184[0m [32m━━━━

#### Save the model for Django Implementation