In [1]:
# Dependencies
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

In [2]:
# Read Data Function
def read_data(filename):
    
    # Retrieve Data
    data = pd.read_csv(filename)
    
    # Drop Columns
    clean = data.drop(columns=['Unnamed: 0', 'Unnamed: 0.1', 'Date'])
    
    # Drop Last Row
    clean = clean[:-1].copy()
    
    # Drop missing data
    clean = clean.dropna()
    
    # Rename Columns
    cols = ['Year', 'Population', 'CH4', 'CO2', 'GHG', 'HFC', 'N2O', 'PFC', 'SF6']
    clean.columns = cols
    
    return clean

In [3]:
# Run Model (Machine Learning)
def run_model(clean):
    
    # Create Lag Column
    clean['nextGHG'] = clean.shift(1)['GHG']
    
    # Set Training and Testing Split
    training = clean.loc[(clean['Year']<2010) & (clean['Year']>1990)]
    testing = clean.loc[clean['Year']>=2010]

    # Set Variables
    X_train = training[['Population', 'CH4', 'CO2', 'GHG', 'HFC', 'N2O', 'PFC', 'SF6']]
    y_train = training['nextGHG']
    X_test = testing[['Population', 'CH4', 'CO2', 'GHG', 'HFC', 'N2O', 'PFC', 'SF6']]
    y_test = testing['nextGHG']
    X = clean[['Population', 'CH4', 'CO2', 'GHG', 'HFC', 'N2O', 'PFC', 'SF6']]
    y = clean['nextGHG']

    # Linear Regression Model
    from sklearn.linear_model import LinearRegression
    
    model = LinearRegression()

    # Train Modele
    model.fit(X_train, y_train)

    # Score for 1991-2009
    print(f"Train Score: {model.score(X_train, y_train)}")

    # Score for 2010+
    print(f"Test Score: {model.score(X_test, y_test)}")

    # Predictions vs Actual
    predictions = model.predict(X)
    print(f"Actual: {y[1]}")
    print(f"Predicted: {predictions[1]}")
    print(f"Error: {predictions[1]-y[1]}")

    return pd.DataFrame({"Year": clean["Year"],
                         "Predicted": predictions,
                         "Actual": y,
                         "Error": predictions - y,
                         "Country": country})

In [4]:
final = pd.DataFrame(columns = ["Year", "Predicted", "Actual", "Error", "Country"])

for filename in os.listdir("Cleaning/Combined"):
    if ".csv" in filename:
        
        print()
        
        country = filename.replace('.csv', '')
        
        print(country)
        
        clean = read_data("Cleaning/Combined/" + filename)
        
        try:
            output = run_model(clean)
            final = final.append(output)
        
        except:
            print("Error")


Australia
Train Score: 0.9947648150053119
Test Score: -2.6955417426754202
Actual: 424998.38093939
Predicted: 425918.4488608485
Error: 920.0679214585107

Canada
Train Score: 0.9622328932953712
Test Score: -156.9229802443247
Actual: 603221.866919033
Predicted: 601348.1069212733
Error: -1873.7599977597129

Japan
Train Score: 0.9568427559650884
Test Score: -6.519073224123232
Actual: 516051.74218015705
Predicted: 512918.3581822196
Error: -3133.3839979374316

New Zealand
Train Score: 0.9028441122014138
Test Score: -1.7059415188546883
Actual: 220740.86736172801
Predicted: 223554.90338190235
Error: 2814.0360201743315

Russia
Train Score: 0.8972530296224805
Test Score: -3.376495883498513
Actual: 247994.300007144
Predicted: 223798.47887264687
Error: -24195.82113449712

Spain
Train Score: 0.47647984367963514
Test Score: -1.3750449569858185
Actual: 53778.8971016929
Predicted: 54546.571800282814
Error: 767.6746985899154

Switzerland
Train Score: 0.864415672045828
Test Score: -1.2258234223593965
Ac

In [5]:
# Export 
final.to_csv('Resources/ML_GHG.csv')