# Oil Consumption Ridge Machine Learning

In [1]:
# Import dependencies

%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge

In [2]:
# Read the csv files into a pandas DataFrame

oil_cons = pd.read_csv('../data/clean_data_final/final_clean_data/Oil Consumption - Barrels-YearFixed-Python.csv')
oil_cons = oil_cons.rename(columns={"Total World": "World Barrels"})
pop = pd.read_csv('../data/clean_data/WorldPopulationbyYear.csv')
pop = pop.rename(columns={"World": "World Population"})
oil_pop = pop.merge(oil_cons, on="Year")
gdp = pd.read_csv('../data/clean_data_final/final_clean_data/GDP%-YearFixed-Python.csv', encoding = "ISO-8859-1")
filteredGDP = gdp[["Year", "United States", "World"]]
filteredGDP = filteredGDP.rename(columns={"United States": "US GDP%", "World": "World GDP%"})
inflation = pd.read_csv('../data/clean_data_final/final_clean_data/InflationAnnual%-YearFixed-Python.csv', encoding = "ISO-8859-1")
filteredInflation = inflation[["Year", "United States", "World"]]
filteredInflation = filteredInflation.rename(columns={"United States": "US Inflation%", "World": "World Inflation%"})
goodsTax = pd.read_csv('../data/clean_data_final/final_clean_data/Taxes%-YearFixed-Python.csv', encoding = "ISO-8859-1")
filteredGoodsTax = goodsTax[["Year", "United States", "World"]]
filteredGoodsTax = filteredGoodsTax.rename(columns={"United States": "US Goods Tax%", "World": "World Goods Tax%"})

# Merge dataframes on Year
gdp_oil_pop = oil_pop.merge(filteredGDP, on="Year")
gdp_oil_pop_infl = gdp_oil_pop.merge(filteredInflation, on="Year")
full_merge = gdp_oil_pop_infl.merge(filteredGoodsTax, on="Year")

# Drop 2019 row - There's multiple features set to 0.  Will later run prediction on 2019
full_merge = full_merge.drop([54])

# Clean NaNs - change to 0 so columns are same length
full_merge = full_merge.replace(np.nan, 0)
full_merge

Unnamed: 0,Year,World Population,Canada,Mexico,US,Total North America,Argentina,Brazil,Chile,Colombia,...,World Barrels,of which: OECD,Non-OECD,European Union #,US GDP%,World GDP%,US Inflation%,World Inflation%,US Goods Tax%,World Goods Tax%
0,1965,3322973367,1108,316,11522,12946,432,306,71,81,...,30771,23109,7662,7799,0.0,0.0,1.585169,0.0,0.0,0.0
1,1966,3393031801,1167,333,12100,13601,447,335,77,92,...,33137,24873,8264,8566,0.0,0.0,3.015075,0.0,0.0,0.0
2,1967,3462460201,1246,357,12567,14170,460,345,82,93,...,35503,26682,8821,9302,0.0,0.0,2.772786,0.0,0.0,0.0
3,1968,3532826854,1322,386,13405,15113,469,412,86,104,...,38420,28978,9442,10181,0.0,0.0,4.271796,0.0,0.0,0.0
4,1969,3607499991,1380,410,14153,15943,492,457,92,100,...,41727,31585,10142,11383,0.0,0.0,5.462386,0.0,0.0,0.0
5,1970,3682911039,1472,441,14710,16622,448,513,98,116,...,45313,34178,11135,12596,21.414736,26.911121,5.838255,0.0,0.0,0.0
6,1971,3760509002,1512,467,15223,17202,480,564,108,120,...,47886,35839,12047,13188,21.919818,26.526697,4.292767,0.0,0.0,0.0
7,1972,3836892580,1589,523,16381,18493,479,649,114,135,...,51430,38369,13061,14096,22.580622,26.164536,3.272278,0.0,7.143859,0.0
8,1973,3912347640,1682,564,17318,19564,483,797,110,131,...,55577,41284,14293,15129,23.331809,27.054172,6.17776,0.0,6.579487,0.0
9,1974,3988478324,1713,629,16631,18973,483,860,105,145,...,54790,39574,15217,14212,22.694942,27.896165,11.054805,0.0,5.990202,0.0


## Ridge

In [3]:
model = Ridge(alpha=.01)

## One-step Forecast

In [4]:
# Using 2000 - 2009 data to run historical prediction 2001 - 2010

predict0110=[]

for year in range(10):
    i = 36 + year

    # Does not need .value.reshape(-1, 1) as there's dimension now with 2+ features
    hist_X = full_merge[["World Population", "World Inflation%", "World Goods Tax%", "World GDP%"]]
    hist_y = full_merge["World Barrels"].values.reshape(-1, 1)
    X_scaler = StandardScaler().fit(hist_X)
    y_scaler = StandardScaler().fit(hist_y)
    X_train_scaled = X_scaler.transform(hist_X)
    y_train_scaled = y_scaler.transform(hist_y)
    X_train_scaled = pd.DataFrame(X_train_scaled)
    ridge = model.fit(X_train_scaled.iloc[(i-21):i], y_train_scaled[(i-21):i])
    
    # changed reshape to (1, -1)
    oil_predict = ridge.predict(X_train_scaled.iloc[i-1].values.reshape(1, -1))
    predict0110.append(oil_predict.flatten()[0])
    
# Invert predict0110 so it's not scaled for later comoparison
inv_predict0110 = y_scaler.inverse_transform(predict0110)

print(inv_predict0110)

[75801.62159035 77038.68389848 78118.65238068 79554.1460939
 81806.87766185 83339.25129751 84942.31637668 86333.15602943
 85595.09849837 85509.7097041 ]


## Historical Prediction MSE and R-Square

In [5]:
# Use our model to make predictions

predicted = ridge.predict(X_train_scaled)

# inv_predicted = y_scaler.inverse_transform(predicted)

hist_mse = mean_squared_error(y_train_scaled, predicted)
hist_r2 = ridge.score(X_train_scaled, y_train_scaled)

print(f"Mean Squared Error (MSE): {hist_mse}")
print(f"R-squared (R2): {hist_r2}")

Mean Squared Error (MSE): 0.239359719222003
R-squared (R2): 0.760640280777997


## Historical Predictions

In [6]:
# Generate Historical Prediction table with difference to actual numbers

hist_pred_0110_df = full_merge.loc[full_merge['Year'].between(2001, 2010), ['Year', 'World Barrels']]
hist_pred_0110_df["Prediction"] = inv_predict0110
hist_pred_0110_df["Difference"] = hist_pred_0110_df["Prediction"] - hist_pred_0110_df["World Barrels"]
hist_pred_0110_df["% Difference"] = ((hist_pred_0110_df["Prediction"] - hist_pred_0110_df["World Barrels"])/hist_pred_0110_df["World Barrels"])*100
hist_pred_0110_df

Unnamed: 0,Year,World Barrels,Prediction,Difference,% Difference
36,2001,77366,75801.62159,-1564.37841,-2.022049
37,2002,78238,77038.683898,-1199.316102,-1.532907
38,2003,79908,78118.652381,-1789.347619,-2.23926
39,2004,82654,79554.146094,-3099.853906,-3.750398
40,2005,83891,81806.877662,-2084.122338,-2.484322
41,2006,84916,83339.251298,-1576.748702,-1.856833
42,2007,86100,84942.316377,-1157.683623,-1.34458
43,2008,85170,86333.156029,1163.156029,1.365687
44,2009,84083,85595.098498,1512.098498,1.79834
45,2010,86856,85509.709704,-1346.290296,-1.550026


## Save Historical Predictions to CSV

In [7]:
# Export Historical Predictions table as CSV

hist_pred_0110_df.to_csv('../data/clean_data/oil_outputs/OilConsumption_Historical_Ridge_2001_2010.csv', index=False)

## Features' Rolling Average for 2019 - 2023

In [8]:
# Narrow down data frame to the specific year range of 2010 - 2018

multi_feat = full_merge.loc[full_merge['Year'].between(2010, 2018), ['Year',
                                                                     'World Population', 
                                                                   'World Barrels', 
                                                                   'World Inflation%', 
                                                                   'World Goods Tax%', 
                                                                   'World GDP%']]
multi_feat

Unnamed: 0,Year,World Population,World Barrels,World Inflation%,World Goods Tax%,World GDP%
45,2010,6921871614,86856,3.326345,31.87589,24.207113
46,2011,7002860604,87820,4.839403,33.264196,24.547417
47,2012,7085763408,88784,3.707818,33.271756,24.404915
48,2013,7169640142,90152,2.605818,32.787076,24.310278
49,2014,7254228377,90903,2.346269,33.191709,24.470283
50,2015,7338964960,92610,1.39333,33.724915,24.297531
51,2016,7424282488,94404,1.486007,34.248831,23.91364
52,2017,7509065705,96013,2.233522,33.333664,24.222791
53,2018,7591932907,97348,2.458142,34.011405,24.382773


In [9]:
# Iterate 5 times for 5 years (2019 - 2023) of rolling average of features

for i in range(5):
    starting_index = 4 + i
    year_inc = 2018 + i
    new_year = year_inc + 1

    pop_mean = multi_feat['World Population'].iloc[starting_index:starting_index+5].mean()
    infl_mean = multi_feat['World Inflation%'].iloc[starting_index:starting_index+5].mean()
    gtax_mean = multi_feat['World Goods Tax%'].iloc[starting_index:starting_index+5].mean()
    wgdp_mean = multi_feat['World GDP%'].iloc[starting_index:starting_index+5].mean()

    df = pd.DataFrame({"Year":[new_year],
                       "World Population":[pop_mean],
                       "World Barrels":0,
                       "World Inflation%":[infl_mean], 
                       "World Goods Tax%":[gtax_mean],
                      "World GDP%":[wgdp_mean]})
    
    multi_feat = multi_feat.append(df, ignore_index=True)
    del df
    
multi_feat

Unnamed: 0,Year,World Population,World Barrels,World Inflation%,World Goods Tax%,World GDP%
0,2010,6921872000.0,86856,3.326345,31.87589,24.207113
1,2011,7002861000.0,87820,4.839403,33.264196,24.547417
2,2012,7085763000.0,88784,3.707818,33.271756,24.404915
3,2013,7169640000.0,90152,2.605818,32.787076,24.310278
4,2014,7254228000.0,90903,2.346269,33.191709,24.470283
5,2015,7338965000.0,92610,1.39333,33.724915,24.297531
6,2016,7424282000.0,94404,1.486007,34.248831,23.91364
7,2017,7509066000.0,96013,2.233522,33.333664,24.222791
8,2018,7591933000.0,97348,2.458142,34.011405,24.382773
9,2019,7423695000.0,0,1.983454,33.702105,24.257404


## Multi-step Forecast

In [10]:
future_predict=[]
future_X = multi_feat[["World Population", "World Inflation%", "World Goods Tax%", "World GDP%"]]
future_y = multi_feat["World Barrels"].values.reshape(-1, 1)
X_scaler = StandardScaler().fit(future_X)
# We do not want to include the 0's after 2018. Set range [0:9] (not inclusive).
y_scaler = StandardScaler().fit(future_y[0:9])
X_test_scaled = X_scaler.transform(future_X)
# We do not want to include the 0's after 2018. Set range [0:9] (not inclusive).
y_test_scaled = y_scaler.transform(future_y[0:9])
X_test_scaled_df = pd.DataFrame(X_test_scaled)
y_test_scaled_df = pd.DataFrame(y_test_scaled)

for year in range(5):
    i = 9 + year
    # y_test_scaled_df need to have [year:i+1] to match X_test_scaled_df dimension or it will error with [9,8]
    ridge = model.fit(X_test_scaled_df.iloc[year:i], y_test_scaled_df.iloc[year:i+1])
    multi_predict2 = ridge.predict(X_test_scaled_df.iloc[i-1].values.reshape(1, -1))
    df2 = pd.DataFrame(pd.Series(multi_predict2.flatten()[0]))
    future_predict.append(multi_predict2.flatten()[0])
    
    y_test_scaled_df = y_test_scaled_df.append(df2, ignore_index=True)
    del df2

# Invert future_predict so it's not scaled for later comoparison
inv_future_predict = y_scaler.inverse_transform(future_predict)

print(inv_future_predict)

[97149.36315256 94732.4187476  95284.78802541 95682.03839783
 95846.83906719]


In [11]:
# Create Data Frame for historical and future MSE and R-Square

data = [["Ridge", hist_mse, hist_r2]]

mse_r2_df = pd.DataFrame(data, columns = ["Model", "Historical MSE", "Historical R-Square"])

mse_r2_df

Unnamed: 0,Model,Historical MSE,Historical R-Square
0,Ridge,0.23936,0.76064


In [12]:
# Export MSE and R-Square summary table as CSV

mse_r2_df.to_csv('../data/clean_data/oil_outputs/OilConsumption_MSE_R2_Ridge_Table.csv', index=False)

## Future Prediction

In [13]:
# Generate Prediction table

prediction_19_23 = multi_feat.loc[multi_feat['Year'].between(2019, 2023), ['Year']]
prediction_19_23["Prediction"] = inv_future_predict

prediction_19_23

Unnamed: 0,Year,Prediction
9,2019,97149.363153
10,2020,94732.418748
11,2021,95284.788025
12,2022,95682.038398
13,2023,95846.839067


## Push Future Predictions to CSV

In [14]:
#Export Future prediction table as CSV

prediction_19_23.to_csv('../data/clean_data/oil_outputs/OilConsumption_Future_Ridge_2019_2023.csv', index=False)