# Oil Production Lasso Machine Learning

In [1]:
# Import dependencies

%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso

In [2]:
# Read the csv files into a pandas DataFrame

oil_prod = pd.read_csv('../data/clean_data_final/final_clean_data/Oil Production - Barrels-YearFixed-Python.csv')
oil_prod = oil_prod.rename(columns={"Total World": "World Barrels"})
pop = pd.read_csv('../data/clean_data/WorldPopulationbyYear.csv')
pop = pop.rename(columns={"World": "World Population"})
oil_pop = pop.merge(oil_prod, on="Year")
gdp = pd.read_csv('../data/clean_data_final/final_clean_data/GDP%-YearFixed-Python.csv', encoding = "ISO-8859-1")
filteredGDP = gdp[["Year", "United States", "World"]]
filteredGDP = filteredGDP.rename(columns={"United States": "US GDP%", "World": "World GDP%"})
inflation = pd.read_csv('../data/clean_data_final/final_clean_data/InflationAnnual%-YearFixed-Python.csv', encoding = "ISO-8859-1")
filteredInflation = inflation[["Year", "United States", "World"]]
filteredInflation = filteredInflation.rename(columns={"United States": "US Inflation%", "World": "World Inflation%"})
goodsTax = pd.read_csv('../data/clean_data_final/final_clean_data/Taxes%-YearFixed-Python.csv', encoding = "ISO-8859-1")
filteredGoodsTax = goodsTax[["Year", "United States", "World"]]
filteredGoodsTax = filteredGoodsTax.rename(columns={"United States": "US Goods Tax%", "World": "World Goods Tax%"})

# Merge dataframes on Year
gdp_oil_pop = oil_pop.merge(filteredGDP, on="Year")
gdp_oil_pop_infl = gdp_oil_pop.merge(filteredInflation, on="Year")
full_merge = gdp_oil_pop_infl.merge(filteredGoodsTax, on="Year")

# Drop 2019 row - There's multiple features set to 0.  Will later run prediction on 2019
full_merge = full_merge.drop([54])

# Clean NaNs - change to 0 so columns are same length
full_merge = full_merge.replace(np.nan, 0)
full_merge

Unnamed: 0,Year,World Population,Canada,Mexico,US,Total North America,Argentina,Brazil,Colombia,Ecuador,...,Non-OECD,OPEC,Non-OPEC,European Union #,US GDP%,World GDP%,US Inflation%,World Inflation%,US Goods Tax%,World Goods Tax%
0,1965,3322973367,920,362,9014,10296,276,96,203,8,...,20977,13709,18083,699,0.0,0.0,1.585169,0.0,0.0,0.0
1,1966,3393031801,1012,370,9579,10961,293,117,199,7,...,23083,15118,19443,697,0.0,0.0,3.015075,0.0,0.0,0.0
2,1967,3462460201,1106,411,10219,11736,319,147,192,6,...,24831,16118,20986,705,0.0,0.0,2.772786,0.0,0.0,0.0
3,1968,3532826854,1194,439,10600,12233,348,161,176,5,...,27623,17992,22426,702,0.0,0.0,4.271796,0.0,0.0,0.0
4,1969,3607499991,1306,461,10828,12595,362,176,214,4,...,30479,20010,23639,698,0.0,0.0,5.462386,0.0,0.0,0.0
5,1970,3682911039,1473,487,11297,13257,399,167,226,4,...,34123,22527,25548,694,21.414736,26.911121,5.838255,0.0,0.0,0.0
6,1971,3760509002,1582,486,11156,13224,432,175,224,4,...,36741,24335,26450,677,21.919818,26.526697,4.292767,0.0,0.0,0.0
7,1972,3836892580,1829,506,11185,13520,444,171,203,78,...,39177,25936,27618,668,22.580622,26.164536,3.272278,0.0,7.143859,0.0
8,1973,3912347640,2114,525,10946,13585,434,174,192,209,...,44022,29648,28904,674,23.331809,27.054172,6.17776,0.0,6.579487,0.0
9,1974,3988478324,1993,653,10461,13107,423,182,175,177,...,44616,29458,29213,686,22.694942,27.896165,11.054805,0.0,5.990202,0.0


## Lasso

In [3]:
model = Lasso(alpha=.01)

## One-step Forecast

In [4]:
# Using 2000 - 2009 data to run historical prediction 2001 - 2010

predict0110=[]

for year in range(10):
    i = 36 + year

    # Does not need .value.reshape(-1, 1) as there's dimension now with 2+ features
    hist_X = full_merge[["World Population", "World Inflation%", "World Goods Tax%", "World GDP%"]]
    hist_y = full_merge["World Barrels"].values.reshape(-1, 1)
    X_scaler = StandardScaler().fit(hist_X)
    y_scaler = StandardScaler().fit(hist_y)
    X_train_scaled = X_scaler.transform(hist_X)
    y_train_scaled = y_scaler.transform(hist_y)
    X_train_scaled = pd.DataFrame(X_train_scaled)
    lasso = model.fit(X_train_scaled.iloc[(i-21):i], y_train_scaled[(i-21):i])
    
    # changed reshape to (1, -1)
    oil_predict = lasso.predict(X_train_scaled.iloc[i-1].values.reshape(1, -1))
    predict0110.append(oil_predict.flatten()[0])
    
# Invert predict0110 so it's not scaled for later comoparison
inv_predict0110 = y_scaler.inverse_transform(predict0110)

print(inv_predict0110)

[72673.27300527 74393.7572528  74925.50013368 76198.89504011
 77899.66861117 79328.39321496 80403.77425169 81434.97519581
 82241.50241531 82773.61954644]


## Historical Prediction MSE and R-Square

In [5]:
# Use our model to make predictions

predicted = lasso.predict(X_train_scaled)

# inv_predicted = y_scaler.inverse_transform(predicted)

hist_mse = mean_squared_error(y_train_scaled, predicted)
hist_r2 = lasso.score(X_train_scaled, y_train_scaled)

print(f"Mean Squared Error (MSE): {hist_mse}")
print(f"R-squared (R2): {hist_r2}")

Mean Squared Error (MSE): 0.08711018419467867
R-squared (R2): 0.9128898158053215


## Historical Predictions

In [6]:
# Generate Historical Prediction table with difference to actual numbers

hist_pred_0110_df = full_merge.loc[full_merge['Year'].between(2001, 2010), ['Year', 'World Barrels']]
hist_pred_0110_df["Prediction"] = inv_predict0110
hist_pred_0110_df["Difference"] = hist_pred_0110_df["Prediction"] - hist_pred_0110_df["World Barrels"]
hist_pred_0110_df["% Difference"] = ((hist_pred_0110_df["Prediction"] - hist_pred_0110_df["World Barrels"])/hist_pred_0110_df["World Barrels"])*100
hist_pred_0110_df

Unnamed: 0,Year,World Barrels,Prediction,Difference,% Difference
36,2001,74943,72673.273005,-2269.726995,-3.028604
37,2002,74145,74393.757253,248.757253,0.335501
38,2003,77305,74925.500134,-2379.499866,-3.078067
39,2004,80979,76198.89504,-4780.10496,-5.902895
40,2005,81952,77899.668611,-4052.331389,-4.944762
41,2006,82711,79328.393215,-3382.606785,-4.08967
42,2007,82570,80403.774252,-2166.225748,-2.623502
43,2008,83234,81434.975196,-1799.024804,-2.161406
44,2009,81578,82241.502415,663.502415,0.813335
45,2010,83409,82773.619546,-635.380454,-0.761765


## Save Historical Predictions to CSV

In [7]:
# Export Historical Predictions table as CSV

hist_pred_0110_df.to_csv('../data/clean_data/oil_outputs/OilProduction_Historical_LassoModel_2001_2010.csv', index=False)

## Features' Rolling Average for 2019 - 2023

In [8]:
# Narrow down data frame to the specific year range of 2010 - 2018

multi_feat = full_merge.loc[full_merge['Year'].between(2010, 2018), ['Year',
                                                                     'World Population', 
                                                                   'World Barrels', 
                                                                   'World Inflation%', 
                                                                   'World Goods Tax%', 
                                                                   'World GDP%']]
multi_feat

Unnamed: 0,Year,World Population,World Barrels,World Inflation%,World Goods Tax%,World GDP%
45,2010,6921871614,83409,3.326345,31.87589,24.207113
46,2011,7002860604,84157,4.839403,33.264196,24.547417
47,2012,7085763408,86366,3.707818,33.271756,24.404915
48,2013,7169640142,86794,2.605818,32.787076,24.310278
49,2014,7254228377,88910,2.346269,33.191709,24.470283
50,2015,7338964960,91733,1.39333,33.724915,24.297531
51,2016,7424282488,92072,1.486007,34.248831,23.91364
52,2017,7509065705,92798,2.233522,33.333664,24.222791
53,2018,7591932907,95254,2.458142,34.011405,24.382773


In [9]:
# Iterate 5 times for 5 years (2019 - 2023) of rolling average of features

for i in range(5):
    starting_index = 4 + i
    year_inc = 2018 + i
    new_year = year_inc + 1

    pop_mean = multi_feat['World Population'].iloc[starting_index:starting_index+5].mean()
    infl_mean = multi_feat['World Inflation%'].iloc[starting_index:starting_index+5].mean()
    gtax_mean = multi_feat['World Goods Tax%'].iloc[starting_index:starting_index+5].mean()
    wgdp_mean = multi_feat['World GDP%'].iloc[starting_index:starting_index+5].mean()

    df = pd.DataFrame({"Year":[new_year],
                       "World Population":[pop_mean],
                       "World Barrels":0,
                       "World Inflation%":[infl_mean], 
                       "World Goods Tax%":[gtax_mean],
                      "World GDP%":[wgdp_mean]})
    
    multi_feat = multi_feat.append(df, ignore_index=True)
    del df
    
multi_feat

Unnamed: 0,Year,World Population,World Barrels,World Inflation%,World Goods Tax%,World GDP%
0,2010,6921872000.0,83409,3.326345,31.87589,24.207113
1,2011,7002861000.0,84157,4.839403,33.264196,24.547417
2,2012,7085763000.0,86366,3.707818,33.271756,24.404915
3,2013,7169640000.0,86794,2.605818,32.787076,24.310278
4,2014,7254228000.0,88910,2.346269,33.191709,24.470283
5,2015,7338965000.0,91733,1.39333,33.724915,24.297531
6,2016,7424282000.0,92072,1.486007,34.248831,23.91364
7,2017,7509066000.0,92798,2.233522,33.333664,24.222791
8,2018,7591933000.0,95254,2.458142,34.011405,24.382773
9,2019,7423695000.0,0,1.983454,33.702105,24.257404


## Multi-step Forecast

In [10]:
future_predict=[]
future_X = multi_feat[["World Population", "World Inflation%", "World Goods Tax%", "World GDP%"]]
future_y = multi_feat["World Barrels"].values.reshape(-1, 1)
X_scaler = StandardScaler().fit(future_X)
# We do not want to include the 0's after 2018. Set range [0:9] (not inclusive).
y_scaler = StandardScaler().fit(future_y[0:9])
X_test_scaled = X_scaler.transform(future_X)
# We do not want to include the 0's after 2018. Set range [0:9] (not inclusive).
y_test_scaled = y_scaler.transform(future_y[0:9])
X_test_scaled_df = pd.DataFrame(X_test_scaled)
y_test_scaled_df = pd.DataFrame(y_test_scaled)

for year in range(5):
    i = 9 + year
    # y_test_scaled_df need to have [year:i+1] to match X_test_scaled_df dimension or it will error with [9,8]
    lasso = model.fit(X_test_scaled_df.iloc[year:i], y_test_scaled_df.iloc[year:i+1])
    multi_predict2 = lasso.predict(X_test_scaled_df.iloc[i-1].values.reshape(1, -1))
    df2 = pd.DataFrame(pd.Series(multi_predict2.flatten()[0]))
    future_predict.append(multi_predict2.flatten()[0])

    y_test_scaled_df = y_test_scaled_df.append(df2, ignore_index=True)
    del df2

# Invert future_predict so it's not scaled for later comoparison
inv_future_predict = y_scaler.inverse_transform(future_predict)

print(inv_future_predict)

[94662.10192845 92541.40067285 93031.92323377 93277.6119894
 93399.02393145]


In [11]:
# Create Data Frame for historical and future MSE and R-Square

data = [["Lasso", hist_mse, hist_r2]]

mse_r2_df = pd.DataFrame(data, columns = ["Model", "Historical MSE", "Historical R-Square"])

mse_r2_df

Unnamed: 0,Model,Historical MSE,Historical R-Square
0,Lasso,0.08711,0.91289


In [12]:
# Export MSE and R-Square summary table as CSV

mse_r2_df.to_csv('../data/clean_data/oil_outputs/OilProduction_MSE_R2_Lasso_Table.csv', index=False)

## Future Prediction

In [13]:
# Generate Prediction table

prediction_20_24 = multi_feat.loc[multi_feat['Year'].between(2019, 2023), ['Year']]
prediction_20_24["Prediction"] = inv_future_predict

prediction_20_24

Unnamed: 0,Year,Prediction
9,2019,94662.101928
10,2020,92541.400673
11,2021,93031.923234
12,2022,93277.611989
13,2023,93399.023931


## Push Future Predictions to CSV

In [14]:
#Export Future prediction table as CSV

prediction_20_24.to_csv('../data/clean_data/oil_outputs/OilProduction_Future_LassoModel_2019_2023.csv', index=False)