# Oil Production Lasso Machine Learning

In [1]:
# Import dependencies

%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso

In [2]:
# Read the csv files into a pandas DataFrame

oil_prod = pd.read_csv('../data/clean_data/Oil Production - Barrels-YearFixed-Python.csv')
oil_prod = oil_prod.rename(columns={"Total World": "World Barrels"})
pop = pd.read_csv('../data/clean_data/WorldPopulationbyYear.csv')
pop = pop.rename(columns={"World": "World Population"})
oil_pop = pop.merge(oil_prod, on="Year")
gdp = pd.read_csv('../data/clean_data/GDP%-YearFixed-Python.csv', encoding = "ISO-8859-1")
filteredGDP = gdp[["Year", "United States", "World"]]
filteredGDP = filteredGDP.rename(columns={"United States": "US GDP%", "World": "World GDP%"})
inflation = pd.read_csv('../data/clean_data/InflationAnnual%-YearFixed-Python.csv', encoding = "ISO-8859-1")
filteredInflation = inflation[["Year", "United States", "World"]]
filteredInflation = filteredInflation.rename(columns={"United States": "US Inflation%", "World": "World Inflation%"})
goodsTax = pd.read_csv('../data/clean_data/Taxes%-YearFixed-Python.csv', encoding = "ISO-8859-1")
filteredGoodsTax = goodsTax[["Year", "United States", "World"]]
filteredGoodsTax = filteredGoodsTax.rename(columns={"United States": "US Goods Tax%", "World": "World Goods Tax%"})

# Merge dataframes on Year
gdp_oil_pop = oil_pop.merge(filteredGDP, on="Year")
gdp_oil_pop_infl = gdp_oil_pop.merge(filteredInflation, on="Year")
full_merge = gdp_oil_pop_infl.merge(filteredGoodsTax, on="Year")

# Drop 2019 row - There's multiple features set to 0.  Will later run prediction on 2019
full_merge = full_merge.drop([54])

# Clean NaNs - change to 0 so columns are same length
full_merge = full_merge.replace(np.nan, 0)
full_merge

Unnamed: 0,Year,World Population,Algeria,Angola,Argentina,Australia,Azerbaijan,Brazil,Brunei,Canada,...,Uzbekistan,Venezuela,Vietnam,Yemen,US GDP%,World GDP%,US Inflation%,World Inflation%,US Goods Tax%,World Goods Tax%
0,1965,3322973367,569.94,13.3,275.67,7.0,0.0,96.05,80.0,920.0,...,0.0,3503.0,0.0,0.0,0.0,0.0,1.585169,0.0,0.0,0.0
1,1966,3393031801,733.59,12.81,292.86,9.0,0.0,117.05,96.0,1012.0,...,0.0,3402.0,0.0,0.0,0.0,0.0,3.015075,0.0,0.0,0.0
2,1967,3462460201,842.66,10.9,318.63,21.0,0.0,147.05,104.0,1106.0,...,0.0,3576.0,0.0,0.0,0.0,0.0,2.772786,0.0,0.0,0.0
3,1968,3532826854,922.5,15.18,348.16,39.0,0.0,161.05,122.0,1194.0,...,0.0,3639.0,0.0,0.0,0.0,0.0,4.271796,0.0,0.0,0.0
4,1969,3607499991,969.33,49.9,361.53,45.0,0.0,176.05,124.0,1306.0,...,0.0,3631.0,0.0,0.0,0.0,0.0,5.462386,0.0,0.0,0.0
5,1970,3682911039,1053.93,102.83,399.46,176.0,0.0,166.68,136.0,1473.0,...,0.0,3754.0,0.0,0.0,21.414736,26.911121,5.838255,0.0,0.0,0.0
6,1971,3760509002,802.59,116.14,431.54,315.0,0.0,174.85,129.0,1582.0,...,0.0,3615.0,0.0,0.0,21.919818,26.526697,4.292767,0.0,0.0,0.0
7,1972,3836892580,1081.05,142.88,444.12,337.0,0.0,171.19,182.0,1829.0,...,0.0,3301.0,0.0,0.0,22.580622,26.164536,3.272278,0.0,7.143859,0.0
8,1973,3912347640,1128.74,165.96,433.64,424.0,0.0,173.99,231.0,2114.0,...,0.0,3455.0,0.0,0.0,23.331809,27.054172,6.17776,0.0,6.579487,0.0
9,1974,3988478324,1045.98,173.39,423.39,420.0,0.0,181.76,198.0,1993.0,...,0.0,3060.0,0.0,0.0,22.694942,27.896165,11.054805,0.0,5.990202,0.0


## Lasso

In [3]:
model = Lasso(alpha=.01)

## One-step Forecast

In [4]:
# Using 2000 - 2009 data to run historical prediction 2001 - 2010

predict0110=[]

for year in range(10):
    i = 36 + year

    # Does not need .value.reshape(-1, 1) as there's dimension now with 2+ features
    hist_X = full_merge[["World Population", "World Inflation%", "World Goods Tax%", "World GDP%"]]
    hist_y = full_merge["World Barrels"].values.reshape(-1, 1)
    X_scaler = StandardScaler().fit(hist_X)
    y_scaler = StandardScaler().fit(hist_y)
    X_train_scaled = X_scaler.transform(hist_X)
    y_train_scaled = y_scaler.transform(hist_y)
    X_train_scaled = pd.DataFrame(X_train_scaled)
    lasso = model.fit(X_train_scaled.iloc[(i-21):i], y_train_scaled[(i-21):i])
    
    # changed reshape to (1, -1)
    oil_predict = lasso.predict(X_train_scaled.iloc[i-1].values.reshape(1, -1))
    predict0110.append(oil_predict.flatten()[0])
    
# Invert predict0110 so it's not scaled for later comoparison
inv_predict0110 = y_scaler.inverse_transform(predict0110)

print(inv_predict0110)

[72673.25845985 74393.90989276 74925.57912641 76198.9647683
 77899.76223636 79328.38397564 80403.77163664 81434.87262008
 82241.41195535 82773.49015069]


## Historical Prediction MSE and R2

In [5]:
# Use our model to make predictions
predicted = lasso.predict(X_train_scaled)

# inv_predicted = y_scaler.inverse_transform(predicted)

hist_mse = mean_squared_error(y_train_scaled, predicted)
hist_r2 = lasso.score(X_train_scaled, y_train_scaled)

print(f"Mean Squared Error (MSE): {hist_mse}")
print(f"R-squared (R2): {hist_r2}")

Mean Squared Error (MSE): 0.08711347932952405
R-squared (R2): 0.9128865206704759


## Historical Predictions

In [6]:
hist_pred_0110_df = full_merge.loc[full_merge['Year'].between(2001, 2010), ['Year', 'World Barrels']]
hist_pred_0110_df["Prediction"] = inv_predict0110
hist_pred_0110_df["Difference"] = hist_pred_0110_df["Prediction"] - hist_pred_0110_df["World Barrels"]
hist_pred_0110_df["% Difference"] = ((hist_pred_0110_df["Prediction"] - hist_pred_0110_df["World Barrels"])/hist_pred_0110_df["World Barrels"])*100
hist_pred_0110_df

Unnamed: 0,Year,World Barrels,Prediction,Difference,% Difference
36,2001,74942.96,72673.25846,-2269.70154,-3.028572
37,2002,74144.97,74393.909893,248.939893,0.335748
38,2003,77305.05,74925.579126,-2379.470874,-3.078028
39,2004,80979.16,76198.964768,-4780.195232,-5.902994
40,2005,81951.57,77899.762236,-4051.807764,-4.944149
41,2006,82711.27,79328.383976,-3382.886024,-4.089994
42,2007,82569.68,80403.771637,-2165.908363,-2.623128
43,2008,83234.06,81434.87262,-1799.18738,-2.1616
44,2009,81577.58,82241.411955,663.831955,0.813743
45,2010,83409.28,82773.490151,-635.789849,-0.762253


## Save Historical Predictions to CSV

In [7]:
hist_pred_0110_df.to_csv('../data/clean_data/oil_outputs/OilProduction_Historical_LassoModel_2001_2010.csv', index=False)

## Features' Rolling Average for 2019 - 2023

In [8]:
# Narrow down data frame to the specific year range of 2010 - 2018

multi_feat = full_merge.loc[full_merge['Year'].between(2010, 2018), ['Year',
                                                                     'World Population', 
                                                                   'World Barrels', 
                                                                   'World Inflation%', 
                                                                   'World Goods Tax%', 
                                                                   'World GDP%']]
multi_feat

Unnamed: 0,Year,World Population,World Barrels,World Inflation%,World Goods Tax%,World GDP%
45,2010,6921871614,83409.28,3.326345,31.87589,24.207113
46,2011,7002860604,84156.76,4.839403,33.264196,24.547417
47,2012,7085763408,86366.04,3.707818,33.271756,24.404915
48,2013,7169640142,86794.17,2.605818,32.787076,24.310278
49,2014,7254228377,88909.98,2.346269,33.191709,24.470283
50,2015,7338964960,91732.64,1.39333,33.724915,24.297531
51,2016,7424282488,92072.43,1.486007,34.248831,23.91364
52,2017,7509065705,92797.83,2.233522,33.333664,24.222791
53,2018,7591932907,95254.09,2.458142,34.011405,24.382773


In [9]:
# Iterate 5 times for 5 years (2019 - 2023) of rolling average of features

for i in range(5):
    starting_index = 4 + i
    year_inc = 2018 + i
    new_year = year_inc + 1

    pop_mean = multi_feat['World Population'].iloc[starting_index:starting_index+5].mean()
    infl_mean = multi_feat['World Inflation%'].iloc[starting_index:starting_index+5].mean()
    gtax_mean = multi_feat['World Goods Tax%'].iloc[starting_index:starting_index+5].mean()
    wgdp_mean = multi_feat['World GDP%'].iloc[starting_index:starting_index+5].mean()

    df = pd.DataFrame({"Year":[new_year],
                       "World Population":[pop_mean],
                       "World Barrels":0,
                       "World Inflation%":[infl_mean], 
                       "World Goods Tax%":[gtax_mean],
                      "World GDP%":[wgdp_mean]})
    
    multi_feat = multi_feat.append(df, ignore_index=True)
    del df
    
multi_feat

Unnamed: 0,Year,World Population,World Barrels,World Inflation%,World Goods Tax%,World GDP%
0,2010,6921872000.0,83409.28,3.326345,31.87589,24.207113
1,2011,7002861000.0,84156.76,4.839403,33.264196,24.547417
2,2012,7085763000.0,86366.04,3.707818,33.271756,24.404915
3,2013,7169640000.0,86794.17,2.605818,32.787076,24.310278
4,2014,7254228000.0,88909.98,2.346269,33.191709,24.470283
5,2015,7338965000.0,91732.64,1.39333,33.724915,24.297531
6,2016,7424282000.0,92072.43,1.486007,34.248831,23.91364
7,2017,7509066000.0,92797.83,2.233522,33.333664,24.222791
8,2018,7591933000.0,95254.09,2.458142,34.011405,24.382773
9,2019,7423695000.0,0.0,1.983454,33.702105,24.257404


## Multi-step Forecast

In [10]:
future_predict=[]

for year in range(5):
    i = 9 + year

    future_X = multi_feat[["World Population", "World Inflation%", "World Goods Tax%", "World GDP%"]]
    future_y = multi_feat["World Barrels"].values.reshape(-1, 1)
    X_scaler = StandardScaler().fit(future_X)
    y_scaler = StandardScaler().fit(future_y)
    X_test_scaled = X_scaler.transform(future_X)
    y_test_scaled = y_scaler.transform(future_y)
    X_test_scaled = pd.DataFrame(X_test_scaled)
    lasso = model.fit(X_test_scaled.iloc[year:i], y_test_scaled[year:i])

    multi_predict2 = lasso.predict(X_test_scaled.iloc[i].values.reshape(1, -1))
    future_predict.append(multi_predict2.flatten()[0])


# Invert future_predict so it's not scaled for later comoparison
inv_future_predict = y_scaler.inverse_transform(future_predict)

print(inv_future_predict)

[91766.32644499 77792.27577286 65175.83250728 55417.33269027
 47761.45683441]


## Future Oil Production Lasso Prediction MSE and R2

In [11]:
# Use our model to make predictions
predicted = lasso.predict(X_test_scaled)

# inv_f_predicted = y_scaler.inverse_transform(predicted)

future_mse = mean_squared_error(y_test_scaled, predicted)
future_r2 = lasso.score(X_test_scaled, y_test_scaled)

print(f"Mean Squared Error (MSE): {future_mse}")
print(f"R-squared (R2): {future_r2}")

Mean Squared Error (MSE): 0.8115050981522074
R-squared (R2): 0.18849490184779238


In [12]:
# Create df for historical and future mse and r2
data = [["Oil Production Lasso", hist_mse, future_mse, hist_r2, future_r2]]

mse_r2_df = pd.DataFrame(data, columns = ["Type", "Historical MSE", "Future MSE", "Historical R2", "Future R2"])

mse_r2_df

Unnamed: 0,Type,Historical MSE,Future MSE,Historical R2,Future R2
0,Oil Production Lasso,0.087113,0.811505,0.912887,0.188495


## Future Prediction Compared to Actual

In [13]:
prediction_20_24 = multi_feat.loc[multi_feat['Year'].between(2019, 2023), ['Year', 'World Barrels']]
prediction_20_24["Prediction"] = inv_future_predict

prediction_20_24

Unnamed: 0,Year,World Barrels,Prediction
9,2019,0.0,91766.326445
10,2020,0.0,77792.275773
11,2021,0.0,65175.832507
12,2022,0.0,55417.33269
13,2023,0.0,47761.456834


## Push Future Predictions to CSV

In [14]:
prediction_20_24.to_csv('../data/clean_data/oil_outputs/OilProduction_Future_LassoModel_2019_2023.csv', index=False)