# Modeling using Time Series

---

### Import Libraries

In [2]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
    
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [19]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

In [20]:
from tools.tools import read_json, get_current_time

In [21]:
plt.style.use('ggplot')

In [22]:
config = read_json('../config.json')

current_date = get_current_time('yyyymmdd')

wd = f"{config['workspace_directory']}/data"

In [52]:
df = pd.read_csv(f"{wd}/processing/basin_int_viirs_for_modeling.csv")

In [53]:
df['year_month'] = pd.to_datetime(df['year_month']).dt.date
df.set_index('year_month', inplace=True)
df.sort_index(inplace=True)  # sort by datetime just to be safe

In [54]:
df.shape

(679, 83)

In [55]:
df

Unnamed: 0_level_0,region,latest_day_in_month,obs_day_cnt_avg,obs_day_cnt_med,obs_day_cnt_sum,obs_day_cnt_min,obs_day_cnt_max,qf_fit_day_avg_avg,qf_fit_day_avg_med,qf_fit_day_avg_sum,...,obs_day_cnt_avg_over_pct_month_completed,obs_day_cnt_med_over_pct_month_completed,obs_day_cnt_sum_over_pct_month_completed,obs_day_cnt_min_over_pct_month_completed,obs_day_cnt_max_over_pct_month_completed,obs_day_cnt_avg_per_squaremeters_over_pct_month_completed,obs_day_cnt_med_per_squaremeters_over_pct_month_completed,obs_day_cnt_sum_per_squaremeters_over_pct_month_completed,obs_day_cnt_min_per_squaremeters_over_pct_month_completed,obs_day_cnt_max_per_squaremeters_over_pct_month_completed
year_month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2012-03-01,Anadarko Region,2012-03-31,13.041667,12.5,313,3,33,4.786164,4.291667,114.867926,...,13.041667,12.500000,313.000000,3.000000,33.000000,1.728202e-10,1.656423e-10,4.147684e-09,3.975416e-11,4.372958e-10
2012-03-01,Appalachia Region,2012-03-31,31.071429,32.0,870,1,86,9.502222,8.966184,266.062205,...,31.071429,32.000000,870.000000,1.000000,86.000000,1.616772e-10,1.665089e-10,4.526961e-09,5.203404e-12,4.474927e-10
2012-03-01,Bakken Region,2012-03-31,260.321429,249.0,7289,2,571,1.497039,1.500962,41.917086,...,260.321429,249.000000,7289.000000,2.000000,571.000000,2.852491e-09,2.728436e-09,7.986976e-08,2.191515e-11,6.256775e-09
2012-03-01,Eagle Ford Region,2012-03-31,48.538462,30.0,1262,1,209,1.046799,0.374157,27.216779,...,48.538462,30.000000,1262.000000,1.000000,209.000000,7.517042e-10,4.646032e-10,1.954431e-08,1.548677e-11,3.236736e-09
2012-03-01,Haynesville Region,2012-03-31,4.476190,2.0,94,1,15,8.188339,0.000000,171.955128,...,4.476190,2.000000,94.000000,1.000000,15.000000,9.172062e-11,4.098155e-11,1.926133e-09,2.049078e-11,3.073617e-10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2020-03-01,Bakken Region,2020-03-08,213.875000,221.5,1711,51,371,6.692439,6.807222,53.539510,...,916.607143,949.285714,7332.857143,218.571429,1590.000000,1.004379e-08,1.040187e-08,8.035032e-08,2.395013e-09,1.742254e-08
2020-03-01,Eagle Ford Region,2020-03-08,58.375000,31.0,467,1,172,1.953489,1.357558,15.627909,...,250.178571,132.857143,2001.428571,4.285714,737.142857,3.874459e-09,2.057529e-09,3.099567e-08,6.637189e-11,1.141597e-08
2020-03-01,Haynesville Region,2020-03-06,3.333333,1.0,10,1,8,3.000000,0.000000,9.000000,...,20.000000,6.000000,60.000000,6.000000,48.000000,4.098155e-10,1.229447e-10,1.229447e-09,1.229447e-10,9.835573e-10
2020-03-01,Niobrara Region,2020-03-08,13.250000,12.5,106,4,28,5.852251,6.335714,46.818010,...,56.785714,53.571429,454.285714,17.142857,120.000000,2.446352e-10,2.307879e-10,1.957081e-09,7.385213e-11,5.169649e-10


In [59]:
X = df.drop(columns=[
    'oil_bbl_d_total_production',
    'latest_day_in_month',
    'last_day_of_month',
    'first_day_of_month',
    # 'year_month',
])

X = pd.get_dummies(X)

y = df['oil_bbl_d_total_production']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.05, shuffle = False)

In [60]:
X_train                         

Unnamed: 0_level_0,obs_day_cnt_avg,obs_day_cnt_med,obs_day_cnt_sum,obs_day_cnt_min,obs_day_cnt_max,qf_fit_day_avg_avg,qf_fit_day_avg_med,qf_fit_day_avg_sum,qf_fit_day_avg_min,qf_fit_day_avg_max,...,obs_day_cnt_sum_per_squaremeters_over_pct_month_completed,obs_day_cnt_min_per_squaremeters_over_pct_month_completed,obs_day_cnt_max_per_squaremeters_over_pct_month_completed,region_Anadarko Region,region_Appalachia Region,region_Bakken Region,region_Eagle Ford Region,region_Haynesville Region,region_Niobrara Region,region_Permian Region
year_month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2012-03-01,13.041667,12.5,313,3,33,4.786164,4.291667,114.867926,0.000000,15.444444,...,4.147684e-09,3.975416e-11,4.372958e-10,1,0,0,0,0,0,0
2012-03-01,31.071429,32.0,870,1,86,9.502222,8.966184,266.062205,0.000000,42.400000,...,4.526961e-09,5.203404e-12,4.474927e-10,0,1,0,0,0,0,0
2012-03-01,260.321429,249.0,7289,2,571,1.497039,1.500962,41.917086,0.000000,4.189956,...,7.986976e-08,2.191515e-11,6.256775e-09,0,0,1,0,0,0,0
2012-03-01,48.538462,30.0,1262,1,209,1.046799,0.374157,27.216779,0.000000,5.384615,...,1.954431e-08,1.548677e-11,3.236736e-09,0,0,0,1,0,0,0
2012-03-01,4.476190,2.0,94,1,15,8.188339,0.000000,171.955128,0.000000,34.000000,...,1.926133e-09,2.049078e-11,3.073617e-10,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2019-10-01,114.678571,124.5,3211,1,219,2.412223,2.402281,67.542234,0.000000,6.248000,...,5.144279e-08,1.602080e-11,3.508556e-09,0,0,0,1,0,0,0
2019-10-01,4.400000,4.5,88,2,8,8.919167,9.979167,178.383333,0.000000,20.600000,...,1.931988e-09,4.390881e-11,1.756352e-10,0,0,0,0,1,0,0
2019-10-01,20.464286,22.5,573,1,35,5.157673,3.522727,144.414848,0.200000,34.000000,...,2.553628e-09,4.456594e-12,1.559808e-10,0,0,0,0,0,1,0
2019-10-01,327.566667,329.0,9827,19,709,4.313043,4.541641,129.391291,0.421053,6.941463,...,4.798034e-08,9.276753e-11,3.461694e-09,0,0,0,0,0,0,1


In [61]:
 X_test

Unnamed: 0_level_0,obs_day_cnt_avg,obs_day_cnt_med,obs_day_cnt_sum,obs_day_cnt_min,obs_day_cnt_max,qf_fit_day_avg_avg,qf_fit_day_avg_med,qf_fit_day_avg_sum,qf_fit_day_avg_min,qf_fit_day_avg_max,...,obs_day_cnt_sum_per_squaremeters_over_pct_month_completed,obs_day_cnt_min_per_squaremeters_over_pct_month_completed,obs_day_cnt_max_per_squaremeters_over_pct_month_completed,region_Anadarko Region,region_Appalachia Region,region_Bakken Region,region_Eagle Ford Region,region_Haynesville Region,region_Niobrara Region,region_Permian Region
year_month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2019-11-01,12.958333,11.5,311,1,36,8.939064,8.8125,214.537542,0.0,22.666667,...,1.618259e-09,5.203404e-12,1.873225e-10,0,1,0,0,0,0,0
2019-11-01,227.740741,261.0,6149,3,534,3.902569,4.434944,105.369363,0.2,10.504931,...,6.737812e-08,3.287272e-11,5.851344e-09,0,0,1,0,0,0,0
2019-11-01,98.481481,118.0,2659,1,215,1.957059,1.890909,52.840597,0.0,4.446809,...,4.117933e-08,1.548677e-11,3.329657e-09,0,0,0,1,0,0,0
2019-11-01,3.904762,4.0,82,1,9,11.200529,13.6,235.211111,0.0,34.0,...,1.874118e-09,2.28551e-11,2.056959e-10,0,0,0,0,1,0,0
2019-11-01,17.16,16.0,429,1,37,3.358018,2.925926,83.950455,0.0,7.3,...,1.914155e-09,4.461899e-12,1.650903e-10,0,0,0,0,0,1,0
2019-11-01,331.607143,326.5,9285,2,659,3.951521,4.254208,110.642586,0.5,6.508929,...,4.382289e-08,9.439503e-12,3.110316e-09,0,0,0,0,0,0,1
2019-12-01,14.75,15.0,413,2,32,5.446697,3.318947,152.50752,0.166667,34.0,...,5.472823e-09,2.650277e-11,4.240444e-10,1,0,0,0,0,0,0
2019-12-01,15.47619,13.0,325,2,41,6.547675,7.2,137.501175,0.0,19.52381,...,1.8119e-09,1.115015e-11,2.285781e-10,0,1,0,0,0,0,0
2019-12-01,244.645161,241.0,7584,1,501,4.750184,4.045307,147.255713,0.0,12.224939,...,8.310224e-08,1.095757e-11,5.489744e-09,0,0,1,0,0,0,0
2019-12-01,128.866667,160.5,3866,3,246,2.224707,1.780882,66.741214,0.6,12.0,...,5.987187e-08,4.646032e-11,3.809747e-09,0,0,0,1,0,0,0
