# In this notebook, the production gas volume dataset will be cleaned to be used in the time series analysis and the model

In [1]:
# Import 3rd party libraries
import pandas as pd
import os

In [3]:
#read production data to see what are included in each column
# Path of working directory
working_path = os.path.abspath(os.getcwd())
# Path of parent directory
path_one_level_back = os.path.dirname(working_path)
# Define paths for production volume file
path_2019 = os.path.join(path_one_level_back, '0-Raw_data', 'prod_2019.csv')
prod_2019 =  pd.read_csv(path_2019, header=[0])
prod_2019

Unnamed: 0,wa_num,compltn_event_seq,prod_period,UWI,area_code,formtn_code,Pool_seq,gas_prod_vol_m3,oil_prod_vol_m3,water_prod_vol_m3,cond_prod_vol_m3,prod_days,gas_prod_cum_m3,oil_prod_cum_m3,water_prod_cum_m3,cond_prod_cum_m3,project_code
0,70,0,201901,200C032D094A1300,1400,2900,B,8.3,0.0,0.8,0.1,2.0,2467.0,0.0,251.3,30.9,2
1,70,0,201904,200C032D094A1300,1400,2900,B,57.8,0.0,12.0,0.5,13.0,2524.8,0.0,263.3,31.4,2
2,70,0,201905,200C032D094A1300,1400,2900,B,95.1,0.0,33.6,1.6,24.0,2619.9,0.0,296.9,33.0,2
3,70,0,201906,200C032D094A1300,1400,2900,B,123.4,0.0,13.8,2.0,30.0,2743.3,0.0,310.7,35.0,2
4,70,0,201907,200C032D094A1300,1400,2900,B,113.6,0.0,19.2,1.9,29.0,2856.9,0.0,329.9,36.9,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99821,39286,0,201912,102052608016W600,9021,5000,A,1779.3,0.0,953.8,0.8,9.5,1779.3,0.0,953.8,0.8,0
99822,39287,0,201912,103122608016W600,9021,5000,A,1869.8,0.0,860.4,0.7,13.5,1869.8,0.0,860.4,0.7,0
99823,39288,0,201912,102132608016W600,9021,5000,A,442.3,0.0,45.1,0.0,3.9,442.3,0.0,45.1,0.0,0
99824,39289,0,201912,103123008015W600,9021,5000,A,273.3,0.0,0.0,0.0,0.3,273.3,0.0,0.0,0.0,0


In [6]:
# Converting the index as datetime
format ='%Y%m'

prod_2019['Date'] = pd.to_datetime(prod_2019['prod_period'].astype('str'), format=format)
prod_2019

Unnamed: 0,wa_num,compltn_event_seq,prod_period,UWI,area_code,formtn_code,Pool_seq,gas_prod_vol_m3,oil_prod_vol_m3,water_prod_vol_m3,cond_prod_vol_m3,prod_days,gas_prod_cum_m3,oil_prod_cum_m3,water_prod_cum_m3,cond_prod_cum_m3,project_code,Date
0,70,0,201901,200C032D094A1300,1400,2900,B,8.3,0.0,0.8,0.1,2.0,2467.0,0.0,251.3,30.9,2,2019-01-01
1,70,0,201904,200C032D094A1300,1400,2900,B,57.8,0.0,12.0,0.5,13.0,2524.8,0.0,263.3,31.4,2,2019-04-01
2,70,0,201905,200C032D094A1300,1400,2900,B,95.1,0.0,33.6,1.6,24.0,2619.9,0.0,296.9,33.0,2,2019-05-01
3,70,0,201906,200C032D094A1300,1400,2900,B,123.4,0.0,13.8,2.0,30.0,2743.3,0.0,310.7,35.0,2,2019-06-01
4,70,0,201907,200C032D094A1300,1400,2900,B,113.6,0.0,19.2,1.9,29.0,2856.9,0.0,329.9,36.9,2,2019-07-01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99821,39286,0,201912,102052608016W600,9021,5000,A,1779.3,0.0,953.8,0.8,9.5,1779.3,0.0,953.8,0.8,0,2019-12-01
99822,39287,0,201912,103122608016W600,9021,5000,A,1869.8,0.0,860.4,0.7,13.5,1869.8,0.0,860.4,0.7,0,2019-12-01
99823,39288,0,201912,102132608016W600,9021,5000,A,442.3,0.0,45.1,0.0,3.9,442.3,0.0,45.1,0.0,0,2019-12-01
99824,39289,0,201912,103123008015W600,9021,5000,A,273.3,0.0,0.0,0.0,0.3,273.3,0.0,0.0,0.0,0,2019-12-01


In the entire dataset, only Data and gas production volume (gas_prod_vol_m3) is what we need so we will create a new dataframe that contains only these two coloumns

In [7]:
#sum up gas production volume for each month
df_filter=prod_2019[['Date', 'gas_prod_vol_m3']]

production_monthly_2019=df_filter.groupby(by="Date").sum()
production_monthly_2019

Unnamed: 0_level_0,gas_prod_vol_m3
Date,Unnamed: 1_level_1
2019-01-01,5213817.1
2019-02-01,4704358.5
2019-03-01,5223571.0
2019-04-01,4807638.7
2019-05-01,4561579.5
2019-06-01,4605265.3
2019-07-01,4771919.2
2019-08-01,4732956.6
2019-09-01,4660372.9
2019-10-01,4838847.5


Now we will repeat the same steps for the production years of 2020 and 2021

In [4]:
#read production data for 2020
# Path of working directory
working_path = os.path.abspath(os.getcwd())
# Path of parent directory
path_one_level_back = os.path.dirname(working_path)
# Define paths for production volume file
path_2020 = os.path.join(path_one_level_back, '0-Raw_data', 'prod_2020.csv')
prod_2020 =  pd.read_csv(path_2020, header=[0])
prod_2020

Unnamed: 0,wa_num,compltn_event_seq,prod_period,UWI,area_code,formtn_code,Pool_seq,gas_prod_vol_m3,oil_prod_vol_m3,water_prod_vol_m3,cond_prod_vol_m3,prod_days,gas_prod_cum_m3,oil_prod_cum_m3,water_prod_cum_m3,cond_prod_cum_m3,project_code
0,70,0,202001,200C032D094A1300,1400,2900,B,57.7,0.0,10.9,3.2,26.4,3338.2,0.0,427.6,53.2,2
1,70,0,202002,200C032D094A1300,1400,2900,B,67.0,0.0,11.3,2.0,20.6,3405.2,0.0,438.9,55.2,2
2,70,0,202003,200C032D094A1300,1400,2900,B,54.7,0.0,8.9,0.7,28.9,3459.9,0.0,447.8,55.9,2
3,70,0,202004,200C032D094A1300,1400,2900,B,35.1,0.0,1.7,0.6,3.3,3495.0,0.0,449.5,56.5,2
4,101,0,202001,100060608613W600,2000,4535,A,4.4,73.8,1260.8,0.0,30.1,355.6,3762.6,55602.8,0.0,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
101499,40987,0,202012,100141108219W600,9021,5000,A,5078.7,0.0,2812.6,0.0,31.0,7500.9,0.0,7242.6,41.8,0
101500,41054,0,202011,103140208018W600,9021,5000,A,1658.9,0.0,3444.7,317.9,5.6,1658.9,0.0,3444.7,317.9,0
101501,41054,0,202012,103140208018W600,9021,5000,A,19143.3,0.0,4204.7,1691.9,30.8,20802.2,0.0,7649.4,2009.8,0
101502,41055,0,202011,100130208018W600,9021,5000,A,1293.3,0.0,3396.8,110.8,5.5,1293.3,0.0,3396.8,110.8,0


In [10]:
# Converting the index as datetime
format ='%Y%m'

prod_2020['Date'] = pd.to_datetime(prod_2020['prod_period'].astype('str'), format=format)
prod_2020

Unnamed: 0,wa_num,compltn_event_seq,prod_period,UWI,area_code,formtn_code,Pool_seq,gas_prod_vol_m3,oil_prod_vol_m3,water_prod_vol_m3,cond_prod_vol_m3,prod_days,gas_prod_cum_m3,oil_prod_cum_m3,water_prod_cum_m3,cond_prod_cum_m3,project_code,Date
0,70,0,202001,200C032D094A1300,1400,2900,B,57.7,0.0,10.9,3.2,26.4,3338.2,0.0,427.6,53.2,2,2020-01-01
1,70,0,202002,200C032D094A1300,1400,2900,B,67.0,0.0,11.3,2.0,20.6,3405.2,0.0,438.9,55.2,2,2020-02-01
2,70,0,202003,200C032D094A1300,1400,2900,B,54.7,0.0,8.9,0.7,28.9,3459.9,0.0,447.8,55.9,2,2020-03-01
3,70,0,202004,200C032D094A1300,1400,2900,B,35.1,0.0,1.7,0.6,3.3,3495.0,0.0,449.5,56.5,2,2020-04-01
4,101,0,202001,100060608613W600,2000,4535,A,4.4,73.8,1260.8,0.0,30.1,355.6,3762.6,55602.8,0.0,3,2020-01-01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
101499,40987,0,202012,100141108219W600,9021,5000,A,5078.7,0.0,2812.6,0.0,31.0,7500.9,0.0,7242.6,41.8,0,2020-12-01
101500,41054,0,202011,103140208018W600,9021,5000,A,1658.9,0.0,3444.7,317.9,5.6,1658.9,0.0,3444.7,317.9,0,2020-11-01
101501,41054,0,202012,103140208018W600,9021,5000,A,19143.3,0.0,4204.7,1691.9,30.8,20802.2,0.0,7649.4,2009.8,0,2020-12-01
101502,41055,0,202011,100130208018W600,9021,5000,A,1293.3,0.0,3396.8,110.8,5.5,1293.3,0.0,3396.8,110.8,0,2020-11-01


In [11]:
#sum up gas production volume for each month
df_filter=prod_2020[['Date', 'gas_prod_vol_m3']]
production_monthly_2020=df_filter.groupby(by="Date").sum()
production_monthly_2020

Unnamed: 0_level_0,gas_prod_vol_m3
Date,Unnamed: 1_level_1
2020-01-01,5421631.6
2020-02-01,5099580.5
2020-03-01,5471040.4
2020-04-01,5193531.9
2020-05-01,5282204.8
2020-06-01,4998401.2
2020-07-01,4954881.5
2020-08-01,4954800.7
2020-09-01,4500421.1
2020-10-01,5045550.0


In [5]:
#read production data for 2021
# Path of working directory
working_path = os.path.abspath(os.getcwd())
# Path of parent directory
path_one_level_back = os.path.dirname(working_path)
# Define paths for production volume file
path_2021 = os.path.join(path_one_level_back, '0-Raw_data', 'prod_2021.csv')
prod_2021 =  pd.read_csv(path_2021, header=[0])
prod_2021

Unnamed: 0,wa_num,compltn_event_seq,prod_period,UWI,area_code,formtn_code,Pool_seq,gas_prod_vol_m3,oil_prod_vol_m3,water_prod_vol_m3,cond_prod_vol_m3,prod_days,gas_prod_cum_m3,oil_prod_cum_m3,water_prod_cum_m3,cond_prod_cum_m3,project_code
0,101,0,202101,100060608613W600,2000,4535,A,6.5,47.8,1148.1,0.0,28.5,423.1,4280.6,70330.5,0.0,3
1,101,0,202102,100060608613W600,2000,4535,A,6.1,36.7,1173.0,0.0,27.9,429.2,4317.3,71503.5,0.0,3
2,101,0,202103,100060608613W600,2000,4535,A,6.4,38.5,1260.3,0.0,31.0,435.6,4355.8,72763.8,0.0,3
3,101,0,202104,100060608613W600,2000,4535,A,7.0,39.8,1218.2,0.0,29.8,442.6,4395.6,73982.0,0.0,3
4,101,0,202105,100060608613W600,2000,4535,A,6.7,38.3,1209.3,0.0,30.2,449.3,4433.9,75191.3,0.0,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
105999,42374,0,202110,100100508214W600,9021,5000,A,524.7,0.0,284.5,728.8,5.7,524.7,0.0,284.5,728.8,0
106000,42374,0,202111,100100508214W600,9021,5000,A,381.7,0.0,2499.5,1547.1,11.0,906.4,0.0,2784.0,2275.9,0
106001,42374,0,202112,100100508214W600,9021,5000,A,1962.2,0.0,1044.4,1272.2,31.0,2868.6,0.0,3828.4,3548.1,0
106002,42375,0,202111,102070508214W600,9021,5000,A,494.9,0.0,2499.5,601.8,13.0,494.9,0.0,2499.5,601.8,62


In [14]:
# Converting the index as datetime
format ='%Y%m'

prod_2021['Date'] = pd.to_datetime(prod_2021['prod_period'].astype('str'), format=format)
prod_2021

Unnamed: 0,wa_num,compltn_event_seq,prod_period,UWI,area_code,formtn_code,Pool_seq,gas_prod_vol_m3,oil_prod_vol_m3,water_prod_vol_m3,cond_prod_vol_m3,prod_days,gas_prod_cum_m3,oil_prod_cum_m3,water_prod_cum_m3,cond_prod_cum_m3,project_code,Date
0,101,0,202101,100060608613W600,2000,4535,A,6.5,47.8,1148.1,0.0,28.5,423.1,4280.6,70330.5,0.0,3,2021-01-01
1,101,0,202102,100060608613W600,2000,4535,A,6.1,36.7,1173.0,0.0,27.9,429.2,4317.3,71503.5,0.0,3,2021-02-01
2,101,0,202103,100060608613W600,2000,4535,A,6.4,38.5,1260.3,0.0,31.0,435.6,4355.8,72763.8,0.0,3,2021-03-01
3,101,0,202104,100060608613W600,2000,4535,A,7.0,39.8,1218.2,0.0,29.8,442.6,4395.6,73982.0,0.0,3,2021-04-01
4,101,0,202105,100060608613W600,2000,4535,A,6.7,38.3,1209.3,0.0,30.2,449.3,4433.9,75191.3,0.0,3,2021-05-01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
105999,42374,0,202110,100100508214W600,9021,5000,A,524.7,0.0,284.5,728.8,5.7,524.7,0.0,284.5,728.8,0,2021-10-01
106000,42374,0,202111,100100508214W600,9021,5000,A,381.7,0.0,2499.5,1547.1,11.0,906.4,0.0,2784.0,2275.9,0,2021-11-01
106001,42374,0,202112,100100508214W600,9021,5000,A,1962.2,0.0,1044.4,1272.2,31.0,2868.6,0.0,3828.4,3548.1,0,2021-12-01
106002,42375,0,202111,102070508214W600,9021,5000,A,494.9,0.0,2499.5,601.8,13.0,494.9,0.0,2499.5,601.8,62,2021-11-01


In [15]:
#sum up gas production volume for each month
df_filter=prod_2021[['Date', 'gas_prod_vol_m3']]


production_monthly_2021=df_filter.groupby(by="Date").sum()
production_monthly_2021

Unnamed: 0_level_0,gas_prod_vol_m3
Date,Unnamed: 1_level_1
2021-01-01,5591040.3
2021-02-01,5064619.2
2021-03-01,5588412.9
2021-04-01,5274400.0
2021-05-01,5246173.5
2021-06-01,5099474.2
2021-07-01,5313956.1
2021-08-01,5543320.2
2021-09-01,5477704.9
2021-10-01,5493284.6


After cleaning all the datasets separately and taking only the date and production volume as columns now we can merge all the three production years together 

In [74]:
# Combine all years into one df
production_monthly = pd.concat([production_monthly_2019,
                                production_monthly_2020,
                                production_monthly_2021], axis=0)

# Include only the year and month in index for monthly df
production_monthly.index = production_monthly.index.to_period('M')

# view df
production_monthly.head()

Unnamed: 0_level_0,gas_prod_vol_m3
Date,Unnamed: 1_level_1
2019-01,5213817.1
2019-02,4704358.5
2019-03,5223571.0
2019-04,4807638.7
2019-05,4561579.5


In [89]:
# Create a daily df
production_daily = production_monthly.resample('D').pad()

# divide the monthly gas produced evenly over the number of days in the month
production_daily['ndays'] = production_daily.index.days_in_month
production_daily['gas_prod_vol_m3'] = production_daily['gas_prod_vol_m3'] / production_daily['ndays']
production_daily.drop('ndays', axis=1, inplace=True)

# view df
production_daily.head()

Unnamed: 0_level_0,gas_prod_vol_m3
Date,Unnamed: 1_level_1
2019-01-01,168187.648387
2019-01-02,168187.648387
2019-01-03,168187.648387
2019-01-04,168187.648387
2019-01-05,168187.648387


In [91]:
#Save the daily and monthly production volume as csv
production_monthly.to_csv(r'C:\Users\user\OneDrive\Documents\production_monthly.csv')
production_daily.to_csv(r'C:\Users\user\OneDrive\Documents\production_daily.csv')

# Now that we have all the production data cleaned they can be used in the time series analysis and model training