In [1]:
import pandas as pd
import datetime as dt

pd.options.display.max_columns=1000
pd.options.display.max_rows = 100

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

%run configs.ipynb

In [2]:
# parameter cell
file_name = 'oct_21.csv'

In [3]:
# Parameters
file_name = "nov_21.csv"


# RRC Prod

## Ingest

In [4]:
production = pd.read_csv(f'{raw_data_folder}/rrc_prod_blackbeard/2021/{file_name}', header = 5)

In [5]:
production.head()

Unnamed: 0,Multiple Reports,Lease Type,District,RRC Identifier,Field Name,Lease Name,Gas Well ID,Lease Total,Commingle Permit No.,On Hand Beginning of Month,Production,Volume,Code,On Hand End of Month,Formation Production,Volume.1,Code.1
0,,Oil,8.0,32907.0,ARMER (TUBB),"MCCAMEY, G. B. ""A"" (NCT-B)",,,7024.0,166,38,,,204,0,,
1,,Oil,8.0,20550.0,ARMER (6350),"MCCAMEY, G. B., -A- /NCT-A/",,,7024.0,18,0,,,18,0,,
2,,Oil,8.0,45314.0,ARMER (6350),RAYDEN MCCAMEY,,,,165,85,,,250,447,447.0,2.0
3,,Oil,8.0,48393.0,ARMER (6350),LANDLUBBER,,,8026.0,1452,6098,6499.0,0.0,1051,18593,18593.0,2.0
4,,Oil,8.0,49425.0,ARMER (6350),6 POUNDER NE,,,8026.0,193,866,869.0,0.0,190,4149,4149.0,2.0


## Transform RRC

### Filter to District

In [6]:
#converting data field to string to handle varying schema interpreations 
#(certain months field will be interpreted as float and string depending sample rows pandas uses

production = production.loc[production['District'].astype(str).isin(['8.0', '08', '8'])]

In [7]:
production

Unnamed: 0,Multiple Reports,Lease Type,District,RRC Identifier,Field Name,Lease Name,Gas Well ID,Lease Total,Commingle Permit No.,On Hand Beginning of Month,Production,Volume,Code,On Hand End of Month,Formation Production,Volume.1,Code.1
0,,Oil,8.0,32907.0,ARMER (TUBB),"MCCAMEY, G. B. ""A"" (NCT-B)",,,7024.0,166,38,,,204,0,,
1,,Oil,8.0,20550.0,ARMER (6350),"MCCAMEY, G. B., -A- /NCT-A/",,,7024.0,18,0,,,18,0,,
2,,Oil,8.0,45314.0,ARMER (6350),RAYDEN MCCAMEY,,,,165,85,,,250,447,447,2.0
3,,Oil,8.0,48393.0,ARMER (6350),LANDLUBBER,,,8026.0,1452,6098,6499,0.0,1051,18593,18593,2.0
4,,Oil,8.0,49425.0,ARMER (6350),6 POUNDER NE,,,8026.0,193,866,869,0.0,190,4149,4149,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
459,Y,Gas,8.0,147187.0,WADDELL (GRAYBURG),"WADDELL, W.N.",859,,8988.0,2,28,28,1.0,2,554,554,2.0
460,,Gas,8.0,190864.0,WADDELL (GRAYBURG),"WADDELL, W.N.",890,,8988.0,0,0,,,0,0,,
461,Y,Gas,8.0,190926.0,WADDELL (GRAYBURG),"WADDELL, W.N.",293,,3501.0,2,28,28,1.0,2,362,362,2.0
462,Y,Gas,8.0,205775.0,WADDELL (GRAYBURG),"WADDELL, W. N.",1200,,8092.0,2,28,28,1.0,2,532,532,2.0


In [8]:
production.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 460 entries, 0 to 463
Data columns (total 17 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   Multiple Reports            233 non-null    object 
 1   Lease Type                  406 non-null    object 
 2   District                    460 non-null    float64
 3   RRC Identifier              460 non-null    float64
 4   Field Name                  460 non-null    object 
 5   Lease Name                  460 non-null    object 
 6   Gas Well ID                 232 non-null    object 
 7   Lease Total                 13 non-null     object 
 8   Commingle Permit No.        270 non-null    float64
 9   On Hand Beginning of Month  407 non-null    object 
 10  Production                  460 non-null    object 
 11  Volume                      183 non-null    object 
 12  Code                        183 non-null    float64
 13  On Hand End of Month        460 non

### Clean Oil Production Columns and Convert to Int

In [9]:
production['Volume'] = production.loc[:,'Volume'].fillna(0).apply(lambda x: int(str(x).replace(',', '')))
production['Volume.1'] = production.loc[:,'Volume.1'].fillna(0).apply(lambda x: int(str(x).replace(',', '')))

### Rename Volume to rrc_oil_volume

In [10]:
production.rename(columns = {'Volume': 'rrc_oil_volume'}, inplace = True)
production.rename(columns = {'Volume.1': 'rrc_gas_volume'}, inplace = True)

In [11]:
production['rrc_oil_volume'].sum()
production['rrc_gas_volume'].sum()

362868

1415945

### Normalize Field and Reservoir Names with Text Cleaning

In [12]:
production["Normalized_Field_Name"] = production['Field Name'].dropna()\
                                                            .apply(lambda x: str(x).split('(')[0]\
                                                                                    .split(',')[0]\
                                                                                    .strip())

In [13]:
production["Normalized_Reservoir_Name"] = production['Field Name'].dropna()\
                                        .apply(lambda x: str(x).split('(')[-1]\
                                                               .strip().replace(')',''))

### Get Date from Production File

In [14]:
def get_date_from_csv(filepath, col_number, row_number):
    str_date = pd.read_csv(filepath, usecols = [col_number]).iloc[row_number].values[0]
    return str_date.strip()

In [15]:
get_date_from_csv(f'{raw_data_folder}/rrc_prod_blackbeard/2021/{file_name}', 3, 1)

'Nov 2021'

### Add Production Date to Production Data Frame

In [16]:
production.insert(0, 'Date', get_date_from_csv(f'{raw_data_folder}/rrc_prod_blackbeard/2021/{file_name}', 3, 1))

In [17]:
production

Unnamed: 0,Date,Multiple Reports,Lease Type,District,RRC Identifier,Field Name,Lease Name,Gas Well ID,Lease Total,Commingle Permit No.,On Hand Beginning of Month,Production,rrc_oil_volume,Code,On Hand End of Month,Formation Production,rrc_gas_volume,Code.1,Normalized_Field_Name,Normalized_Reservoir_Name
0,Nov 2021,,Oil,8.0,32907.0,ARMER (TUBB),"MCCAMEY, G. B. ""A"" (NCT-B)",,,7024.0,166,38,0,,204,0,0,,ARMER,TUBB
1,Nov 2021,,Oil,8.0,20550.0,ARMER (6350),"MCCAMEY, G. B., -A- /NCT-A/",,,7024.0,18,0,0,,18,0,0,,ARMER,6350
2,Nov 2021,,Oil,8.0,45314.0,ARMER (6350),RAYDEN MCCAMEY,,,,165,85,0,,250,447,447,2.0,ARMER,6350
3,Nov 2021,,Oil,8.0,48393.0,ARMER (6350),LANDLUBBER,,,8026.0,1452,6098,6499,0.0,1051,18593,18593,2.0,ARMER,6350
4,Nov 2021,,Oil,8.0,49425.0,ARMER (6350),6 POUNDER NE,,,8026.0,193,866,869,0.0,190,4149,4149,2.0,ARMER,6350
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
459,Nov 2021,Y,Gas,8.0,147187.0,WADDELL (GRAYBURG),"WADDELL, W.N.",859,,8988.0,2,28,28,1.0,2,554,554,2.0,WADDELL,GRAYBURG
460,Nov 2021,,Gas,8.0,190864.0,WADDELL (GRAYBURG),"WADDELL, W.N.",890,,8988.0,0,0,0,,0,0,0,,WADDELL,GRAYBURG
461,Nov 2021,Y,Gas,8.0,190926.0,WADDELL (GRAYBURG),"WADDELL, W.N.",293,,3501.0,2,28,28,1.0,2,362,362,2.0,WADDELL,GRAYBURG
462,Nov 2021,Y,Gas,8.0,205775.0,WADDELL (GRAYBURG),"WADDELL, W. N.",1200,,8092.0,2,28,28,1.0,2,532,532,2.0,WADDELL,GRAYBURG


### Add columns for net production to Trust

In [18]:
production['Net Oil Volume to Trust (RRC)'] = production['rrc_oil_volume'] * 0.50
production['Net Oil Sales Volume to Trust (RRC)'] = production['rrc_oil_volume'] * 0.50 * 0.75

In [19]:
production['Net Gas Volume to Trust (RRC)'] = production['rrc_gas_volume'] * 0.50
production['Net Gas Sales Volume to Trust (RRC)'] = production['rrc_gas_volume'] * 0.50 * 0.75

In [20]:
production

Unnamed: 0,Date,Multiple Reports,Lease Type,District,RRC Identifier,Field Name,Lease Name,Gas Well ID,Lease Total,Commingle Permit No.,On Hand Beginning of Month,Production,rrc_oil_volume,Code,On Hand End of Month,Formation Production,rrc_gas_volume,Code.1,Normalized_Field_Name,Normalized_Reservoir_Name,Net Oil Volume to Trust (RRC),Net Oil Sales Volume to Trust (RRC),Net Gas Volume to Trust (RRC),Net Gas Sales Volume to Trust (RRC)
0,Nov 2021,,Oil,8.0,32907.0,ARMER (TUBB),"MCCAMEY, G. B. ""A"" (NCT-B)",,,7024.0,166,38,0,,204,0,0,,ARMER,TUBB,0.0,0.000,0.0,0.000
1,Nov 2021,,Oil,8.0,20550.0,ARMER (6350),"MCCAMEY, G. B., -A- /NCT-A/",,,7024.0,18,0,0,,18,0,0,,ARMER,6350,0.0,0.000,0.0,0.000
2,Nov 2021,,Oil,8.0,45314.0,ARMER (6350),RAYDEN MCCAMEY,,,,165,85,0,,250,447,447,2.0,ARMER,6350,0.0,0.000,223.5,167.625
3,Nov 2021,,Oil,8.0,48393.0,ARMER (6350),LANDLUBBER,,,8026.0,1452,6098,6499,0.0,1051,18593,18593,2.0,ARMER,6350,3249.5,2437.125,9296.5,6972.375
4,Nov 2021,,Oil,8.0,49425.0,ARMER (6350),6 POUNDER NE,,,8026.0,193,866,869,0.0,190,4149,4149,2.0,ARMER,6350,434.5,325.875,2074.5,1555.875
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
459,Nov 2021,Y,Gas,8.0,147187.0,WADDELL (GRAYBURG),"WADDELL, W.N.",859,,8988.0,2,28,28,1.0,2,554,554,2.0,WADDELL,GRAYBURG,14.0,10.500,277.0,207.750
460,Nov 2021,,Gas,8.0,190864.0,WADDELL (GRAYBURG),"WADDELL, W.N.",890,,8988.0,0,0,0,,0,0,0,,WADDELL,GRAYBURG,0.0,0.000,0.0,0.000
461,Nov 2021,Y,Gas,8.0,190926.0,WADDELL (GRAYBURG),"WADDELL, W.N.",293,,3501.0,2,28,28,1.0,2,362,362,2.0,WADDELL,GRAYBURG,14.0,10.500,181.0,135.750
462,Nov 2021,Y,Gas,8.0,205775.0,WADDELL (GRAYBURG),"WADDELL, W. N.",1200,,8092.0,2,28,28,1.0,2,532,532,2.0,WADDELL,GRAYBURG,14.0,10.500,266.0,199.500


### Write to File

In [21]:
production.to_csv(f'{processed_data_folder}/rrc_prod/{file_name}', index = False)

## Aggregated EDA

In [22]:
production.groupby(['Normalized_Field_Name']).agg({'rrc_oil_volume': 'sum'})\
                                            .sort_values(by = 'rrc_oil_volume', ascending= False)\
                                            .head(15)

Unnamed: 0_level_0,rrc_oil_volume
Normalized_Field_Name,Unnamed: 1_level_1
SAND HILLS,230210
MONAHANS,62167
ARMER,29074
DUNE,16874
MARSTON RANCH,8422
RUNNING W,5604
WADDELL,4307
UNIVERSITY WADDELL,2511
CORDONA LAKE,1907
LEA,1454


In [23]:
production.groupby(['Normalized_Reservoir_Name', 'Normalized_Field_Name']).agg({'rrc_oil_volume': 'sum'}).sort_values(by = 'rrc_oil_volume', ascending= False).sort_index()

Unnamed: 0_level_0,Unnamed: 1_level_0,rrc_oil_volume
Normalized_Reservoir_Name,Normalized_Field_Name,Unnamed: 2_level_1
6350,ARMER,29074
7900,EDWARDS -04-,0
CLEAR FORK,LEA,342
CLEAR FORK,MONAHANS,0
CLEAR FORK,SAND HILLS,32761
CLEAR FORK 4070,SAND HILLS,670
"CLEAR FORK, LOWER",MCKEE,0
"CLEAR FORK, MID.",MCKEE,0
CLEARFORK,MARSTON RANCH,8422
CLEARFORK,MONAHANS,56623
