In [1]:
import pandas as pd
import datetime as dt

pd.options.display.max_columns=1000
pd.options.display.max_rows = 100

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

%run configs.ipynb

In [2]:
# parameter cell
file_name = 'oct_21.csv'

In [3]:
# Parameters
file_name = "august_21.csv"


# RRC Prod

## Ingest

In [4]:
production = pd.read_csv(f'{raw_data_folder}/rrc_prod_blackbeard/2021/{file_name}', header = 5)

In [5]:
production.head()

Unnamed: 0,Multiple Reports,Lease Type,District,RRC Identifier,Field Name,Lease Name,Gas Well ID,Lease Total,Commingle Permit No.,On Hand Beginning of Month,Production,Volume,Code,On Hand End of Month,Formation Production,Volume.1,Code.1
0,,Oil,8.0,32907.0,ARMER (TUBB),"MCCAMEY, G. B. ""A"" (NCT-B)",,,7024.0,222,38,,,260,0,,
1,,Oil,8.0,20550.0,ARMER (6350),"MCCAMEY, G. B., -A- /NCT-A/",,,7024.0,39,0,,,39,0,,
2,,Oil,8.0,45314.0,ARMER (6350),RAYDEN MCCAMEY,,,,183,89,,,272,2064,2064.0,2.0
3,,Oil,8.0,48393.0,ARMER (6350),LANDLUBBER,,,8026.0,1882,8215,8165.0,0.0,1932,28894,28894.0,2.0
4,,Oil,8.0,49425.0,ARMER (6350),6 POUNDER NE,,,8026.0,485,1253,1344.0,0.0,394,6773,6773.0,2.0


## Transform RRC

### Filter to District

In [6]:
#converting data field to string to handle varying schema interpreations 
#(certain months field will be interpreted as float and string depending sample rows pandas uses

production = production.loc[production['District'].astype(str).isin(['8.0', '08', '8'])]

In [7]:
production

Unnamed: 0,Multiple Reports,Lease Type,District,RRC Identifier,Field Name,Lease Name,Gas Well ID,Lease Total,Commingle Permit No.,On Hand Beginning of Month,Production,Volume,Code,On Hand End of Month,Formation Production,Volume.1,Code.1
0,,Oil,8.0,32907.0,ARMER (TUBB),"MCCAMEY, G. B. ""A"" (NCT-B)",,,7024.0,222,38,,,260,0,,
1,,Oil,8.0,20550.0,ARMER (6350),"MCCAMEY, G. B., -A- /NCT-A/",,,7024.0,39,0,,,39,0,,
2,,Oil,8.0,45314.0,ARMER (6350),RAYDEN MCCAMEY,,,,183,89,,,272,2064,2064,2.0
3,,Oil,8.0,48393.0,ARMER (6350),LANDLUBBER,,,8026.0,1882,8215,8165,0.0,1932,28894,28894,2.0
4,,Oil,8.0,49425.0,ARMER (6350),6 POUNDER NE,,,8026.0,485,1253,1344,0.0,394,6773,6773,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
462,,Gas,8.0,147187.0,WADDELL (GRAYBURG),"WADDELL, W.N.",859,,8988.0,7,16,16,1.0,7,800,800,2.0
463,,Gas,8.0,190864.0,WADDELL (GRAYBURG),"WADDELL, W.N.",890,,8988.0,0,0,,,0,0,,
464,,Gas,8.0,190926.0,WADDELL (GRAYBURG),"WADDELL, W.N.",293,,3501.0,7,34,27,1.0,14,649,649,2.0
465,,Gas,8.0,205775.0,WADDELL (GRAYBURG),"WADDELL, W. N.",1200,,8092.0,7,34,27,1.0,14,891,891,2.0


In [8]:
production.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 464 entries, 0 to 466
Data columns (total 17 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   Multiple Reports            92 non-null     object 
 1   Lease Type                  409 non-null    object 
 2   District                    464 non-null    float64
 3   RRC Identifier              464 non-null    float64
 4   Field Name                  464 non-null    object 
 5   Lease Name                  464 non-null    object 
 6   Gas Well ID                 232 non-null    object 
 7   Lease Total                 14 non-null     object 
 8   Commingle Permit No.        274 non-null    float64
 9   On Hand Beginning of Month  414 non-null    object 
 10  Production                  464 non-null    object 
 11  Volume                      196 non-null    object 
 12  Code                        196 non-null    float64
 13  On Hand End of Month        464 non

### Clean Oil Production Columns and Convert to Int

In [9]:
production['Volume'] = production.loc[:,'Volume'].fillna(0).apply(lambda x: int(str(x).replace(',', '')))
production['Volume.1'] = production.loc[:,'Volume.1'].fillna(0).apply(lambda x: int(str(x).replace(',', '')))

### Rename Volume to rrc_oil_volume

In [10]:
production.rename(columns = {'Volume': 'rrc_oil_volume'}, inplace = True)
production.rename(columns = {'Volume.1': 'rrc_gas_volume'}, inplace = True)

In [11]:
production['rrc_oil_volume'].sum()
production['rrc_gas_volume'].sum()

384725

1423678

### Normalize Field and Reservoir Names with Text Cleaning

In [12]:
production["Normalized_Field_Name"] = production['Field Name'].dropna()\
                                                            .apply(lambda x: str(x).split('(')[0]\
                                                                                    .split(',')[0]\
                                                                                    .strip())

In [13]:
production["Normalized_Reservoir_Name"] = production['Field Name'].dropna()\
                                        .apply(lambda x: str(x).split('(')[-1]\
                                                               .strip().replace(')',''))

### Get Date from Production File

In [14]:
def get_date_from_csv(filepath, col_number, row_number):
    str_date = pd.read_csv(filepath, usecols = [col_number]).iloc[row_number].values[0]
    return str_date.strip()

In [15]:
get_date_from_csv(f'{raw_data_folder}/rrc_prod_blackbeard/2021/{file_name}', 3, 1)

'Aug 2021'

### Add Production Date to Production Data Frame

In [16]:
production.insert(0, 'Date', get_date_from_csv(f'{raw_data_folder}/rrc_prod_blackbeard/2021/{file_name}', 3, 1))

In [17]:
production

Unnamed: 0,Date,Multiple Reports,Lease Type,District,RRC Identifier,Field Name,Lease Name,Gas Well ID,Lease Total,Commingle Permit No.,On Hand Beginning of Month,Production,rrc_oil_volume,Code,On Hand End of Month,Formation Production,rrc_gas_volume,Code.1,Normalized_Field_Name,Normalized_Reservoir_Name
0,Aug 2021,,Oil,8.0,32907.0,ARMER (TUBB),"MCCAMEY, G. B. ""A"" (NCT-B)",,,7024.0,222,38,0,,260,0,0,,ARMER,TUBB
1,Aug 2021,,Oil,8.0,20550.0,ARMER (6350),"MCCAMEY, G. B., -A- /NCT-A/",,,7024.0,39,0,0,,39,0,0,,ARMER,6350
2,Aug 2021,,Oil,8.0,45314.0,ARMER (6350),RAYDEN MCCAMEY,,,,183,89,0,,272,2064,2064,2.0,ARMER,6350
3,Aug 2021,,Oil,8.0,48393.0,ARMER (6350),LANDLUBBER,,,8026.0,1882,8215,8165,0.0,1932,28894,28894,2.0,ARMER,6350
4,Aug 2021,,Oil,8.0,49425.0,ARMER (6350),6 POUNDER NE,,,8026.0,485,1253,1344,0.0,394,6773,6773,2.0,ARMER,6350
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
462,Aug 2021,,Gas,8.0,147187.0,WADDELL (GRAYBURG),"WADDELL, W.N.",859,,8988.0,7,16,16,1.0,7,800,800,2.0,WADDELL,GRAYBURG
463,Aug 2021,,Gas,8.0,190864.0,WADDELL (GRAYBURG),"WADDELL, W.N.",890,,8988.0,0,0,0,,0,0,0,,WADDELL,GRAYBURG
464,Aug 2021,,Gas,8.0,190926.0,WADDELL (GRAYBURG),"WADDELL, W.N.",293,,3501.0,7,34,27,1.0,14,649,649,2.0,WADDELL,GRAYBURG
465,Aug 2021,,Gas,8.0,205775.0,WADDELL (GRAYBURG),"WADDELL, W. N.",1200,,8092.0,7,34,27,1.0,14,891,891,2.0,WADDELL,GRAYBURG


### Add columns for net production to Trust

In [18]:
production['Net Oil Volume to Trust (RRC)'] = production['rrc_oil_volume'] * 0.50
production['Net Oil Sales Volume to Trust (RRC)'] = production['rrc_oil_volume'] * 0.50 * 0.75

In [19]:
production['Net Gas Volume to Trust (RRC)'] = production['rrc_gas_volume'] * 0.50
production['Net Gas Sales Volume to Trust (RRC)'] = production['rrc_gas_volume'] * 0.50 * 0.75

In [20]:
production

Unnamed: 0,Date,Multiple Reports,Lease Type,District,RRC Identifier,Field Name,Lease Name,Gas Well ID,Lease Total,Commingle Permit No.,On Hand Beginning of Month,Production,rrc_oil_volume,Code,On Hand End of Month,Formation Production,rrc_gas_volume,Code.1,Normalized_Field_Name,Normalized_Reservoir_Name,Net Oil Volume to Trust (RRC),Net Oil Sales Volume to Trust (RRC),Net Gas Volume to Trust (RRC),Net Gas Sales Volume to Trust (RRC)
0,Aug 2021,,Oil,8.0,32907.0,ARMER (TUBB),"MCCAMEY, G. B. ""A"" (NCT-B)",,,7024.0,222,38,0,,260,0,0,,ARMER,TUBB,0.0,0.000,0.0,0.000
1,Aug 2021,,Oil,8.0,20550.0,ARMER (6350),"MCCAMEY, G. B., -A- /NCT-A/",,,7024.0,39,0,0,,39,0,0,,ARMER,6350,0.0,0.000,0.0,0.000
2,Aug 2021,,Oil,8.0,45314.0,ARMER (6350),RAYDEN MCCAMEY,,,,183,89,0,,272,2064,2064,2.0,ARMER,6350,0.0,0.000,1032.0,774.000
3,Aug 2021,,Oil,8.0,48393.0,ARMER (6350),LANDLUBBER,,,8026.0,1882,8215,8165,0.0,1932,28894,28894,2.0,ARMER,6350,4082.5,3061.875,14447.0,10835.250
4,Aug 2021,,Oil,8.0,49425.0,ARMER (6350),6 POUNDER NE,,,8026.0,485,1253,1344,0.0,394,6773,6773,2.0,ARMER,6350,672.0,504.000,3386.5,2539.875
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
462,Aug 2021,,Gas,8.0,147187.0,WADDELL (GRAYBURG),"WADDELL, W.N.",859,,8988.0,7,16,16,1.0,7,800,800,2.0,WADDELL,GRAYBURG,8.0,6.000,400.0,300.000
463,Aug 2021,,Gas,8.0,190864.0,WADDELL (GRAYBURG),"WADDELL, W.N.",890,,8988.0,0,0,0,,0,0,0,,WADDELL,GRAYBURG,0.0,0.000,0.0,0.000
464,Aug 2021,,Gas,8.0,190926.0,WADDELL (GRAYBURG),"WADDELL, W.N.",293,,3501.0,7,34,27,1.0,14,649,649,2.0,WADDELL,GRAYBURG,13.5,10.125,324.5,243.375
465,Aug 2021,,Gas,8.0,205775.0,WADDELL (GRAYBURG),"WADDELL, W. N.",1200,,8092.0,7,34,27,1.0,14,891,891,2.0,WADDELL,GRAYBURG,13.5,10.125,445.5,334.125


### Write to File

In [21]:
production.to_csv(f'{processed_data_folder}/rrc_prod/{file_name}', index = False)

## Aggregated EDA

In [22]:
production.groupby(['Normalized_Field_Name']).agg({'rrc_oil_volume': 'sum'})\
                                            .sort_values(by = 'rrc_oil_volume', ascending= False)\
                                            .head(15)

Unnamed: 0_level_0,rrc_oil_volume
Normalized_Field_Name,Unnamed: 1_level_1
SAND HILLS,220704
MONAHANS,76707
ARMER,35633
DUNE,18193
UNIVERSITY WADDELL,9698
MARSTON RANCH,8876
WADDELL,5737
RUNNING W,5005
CORDONA LAKE,2800
LEA,1023


In [23]:
production.groupby(['Normalized_Reservoir_Name', 'Normalized_Field_Name']).agg({'rrc_oil_volume': 'sum'}).sort_values(by = 'rrc_oil_volume', ascending= False).sort_index()

Unnamed: 0_level_0,Unnamed: 1_level_0,rrc_oil_volume
Normalized_Reservoir_Name,Normalized_Field_Name,Unnamed: 2_level_1
6350,ARMER,35633
7900,EDWARDS -04-,0
CLEAR FORK,LEA,299
CLEAR FORK,MONAHANS,0
CLEAR FORK,SAND HILLS,38060
CLEAR FORK 4070,SAND HILLS,301
"CLEAR FORK, LOWER",MCKEE,0
"CLEAR FORK, MID.",MCKEE,0
CLEARFORK,MARSTON RANCH,8876
CLEARFORK,MONAHANS,69081
