-
Notifications
You must be signed in to change notification settings - Fork 18
/
WorkingFileFor_dbf900.py
410 lines (337 loc) · 21.5 KB
/
WorkingFileFor_dbf900.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
# -*- coding: utf-8 -*-
"""
Created on Tue Aug 4 14:27:23 2020
@author: MBelobraydic
"""
import argparse
import os
import sys
import pandas as pd
import json
from ebcdic_main import yield_blocks, parse_record
from layouts_wells_dbf900 import dbf900_layout
from ebcdic_formats import pic_yyyymmdd, pic_numeric, pic_any
def main():
file_path, out_dir = parse_args()
block_size = 247 ##block size for each record in the file
##Unknown if this holds true for all versions of this file or for other files on TXRRC
print('opening',file_path,'...')
file = open(file_path, 'rb') ##Opens the .ebc file and reads it as bytes
##Use limiting counter for testing formatting
Limiting_Counter = True
"""
##Section for testing the outputs
"""
API = None ##this needs to be inplace incase the random part of the array selected does start on an 01 record
ct = 0 ##counter for number of records
wellct = 0 ##counter for number of wells
check_stop = 100 ##number of loop runs to complete before stopping
"""
DataFrame and JSON items for each sections
https://www.rrc.texas.gov/media/41906/wba091_well-bore-database.pdf
"""
wbroot_df = pd.DataFrame() ##01 Not Recurring dataframe
wbcompl_df = pd.DataFrame() ##02 Recurring dataframe most likely linked to 1 with error in .pdf manual
wbdate_df = pd.DataFrame() ##03 Recurring dataframe linked to 02
wbrmks_df = pd.DataFrame() ##04 Recurring JSON best linked to 03
wbtube_df = pd.DataFrame() ##05 Recurring linked to 03
wbcase_df = pd.DataFrame() ##06 Recurring linked to 03
wbperf_df = pd.DataFrame() ##07 Recurring linked to 03
wbline_df = pd.DataFrame() ##08 Recurring linked to 03
wbform_df = pd.DataFrame() ##09 Recurring linked to 03
wbsqeze_df = pd.DataFrame() ##10 Recurring linked to 03
wbfresh_df = pd.DataFrame() ##11 Recurring linked to 03
wboldloc_df = pd.DataFrame() ##12 Not Recurring linked to 01
wbnewloc_df = pd.DataFrame() ##13 Not Recurring linked to 01
wbplug_df = pd.DataFrame() ##14 Recurring linked to 01
wbplrmks_df = pd.DataFrame() ##15 Recurring JSON best linked to 14
wbplrec_df = pd.DataFrame() ##16 Not Recurring linked to 14
wbplcase_df = pd.DataFrame() ##17 Recurring linked to 14
wbplperf_df = pd.DataFrame() ##18 Recurring linked to 14
wbplname_df = pd.DataFrame() ##19 Not Recurring JSON? linked to 14
wbdrill_df = pd.DataFrame() ##20 Recurring linked to 01
wbwellid_df = pd.DataFrame() ##21 Recurring linked to 20
wb14b2_df = pd.DataFrame() ##22 Recurring linked to 01
wbh15_df = pd.DataFrame() ##23 Recurring linked to 01
## now linked to 23 wbh15rmk_df = pd.DataFrame() ##24 Recurring JSON best linked to 23
wbsb126_df = pd.DataFrame() ##25 Recurring linked to 01
wbdastat_df = pd.DataFrame() ##26 Recurring linked to 01
wbw3c_df = pd.DataFrame() ##27 Recurring linked to 01
## now linked to 22 wb14b2rm_df = pd.DataFrame() ##28 Recurring JSON best linked to 22
"""Loop section for all records or partial set"""
for block in yield_blocks(file, block_size): ##for each block in file
##For testing script
if Limiting_Counter == True and wellct > check_stop: ##Stops the loop once a set number of wells has been complete
break
startval = pic_any(block[0:2]) ## first two characters of a block
"""
Holding unique key values for the current record structure.
Reset when new record "01" is found.
"""
if startval == '01': ##captures the API number for databasing
API = API = '42'+ pic_any(block[2:10]) ##api value in records '01'
wellct+=1
##Might be helpful to find the next occurance of a record staring with "01"
##This way all the associated items are added to the right sections with a check of completion
wbcplkey = None ## defined in '02' cleared upon new API
fluid_code_02 = None ## defined in '02' cleared upon new API
wbfilekey = None ## defined in '03' cleared upon new API
wbfiledt = None ## defined in '14' cleared upon new API
wbplugkey = None ## defined in '14' cleared upon new API
fluid_code_14 = None ## defined in '14' cleared upon new API
wbpmtnum = None ## defined in '20' cleared upon new API
wb14b2ky = None ## defined in '22' cleared upon new API
fluid_code_22 = None ## defined in '22' cleared upon new API
h15_key = None ## defined in '23' cleared upon new API
if startval == '02': ##captures wbcplkey unique key for databasing
wbcplkey = pic_any(block[2:15]) ##key for 03 ->04 05 06 07 08 09 10 11 ##similar to wbplugkey and wbplugkey
fluid_code_02 = pic_any(block[2:3])
if startval == '03': ##captures wbfilekey unique key for databasing
wbfilekey = pic_numeric(block[2:10]) ##key for 04 05 06 07 08 09 10 11
if startval == '14': ##captures wbfiledt unique key for databasing
wbfiledt = pic_yyyymmdd(block[2:10]) ##key for 15 16 17 18 19
wbplugkey = pic_any(block[177:190]).strip() ##similar to wbcplkey and wb14b2ky
fluid_code_14 = pic_any(block[177:178]) ##Unknown if this needs to be captured since it is simlar to fluid_code in 02
if startval == '20': ##captures wbpmtnum unique key for databasing
wbpmtnum = pic_any(block[2:8]) ##key for 21
if startval == '22': ##captures wb14b2ky unique key for databasing
wb14b2ky = pic_any(block[2:15]).strip() ##key for 28 ##similar to wbcplkey and wbplugkey
fluid_code_22 = pic_any(block[2:3]) ##Unknown if this needs to be captured since it is simlar to fluid_code in 02
if startval == '23':
h15_key = pic_numeric(block[2:10]) ##key for 24 WB-H15-DATE-KEY Derived by subtracting the mailing date for the H-15 Listing from 999999999. This provides us with a date key
"""
Selecting layout based on leading startval
and parsing record based on the selected layout
"""
layout = dbf900_layout(startval)['layout'] ##identifies layout based on record start values
parsed_vals = parse_record(block, layout) ##formats the record and returns a formated {dict}
temp_df = pd.DataFrame([parsed_vals], columns=parsed_vals.keys()) ##convert {dict} to dataframe
temp_df['api10'] = API ##adds API number to record (might need to move this to first position)
"""Dataframe loading and correcting (as necessary)"""
if startval =='01': ##
wbroot_df = wbroot_df.append(temp_df, ignore_index=True) ##appends results to dataframe
elif startval =='02':
##add unique keys
temp_df['wbcplkey'] = wbcplkey ##adds wbcplkey to record for link to 03 04 05 06 07 08 09 10 11
## break up 'WB-OIL-GAS-INFO' based on fluid_code_02 'o' vs 'g'
wbcompl_df = wbcompl_df.append(temp_df, ignore_index=True) ##appends results to dataframe
elif startval =='03':
##add unique keys
temp_df['wbcplkey'] = wbcplkey ##adds wbcplkey to record for link to 03 04 05 06 07 08 09 10 11
temp_df['wbfilekey'] = wbfilekey ##adds wbfilekey to record for link to 04 05 06 07 08 09 10 11
wbdate_df = wbdate_df.append(temp_df, ignore_index=True) ##appends results to dataframe
elif startval =='04': ###temporary
"""Should be appended to 03 by JSON"""
##add unique keys
temp_df['wbcplkey'] = wbcplkey ##adds wbcplkey to record for link to 03 04 05 06 07 08 09 10 11
temp_df['wbfilekey'] = wbfilekey ##adds wbfilekey to record for link to 04 05 06 07 08 09 10 11
wbrmks_df = wbrmks_df.append(temp_df, ignore_index=True) ##appends results to dataframe
elif startval =='05':
##add unique keys
temp_df['wbcplkey'] = wbcplkey ##adds wbcplkey to record for link to 03 04 05 06 07 08 09 10 11
temp_df['wbfilekey'] = wbfilekey ##adds wbfilekey to record for link to 04 05 06 07 08 09 10 11
wbtube_df = wbtube_df.append(temp_df, ignore_index=True) ##appends results to dataframe
elif startval =='06':
##add unique keys
temp_df['wbcplkey'] = wbcplkey ##adds wbcplkey to record for link to 03 04 05 06 07 08 09 10 11
temp_df['wbfilekey'] = wbfilekey ##adds wbfilekey to record for link to 04 05 06 07 08 09 10 11
wbcase_df = wbcase_df.append(temp_df, ignore_index=True) ##appends results to dataframe
elif startval =='07':
##add unique keys
temp_df['wbcplkey'] = wbcplkey ##adds wbcplkey to record for link to 03 04 05 06 07 08 09 10 11
temp_df['wbfilekey'] = wbfilekey ##adds wbfilekey to record for link to 04 05 06 07 08 09 10 11
wbperf_df = wbperf_df.append(temp_df, ignore_index=True) ##appends results to dataframe
elif startval =='08':
##add unique keys
temp_df['wbcplkey'] = wbcplkey ##adds wbcplkey to record for link to 03 04 05 06 07 08 09 10 11
temp_df['wbfilekey'] = wbfilekey ##adds wbfilekey to record for link to 04 05 06 07 08 09 10 11
wbline_df = wbline_df.append(temp_df, ignore_index=True) ##appends results to dataframe
elif startval =='09':
##add unique keys
temp_df['wbcplkey'] = wbcplkey ##adds wbcplkey to record for link to 03 04 05 06 07 08 09 10 11
temp_df['wbfilekey'] = wbfilekey ##adds wbfilekey to record for link to 04 05 06 07 08 09 10 11
wbform_df = wbform_df.append(temp_df, ignore_index=True) ##appends results to dataframe
elif startval =='10':
##add unique keys
temp_df['wbcplkey'] = wbcplkey ##adds wbcplkey to record for link to 03 04 05 06 07 08 09 10 11
temp_df['wbfilekey'] = wbfilekey ##adds wbfilekey to record for link to 04 05 06 07 08 09 10 11
wbsqeze_df = wbsqeze_df.append(temp_df, ignore_index=True) ##appends results to dataframe
elif startval =='11':
##add unique keys
temp_df['wbcplkey'] = wbcplkey ##adds wbcplkey to record for link to 03 04 05 06 07 08 09 10 11
temp_df['wbfilekey'] = wbfilekey ##adds wbfilekey to record for link to 04 05 06 07 08 09 10 11
wbfresh_df = wbfresh_df.append(temp_df, ignore_index=True) ##appends results to dataframe
elif startval =='12':
wboldloc_df = wboldloc_df.append(temp_df, ignore_index=True) ##appends results to dataframe
elif startval =='13':
wbnewloc_df = wbnewloc_df.append(temp_df, ignore_index=True) ##appends results to dataframe
elif startval =='14':
##add unique keys
temp_df['wbfiledt'] = wbfiledt ##adds wbfiledt to record for 15 16 17 18 19
temp_df['wbplugkey'] = wbplugkey ##adds wbplugkey to record for 15 16 17 18 19
## break up 'WB-OIL-GAS-INFO' based on fluid_code_14 'o' vs 'g'
wbplug_df = wbplug_df.append(temp_df, ignore_index=True) ##appends results to dataframe
elif startval =='15':
##add unique keys
temp_df['wbfiledt'] = wbfiledt ##adds wbfiledt to record for 15 16 17 18 19
temp_df['wbplugkey'] = wbplugkey ##adds wbplugkey to record for 15 16 17 18 19
wbplrmks_df = wbplrmks_df.append(temp_df, ignore_index=True) ##appends results to dataframe
elif startval =='16':
##add unique keys
temp_df['wbfiledt'] = wbfiledt ##adds wbfiledt to record for 15 16 17 18 19
temp_df['wbplugkey'] = wbplugkey ##adds wbplugkey to record for 15 16 17 18 19
wbplrec_df = wbplrec_df.append(temp_df, ignore_index=True) ##appends results to dataframe
elif startval =='17':
##add unique keys
temp_df['wbfiledt'] = wbfiledt ##adds wbfiledt to record for 15 16 17 18 19
temp_df['wbplugkey'] = wbplugkey ##adds wbplugkey to record for 15 16 17 18 19
wbplcase_df = wbplcase_df.append(temp_df, ignore_index=True) ##appends results to dataframe
elif startval =='18':
##add unique keys
temp_df['wbfiledt'] = wbfiledt ##adds wbfiledt to record for 15 16 17 18 19
temp_df['wbplugkey'] = wbplugkey ##adds wbplugkey to record for 15 16 17 18 19
wbplperf_df = wbplperf_df.append(temp_df, ignore_index=True) ##appends results to dataframe
elif startval =='19':
##add unique keys
temp_df['wbfiledt'] = wbfiledt ##adds wbfiledt to record for 15 16 17 18 19
temp_df['wbplugkey'] = wbplugkey ##adds wbplugkey to record for 15 16 17 18 19
wbplname_df = wbplname_df.append(temp_df, ignore_index=True) ##appends results to dataframe
elif startval =='20':
##add unique keys
temp_df['wbpmtnum'] = wbpmtnum ##adds unique wbpmtnum key for record 21
wbdrill_df = wbdrill_df.append(temp_df, ignore_index=True) ##appends results to dataframe
elif startval =='21':
##add unique keys
temp_df['wbpmtnum'] = wbpmtnum ##adds unique wbpmtnum key from record 20
wbwellid_df = wbwellid_df.append(temp_df, ignore_index=True) ##appends results to dataframe
elif startval =='22':
##add unique keys
temp_df['wb14b2ky'] = wb14b2ky ##adds wb14b2ky unique key to record
temp_df['wb14b2rm'] = None ## To be used for any following 28 sections
## break up 'WB-OIL-GAS-INFO' based on fluid_code_22 'o' vs 'g'
wb14b2_df = wb14b2_df.append(temp_df, ignore_index=True) ##appends results to dataframe
elif startval =='23':
##add unique keys
temp_df['h15_key'] = h15_key ##adds h15_key to record
temp_df['h15_remark'] = None ## To be used for any following 24 sections
wbh15_df = wbh15_df.append(temp_df, ignore_index=True) ##appends results to dataframe
elif startval =='24':###Loads any section 24 as JSON in associated 23
##add unique keys
temp_df['h15_key'] = h15_key ##adds the h15_key from previous 23 record
"""grab existing 24 JSON 'h15_remark' field from current section 23 ##None if first"""
wbh15rmk_json_24 = wbh15_df.loc[(wbh15_df['api10'] == API) & (wbh15_df['h15_key'] == h15_key), ['h15_remark']].values[0][0]
""" Adds JSON record of 24 to any previous values """
if wbh15rmk_json_24: ## verifies if the value is not null
wbh15rmk_json_24 = json.dumps(json.loads(wbh15rmk_json_24) + temp_df.to_json(orient="records"))
else: ## if null (for first 24 record for a given 23 record)
wbh15rmk_json_24 = json.dumps(temp_df.to_json(orient="records"))
"""writes record back to the correct position in 23 wbh15_df """
wbh15_df.loc[(wbh15_df['api10'] == API) & (wbh15_df['h15_key'] == h15_key), ['h15_remark']] = wbh15rmk_json_24
## Previous version had section 24 output as its own dataframe <to be deleted if all tests ok in new JSON version>
##wbh15rmk_df = wbh15rmk_df.append(temp_df, ignore_index=True) ##appends results to dataframe
elif startval =='25':
wbsb126_df = wbsb126_df.append(temp_df, ignore_index=True) ##appends results to dataframe
elif startval =='26':
wbdastat_df = wbdastat_df.append(temp_df, ignore_index=True) ##appends results to dataframe
elif startval =='27':
wbw3c_df = wbw3c_df.append(temp_df, ignore_index=True) ##appends results to dataframe
elif startval =='28':### Loads section 28 as JSON into associated 22
##add unique keys
temp_df['wb14b2ky'] = wb14b2ky ##unique key from 22
""" grabs exising 28 JSON field in 22 wb14b2_df ## None if first """
wb14b2rm_json_28 = wb14b2_df.loc[(wb14b2_df['api10'] == API) & (wb14b2_df['wb14b2ky'] == wb14b2ky), ['wb14b2rm']].values[0][0]
""" Adds JSON record of 28 to any previous values """
if wb14b2rm_json_28: ## verifies if the value is not null
wb14b2rm_json_28 = json.dumps(json.loads(wb14b2rm_json_28) + temp_df.to_json(orient="records"))
else: ## if null (for first 28 record for a given 22 record)
wb14b2rm_json_28 = json.dumps(temp_df.to_json(orient="records"))
"""writes record back to the correct position in 22 wb14b2_df """
wb14b2_df.loc[(wb14b2_df['api10'] == API) & (wb14b2_df['wb14b2ky'] == wb14b2ky), ['wb14b2rm']] = wb14b2rm_json_28
## previous version 28 to seperate dataframe <delete when above works correctly>
## wb14b2rm_df = wb14b2rm_df.append(temp_df, ignore_index=True) ##appends results to dataframe
ct+=1 ## count for number of records being reviewed by script
"""printable counter and percent to keep track in console"""
##the counter isn't necessary, but it helps to determine if it is still running.
use_counter = True
if use_counter:
sys.stdout.write("\r record:{0} well#:{1}".format(ct,wellct))
sys.stdout.flush()
print('\n Writing results to disk...')
##Currently writing to CSV
## Could be changed to XLS or written to SQL
## Need to determine how all the different sections link prior to decision
##for local storage
base_path = out_dir + os.sep + r'dbf900'
path_ext = r'.csv'
wbroot_df.to_csv(base_path+'01_wbroot'+path_ext, index=False) ##01
wbcompl_df.to_csv(base_path+'02_wbcompl'+path_ext, index=False) ##02
wbdate_df.to_csv(base_path+'03_wbdate'+path_ext, index=False) ##03
##04 is temporary here and will need to be handeled differently
wbrmks_df.to_csv(base_path+'04_wbrmks'+path_ext, index=False) ##04
wbtube_df.to_csv(base_path+'05_wbtube'+path_ext, index=False) ##05
wbcase_df.to_csv(base_path+'06_wbcase'+path_ext, index=False) ##06
wbperf_df.to_csv(base_path+'07_wbperf'+path_ext, index=False) ##07
wbline_df.to_csv(base_path+'08_wbline'+path_ext, index=False) ##08
wbform_df.to_csv(base_path+'09_wbform'+path_ext, index=False) ##09
wbsqeze_df.to_csv(base_path+'10_wbsqeze'+path_ext, index=False) ##10
wbfresh_df.to_csv(base_path+'11_wbfresh'+path_ext, index=False) ##11
wboldloc_df.to_csv(base_path+'12_wboldloc'+path_ext, index=False) ##12
wbnewloc_df.to_csv(base_path+'13_wbnewloc'+path_ext, index=False) ##13
wbplug_df.to_csv(base_path+'14_wbplug'+path_ext, index=False) ##14
##15 is temporary here and will need to be handeled differently
wbplrmks_df.to_csv(base_path+'15_wbplrmks'+path_ext, index=False) ##15
wbplrec_df.to_csv(base_path+'16_wbplrec'+path_ext, index=False) ##16
wbplcase_df.to_csv(base_path+'17_wbplcase'+path_ext, index=False) ##17
wbplperf_df.to_csv(base_path+'18_wbplperf'+path_ext, index=False) ##18
wbplname_df.to_csv(base_path+'19_wbplname'+path_ext, index=False) ##19
wbdrill_df.to_csv(base_path+'20_wbdrill'+path_ext, index=False) ##20
wbwellid_df.to_csv(base_path+'21_wbwellid'+path_ext, index=False) ##21
wb14b2_df.to_csv(base_path+'22_wb14b2'+path_ext, index=False) ##22
wbh15_df.to_csv(base_path+'23_wbh15'+path_ext, index=False) ##23
##24 is commented out since the code now combines 24 into 23
##wbh15rmk_df.to_csv(base_path+'24_wbh15rmk'+path_ext, index=False) ##24
wbsb126_df.to_csv(base_path+'25_wbsb126'+path_ext, index=False) ##25
wbdastat_df.to_csv(base_path+'26_wbdastat'+path_ext, index=False) ##26
wbw3c_df.to_csv(base_path+'27_wbw3c'+path_ext, index=False) ##27
##28 is commented out since the code now combines 28 into 22
##wb14b2rm_df.to_csv(base_path+'28_wb14b2rm'+path_ext, index=False) ##28
file.close()
def get_parser():
desc = "Process oil and gas well data from the Texas Railroad Commission"
parser = argparse.ArgumentParser(
description=desc,
)
parser.add_argument("--filepath", required=False, help="path to source data file")
parser.add_argument(
"--outdir", required=False, help="directory path to write the processed data"
)
return parser
def parse_args():
parser = get_parser()
args = parser.parse_args(sys.argv[1:])
if args.filepath:
# python WorkingFileForTesting.py --filepath data/dbf900.ebc
file_path = args.filepath
else:
## Default input data source
file_path = r"C:\PublicData\Texas\TXRRC\index\dbf900.ebc"
##file origin: ftp://ftpe.rrc.texas.gov/shfwba/dbf900.ebc.gz
##file size: 1.96MB-ish
if not os.path.isfile(file_path):
print("File Error: {} is not a file\n".format(file_path))
parser.print_help(sys.stderr)
parser.exit(1)
if args.outdir:
# python WorkingFileForTesting.py --outdir C:\mydatabase
out_dir = args.outdir
else:
## Default local storage location
out_dir = r"C:\PublicData\Texas\TXRRC\database"
if not os.path.isdir(out_dir):
print("Directory Error: {} is not a directory\n".format(out_dir))
parser.print_help(sys.stderr)
parser.exit(1)
return file_path, out_dir
if __name__ == '__main__':
main()
print('WorkingFileForTesting.py complete.')