In [1]:
# SCRIPT TO SCRAPE BAD SHOTS FROM OBSERVER LOGS IN PDF FORMAT

# NOTES:
# The code assumes a 3 digit format NUMBER for channels and 4 digitS for shots

#____________________________________________________________IMPORT LIBRARIES
import re
import os
import PyPDF2
import openpyxl
from openpyxl import Workbook
from time import sleep
from tqdm import tqdm

#____________________________________________________________INPUT DATA
# Location of the PDF files (Observer Logs)
pdf_path = r'./Observer_Logs/'

# list files inside folder
pdf_list = os.listdir(pdf_path)

#____________________________________________________________VARIABLES (REQUIRE USER INTERVENTION)
rcvmin = 1     #Survey Parameter: Minimum Channel
rcvmax = 320   #Survey Parameter: Maximum Channel

# Search for turn noise between these patterns 
pattern1_top = 'Line-segments'
pattern1_bot = 'Tape-list'

# Search for bad records between these patterns
pattern2_top = 'Seismic Data Annotations'
pattern2_bot = 'Comment'

#____________________________________________________________CHOOSE TO PROCESS ONE OR MULTIPLE FILES
def let_user_pick(options):
    print("     Choose a number to select a file from the list: ")

    for idx, element in enumerate(options):
        print("{}) {}".format(idx + 1, element))

    i = input("Entered number: ")
    try:
        if 0 < int(i) <= len(options):
            return int(i) - 1
    except:
        pass
    return None
    
question = input('Do you want to process one or multiple files (single/multiple): ')

if question.lower() == 'single':
    response = let_user_pick(pdf_list)
    single_file = pdf_path + pdf_list[response]
    print(f"          Processing file {pdf_list[response]}")
elif question.lower() == 'multiple':
    print('     Processing all files')
else:
    print("     Please type 'single' or 'multiple... exiting'")

# Single PDF for testing (COMMENT the single_file row if you want to process all files):
#one_pdf = '0258-5065_ROB.pdf'
#single_file = pdf_path + one_pdf  

#____________________________________________________________OUTPUT DATA
# Directory to contain the output files.
out_dir = "Output"

# Create if does not exits.
isExist = os.path.exists(out_dir)
if not isExist:
    os.makedirs(out_dir)   

print("The 'Output' directory is created!")

# Output Excel File
xlsx = './' + out_dir + '/Bad_Records.xlsx'
wb = Workbook()
ws = wb.active
ws.title = "Bad Records"
ws.append(['Line_Number', 'Shot_Min', 'Shot_Max', 'Rcv_Min', 'Rcv_Max', 'Comment'])
wb.create_sheet('Turn Lines')
ws = wb['Turn Lines']
ws.append(['Lines with reported turn noise shots'])
wb.save(xlsx)

print("The output files were initialized... proceeding to execute the function")

#____________________________________________________________EXECUTE THE FUNCTION
def scrape_pdf(pdf):
    
    #____________________________________________________________READ PDFs
    # Open the Observer Log PDF file
    pdfFileObject = open(pdf, 'rb')
    pdfReader = PyPDF2.PdfFileReader(pdfFileObject, strict=False)
    numPages = pdfReader.numPages
    
    #____________________________________________________________INITIALIZE OUTPUT FILE(s)
    # Define current acquisition line number
    line = re.findall(r"[-](.+)[_]",pdf)
    line_number = ''.join(line)
    
    # Define the output file that will be read by TRACE_SELECT in Omega
    template_file = "./" + out_dir + "/TRACE_SELECT_line_" + line_number + ".txt"    
    
    header = "'Key' 'Use Absolute Value of Key' 'First' 'Last' 'Increment' 'Tolerance' 'Operation to Perform' 'Exclude' 'And/Or'"
    template_txt = open(template_file, "w")
    template_line = template_txt.write(header + '\n')
    template_txt.close()
    
    #____________________________________________________________NESTED FUNCTIONS TO APPEND TO OUTPUT
    global wb
    global ws
    
    def write(file):
        text = f"""'IDENT_NUM' 'NO' {spmin[0]} {spmax[0]} NOT_USED 0 NOT_USED 'NO' 'AND'
'TRACE_NUM' 'NO' {rcmin} {rcmax} NOT_USED 0 NOT_USED 'NO' 'OR'"""
        
        template_txt = open(file, "a")
        template_line = template_txt.write(text + '\n')
        template_txt.close()
        
        ws = wb['Bad Records']
        ws.append([line_number, int(spmin[0]), int(spmax[0]), rcmin, rcmax, comment])
        wb.save(xlsx)
        
    def parse_int(inp):
        global rcvmax
        if inp:
            rcv_max = rcvmax + 1
            for i in inp:
                i_int = int(i)
                if i_int in range(rcvmin, rcv_max, 1):
                    return i_int
        
    output = []
    
    #____________________________________________________________SCRAPE TEXT FROM PDF
    for i in range(numPages):
        content = pdfReader.getPage(i)
        page = content.extractText()
        output.append(page)
            
    # Merge all pages into a single string
    text = ''.join(output)
    
    #____________________________________________________________IDENTIFY TURN LINES
    # Extract text to identify if there were shots during vessel turn
    turn_text = re.findall(rf"{pattern1_top}.*?{pattern1_bot}", text, re.DOTALL)     
        
    for line in turn_text:
        turn = re.findall(r"[tT]urn|[bB]en[dt]",line)
        if turn:
            # Write lines with turn noise
            ws = wb['Turn Lines']
            ws.append([line_number])
            wb.save(xlsx)
        
    #____________________________________________________________IDENTIFY NOISY RECORDS
    # Extract lines between patterns. Bad shot/channels are listed within these patterns
    extracted_text = re.findall(rf"{pattern2_top}.*?{pattern2_bot}", text, re.DOTALL)
        
    # Split the text at the break
    for i in extracted_text:
        text_row = i.split('\n')
            
    matched_text_list = []
        
    # Search for specific noise keywords
    for line in text_row:
        rematch = re.findall(r"[tT]urn.*[nN]oise|[bB]en[dt]|[cC]urrent.*[nN]oise|[sS]hip.*[nN]oise|[sS]pik[ey]|[bB]ad.*[sS]hot",line)
        if rematch:
            matched_text_list.append(line) 

    # Regular Expressions to Match Bad Shots and Receivers
    mark = 0 
    
    for line in matched_text_list:
        comments = re.sub(r'([^a-zA-Z])', '', line)
        comment = comments.replace('ALL', '')
        spmin = re.findall(r"(^\d{4})[-—–](?:\d{4})",line)
        spmax = re.findall(r"(?:^\d{4})[-—–](\d{4})",line)
        spsin = re.findall(r"(^\d{4})\s",line)
        rmi = re.findall(r"(0{0,2}[1-9]|0?[1-9][0-9]|[1-9][0-9][0-9]\b)[-—–](?:0{0,2}[1-9]|0?[1-9][0-9]|[1-9][0-9][0-9])\b",line)
        if rmi:
            rcmin = parse_int(rmi)
        rma = re.findall(r"(?:0{0,2}[1-9]|0?[1-9][0-9]|[1-9][0-9][0-9]\b)[-—–](0{0,2}[1-9]|0?[1-9][0-9]|[1-9][0-9][0-9])\b",line)
        if rma:
            rcmax = parse_int(rma)
        rcall = re.findall(r"\s[aA][lL][lL]\s",line)
        
        if spmin and spmax and rmi and rma:
            mark = 1
            write(template_file)
        elif spmin and spmax and rcall:
            mark = 2
            rcmin = rcvmin
            rcmax = rcvmax
            write(template_file)
        elif spsin and rcall:
            mark = 3
            spmin = spsin
            spmax = spsin
            rcmin = rcvmin
            rcmax = rcvmax
            write(template_file)
        elif spsin and rmi and rma:
            mark = 4
            spmin = spsin
            spmax = spsin
            write(template_file)

    # Remove empty files (lines without bad records)
    if mark == 0:
        os.remove(template_file)

#____________________________________________________________EXECUTION STAGE
try:
    single_file
    print(f'Single file {pdf_list[response]} successfully processed')
except NameError:
    pbar = tqdm(pdf_list[:len(pdf_list)])
    for i in pbar:
        sleep(0.01)
        pbar.set_description(f'Processing file {i}')
        file = pdf_path + i
        scrape_pdf(file)
else:
    scrape_pdf(single_file)

Do you want to process one or multiple files (single/multiple): multiple
     Processing all files
The 'Output' directory is created!
The output files were initialized... proceeding to execute the function


Processing file 0258-5996_ROB.pdf: 100%|███████████████████████████████████████████████| 79/79 [00:10<00:00,  7.20it/s]
