In [1]:
# SCRIPT TO SCRAPE BAD SHOTS FROM OBSERVER LOGS IN PDF FORMAT

# NOTES:
# The code assumes a 3 digit format NUMBER for channels and 4 digitS for shots

#____________________________________________________________IMPORT LIBRARIES
import re
import os
import PyPDF2
import openpyxl
from openpyxl import Workbook
from openpyxl.utils import get_column_letter
from openpyxl.styles import Font
from openpyxl.styles import Alignment
from time import sleep
from tqdm import tqdm

#____________________________________________________________INPUT DATA
# Location of the PDF files (Observer Logs)
pdf_path = r'./Observer_Logs/'

# list files inside folder
pdf_list = os.listdir(pdf_path)

#____________________________________________________________VARIABLES (REQUIRE USER INTERVENTION)
rcvmin = 1     #Survey Parameter: Minimum Channel
rcvmax = 320   #Survey Parameter: Maximum Channel

# Search for turn noise between these patterns 
pattern1_top = 'Line-segments'
pattern1_bot = 'Tape-list'

# Search for bad records between these patterns
pattern2_top = 'Seismic Data Annotations'
pattern2_bot = 'Comment'

#____________________________________________________________FUNCTION TO FORMAT OUTPUT EXCEL FILE  
def format_ws(sheet):
    for idx, col in enumerate(sheet.columns, 1):
        sheet.column_dimensions[get_column_letter(idx)].auto_size = True
    sheet.column_dimensions['F'].width = 100
    for cell in sheet["1:1"]:
        cell.font = Font(bold=True)
    for row in range(1,sheet.max_row+1):
        for col in range(1,sheet.max_column+1):
            cell=sheet.cell(row, col)
            cell.alignment = Alignment(horizontal='center', vertical='center')
            
#____________________________________________________________CHOOSE TO PROCESS ONE OR MULTIPLE FILES
def one_or_all(options):
    print("     Choose a number to select a file from the list: ")

    for idx, element in enumerate(options):
        print("{}) {}".format(idx + 1, element))

    i = input("Enter a number: ")
    try:
        if 0 < int(i) <= len(options):
            return int(i) - 1
    except:
        pass
    return None
    
question = input('Do you want to process one or all the files (one/all): ')

if question.lower() == 'one':
    response = one_or_all(pdf_list)
    one_file = pdf_path + pdf_list[response]
    print(f"          Processing file {pdf_list[response]}")
elif question.lower() == 'all':
    print('     Processing all files')
else:
    print("     Please type 'one' or 'all' and re-run the script... exiting")
    
#____________________________________________________________OUTPUT DATA
# Directory to contain the output files.
out_dir = "Output"

# Create if does not exits.
isExist = os.path.exists(out_dir)
if not isExist:
    os.makedirs(out_dir)   

print("The Output directory is created!")

# Output Excel File
xlsx = './' + out_dir + '/Bad_Records.xlsx'
wb = Workbook()
ws = wb.active
ws.title = "Bad Records"
ws.append(['Line_Number', 'Shot_Min', 'Shot_Max', 'Rcv_Min', 'Rcv_Max', 'Comment'])
format_ws(ws)
wb.create_sheet('Turn Lines')
ws = wb['Turn Lines']
ws.append(['Line_Number'])
format_ws(ws)
wb.save(xlsx)

print("The output files were initialized... proceeding to execute the function")

#____________________________________________________________EXECUTE THE FUNCTION
def scrape_pdf(pdf):
    
    #____________________________________________________________READ PDFs
    # Open the Observer Log PDF file
    pdfFileObject = open(pdf, 'rb')
    pdfReader = PyPDF2.PdfFileReader(pdfFileObject, strict=False)
    numPages = pdfReader.numPages
    
    #____________________________________________________________INITIALIZE OUTPUT FILE(s)
    # Define current acquisition line number
    line = re.findall(r"[-](.+)[_]",pdf)
    line_number = ''.join(line)
    
    # Define the output file that will be read by TRACE_SELECT in Omega
    template_file = "./" + out_dir + "/TRACE_SELECT_line_" + line_number + ".txt"    
    
    header = "'Key' 'Use Absolute Value of Key' 'First' 'Last' 'Increment' 'Tolerance' 'Operation to Perform' 'Exclude' 'And/Or'"
    template_txt = open(template_file, "w")
    template_line = template_txt.write(header + '\n')
    template_txt.close()
    
    #____________________________________________________________NESTED FUNCTIONS TO APPEND TO OUTPUT
    global wb
    global ws
    
    def write(file):
        text = f"""'IDENT_NUM' 'NO' {spmin[0]} {spmax[0]} NOT_USED 0 NOT_USED 'NO' 'AND'
'TRACE_NUM' 'NO' {rcmin} {rcmax} NOT_USED 0 NOT_USED 'NO' 'OR'"""
        
        template_txt = open(file, "a")
        template_line = template_txt.write(text + '\n')
        template_txt.close()
        
        ws = wb['Bad Records']
        ws.append([line_number, int(spmin[0]), int(spmax[0]), rcmin, rcmax, comment])
        format_ws(ws)
        wb.save(xlsx)
        
    def parse_int(inp):
        global rcvmax
        if inp:
            rcv_max = rcvmax + 1
            for i in inp:
                i_int = int(i)
                if i_int in range(rcvmin, rcv_max, 1):
                    return i_int
        
    output = []
    
    #____________________________________________________________SCRAPE TEXT FROM PDF
    for i in range(numPages):
        content = pdfReader.getPage(i)
        page = content.extractText()
        output.append(page)
            
    # Merge all pages into a single string
    text = ''.join(output)
    
    #____________________________________________________________IDENTIFY TURN LINES
    # Extract text to identify if there were shots during vessel turn
    turn_text = re.findall(rf"{pattern1_top}.*?{pattern1_bot}", text, re.DOTALL)     
        
    for line in turn_text:
        turn = re.findall(r"[tT]urn|[bB]en[dt]",line)
        if turn:
            # Write lines with turn noise
            ws = wb['Turn Lines']
            ws.append([line_number])
            format_ws(ws)
            wb.save(xlsx)
        
    #____________________________________________________________IDENTIFY NOISY RECORDS
    # Extract lines between patterns. Bad shot/channels are listed within these patterns
    extracted_text = re.findall(rf"{pattern2_top}.*?{pattern2_bot}", text, re.DOTALL)
        
    # Split the text at the break
    for i in extracted_text:
        text_row = i.split('\n')
            
    matched_text_list = []
        
    # Search for specific noise keywords
    for line in text_row:
        rematch = re.findall(r"[tT]urn.*[nN]oise|[bB]en[dt]|[cC]urrent.*[nN]oise|[sS]hip.*[nN]oise|[sS]pik[ey]|[bB]ad.*[sS]hot",line)
        if rematch:
            matched_text_list.append(line) 

    # Regular Expressions to Match Bad Shots and Receivers
    mark = 0 
    
    for line in matched_text_list:
        comments = re.sub(r'([^a-zA-Z])', '', line)
        comment = comments.replace('ALL', '')
        spmin = re.findall(r"(^\d{4})[-—–](?:\d{4})",line)
        spmax = re.findall(r"(?:^\d{4})[-—–](\d{4})",line)
        spsin = re.findall(r"(^\d{4})\s",line)
        rmi = re.findall(r"(0{0,2}[1-9]|0?[1-9][0-9]|[1-9][0-9][0-9]\b)[-—–](?:0{0,2}[1-9]|0?[1-9][0-9]|[1-9][0-9][0-9])\b",line)
        if rmi:
            rcmin = parse_int(rmi)
        rma = re.findall(r"(?:0{0,2}[1-9]|0?[1-9][0-9]|[1-9][0-9][0-9]\b)[-—–](0{0,2}[1-9]|0?[1-9][0-9]|[1-9][0-9][0-9])\b",line)
        if rma:
            rcmax = parse_int(rma)
        rcall = re.findall(r"\s[aA][lL][lL]\s",line)
        
        if spmin and spmax and rmi and rma:
            mark = 1
            write(template_file)
        elif spmin and spmax and rcall:
            mark = 2
            rcmin = rcvmin
            rcmax = rcvmax
            write(template_file)
        elif spsin and rcall:
            mark = 3
            spmin = spsin
            spmax = spsin
            rcmin = rcvmin
            rcmax = rcvmax
            write(template_file)
        elif spsin and rmi and rma:
            mark = 4
            spmin = spsin
            spmax = spsin
            write(template_file)

    # Remove empty files (lines without bad records)
    if mark == 0:
        os.remove(template_file)

#____________________________________________________________EXECUTION STAGE
try:
    one_file
    print(f'File {pdf_list[response]} successfully processed')
except NameError:
    pbar = tqdm(pdf_list[:len(pdf_list)])
    for i in pbar:
        sleep(0.01)
        pbar.set_description(f'Processing file {i}')
        file = pdf_path + i
        scrape_pdf(file)
else:
    scrape_pdf(one_file)

Do you want to process one or all the files (one/all): one
     Choose a number to select a file from the list: 
1) 0258-5001_ROB.pdf
2) 0258-5005_ROB.pdf
3) 0258-5009_ROB.pdf
4) 0258-5013_ROB.pdf
5) 0258-5017_ROB.pdf
6) 0258-5021_ROB.pdf
7) 0258-5025_ROB.pdf
8) 0258-5029_ROB.pdf
9) 0258-5033_ROB.pdf
10) 0258-5037_ROB.pdf
11) 0258-5041A_ROB.pdf
12) 0258-5041_ROB.pdf
13) 0258-5045_ROB.pdf
14) 0258-5049_ROB.pdf
15) 0258-5053_ROB.pdf
16) 0258-5057A_ROB.pdf
17) 0258-5057_ROB.pdf
18) 0258-5061_ROB.pdf
19) 0258-5065_ROB.pdf
20) 0258-5069_ROB.pdf
21) 0258-5073_ROB.pdf
22) 0258-5077_ROB.pdf
23) 0258-5081_ROB.pdf
24) 0258-5085_ROB.pdf
25) 0258-5089_ROB.pdf
26) 0258-5093_ROB.pdf
27) 0258-5097_ROB.pdf
28) 0258-5101_ROB.pdf
29) 0258-5105_ROB.pdf
30) 0258-5109_ROB.pdf
31) 0258-5113_ROB.pdf
32) 0258-5117_ROB.pdf
33) 0258-5121_ROB.pdf
34) 0258-5125_ROB.pdf
35) 0258-5129_ROB.pdf
36) 0258-5133_ROB.pdf
37) 0258-5136A_ROB.pdf
38) 0258-5136B_ROB.pdf
39) 0258-5136_ROB.pdf
40) 0258-5137_ROB.pdf
41) 0258-514