In [1]:
# SCRIPT TO SCRAPE BAD SHOTS FROM OBSERVER LOGS IN PDF FORMAT

# Import Libraries
import pandas as pd
import PyPDF2
import re
import os

# Define Variables:

# Survey Parameters: Min/Max channel numbers
rcvmin = 1
rcvmax = 320

# Location of the PDF files (Observer Logs)
pdf_path = r'./Observer_Logs/'

# list files inside folder
pdf_list = os.listdir(pdf_path)


def parse_pdf (pdf):
    
    # Check if a directory exists
    out_dir = "Output"
    out_file = "./" + out_dir + "/turn_list.txt"

    # Check whether the specified folder exists or not
    isExist = os.path.exists(out_dir)
    if not isExist:
        # Create it if it does not exist
        os.makedirs(out_dir)
        
        # Create file to store lines with shots acquired during vessel turn
        turn_txt = open(out_file, "w")
        turn_line = turn_txt.write('List of lines with shots acquired during a vessel turn' + '\n')
        turn_txt.close()

        
        print("The Output directory is created!")
    
    # Define current acquisition line number
    #line_number = (pdf[21:25])
    line = re.findall(r"[-—–_](\d{4}|\d{4}[A-Z])[-—–_)]",pdf)
    line_number = ''.join(line)
    
    # Define the output file that will be read by TRACE_SELECT
    template_file = "./" + out_dir + "/TRACE_SELECT_line_" + line_number + ".txt"    
    
    header = "'Key' 'Use Absolute Value of Key' 'First' 'Last' 'Increment' 'Tolerance' 'Operation to Perform' 'Exclude' 'And/Or'"
    template_txt = open(template_file, "w")
    template_line = template_txt.write(header + '\n')
    template_txt.close()
    
    # Open the Observer Log PDF file
    pdfFileObject = open(pdf, 'rb')
    pdfReader = PyPDF2.PdfFileReader(pdfFileObject, strict=False)
    numPages = pdfReader.numPages
        
    print(f'Working in file {pdf}... ({numPages} pages.)')
        
    output = []
        
    for i in range(numPages):
        content = pdfReader.getPage(i)
        page = content.extractText()
        output.append(page)
            
    # Merge all pages into a single string
    text = ''.join(output)
        
    # Extract text to identify if there were shots during vessel turn
    turn_text = re.findall("Line-segments.*?Tape-list", text, re.DOTALL)     
        
    for line in turn_text:
        turn = re.findall(r"[tT]urn|[bB]en[dt]",line)
        if turn:
            #print(f'WARNING: {pdf} reports shots acquired during vessel turn')
            turn_txt = open(out_file, "a")
            turn_line = turn_txt.write(line_number + '\n')
            turn_txt.close()
        
    # Extract lines between patterns. Bad shot/channels are listed within these patterns
    extracted_text = re.findall("Seismic Data Annotations.*?Comment", text, re.DOTALL)
        
    # Split the text at the break
    for i in extracted_text:
        text_row = i.split('\n')
            
    matched_text_list = []
        
    # Search for specific noise keywords
    for line in text_row:
        rematch = re.findall(r"[tT]urn.[nN]oise|[bB]en[dt]|[cC]urrent.[nN]oise|[sS]hip.[nN]oise|[sS]pik[ey]|[bB]ad.[sS]hot",line)
        if rematch:
            matched_text_list.append(line)
        
    matched_text_list
          
    output_list = []
    
    mark = 0
        
    # Regular Expressions to Match Bad Shots and Receivers
    for line in matched_text_list:
        comments = re.sub(r'([^a-zA-Z])', '', line)
        comment = comments.replace('ALL', '')
        spmin = re.findall(r"(^\d{4})[-—–](?:\d{4})",line)
        spmax = re.findall(r"(?:^\d{4})[-—–](\d{4})",line)
        spsin = re.findall(r"(^\d{4})\s",line)
        rcmin = re.findall(r"([1-9]|[1-9][0-9]|1[0-9][0-9]|2[0-9][0-9]|3[0-1][0-9]|320\B)[-—–](?:[1-9]|[1-9][0-9]|1[0-9][0-9]|2[0-9][0-9]|3[0-1][0-9]|320)\b",line)
        rcmax = re.findall(r"(?:[1-9]|[1-9][0-9]|1[0-9][0-9]|2[0-9][0-9]|3[0-1][0-9]|320\B)[-—–]([1-9]|[1-9][0-9]|1[0-9][0-9]|2[0-9][0-9]|3[0-1][0-9]|320)\b",line)
        rcall = re.findall(r"\s[aA][lL][lL]\s",line)
        
        if spmin and spmax and rcmin and rcmax:
            mark = 1
            ftext = f"""'IDENT_NUM' 'NO' {spmin[0]} {spmax[0]} NOT_USED 0 NOT_USED 'NO' 'AND'
'TRACE_NUM' 'NO' {rcmin[0]} {rcmax[0]} NOT_USED 0 NOT_USED 'NO' 'OR'"""
            template_txt = open(template_file, "a")
            template_line = template_txt.write(ftext + '\n')
            template_txt.close()
            output_list.append((spmin[0],spmax[0],rcmin[0],rcmax[0],comment))
        elif spmin and spmax and rcall:
            mark = 2
            rcmin = rcvmin
            rcmax = rcvmax
            ftext = f"""'IDENT_NUM' 'NO' {spmin[0]} {spmax[0]} NOT_USED 0 NOT_USED 'NO' 'AND'
'TRACE_NUM' 'NO' {rcmin} {rcmax} NOT_USED 0 NOT_USED 'NO' 'OR'"""
            template_txt = open(template_file, "a")
            template_line = template_txt.write(ftext + '\n')
            template_txt.close()
            output_list.append((spmin[0],spmax[0],rcmin,rcmax,comment))
        elif spsin and rcall:
            mark = 3
            spmin = spsin
            spmax = spsin
            rcmin = rcvmin
            rcmax = rcvmax
            ftext = f"""'IDENT_NUM' 'NO' {spmin[0]} {spmax[0]} NOT_USED 0 NOT_USED 'NO' 'AND'
'TRACE_NUM' 'NO' {rcmin} {rcmax} NOT_USED 0 NOT_USED 'NO' 'OR'"""
            template_txt = open(template_file, "a")
            template_line = template_txt.write(ftext + '\n')
            template_txt.close()
            output_list.append((spmin[0],spmax[0],rcmin,rcmax,comment))
        elif spsin and rcmin and rcmax:
            mark = 4
            spmin = spsin
            spmax = spsin
            ftext = f"""'IDENT_NUM' 'NO' {spmin[0]} {spmax[0]} NOT_USED 0 NOT_USED 'NO' 'AND'
'TRACE_NUM' 'NO' {rcmin[0]} {rcmax[0]} NOT_USED 0 NOT_USED 'NO' 'OR'"""
            template_txt = open(template_file, "a")
            template_line = template_txt.write(ftext + '\n')
            template_txt.close()
            output_list.append((spmin[0],spmax[0],rcmin[0],rcmax[0],comment))
    
    if mark == 0:
        print(f'NO bad shots or channels were identified in {pdf}')
        os.remove(template_file)
        
    elif mark > 0:
        
        # Convert results to a pandas DataFrame
        pd.set_option('display.max_colwidth', None)
        
        df = pd.DataFrame(output_list)
        df.columns = ['spmin', 'spmax', 'rcmin', 'rcmax', 'Comment']
        df.sort_values(by=['spmin'])
        
        # Save outputs to text files:
        df.to_csv(sep=' ', index=False, header=True)
        df.to_csv(f'./{out_dir}/line_{line_number}.txt', sep=' ', index=False, header=True)       
        
# Iterate through every PDF file and execute the function
for i in pdf_list:
    file = pdf_path + i
    parse_pdf(file)

The Output directory is created!
Working in file ./Observer_Logs/0258-5001_ROB.pdf... (4 pages.)
NO bad shots or channels were identified in ./Observer_Logs/0258-5001_ROB.pdf
Working in file ./Observer_Logs/0258-5005_ROB.pdf... (4 pages.)
Working in file ./Observer_Logs/0258-5009_ROB.pdf... (4 pages.)
NO bad shots or channels were identified in ./Observer_Logs/0258-5009_ROB.pdf
Working in file ./Observer_Logs/0258-5013_ROB.pdf... (5 pages.)
Working in file ./Observer_Logs/0258-5017_ROB.pdf... (4 pages.)
Working in file ./Observer_Logs/0258-5021_ROB.pdf... (6 pages.)
Working in file ./Observer_Logs/0258-5025_ROB.pdf... (4 pages.)
Working in file ./Observer_Logs/0258-5029_ROB.pdf... (5 pages.)
Working in file ./Observer_Logs/0258-5033_ROB.pdf... (4 pages.)
NO bad shots or channels were identified in ./Observer_Logs/0258-5033_ROB.pdf
Working in file ./Observer_Logs/0258-5037_ROB.pdf... (5 pages.)
NO bad shots or channels were identified in ./Observer_Logs/0258-5037_ROB.pdf
Working in file