In [1]:
# SCRIPT TO SCRAPE BAD SHOTS FROM OBSERVER LOGS IN PDF FORMAT

#____________________________________________________________IMPORT LIBRARIES
import pandas as pd
import re
import os
import PyPDF2
import openpyxl
from openpyxl import Workbook

#____________________________________________________________INPUT DATA
# Location of the PDF files (Observer Logs)
pdf_path = r'./Observer_Logs/'
# list files inside folder
pdf_list = os.listdir(pdf_path)

#____________________________________________________________OUTPUT DATA
# Directory to contain the output files. Create if not exits.
out_dir = "Output"

isExist = os.path.exists(out_dir)
if not isExist:
    os.makedirs(out_dir)
        
print("The Output directory is created!")

# Output Excel File
xlsx = './' + out_dir + '/Bad_Records.xlsx'
wb = Workbook()
ws = wb.active
ws.title = "Bad Records"
ws.append(['Line_Number', 'Shot_Min', 'Shot_Max', 'Rcv_Min', 'Rcv_Max', 'Comment'])
wb.create_sheet('Turn Lines')
ws = wb['Turn Lines']
ws.append(['Lines with reported turn noise shots'])
wb.save(xlsx)

#____________________________________________________________DEFINE VARIABLES
rcvmin = 1     #Survey Parameter: Minimum Channel
rcvmax = 320   #Survey Parameter: Maximum Channel

def parse_pdf (pdf):
    
    global wb
    global ws
    
    #____________________________________________________________READ PDFs
    # Open the Observer Log PDF file
    pdfFileObject = open(pdf, 'rb')
    pdfReader = PyPDF2.PdfFileReader(pdfFileObject, strict=False)
    numPages = pdfReader.numPages
        
    print(f'Working in file {pdf}... ({numPages} pages.)')
    
    #____________________________________________________________INITIALIZE OUTPUT FILE(s)
    # Define current acquisition line number
    line = re.findall(r"[-—–_](\d{4}|\d{4}[A-Z])[-—–_)]",pdf)
    line_number = ''.join(line)
    
    # Define the output file that will be read by TRACE_SELECT in Omega
    template_file = "./" + out_dir + "/TRACE_SELECT_line_" + line_number + ".txt"    
    
    header = "'Key' 'Use Absolute Value of Key' 'First' 'Last' 'Increment' 'Tolerance' 'Operation to Perform' 'Exclude' 'And/Or'"
    template_txt = open(template_file, "w")
    template_line = template_txt.write(header + '\n')
    template_txt.close()
    
    output = []
    
    #____________________________________________________________SCRAPE ALL THE TEXT
    for i in range(numPages):
        content = pdfReader.getPage(i)
        page = content.extractText()
        output.append(page)
            
    # Merge all pages into a single string
    text = ''.join(output)
    
    #____________________________________________________________IDENTIFY TURN LINES
    # Extract text to identify if there were shots during vessel turn
    turn_text = re.findall("Line-segments.*?Tape-list", text, re.DOTALL)     
        
    for line in turn_text:
        turn = re.findall(r"[tT]urn|[bB]en[dt]",line)
        if turn:
            # Write lines with turn noise
            ws = wb['Turn Lines']
            ws.append([line_number])
            wb.save(xlsx)
        
    #____________________________________________________________IDENTIFY NOISY RECORDS
    # Extract lines between patterns. Bad shot/channels are listed within these patterns
    extracted_text = re.findall("Seismic Data Annotations.*?Comment", text, re.DOTALL)
        
    # Split the text at the break
    for i in extracted_text:
        text_row = i.split('\n')
            
    matched_text_list = []
        
    # Search for specific noise keywords
    for line in text_row:
        rematch = re.findall(r"[tT]urn.[nN]oise|[bB]en[dt]|[cC]urrent.[nN]oise|[sS]hip.[nN]oise|[sS]pik[ey]|[bB]ad.[sS]hot",line)
        if rematch:
            matched_text_list.append(line)
        
    matched_text_list
    output_list = []
    mark = 0  
    ws = wb['Bad Records']
        
    # Regular Expressions to Match Bad Shots and Receivers
    for line in matched_text_list:
        comments = re.sub(r'([^a-zA-Z])', '', line)
        comment = comments.replace('ALL', '')
        spmin = re.findall(r"(^\d{4})[-—–](?:\d{4})",line)
        spmax = re.findall(r"(?:^\d{4})[-—–](\d{4})",line)
        spsin = re.findall(r"(^\d{4})\s",line)
        rcmin = re.findall(r"([1-9]|[1-9][0-9]|1[0-9][0-9]|2[0-9][0-9]|3[0-1][0-9]|320\B)[-—–](?:[1-9]|[1-9][0-9]|1[0-9][0-9]|2[0-9][0-9]|3[0-1][0-9]|320)\b",line)
        rcmax = re.findall(r"(?:[1-9]|[1-9][0-9]|1[0-9][0-9]|2[0-9][0-9]|3[0-1][0-9]|320\B)[-—–]([1-9]|[1-9][0-9]|1[0-9][0-9]|2[0-9][0-9]|3[0-1][0-9]|320)\b",line)
        rcall = re.findall(r"\s[aA][lL][lL]\s",line)
        
        if spmin and spmax and rcmin and rcmax:
            mark = 1
            ftext = f"""'IDENT_NUM' 'NO' {spmin[0]} {spmax[0]} NOT_USED 0 NOT_USED 'NO' 'AND'
'TRACE_NUM' 'NO' {rcmin[0]} {rcmax[0]} NOT_USED 0 NOT_USED 'NO' 'OR'"""
            template_txt = open(template_file, "a")
            template_line = template_txt.write(ftext + '\n')
            template_txt.close()
            ws.append([line_number, int(spmin[0]), int(spmax[0]), int(rcmin[0]), int(rcmax[0]), comment])
            wb.save(xlsx)
        elif spmin and spmax and rcall:
            mark = 2
            rcmin = rcvmin
            rcmax = rcvmax
            ftext = f"""'IDENT_NUM' 'NO' {spmin[0]} {spmax[0]} NOT_USED 0 NOT_USED 'NO' 'AND'
'TRACE_NUM' 'NO' {rcmin} {rcmax} NOT_USED 0 NOT_USED 'NO' 'OR'"""
            template_txt = open(template_file, "a")
            template_line = template_txt.write(ftext + '\n')
            template_txt.close()
            ws.append([line_number, int(spmin[0]), int(spmax[0]), int(rcmin), int(rcmax), comment])
            wb.save(xlsx)
        elif spsin and rcall:
            mark = 3
            spmin = spsin
            spmax = spsin
            rcmin = rcvmin
            rcmax = rcvmax
            ftext = f"""'IDENT_NUM' 'NO' {spmin[0]} {spmax[0]} NOT_USED 0 NOT_USED 'NO' 'AND'
'TRACE_NUM' 'NO' {rcmin} {rcmax} NOT_USED 0 NOT_USED 'NO' 'OR'"""
            template_txt = open(template_file, "a")
            template_line = template_txt.write(ftext + '\n')
            template_txt.close()
            ws.append([line_number, int(spmin[0]), int(spmax[0]), int(rcmin), int(rcmax), comment])
            wb.save(xlsx)
        elif spsin and rcmin and rcmax:
            mark = 4
            spmin = spsin
            spmax = spsin
            ftext = f"""'IDENT_NUM' 'NO' {spmin[0]} {spmax[0]} NOT_USED 0 NOT_USED 'NO' 'AND'
'TRACE_NUM' 'NO' {rcmin[0]} {rcmax[0]} NOT_USED 0 NOT_USED 'NO' 'OR'"""
            template_txt = open(template_file, "a")
            template_line = template_txt.write(ftext + '\n')
            template_txt.close()
            ws.append([line_number, int(spmin[0]), int(spmax[0]), int(rcmin[0]), int(rcmax[0]), comment])
            wb.save(xlsx)
    
    if mark == 0:
        print(f'NO bad shots or channels were identified in {pdf}')
        os.remove(template_file) 

#____________________________________________________________EXECUTE THE FUNCTION FOR ALL PDF FILES
for i in pdf_list:
    file = pdf_path + i
    parse_pdf(file)

The Output directory is created!
Working in file ./Observer_Logs/0258-5001_ROB.pdf... (4 pages.)
NO bad shots or channels were identified in ./Observer_Logs/0258-5001_ROB.pdf
Working in file ./Observer_Logs/0258-5005_ROB.pdf... (4 pages.)
Working in file ./Observer_Logs/0258-5009_ROB.pdf... (4 pages.)
NO bad shots or channels were identified in ./Observer_Logs/0258-5009_ROB.pdf
Working in file ./Observer_Logs/0258-5013_ROB.pdf... (5 pages.)
Working in file ./Observer_Logs/0258-5017_ROB.pdf... (4 pages.)
Working in file ./Observer_Logs/0258-5021_ROB.pdf... (6 pages.)
Working in file ./Observer_Logs/0258-5025_ROB.pdf... (4 pages.)
Working in file ./Observer_Logs/0258-5029_ROB.pdf... (5 pages.)
Working in file ./Observer_Logs/0258-5033_ROB.pdf... (4 pages.)
NO bad shots or channels were identified in ./Observer_Logs/0258-5033_ROB.pdf
Working in file ./Observer_Logs/0258-5037_ROB.pdf... (5 pages.)
NO bad shots or channels were identified in ./Observer_Logs/0258-5037_ROB.pdf
Working in file

In [7]:
# SCRIPT TO SCRAPE BAD SHOTS FROM OBSERVER LOGS IN PDF FORMAT

#____________________________________________________________IMPORT LIBRARIES
import pandas as pd
import re
import os
import PyPDF2
import openpyxl
from openpyxl import Workbook

#____________________________________________________________INPUT DATA
# Location of the PDF files (Observer Logs)
file = '0258-5065_ROB.pdf'

#____________________________________________________________OUTPUT DATA
# Output Excel File
xlsx = 'Bad_Records.xlsx'
wb = Workbook()
ws = wb.active
ws.title = "Bad Records"
ws.append(['Line_Number', 'Shot_Min', 'Shot_Max', 'Rcv_Min', 'Rcv_Max', 'Comment'])
wb.create_sheet('Turn Lines')
ws = wb['Turn Lines']
ws.append(['Lines with reported turn noise shots'])
wb.save(xlsx)

#____________________________________________________________DEFINE VARIABLES
rcvmin = 1     #Survey Parameter: Minimum Channel
rcvmax = 320   #Survey Parameter: Maximum Channel

# Search for turn noise between these patterns 
pattern1_top = 'Line-segments'
pattern1_bot = 'Tape-list'

# Search for bad records between these patterns
pattern2_top = 'Seismic Data Annotations'
pattern2_bot = 'Comment'


def parse_pdf (pdf):
    
    global wb
    global ws
    
    #____________________________________________________________READ PDFs
    # Open the Observer Log PDF file
    pdfFileObject = open(pdf, 'rb')
    pdfReader = PyPDF2.PdfFileReader(pdfFileObject, strict=False)
    numPages = pdfReader.numPages
        
    print(f'Working in file {pdf}... ({numPages} pages.)')
    
    #____________________________________________________________INITIALIZE OUTPUT FILE(s)
    # Define current acquisition line number
    line = re.findall(r"[-—–_](\d{4}|\d{4}[A-Z])[-—–_)]",pdf)
    line_number = ''.join(line)
    
    # Define the output file that will be read by TRACE_SELECT in Omega
    template_file = "TRACE_SELECT_line_" + line_number + ".txt"    
    
    header = "'Key' 'Use Absolute Value of Key' 'First' 'Last' 'Increment' 'Tolerance' 'Operation to Perform' 'Exclude' 'And/Or'"
    template_txt = open(template_file, "w")
    template_line = template_txt.write(header + '\n')
    template_txt.close()
    
    output = []
    
    #____________________________________________________________SCRAPE ALL THE TEXT
    for i in range(numPages):
        content = pdfReader.getPage(i)
        page = content.extractText()
        output.append(page)
            
    # Merge all pages into a single string
    text = ''.join(output)
    
    #____________________________________________________________IDENTIFY TURN LINES
    # Extract text to identify if there were shots during vessel turn
    turn_text = re.findall(rf"{pattern1_top}.*?{pattern1_bot}", text, re.DOTALL)     
        
    for line in turn_text:
        turn = re.findall(r"[tT]urn|[bB]en[dt]",line)
        if turn:
            # Write lines with turn noise
            ws = wb['Turn Lines']
            ws.append([line_number])
            wb.save(xlsx)
        
    #____________________________________________________________IDENTIFY NOISY RECORDS
    # Extract lines between patterns. Bad shot/channels are listed within these patterns
    extracted_text = re.findall(rf"{pattern2_top}.*?{pattern2_bot}", text, re.DOTALL)
        
    # Split the text at the break
    for i in extracted_text:
        text_row = i.split('\n')
            
    matched_text_list = []
        
    # Search for specific noise keywords
    for line in text_row:
        rematch = re.findall(r"[tT]urn.[nN]oise|[bB]en[dt]|[cC]urrent.[nN]oise|[sS]hip.[nN]oise|[sS]pik[ey]|[bB]ad.[sS]hot",line)
        if rematch:
            matched_text_list.append(line)
        
    matched_text_list
    output_list = []
    mark = 0  
    ws = wb['Bad Records']
        
    # Regular Expressions to Match Bad Shots and Receivers
    for line in matched_text_list:
        comments = re.sub(r'([^a-zA-Z])', '', line)
        comment = comments.replace('ALL', '')
        spmin = re.findall(r"(^\d{4})[-—–](?:\d{4})",line)
        spmax = re.findall(r"(?:^\d{4})[-—–](\d{4})",line)
        spsin = re.findall(r"(^\d{4})\s",line)
        rcmin = re.findall(r"([1-9]|[1-9][0-9]|1[0-9][0-9]|2[0-9][0-9]|3[0-1][0-9]|320\B)[-—–](?:[1-9]|[1-9][0-9]|1[0-9][0-9]|2[0-9][0-9]|3[0-1][0-9]|320)\b",line)
        rcmax = re.findall(r"(?:[1-9]|[1-9][0-9]|1[0-9][0-9]|2[0-9][0-9]|3[0-1][0-9]|320\B)[-—–]([1-9]|[1-9][0-9]|1[0-9][0-9]|2[0-9][0-9]|3[0-1][0-9]|320)\b",line)
        rcall = re.findall(r"\s[aA][lL][lL]\s",line)
        
        if spmin and spmax and rcmin and rcmax:
            mark = 1
            ftext = f"""'IDENT_NUM' 'NO' {spmin[0]} {spmax[0]} NOT_USED 0 NOT_USED 'NO' 'AND'
'TRACE_NUM' 'NO' {rcmin[0]} {rcmax[0]} NOT_USED 0 NOT_USED 'NO' 'OR'"""
            template_txt = open(template_file, "a")
            template_line = template_txt.write(ftext + '\n')
            template_txt.close()
            ws.append([line_number, int(spmin[0]), int(spmax[0]), int(rcmin[0]), int(rcmax[0]), comment])
            wb.save(xlsx)
        elif spmin and spmax and rcall:
            mark = 2
            rcmin = rcvmin
            rcmax = rcvmax
            ftext = f"""'IDENT_NUM' 'NO' {spmin[0]} {spmax[0]} NOT_USED 0 NOT_USED 'NO' 'AND'
'TRACE_NUM' 'NO' {rcmin} {rcmax} NOT_USED 0 NOT_USED 'NO' 'OR'"""
            template_txt = open(template_file, "a")
            template_line = template_txt.write(ftext + '\n')
            template_txt.close()
            ws.append([line_number, int(spmin[0]), int(spmax[0]), int(rcmin), int(rcmax), comment])
            wb.save(xlsx)
        elif spsin and rcall:
            mark = 3
            spmin = spsin
            spmax = spsin
            rcmin = rcvmin
            rcmax = rcvmax
            ftext = f"""'IDENT_NUM' 'NO' {spmin[0]} {spmax[0]} NOT_USED 0 NOT_USED 'NO' 'AND'
'TRACE_NUM' 'NO' {rcmin} {rcmax} NOT_USED 0 NOT_USED 'NO' 'OR'"""
            template_txt = open(template_file, "a")
            template_line = template_txt.write(ftext + '\n')
            template_txt.close()
            ws.append([line_number, int(spmin[0]), int(spmax[0]), int(rcmin), int(rcmax), comment])
            wb.save(xlsx)
        elif spsin and rcmin and rcmax:
            mark = 4
            spmin = spsin
            spmax = spsin
            ftext = f"""'IDENT_NUM' 'NO' {spmin[0]} {spmax[0]} NOT_USED 0 NOT_USED 'NO' 'AND'
'TRACE_NUM' 'NO' {rcmin[0]} {rcmax[0]} NOT_USED 0 NOT_USED 'NO' 'OR'"""
            template_txt = open(template_file, "a")
            template_line = template_txt.write(ftext + '\n')
            template_txt.close()
            ws.append([line_number, int(spmin[0]), int(spmax[0]), int(rcmin[0]), int(rcmax[0]), comment])
            wb.save(xlsx)
    
    if mark == 0:
        print(f'NO bad shots or channels were identified in {pdf}')
        os.remove(template_file) 

#____________________________________________________________EXECUTE THE FUNCTION FOR ALL PDF FILES
parse_pdf(file)

Working in file 0258-5065_ROB.pdf... (4 pages.)
