In [4]:
# SCRIPT TO SCRAPE BAD SHOTS FROM OBSERVER LOGS IN PDF FORMAT

# Import Libraries
import pandas as pd
import PyPDF2
import re
import os

# Define Variables:

# Survey Parameters: Min/Max channel numbers
rcvmin = 1
rcvmax = 320

# Location of the PDF files (Observer Logs)
pdf_path = r'./Observer_Logs/'

# list files inside folder
pdf_list = os.listdir(pdf_path)

# Temporary output file to hold bad records from all the lines
tmp_file = "tmp.txt"    

def parse_pdf (pdf):
    
    line = re.findall(r"[-—–_](\d{4}|\d{4}[A-Z])[-—–_)]",pdf)
    line_number = ''.join(line)
    
    # Open the Observer Log PDF file
    pdfFileObject = open(pdf, 'rb')
    pdfReader = PyPDF2.PdfFileReader(pdfFileObject, strict=False)
    numPages = pdfReader.numPages
        
    output = []
        
    for i in range(numPages):
        content = pdfReader.getPage(i)
        page = content.extractText()
        output.append(page)
            
    # Merge all pages into a single string
    text = ''.join(output)  
       
    # Extract lines between patterns. Bad shot/channels are listed within these patterns
    extracted_text = re.findall("Seismic Data Annotations.*?Comment", text, re.DOTALL)
        
    # Split the text at the break
    for i in extracted_text:
        text_row = i.split('\n')
            
    matched_text_list = []
        
    # Search for specific noise keywords
    for line in text_row:
        rematch = re.findall(r"[tT]urn.[nN]oise|[bB]en[dt]|[cC]urrent.[nN]oise|[sS]hip.[nN]oise|[sS]pik[ey]|[bB]ad.[sS]hot",line)
        if rematch:
            matched_text_list.append(line)
        
    matched_text_list
          
    output_list = []
    
    mark = 0
        
    # Regular Expressions to Match Bad Shots and Receivers
    for line in matched_text_list:
        comments = re.sub(r'([^a-zA-Z])', '', line)
        comment = comments.replace('ALL', '')
        spmin = re.findall(r"(^\d{4})[-—–](?:\d{4})",line)
        spmax = re.findall(r"(?:^\d{4})[-—–](\d{4})",line)
        spsin = re.findall(r"(^\d{4})\s",line)
        rcmin = re.findall(r"([1-9]|[1-9][0-9]|1[0-9][0-9]|2[0-9][0-9]|3[0-1][0-9]|320\B)[-—–](?:[1-9]|[1-9][0-9]|1[0-9][0-9]|2[0-9][0-9]|3[0-1][0-9]|320)\b",line)
        rcmax = re.findall(r"(?:[1-9]|[1-9][0-9]|1[0-9][0-9]|2[0-9][0-9]|3[0-1][0-9]|320\B)[-—–]([1-9]|[1-9][0-9]|1[0-9][0-9]|2[0-9][0-9]|3[0-1][0-9]|320)\b",line)
        rcall = re.findall(r"\s[aA][lL][lL]\s",line)
        
        if spmin and spmax and rcmin and rcmax:
            mark = 1
            tmp_text = f"{line_number} {spmin[0]} {spmax[0]} {rcmin[0]} {rcmax[0]} {comment}"
            tmp_txt = open(tmp_file, "a"); tmp_line = tmp_txt.write(tmp_text + '\n'); tmp_txt.close()
        elif spmin and spmax and rcall:
            mark = 2
            rcmin = rcvmin
            rcmax = rcvmax
            tmp_text = f"{line_number} {spmin[0]} {spmax[0]} {rcmin} {rcmax} {comment}"
            tmp_txt = open(tmp_file, "a"); tmp_line = tmp_txt.write(tmp_text + '\n'); tmp_txt.close()
        elif spsin and rcall:
            mark = 3
            spmin = spsin
            spmax = spsin
            rcmin = rcvmin
            rcmax = rcvmax
            tmp_text = f"{line_number} {spmin[0]} {spmax[0]} {rcmin} {rcmax} {comment}"
            tmp_txt = open(tmp_file, "a"); tmp_line = tmp_txt.write(tmp_text + '\n'); tmp_txt.close()
        elif spsin and rcmin and rcmax:
            mark = 4
            spmin = spsin
            spmax = spsin
            tmp_text = f"{line_number} {spmin[0]} {spmax[0]} {rcmin[0]} {rcmax[0]} {comment}"
            tmp_txt = open(tmp_file, "a"); tmp_line = tmp_txt.write(tmp_text + '\n'); tmp_txt.close()
        
# Iterate through every PDF file and execute the function
for i in pdf_list:
    file = pdf_path + i
    parse_pdf(file)
    
# Write Output Excel spreadsheet
df = pd.read_csv(tmp_file, sep=" ", names=['Line_Number', 'Sp_min', 'Sp_max', 'Rcv_min', 'Rcv_max', 'Comment']).sort_values(by=['Line_Number'])
df.style.set_properties(**{'text-align': 'left'})
df.to_excel('Bad_Records.xlsx', index=False)

# Remove temp file
os.remove(tmp_file)