In [8]:
import boto3
import pdfplumber
import io
import csv
import re
import pandas as pd
from pdf2image import convert_from_bytes
import pytesseract

print("Starting")
pytesseract.pytesseract.tesseract_cmd = r"C:\Users\mquig\AppData\Local\Programs\Tesseract-OCR\tesseract.exe"
#print(pytesseract.get_tesseract_version())

client = boto3.client('s3');

bucket_name = 'music-industry-data-lake'
obj_key = "raw/pollstar_dir/pdf/reports_pack/top-tours/2019/north-american/Top200NorthAmericanTours_791.pdf"

non_musician_tours = []
non_musicians_path = "../data_cleaning/non_musicians.txt"

with open(non_musicians_path, 'r') as file:
    for line in file:
        non_musician_tours.append(line.strip())

noisy_symbols = []
noisy_symbols_path = "../data_cleaning/noisy_symbols.txt"

with open(noisy_symbols_path, 'r') as file:
    for line in file:
        noisy_symbols.append(line.strip())

def extract_text_ocr(pdf_bytes, page_num):
    pdf_images = convert_from_bytes(
        pdf_bytes,
        first_page=page_num,
        last_page=page_num
    )

    texts = []
    
    for img in pdf_images:
        texts.append(pytesseract.image_to_string(img))

    return "\n".join(texts)

def clean_text(text):
    text = text.encode("latin1", errors="ignore").decode("utf-8", errors="ignore")

    for symbol in noisy_symbols:
        text = text.replace(symbol, "")

    text = text.strip()
    #text = re.sub(r"\s*", "\s", text)
    fields = text.split()
    text = " ".join(fields)
    
    return text

try:
    file = client.get_object(Bucket=bucket_name, Key=obj_key)
    pdf_bytes = file['Body'].read()
    pdf_file = io.BytesIO(pdf_bytes)

    with pdfplumber.open(pdf_file) as pdf:
        top_tours_musicians = []
        top_tours_all = []
        print(len(pdf.pages))

        for page_num, page in enumerate(pdf.pages, start=1):
            page_text = page.extract_text()

            if not page_text:
                page_text = extract_text_ocr(pdf_bytes, page_num)
                #print(page_text)

            if not page_text:
                print("Did not find text with pdfplumber or OCR. Skipping page.")
                continue
            
            lines = page_text.splitlines()

            for line in lines:
                        if re.search(r"^\d", line) and len(line) >= 40:
                            try:
                                line = clean_text(line)
                                print(line)
                                
                                tour_data = re.split(r"\s", line, 2);
                                rank = tour_data[0]
                                gross_millions = tour_data[1]
                                tour_data = tour_data[2]
        
                                end_of_artist_name_idx = re.search(r"\d*\.\d*", tour_data)
                                artist = tour_data[0:end_of_artist_name_idx.start()-2]
                                tour_data = tour_data[end_of_artist_name_idx.start():]

                                view_remaining_columns = re.split(r"\s", tour_data, 5)
        
                                if "/" in view_remaining_columns[4]:
                                    tour_data = re.split(r"\s", tour_data, 5)
                                    avg_ticket_price = tour_data[0]
                                    avg_num_tickets = tour_data[1]
                                    total_num_tickets = tour_data[2]
                                    avg_gross = tour_data[3]
                                    cities_shows = tour_data[4]
                                    agency = tour_data[5]

                                    tour_dict = {
                                        "Rank": rank,
                                        "Gross Millions": gross_millions,
                                        "Artist": artist,
                                        "Average Ticket Price": avg_ticket_price,
                                        "Average Tickets": avg_num_tickets,
                                        "Total Tickets": total_num_tickets,
                                        "Average Gross": avg_gross,
                                        "Cities Shows": cities_shows,
                                        "Agency": agency
                                    }
                                            
                                    #print(tour_dict)
            
                                    top_tours_all.append(tour_dict)

                                    if artist not in non_musicians:
                                        top_tours_musicians.append(tour_dict)
                            except Exception as e:
                                print("Error parsing tour record: ", e)

except client.exceptions.NoSuchKey:
    print(f"Error: Object '{object_key}' not found in bucket '{bucket_name}'")
    exit()
except Exception as e:
    print(f"Error retrieving object: {e}")
    exit()

'''
FOR: View files in bucket

bucket_name = "music-industry-data-lake"

resp = client.list_objects_v2(
    Bucket=bucket_name,
    Prefix="processed/pollstar_dir/reports_pack/musician-tours/"
)

for obj in resp.get("Contents", []):
    print(obj["Key"])
    response = client.delete_object(
        Bucket=bucket_name,
        Key=obj["Key"]
    )
'''

Starting
4
1 177.8 The Rolling Stones 226.61 49,041 784,652 11,112,923 14/16 Concerts West
Error parsing tour record:  name 'non_musicians' is not defined
2 157.4 Elton John 36.25 14,266 1,155,510 1,943,702 57/81 The Howard Rose Agency
Error parsing tour record:  name 'non_musicians' is not defined
3 97.0 Bob Seger & The Silver Bullet Band 111.72 12,224 867,925 1,365,694 1/71 ICM Partners
Error parsing tour record:  name 'non_musicians' is not defined
4 87.8 Pink 40.76 14,867 624,419 2,092,626 37/42 Marshall Arts
Error parsing tour record:  name 'non_musicians' is not defined
5 82.6 Ariana Grande 17.96 13,727 700,053 1,619,209 46/51 Creative Artists Agency
Error parsing tour record:  name 'non_musicians' is not defined
6 81.7 Jonas Brothers 106.14 14,246 769,310 1,512,119 48/54 United Talent Agency
Error parsing tour record:  name 'non_musicians' is not defined
7 81.6 KISS 01.94 11,945 800,302 1,217,650 67/67 Creative Artists Agency
Error parsing tour record:  name 'non_musicians' is n

  tour_data = re.split(r"\s", line, 2);
  view_remaining_columns = re.split(r"\s", tour_data, 5)
  tour_data = re.split(r"\s", tour_data, 5)


101 11.4 Alan Jackson 71.76 8,795 158,312 631,132 18/18 WME
Error parsing tour record:  name 'non_musicians' is not defined
102.-11.2 Hillsong United 37.17 7,333 300,634 272,582 39/41 Premier Productions
Error parsing tour record:  name 'non_musicians' is not defined
103. 11.0 Old Dominion 48.85 4,707 225,940 229,931 48/48 Paradigm Talent Agency
Error parsing tour record:  name 'non_musicians' is not defined
104 10.8 James Taylor 90.60 7,913 118,699 716,917 15/15 Creative Artists Agency
Error parsing tour record:  name 'non_musicians' is not defined
105 610.7 Lizzo 45.03 3,777 237,934 170,052 48/63 WME
Error parsing tour record:  name 'non_musicians' is not defined
106 10.7. Anuel AA 72.19 6,444 148,218 465,181 22/23 MN2S
Error parsing tour record:  name 'non_musicians' is not defined
107. 10.6 For King & Country 36.51 4,100 291,065 149,675 71/71 Jeff Roberts & Associates
Error parsing tour record:  name 'non_musicians' is not defined
109 10.4 s Bon Iver 60.37 5,939 172,229 358,532 26/

'\nFOR: View files in bucket\n\nbucket_name = "music-industry-data-lake"\n\nresp = client.list_objects_v2(\n    Bucket=bucket_name,\n    Prefix="processed/pollstar/reports/musician-tours/"\n)\n\nfor obj in resp.get("Contents", []):\n    print(obj["Key"])\n    response = client.delete_object(\n        Bucket=bucket_name,\n        Key=obj["Key"]\n    )\n'