In [1]:
import boto3
import pdfplumber
import io
import csv
import re
import pandas as pd
from pdf2image import convert_from_bytes
import pytesseract

def clean_text(text):
    text = text.encode("latin1", errors="ignore").decode("utf-8", errors="ignore")

    for symbol in noisy_symbols:
        text = text.replace(symbol, "")

    text = text.strip()
    #text = re.sub(r"\s*", "\s", text)
    fields = text.split()
    text = " ".join(fields)
    
    return text

pytesseract.pytesseract.tesseract_cmd = r"C:\Users\mquig\AppData\Local\Programs\Tesseract-OCR\tesseract.exe"

non_musicians = []
non_musicians_path = "../data_cleaning/non_musicians.txt"

with open(non_musicians_path, 'r') as file:
    for line in file:
        non_musicians.append(line.strip())

noisy_symbols = []
noisy_symbols_path = "../data_cleaning/noisy_symbols.txt"

with open(noisy_symbols_path, 'r') as file:
    for line in file:
        noisy_symbols.append(line.strip())

client = boto3.client('s3');

paginator = client.get_paginator('list_objects_v2')

bucket_name = 'music-industry-data-lake'
directory_prefix = "raw/pollstar_dir/pdf/reports_pack/top-tours/"

pages = paginator.paginate(Bucket=bucket_name, Prefix=directory_prefix)

def extract_text_ocr(pdf_bytes, page_num):
    pdf_images = convert_from_bytes(
        pdf_bytes,
        first_page=page_num,
        last_page=page_num
    )

    texts = []
    
    for img in pdf_images:
        texts.append(pytesseract.image_to_string(img))

    return "\n".join(texts)

for page in pages:
    if 'Contents' in page:
        for obj in page['Contents']:
            try:
                file = client.get_object(Bucket=bucket_name, Key=obj['Key'])
                
                pdf_bytes = file['Body'].read()
                pdf_file = io.BytesIO(pdf_bytes)
                
                with pdfplumber.open(pdf_file) as pdf:
                    #print(pdf.pages[0].extract_text())
                    top_tours_all = []
                    top_tours_musicians = []
                    
                    for page_num, page in enumerate(pdf.pages, start=1):
                        page_text = page.extract_text()

                        if not page_text:
                            print("PDF Plumber did not find text. Attempting to extract text with pytesseract OCR")
                            page_text = extract_text_ocr(pdf_bytes, page_num)

                        if not page_text:
                            print("No text found with pdfplumber or OCR, skipping page")
                            continue
                            
                        lines = page_text.splitlines()
            
                        for line in lines:
                            if re.search(r"^\d", line) and len(line) >= 40:
                                try:
                                    line = clean_text(line)
                                    #print(line)
                                    tour_data = re.split(r"\s", line, 2);
                                    rank = tour_data[0]
                                    gross_millions = tour_data[1]
                                    tour_data = tour_data[2]
            
                                    end_of_artist_name_idx = re.search(r"\d*\.\d*", tour_data)
                                    artist = tour_data[0:end_of_artist_name_idx.start()-1]
                                    tour_data = tour_data[end_of_artist_name_idx.start():]

                                    view_remaining_columns = re.split(r"\s", tour_data, 5)
            
                                    if "/" in view_remaining_columns[4]:
                                        tour_data = re.split(r"\s", tour_data, 5)
                                        avg_ticket_price = tour_data[0]
                                        avg_num_tickets = tour_data[1]
                                        total_num_tickets = tour_data[2]
                                        avg_gross = tour_data[3]
                                        cities_shows = tour_data[4]
                                        agency = tour_data[5]

                                        tour_dict = {
                                            "Rank": rank,
                                            "Gross Millions": gross_millions,
                                            "Artist": artist,
                                            "Average Ticket Price": avg_ticket_price,
                                            "Average Tickets": avg_num_tickets,
                                            "Total Tickets": total_num_tickets,
                                            "Average Gross": avg_gross,
                                            "Cities Shows": cities_shows,
                                            "Agency": agency
                                        }

                                        #print(tour_dict)
            
                                        top_tours_all.append(tour_dict)

                                        if artist not in non_musicians:
                                            top_tours_musicians.append(tour_dict)

                                    else:
                                        tour_data = re.split(r"\s", tour_data, 5)
                                        avg_ticket_price = tour_data[0]
                                        avg_num_tickets = tour_data[1]
                                        avg_gross = tour_data[2]
                                        rank2 = tour_data[3]
                                        gross_millions2 = tour_data[4]
            
                                        tour_data = tour_data[5]
                                        end_of_artist_name_idx = re.search(r"\s\d*\.\d*", tour_data)
                                        artist2 = tour_data[0:end_of_artist_name_idx.start()-1]
                                        tour_data = tour_data[end_of_artist_name_idx.start():]
                                        tour_data = re.split(r"\s", tour_data)
            
                                        avg_ticket_price2 = tour_data[0]
                                        avg_num_tickets2 = tour_data[1]
                                        avg_gross2 = tour_data[2]
            
                                        tour_dict1 = {
                                            "Rank": rank,
                                            "Gross Millions": gross_millions,
                                            "Artist": artist,
                                            "Average Ticket Price": avg_ticket_price,
                                            "Average Tickets": avg_num_tickets,
                                            "Total Tickets": None,
                                            "Average Gross": avg_gross,
                                            "Cities Shows": None,
                                            "Agency": None
                                        }

                                        tour_dict2 = {
                                            "Rank": rank2,
                                            "Gross Millions": gross_millions2,
                                            "Artist": artist2,
                                            "Average Ticket Price": avg_ticket_price2,
                                            "Average Tickets": avg_num_tickets2,
                                            "Total Tickets": None,
                                            "Average Gross": avg_gross2,
                                            "Cities Shows": None,
                                            "Agency": None
                                        }
            
                                        top_tours_all.append(tour_dict1)
                                        top_tours_all.append(tour_dict2)

                                        if artist not in non_musicians:
                                            top_tours_musicians.append(tour_dict1)

                                        if artist2 not in non_musicians:
                                            top_tours_musicians.append(tour_dict2)
                                            
                                except Exception as e:
                                    print("Error parsing tour record: ", e)
                    
                    df_all_tours = pd.DataFrame(top_tours_all)
                    df_musician_tours = pd.DataFrame(top_tours_musicians)

                    # get name of the file and add .csv instead of .pdf
                    file_path = obj['Key']
                    file_name = file_path.split('/')[-1]
                    csv_file_name = file_name.replace('.pdf', '.csv')
                    #print(csv_file_name)

                    #df_all_tours.to_csv("./csv/" + csv_file_name)

                    # save all tour df as csv in all-tours folder
                    
                    csv_buffer = io.StringIO()
                    df_all_tours.to_csv(csv_buffer, index=False)

                    try:
                        client.put_object(
                            Bucket="music-industry-data-lake",
                            Key="processed/pollstar_dir/reports_pack/top-tours/all-tours/" + csv_file_name,
                            Body=csv_buffer.getvalue(),
                        )
                        print("Saved all tours report")
                    except Exception as e:
                        print(f"Error uploading file: {e}")
                    
                    df_musician_tours["OriginalRank"] = df_musician_tours["Rank"]

                    df_musician_tours = df_musician_tours.reset_index(drop=True)
                    df_musician_tours.index = df_musician_tours.index+1
                    df_musician_tours.index.name = "MusicianRank"
                    
                    csv_buffer = io.StringIO()
                    df_musician_tours.to_csv(csv_buffer, index=True)
                    
                    try:  
                        client.put_object(
                            Bucket="music-industry-data-lake",
                            Key="processed/pollstar_dir/reports_pack/top-tours/musician-tours/" + csv_file_name,
                            Body=csv_buffer.getvalue()
                        )

                        print(f"Uploaded file: {csv_file_name}")
                    except Exception as e:
                        print(f"Error uploading file {csv_file_name}: {e}")
                    
            except client.exceptions.NoSuchKey:
                print(f"Error: Object '{object_key}' not found in bucket '{bucket_name}'")
                exit()
            except Exception as e:
                print(f"Error retrieving object: {e}")
                exit()

Error retrieving object: No /Root object! - Is this really a PDF?
Error retrieving object: No /Root object! - Is this really a PDF?
Error retrieving object: No /Root object! - Is this really a PDF?


  tour_data = re.split(r"\s", line, 2);
  view_remaining_columns = re.split(r"\s", tour_data, 5)
  tour_data = re.split(r"\s", tour_data, 5)
  tour_data = re.split(r"\s", tour_data, 5)


Error parsing tour record:  'NoneType' object has no attribute 'start'
Error parsing tour record:  'NoneType' object has no attribute 'start'
Error parsing tour record:  'NoneType' object has no attribute 'start'
Error parsing tour record:  'NoneType' object has no attribute 'start'
Saved all tours report
Uploaded file: 2004yearendtop100tours_485.csv
Error retrieving object: No /Root object! - Is this really a PDF?
Error retrieving object: No /Root object! - Is this really a PDF?


  tour_data = re.split(r"\s", line, 2);
  view_remaining_columns = re.split(r"\s", tour_data, 5)
  tour_data = re.split(r"\s", tour_data, 5)
  tour_data = re.split(r"\s", tour_data, 5)


Error parsing tour record:  'NoneType' object has no attribute 'start'
Error parsing tour record:  'NoneType' object has no attribute 'start'
Error parsing tour record:  'NoneType' object has no attribute 'start'
Saved all tours report
Uploaded file: 2007YearEndTop100Tours_361.csv
Error retrieving object: No /Root object! - Is this really a PDF?
Error retrieving object: No /Root object! - Is this really a PDF?


  tour_data = re.split(r"\s", line, 2);
  view_remaining_columns = re.split(r"\s", tour_data, 5)
  tour_data = re.split(r"\s", tour_data, 5)


Saved all tours report
Uploaded file: 2009YearEndTop50WorldwideConcertTours_271.csv
Error retrieving object: No /Root object! - Is this really a PDF?
Error retrieving object: No /Root object! - Is this really a PDF?


  tour_data = re.split(r"\s", line, 2);
  view_remaining_columns = re.split(r"\s", tour_data, 5)
  tour_data = re.split(r"\s", tour_data, 5)
  tour_data = re.split(r"\s", tour_data, 5)


Error parsing tour record:  'NoneType' object has no attribute 'start'
Saved all tours report
Uploaded file: 2011YearEndTop200NorthAmericanTours_173.csv
Error retrieving object: No /Root object! - Is this really a PDF?


  tour_data = re.split(r"\s", line, 2);
  view_remaining_columns = re.split(r"\s", tour_data, 5)
  tour_data = re.split(r"\s", tour_data, 5)
  tour_data = re.split(r"\s", tour_data, 5)


Error parsing tour record:  'NoneType' object has no attribute 'start'
Saved all tours report
Uploaded file: 2011YearEndTop100WorldwideTours_172.csv
Error retrieving object: No /Root object! - Is this really a PDF?
Error retrieving object: No /Root object! - Is this really a PDF?


  tour_data = re.split(r"\s", line, 2);
  view_remaining_columns = re.split(r"\s", tour_data, 5)
  tour_data = re.split(r"\s", tour_data, 5)
  tour_data = re.split(r"\s", tour_data, 5)


Error parsing tour record:  'NoneType' object has no attribute 'start'
Saved all tours report
Uploaded file: 2015YearEndTop100WorldwideTours_chartItem_26.csv
Error retrieving object: No /Root object! - Is this really a PDF?
Error retrieving object: No /Root object! - Is this really a PDF?


  tour_data = re.split(r"\s", line, 2);
  view_remaining_columns = re.split(r"\s", tour_data, 5)
  tour_data = re.split(r"\s", tour_data, 5)
  tour_data = re.split(r"\s", tour_data, 5)


Error parsing tour record:  'NoneType' object has no attribute 'start'
Saved all tours report
Uploaded file: 2017YearEndTop100WorldwideTours_621.csv
Error retrieving object: No /Root object! - Is this really a PDF?
Error retrieving object: No /Root object! - Is this really a PDF?


  tour_data = re.split(r"\s", line, 2);
  view_remaining_columns = re.split(r"\s", tour_data, 5)
  tour_data = re.split(r"\s", tour_data, 5)
  tour_data = re.split(r"\s", tour_data, 5)


Error parsing tour record:  'NoneType' object has no attribute 'start'
Saved all tours report
Uploaded file: 2018YearEndTop200NorthAmericanTours_698.csv
Error retrieving object: No /Root object! - Is this really a PDF?


  tour_data = re.split(r"\s", line, 2);
  view_remaining_columns = re.split(r"\s", tour_data, 5)
  tour_data = re.split(r"\s", tour_data, 5)
  tour_data = re.split(r"\s", tour_data, 5)


Error parsing tour record:  'NoneType' object has no attribute 'start'
Error parsing tour record:  'NoneType' object has no attribute 'start'
Saved all tours report
Uploaded file: 2018YearEndTop100WorldwideTours_697.csv
Error retrieving object: No /Root object! - Is this really a PDF?
Error retrieving object: No /Root object! - Is this really a PDF?


  tour_data = re.split(r"\s", line, 2);
  view_remaining_columns = re.split(r"\s", tour_data, 5)
  tour_data = re.split(r"\s", tour_data, 5)


Saved all tours report
Uploaded file: 2019MidYearTop100NorthAmericanTours_748.csv
PDF Plumber did not find text. Attempting to extract text with pytesseract OCR


  tour_data = re.split(r"\s", line, 2);
  view_remaining_columns = re.split(r"\s", tour_data, 5)
  tour_data = re.split(r"\s", tour_data, 5)


PDF Plumber did not find text. Attempting to extract text with pytesseract OCR
PDF Plumber did not find text. Attempting to extract text with pytesseract OCR


  tour_data = re.split(r"\s", tour_data, 5)


Error parsing tour record:  'NoneType' object has no attribute 'start'
PDF Plumber did not find text. Attempting to extract text with pytesseract OCR
Error parsing tour record:  'NoneType' object has no attribute 'start'
Saved all tours report
Uploaded file: Top200NorthAmericanTours_791.csv
Error retrieving object: No /Root object! - Is this really a PDF?
Error retrieving object: No /Root object! - Is this really a PDF?


  tour_data = re.split(r"\s", line, 2);
  view_remaining_columns = re.split(r"\s", tour_data, 5)
  tour_data = re.split(r"\s", tour_data, 5)


Saved all tours report
Uploaded file: top100worldwidetours_912.csv
Error retrieving object: No /Root object! - Is this really a PDF?
Error retrieving object: No /Root object! - Is this really a PDF?


  tour_data = re.split(r"\s", line, 2);
  view_remaining_columns = re.split(r"\s", tour_data, 5)
  tour_data = re.split(r"\s", tour_data, 5)
  tour_data = re.split(r"\s", tour_data, 5)


Error parsing tour record:  'NoneType' object has no attribute 'start'
Error parsing tour record:  'NoneType' object has no attribute 'start'
Error parsing tour record:  list index out of range
Error parsing tour record:  'NoneType' object has no attribute 'start'
Error parsing tour record:  'NoneType' object has no attribute 'start'
Error parsing tour record:  list index out of range
Saved all tours report
Uploaded file: Top200NorthAmericanTours_984.csv
