In [1]:
import os
import time
import json
import requests 
from datetime import datetime


def save_request(request_data, page_number, file_path='events_data.json'):
# def save_request(request_data, page_number, file_path='events_data_231101-241101'):
    # Load existing requests if the file exists
    if os.path.exists('datasets/'+ file_path):
        with open('datasets/'+ file_path, 'r') as file:
            requests = json.load(file)
    else:
        requests = []
    
    # Add timestamp to each event 
    new_request = []
    tmp_timestamp = datetime.now().isoformat()


    # add a timestamp to every object being processed
    for tm_object in request_data:

        tm_object['db_stamp'] = tmp_timestamp
        requests.append(tm_object)
    
    
    # Save updated requests back to the file
    
    with open('datasets/' + file_path, 'w') as file:
        json.dump(requests, file, indent=4)

    print(page_number)

In [2]:
# pipeline to download information from an endpoint

error_case = None
def ticketmaster_download_data(object_to_retrieve,start= '2023-11-01T00:00:00Z',end ='2024-11-01T00:00:00Z',page_size = '80'):
    global error_case

    print(f'Object to extract: {object_to_retrieve}')
    #default values
    i = 0               # to start in the first page
    next_page = ''      # default page value

    consumer_key = 'p5lYbtnQB3foTsOFRN7Jc7jpzeox9EMN'
    country_code = 'US'


    base_url = 'https://app.ticketmaster.com'

    url0 = f'https://app.ticketmaster.com/discovery/v2/{object_to_retrieve}.json?'
    url0 += 'countryCode=' + country_code
    url0 += '&startDateTime=' + start 
    url0 += '&endDateTime=' + end
    # url0 += '&classificationName=' + 'music'
    url0 += '&size=' + page_size + '&apikey=' + consumer_key

    total_pages = 1
    total_elements = 0

    while i < total_pages:

        # check if this is the first page to prepare
        if i == 0:
            
            # get the first batch of information and retrieve the amount of pages to process
            events_list = requests.request('GET', url0 )
            total_pages = events_list.json()['page']['totalPages']
            total_elements = events_list.json()['page']['totalElements']

            # save the data requested
            if total_pages==0:
                print('There are no elements to retrieve.')
                break
            save_request(
                events_list.json()['_embedded'][object_to_retrieve]
                , str(events_list.json()['page']['number'])
                , f'{object_to_retrieve}_data.json'
                )
            
            # increase the page
            i += 1
            print(f'Total pages: {total_pages}')
            if total_pages > 1000:
                # break
                print('it will break')

        else:
            # proceed in case there is a next page in the request data
            if events_list.json().get('_links',{}).get('next','') != '':
                # request the 'next' page in the link in case there are more data
                events_list = requests.request('GET', base_url + events_list.json()['_links']['next']['href']+ '&apikey=' + consumer_key)
                try:
                    error_case = events_list
                    save_request(
                        events_list.json()['_embedded'][object_to_retrieve]
                        , str(events_list.json()['page']['number'])
                        , f'{object_to_retrieve}_data.json'
                        )
                    i += 1 
                except Exception as e:
                    print(e)
            # stop downloading more 
            else:
                break
        
        # in order to prevent the api_key to be throttled
        time.sleep(1)

    # unit test for validating the downloaded data

    # load recently created json
    with open(f'datasets/{object_to_retrieve}_data.json', 'r') as file:
        full_data = json.load(file)

    if len(full_data) == total_elements:
        print(f'The download of the object {object_to_retrieve} was successful.')
        print(f'Total elements downloaded: {total_elements}')
    else:
        print('There was an issue in the pipeline')
        print('Here is the last request''s response ')
        print('VVVVVVVV')
        print(error_case.text)
        print('')
        print(f'Rows extracted: {len(full_data)}' )

    # the json file needs to be formatted in the proper formatting for GCP
    print('Prepare data to have BigQuery necessary formatting.')
    with open(f'datasets/{object_to_retrieve}_data_f.json', "w") as new_file:
        for row in full_data:
            new_file.write(json.dumps(row))
            new_file.write("\n")

    # delete unformatted version of the data
    os.remove(f'datasets/{object_to_retrieve}_data.json')

In [9]:
# function to upload raw data into BigQuery

from google.cloud import bigquery

# settup global variables for service-account connexion 

os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = 'service_key.json'
client = bigquery.Client(project='ticketmaster-demo-argo')


def upload_data_to_bigquer(object_of_interest):
    global client

    filename = f'datasets/{object_of_interest}_data_f.json'
    dataset_id = 'stage_db'
    table_id = f'{object_of_interest}_tb'

    dataset_ref = client.dataset(dataset_id)
    table_ref = dataset_ref.table(table_id)
    job_config = bigquery.LoadJobConfig()
    job_config.source_format = bigquery.SourceFormat.NEWLINE_DELIMITED_JSON
    job_config.autodetect = True

    with open(filename, "rb") as source_file:
        job = client.load_table_from_file(
            source_file,
            table_ref,
            location="us-east4",  # Must match the destination dataset location.
            job_config=job_config,
        )  # API request

    job.result()  # Waits for table load to complete.

    print("Loaded {} rows for object {} into {}:{}.".format(job.output_rows, object_of_interest, dataset_id, table_id))

In [4]:
ticketmaster_download_data('events','2023-11-01T00:00:00Z','2024-11-01T00:00:00Z','50')
print('')
ticketmaster_download_data('attractions','2024-10-15T00:00:00Z','2024-11-01T00:00:00Z','80') # this is pending
print('')
ticketmaster_download_data('venues','2024-10-01T00:00:00Z','2024-11-01T00:00:00Z','80') # got limited to only 1000 records per deep-page request

Object to extract: events
0
Total pages: 10
1
2
3
4
5
6
7
8
9
The download of the object events was successful.
Total elements downloaded: 494
Prepare data to have BigQuery necessary formatting.

Object to extract: attractions
0
Total pages: 4118
it will break
1
2
3
4
5
6
7
8
9
10
11
12
'_embedded'
There was an issue in the pipeline
Here is the last requests response 
VVVVVVVV
{"errors":[{"code":"DIS1035","detail":"API Limits Exceeded: Max paging depth exceeded. (page * size) must be less than 1,000","status":"400","_links":{"about":{"href":"/discovery/v2/errors.html#DIS1035"}}}]}

Rows extracted: 1040
Prepare data to have BigQuery necessary formatting.

Object to extract: venues
0
Total pages: 433
1
2
3
4
5
6
7
8
9
10
11
12
'_embedded'
There was an issue in the pipeline
Here is the last requests response 
VVVVVVVV
{"errors":[{"code":"DIS1035","detail":"API Limits Exceeded: Max paging depth exceeded. (page * size) must be less than 1,000","status":"400","_links":{"about":{"href":"/disc

In [6]:
for object in ['events','attractions','venues']:

    # 3 , for deleting the data
    query_string = f"""TRUNCATE TABLE `ticketmaster-demo-argo.stage_db.{object}_tb`;"""
    results = client.query_and_wait(query_string)

    print(f'The table {object} has been cleaned.')

The table events has been cleaned.
The table attractions has been cleaned.
The table venues has been cleaned.


In [11]:
upload_data_to_bigquer('events')
upload_data_to_bigquer('attractions')
upload_data_to_bigquer('venues')


Loaded 494 rows for object events into stage_db:events_tb.
Loaded 1040 rows for object attractions into stage_db:attractions_tb.
Loaded 1040 rows for object venues into stage_db:venues_tb.


### TEST ENV

In [None]:
# queries to test connection
# 1

# query_string = """SELECT name, SUM(number) as total
# FROM `bigquery-public-data.usa_names.usa_1910_current`
# WHERE name = 'William'
# GROUP BY name;
# """
# results = client.query_and_wait(query_string)

# # Print the results.
# for row in results:  # Wait for the job to complete.
#     print("{}: {}".format(row["name"], row["total"]))



# 2
# query_string = """SELECT *
# FROM `bigquery-public-data.usa_names.usa_1910_current`
# WHERE name = 'William'
# ;
# """
# results = client.query_and_wait(query_string)

# for row in results:
#     print(row)


# 3 , for deleting the data
# query_string = """TRUNCATE TABLE `ticketmaster-demo-argo.stage_db.events_tb`;
# """
# results = client.query_and_wait(query_string)

# for i in results:
#     print(i)