## Get urls for all jerusalem images

In [2]:
import os
import pandas as pd
import geopandas as gpd
from dbfread import DBF
import requests
from urllib.parse import urlparse
import json
import re
import random

In [None]:
'''
Adapted from the LOC github.
This will allow us to store the web address for each item in a list, working through the search page by page.

'''
def get_image_urls(url, items=[]):
    '''
    Retrieves the image_ruls for items that have public URLs available. 
    Skips over items that are for the collection as a whole or web pages about the collection.
    Handles pagination. 
    '''
    # request pages of 100 results at a time
    params = {"fo": "json", "c": 100, "at": "results,pagination"}
    call = requests.get(url, params=params)
    data = call.json()
    results = data['results']
    for result in results:
        # don't try to get images from the collection-level result
        if "collection" not in result.get("original_format") and "web page" not in result.get("original_format"):
            # take the last URL listed in the image_url array
            item = result.get("id")
            items.append(item)
    if data["pagination"]["next"] is not None: # make sure we haven't hit the end of the pages
        next_url = data["pagination"]["next"]
        #print("getting next page: {0}".format(next_url))
        get_image_urls(next_url, items)
        
    return items

In [None]:
#Getting the urls
jerusalem = get_image_urls("https://www.loc.gov/photos/?fa=online-format:image&q=jerusalem", items=[])
#Checking the total number of results
len(jerusalem)

In [None]:
# Create a DataFrame from the list
df_jerusalem = pd.DataFrame(jerusalem, columns=['URLs'])
# Save the DataFrame to a CSV file
df_jerusalem.to_csv('./jerusalem_urls.csv', index=False)
file_path = './jerusalem_urls.csv'

In [None]:
# Select 300 random urls from the list for testing
random_jerusalem = random.sample(jerusalem, 300)
#Save them in a csv file
df_random = pd.DataFrame(random_jerusalem, columns=['URLs'])
df_jerusalem.to_csv('./random_jerusalem.csv', index=False)
file_path = './random_jerusalem.csv'

## Get urls for images of each landmark

In [3]:
#path to DBF file
file_path = "./landmarks_names/landmarks_names/landmarks_names.dbf"

encoding = 'utf-8' 

# Create an empty list to store records
records = []

# Read the DBF file and iterate through records with the specified encoding
with DBF(file_path, encoding=encoding) as dbf:
    for record in dbf:
        records.append(record)

In [None]:
# Convert the list of records into a pandas DataFrame
df = pd.DataFrame(records)

# Stack all the columns of the dataframe into a single column
# This ignores any NaN, None, or empty strings during the process
stacked_series = df.stack().reset_index(drop=True)

# Remove any empty strings or NaN values that might be present
stacked_series.replace('', pd.NA, inplace=True)
stacked_series.dropna(inplace=True)

# Convert the series into a DataFrame
df_single_column = pd.DataFrame(stacked_series, columns=['Names'])

#### Data Cleaning

In [None]:
#function to remove tabs
def remove_quadruple_tabs_if_exists(s):
    """Remove occurrences of '\t\t\t\t' from the input string if it exists."""
    if '\t\t\t\t' in s:
        return s.replace('\t\t\t\t', '')
    if '\t\t\t\t\t' in s:
        return s.replace('\t\t\t\t\t', '')   
    else:
        return s

# Applying the function to each string in the list
df_single_column['Names'] = [remove_quadruple_tabs_if_exists(s) for s in df_single_column['Names']]

# Removing apostrophes from the 'Name' column
df_single_column['Names'] = df_single_column['Names'].str.replace("'", "")

In [None]:
# Replace spaces with '+' in the 'Name' column
df_single_column['formatted_names'] = df_single_column['Names'].str.replace(' ', '+', regex=False)

# Create the base URL
base_url = 'https://www.loc.gov/photos/?fa=location:jerusalem&q='

# Concatenate the base URL with the formatted 'Name' column and assign it to the new 'url' column
df_single_column['url'] = base_url + df_single_column['formatted_names']

#### Getting URLs of each landmark in list

In [None]:
df_single_column['image_urls'] =  df_single_column['url'].apply(get_image_urls)

In [None]:
# Counting the number of results for each landmark 
df_single_column['count'] = df_single_column['image_urls'].apply(lambda x: len(x))

In [None]:
# Getting the total number of results
total_count = df_single_column['count'].sum()

print("Total number:", total_count)

In [None]:
#Saving results in a DataFrame
df_total = df_single_column.explode('image_urls').reset_index(drop=True)

## Get unique urls

In [None]:
#Getting the urls that were present in the 'jerusalem' search query, but not in any of the other landmark queries
jerusalem_set = set(df_jerusalem['URLs'])
total_set = set(df_total['image_urls'])
unique_urls = jerusalem_set - total_set

In [None]:
#checking the number of unique urls
len(unique_urls)

#calculate the percentage of unique urls
unique_per = (len(unique_urls)/len(jerusalem))*100

In [None]:
# Create a DataFrame
df_unique_urls = pd.DataFrame(unique_urls, columns=['URLs'])
# Save the DataFrame to a CSV file
df_unique_urls.to_csv('./unique_urls.csv', index=False)
file_path = './unique_urls.csv'

In [None]:
# Select 300 random urls from the list for testing
random_unique_urls = random.sample(unique_urls, 300)
df_random_unique_urls = pd.DataFrame(random_unique_urls, columns=['URLs'])
df_random_unique_urls.to_csv('./random_unique_urls.csv', index=False)
file_path = './random_unique_urls.csv'