# Archive.org API 

find items in collection

In [71]:
import pandas as pd
from internetarchive import search_items
from tqdm import tqdm
import re
from dateutil.parser import parse

In [48]:
df = pd.read_csv('archive_to_add.csv')

In [49]:
df.head()

Unnamed: 0,Titre,URL,source,id,id_archive,present
0,The Small Pond Magazine of Literature,https://archive.org/details/pub_small-pond-mag...,thomas.csv,https://iiif.archive.org/iiif/pub_small-pond-m...,pub_small-pond-magazine-of-literature,False
1,The English Review Magazine,https://archive.org/details/pub_english-review...,thomas.csv,https://iiif.archive.org/iiif/pub_english-revi...,pub_english-review-magazine,False
2,ATA (Alberta Teachers' Association),https://archive.org/details/pub_ata-magazine,thomas.csv,https://iiif.archive.org/iiif/pub_ata-magazine...,pub_ata-magazine,False
3,HLM : The Howard League Magazine,https://archive.org/details/pub_hlm-the-howard...,thomas.csv,https://iiif.archive.org/iiif/pub_hlm-the-howa...,pub_hlm-the-howard-league-magazine,False
4,Clavier,https://archive.org/details/pub_clavier-a-maga...,thomas.csv,https://iiif.archive.org/iiif/pub_clavier-a-ma...,pub_clavier-a-magazine-for-pianists-and-organists,False


In [50]:
# Function to get items for a collection
def get_items_for_collection(collection_id):
    search_results = internetarchive.search_items(f'collection:{collection_id}')
    return [item['identifier'] for item in search_results]

In [53]:
items_data = []

for collection_id in tqdm(df['id_archive'], desc="Processing collections"):
    items = get_items_for_collection(collection_id)
    for item in items:
        items_data.append({'id_archive': collection_id, 'item': item})


Processing collections: 100%|██████████| 113/113 [02:29<00:00,  1.33s/it]


In [54]:
# Create a new DataFrame from the items data
items_df = pd.DataFrame(items_data)

In [55]:
merged_df = df.merge(items_df, on='id_archive', how='left')

In [None]:
merged_df.to_csv('updated_with_items.csv', index=False)

In [None]:
#df = pd.read_csv('updated_with_items.csv')

In [61]:
query_counter = 0

# Function to get the date for an item
def get_date_for_item(item_id):
    global query_counter
    try:
        item = get_item(item_id)
        query_counter += 1

        # If query_counter hits a multiple of 200, pause for 30 seconds
        if query_counter % 200 == 0:
            print("Pausing for 30 seconds...")
            time.sleep(30)

        return item.metadata.get('date', 'No date available')
    except Exception as e:
        print(f"Error retrieving item {item_id}: {e}")
        return 'Error'

In [None]:
tqdm.pandas(desc="Retrieving Dates")
merged_df['Date'] = merged_df['item'].progress_apply(get_date_for_item)

In [57]:
merged_df.to_csv('updated_with_items_metadata.csv', index=False)

In [77]:
#merged_df = pd.read_csv('updated_with_items_metadata.csv')

In [113]:
def get_season_start_date(season, year):
    season_starts = {
        'Spring': '-03-01',  # Approximate start of spring
        'Summer': '-06-01',  # Approximate start of summer
        'Autumn': '-09-01',  # Approximate start of autumn
        'Fall': '-09-01',    # Fall is treated the same as Autumn
        'Winter': '-11-01'   # Approximate start of winter
    }
    return year + season_starts.get(season, '-01-01')  # Default to January 1st if season not found

In [114]:
def transform_date(date_str):
    if date_str in ['No date available', '']:
        return ''

    # Check for full date format first (YYYY-MM-DD)
    if re.match(r'^\d{4}-\d{2}-\d{2}$', date_str):
        return date_str

    # Handle year range format like '1909 - 1910'
    elif re.match(r'^\d{4}\s*-\s*\d{4}$', date_str):
        return date_str.split('-')[0].strip() + '-01-01'

    # Handle YYYY-MM and YYYY formats
    elif re.match(r'^\d{4}-\d{2}$', date_str):
        return date_str + '-01'
    elif re.match(r'^\d{4}$', date_str):
        return date_str + '-01-01'

    # Handle ranges or complex formats
    try:
        start_date = date_str.split('-')[0].strip()
        year_match = re.search(r'\d{4}', start_date)

        # Check for season names
        for season in ['Spring', 'Summer', 'Autumn', 'Fall', 'Winter']:
            if season in start_date:
                if year_match:
                    year = year_match.group()
                    return get_season_start_date(season, year)

        # For other cases, use dateutil.parser
        parsed_date = parse(start_date)
        return parsed_date.strftime('%Y-%m-%d')

    except ValueError:
        return 'Unrecognized format'

In [115]:
merged_df['normalized_date'] = merged_df['Date'].apply(transform_date)

In [116]:
merged_df.head()

Unnamed: 0,Titre,URL,source,id,id_archive,present,item,Date,normalized_date
0,The Small Pond Magazine of Literature,https://archive.org/details/pub_small-pond-mag...,thomas.csv,https://iiif.archive.org/iiif/pub_small-pond-m...,pub_small-pond-magazine-of-literature,False,sim_small-pond-magazine-of-literature_1985-198...,1985 - 1989,1985-01-01
1,The English Review Magazine,https://archive.org/details/pub_english-review...,thomas.csv,https://iiif.archive.org/iiif/pub_english-revi...,pub_english-review-magazine,False,sim_english-review-magazine_1948_1_index,1948,1948-01-01
2,The English Review Magazine,https://archive.org/details/pub_english-review...,thomas.csv,https://iiif.archive.org/iiif/pub_english-revi...,pub_english-review-magazine,False,sim_english-review-magazine_1949_2_index,1949,1949-01-01
3,The English Review Magazine,https://archive.org/details/pub_english-review...,thomas.csv,https://iiif.archive.org/iiif/pub_english-revi...,pub_english-review-magazine,False,sim_english-review-magazine_1949_3_index,1949,1949-01-01
4,The English Review Magazine,https://archive.org/details/pub_english-review...,thomas.csv,https://iiif.archive.org/iiif/pub_english-revi...,pub_english-review-magazine,False,sim_english-review-magazine_1950_4_index,1950,1950-01-01


In [117]:
merged_df.to_csv('updated_with_items_metadata_date.csv', index=False)