# Extract manifest link for creating datasets in Explore

The notebook is used to create list of manifests to ingest in Explore. The Manifests are created based on one or multiple CSV file downloaded from OpenRefine. It extract the "Media URL" value and concatenate them together creating a txt file.

In [None]:
import pandas as pd
import os
import glob

In [None]:
keep_col = ['Media URL']

## Combined datasets to unique txt file 

In [None]:
folder_path = 'combined_datasets_explore'

In [None]:
all_files = glob.glob(folder_path + "/*.csv")

li = []

for filename in all_files:
    df = pd.read_csv(filename, index_col=None, header=0, delimiter=',', low_memory=False)
    df = df[keep_col]
    df['source'] = filename.split("/")[-1]
    li.append(df)

df_merged = pd.concat(li, axis=0, ignore_index=True)

In [None]:
df_merged.head()

In [None]:
df_merged.to_csv('merged_combined_manifests.csv', index=False)

In [None]:
# df_merged = df_merged.dropna()

In [None]:
url_string = ' '.join(df_merged['Media URL'].astype(str))

# Write the string to a text file
with open('full_manifests.txt', 'w') as file:
    file.write(url_string)

print("Text file 'full_manifests.txt' created.")

## Combined datasets to multiplle file (based on the source)

The following do the same operation as the one above, but the produced results are based on the original CSV files (source). If multiple CSV are given as input, multiple lists of manifests are produced

In [None]:
grouped = df_merged.groupby('source')['Media URL'].apply(lambda x: ' '.join(x.astype(str)))

for source, urls in grouped.items():
    filename = source.replace('/', '_').replace('\\', '_') + '_manifest.txt'

    with open(filename, 'w') as file:
        file.write(urls)

    print(f"Text file '{filename}' created.")

## Combined datasets to multiplle file (based on the size)

The following operations combine multiple CSV and extract the manifests in the column 'Media URL'. The result is then splitted into multiple files based on the size. Currently, the script it is set for using 20 MB as the maximum size for the manifest list. Bigger txt file have caused errors on Explore.

In [None]:
def write_to_file(file_number, content):
    filename = f'manifests_{file_number}.txt'
    with open(filename, 'w') as file:
        file.write(content)
    print(f"Text file '{filename}' created.")

# Convert the column to a string, separating entries with spaces
url_string = ' '.join(df_merged['Media URL'].astype(str))

# Define max size for each file (in bytes) - 20MB
max_size = 20 * 1024 * 1024

file_number = 1
current_size = 0
current_content = ''

for url in url_string.split(' '):
    # Estimate size added by next URL
    added_size = len(url.encode('utf-8')) + 1  # +1 for the space

    # Check if adding the next URL would exceed the max size
    if current_size + added_size > max_size:
        write_to_file(file_number, current_content)
        file_number += 1
        current_content = url + ' '
        current_size = added_size
    else:
        current_content += url + ' '
        current_size += added_size

# Write any remaining content to a file
if current_content:
    write_to_file(file_number, current_content)


## Single CSV file to manifest list

In [None]:
df_single = pd.read_csv('/Users/carboni/Downloads/Slovenia-magazines-with-iiif-manifest-csv.tsv', low_memory=False)

In [None]:
url_string = ' '.join(df_single['Media URL'].astype(str))

In [None]:
# Write the string to a text file
with open('manifests_slovenia.txt', 'w') as file:
    file.write(url_string)

print("Text file 'manifests.txt' created.")