In [1]:
import pandas as pd
import dateparser
import requests
from tqdm import tqdm
import traceback
import xml.etree.ElementTree as ET
import re
import sparql_dataframe
from SPARQLWrapper import SPARQLWrapper, JSON
import time

In [2]:
def process_xml(xml_data):
    root = ET.fromstring(xml_data)
    issues_extract = []
    for issue_elem in root.findall('.//issue'):
        ark = issue_elem.get('ark')
        content = issue_elem.text.strip()
        issues_extract.append(f"{ark}:::{content}")
    return "||".join(issues_extract)

def get_info(row):
    url = "https://gallica.bnf.fr/services/Issues?ark=" + row['URL'] + "&date=" + str(row['year'])
    response = requests.get(url)
    response.raise_for_status() # Raise an exception if the request was unsuccessful
    return process_xml(response.content)

## From a df

In [3]:
# in case if a table is available
journals_bnf = pd.read_csv('/Users/carboni/Downloads/2023 Gallica fashion.csv')

In [4]:
journals_bnf.head()

Unnamed: 0,List,OK VC,VC_Title,VC_Journal Type,VC_City,VC_Country,ark
0,Perfume Drinker list,Pas dans VC,Adam : revue des modes masculines en France et...,Men,Paris,France,ark:/12148/cb32682663c/date
1,Perfume Drinker list,Pas dans VC,Adam chemisier : illustration de la chemiserie...,Men,Paris,France,ark:/12148/cb34541274z/date


In [5]:
# Name of the Journal Type Column
col_type = 'VC_Journal Type'
col_title = 'VC_Title'

In [6]:
col_type_exists = col_type in journals_bnf.columns
col_title_exists = col_title in journals_bnf.columns

In [7]:
journals = [{"ark": row["ark"]} for index, row in journals_bnf.iterrows()]

## From a list

In [None]:
journals = [
    {"title": "Action francaise", "ark": "ark:/12148/cb326819451/date"},
    {"title": "L'Aurore", "ark": "ark:/12148/cb32706846t/date"},
    {"title": "L'Auto", "ark": "ark:/12148/cb327071375/date"}
]

In [None]:
journals

# Process with API

In [8]:
dfs = []

In [11]:
with requests.Session() as session:
    for journal in journals:
        ark = journal["ark"]
        url = f"https://gallica.bnf.fr/services/Issues?ark={ark}"

        try:
            response = session.get(url)

            if response.status_code == 200:
                root = ET.fromstring(response.content)
                years = [element.text for element in root.findall(".//year")]

                df = pd.DataFrame({'URL': [ark] * len(years), 'year': years})
                tqdm.pandas(desc="Processing rows")
                df['issues_extract'] = df.progress_apply(get_info, axis=1)
                df['issues_extract'] = df['issues_extract'].str.strip()

                split_issues = df['issues_extract'].str.split('\|\|', expand=True)
                new_column_names = [f'issues_extract_{i+1}' for i in range(split_issues.shape[1])]
                split_issues.columns = new_column_names
                df1 = pd.concat([df, split_issues], axis=1)
                df1.drop(columns=['issues_extract'], inplace=True)
                df1 = df1.drop(columns=['URL', 'year'])
                table = df1.stack()
                table = table.reset_index()
                table.columns = ['numberIssue', 'issues_extract', 'date']
                new_df = table[['issues_extract', 'date']]
                new_df = new_df.copy()

                # Full Date extractions
                new_df[['ark', 'date']] = new_df["date"].apply(lambda x: pd.Series(str(x).split(":::")))

                def is_full_date(date_str):
                    pattern = r'\d{1,2} \w+ \d{4}'
                    return bool(re.search(pattern, date_str))

                new_df['is_full_date'] = new_df['date'].apply(is_full_date)
                incomplete_dates = new_df[~new_df['is_full_date']]

                def update_date(date_str):
                    if not is_full_date(date_str):
                        year = re.search(r'\d{4}', date_str).group()
                        return f"01 Janvier {year}"
                    return date_str

                new_df['date'] = new_df['date'].apply(update_date)

                # Parsing date
                new_df.date = new_df.date.apply(lambda x: dateparser.parse(x))
                # structuring dataset
                new_df = new_df.drop(columns=['is_full_date'])
                new_df["Notice"] = "https://gallica.bnf.fr/" + ark
                new_df.rename(columns={'date': 'normalized_date', 'ark': 'issueArk'}, inplace=True)
                new_df["Media URL"] = 'https://gallica.bnf.fr/iiif/ark:/12148/' + new_df['issueArk'].astype(str) + '/manifest.json'
                new_df = new_df.drop(columns=['issues_extract', 'issueArk'])

                # Add col_type and col_title to new_df if they exist
                if col_type_exists:
                    new_df[col_type] = journals_bnf.loc[journals_bnf['ark'] == ark, col_type].iloc[0]
                if col_title_exists:
                    new_df[col_title] = journals_bnf.loc[journals_bnf['ark'] == ark, col_title].iloc[0]

                dfs.append(new_df)

            else:
                with open('error_log.txt', 'a') as log_file:
                    log_file.write(f"Error for {ark}: Status code {response.status_code}\n")

        except ET.ParseError as e:
            with open('error_log.txt', 'a') as log_file:
                log_file.write(f"Malformed XML for {ark}: {str(e)}\n")

        except Exception as e:
            with open('error_log.txt', 'a') as log_file:
                log_file.write(f"Error for {ark}: {str(e)}\n{traceback.format_exc()}\n")

Processing rows: 100%|██████████| 20/20 [00:12<00:00,  1.61it/s]
Processing rows: 100%|██████████| 5/5 [00:01<00:00,  2.76it/s]


In [12]:
combined_df = pd.concat(dfs)

In [13]:
combined_df.head()

Unnamed: 0,normalized_date,Notice,Media URL,VC_Journal Type,VC_Title
0,1925-12-15,https://gallica.bnf.fr/ark:/12148/cb32682663c/...,https://gallica.bnf.fr/iiif/ark:/12148/bpt6k42...,Men,Adam : revue des modes masculines en France et...
1,1926-01-25,https://gallica.bnf.fr/ark:/12148/cb32682663c/...,https://gallica.bnf.fr/iiif/ark:/12148/bpt6k42...,Men,Adam : revue des modes masculines en France et...
2,1926-02-25,https://gallica.bnf.fr/ark:/12148/cb32682663c/...,https://gallica.bnf.fr/iiif/ark:/12148/bpt6k42...,Men,Adam : revue des modes masculines en France et...
3,1926-03-25,https://gallica.bnf.fr/ark:/12148/cb32682663c/...,https://gallica.bnf.fr/iiif/ark:/12148/bpt6k42...,Men,Adam : revue des modes masculines en France et...
4,1926-05-01,https://gallica.bnf.fr/ark:/12148/cb32682663c/...,https://gallica.bnf.fr/iiif/ark:/12148/bpt6k42...,Men,Adam : revue des modes masculines en France et...


In [14]:
combined_df.to_csv('combined_journal_data.csv', index=False)

## SPARQL Metadata

In [42]:
#combined_df = pd.read_csv('combined_journal_data.csv')

In [43]:
query = """
PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
PREFIX foaf: <http://xmlns.com/foaf/0.1/>

SELECT ?Title ?Revue_wd ?City WHERE {
  ?journal skos:prefLabel ?Title ;
           foaf:focus ?focus . 
  ?focus <http://rdvocab.info/Elements/placeOfPublication> ?City .
  
  OPTIONAL {
    ?journal skos:exactMatch ?Revue_wd .
    FILTER (contains(str(?Revue_wd), "wikidata.org")) .
  }
}
"""

In [44]:
sparql = SPARQLWrapper("https://data.bnf.fr/sparql")
sparql.setQuery(query)
sparql.setReturnFormat(JSON)

In [45]:
cache = {}


def execute_query(journal, col_title_exists):
    if journal in cache:
        return cache[journal]
    else:
        time.sleep(1) 
        sparql.setQuery(query.replace("?journal", "<" + journal + ">"))
        results = sparql.query().convert()

        if "results" in results and "bindings" in results["results"]:
            bindings = results["results"]["bindings"]
            if len(bindings) > 0:
                result = {}

                # Handle 'Title' only if col_title does not exist
                if not col_title_exists:
                    if "Title" in bindings[0]:
                        result["Title"] = bindings[0]["Title"]["value"]
                    else:
                        result["Title"] = None  # Handle case where Title is not in the query results

                if "Revue_wd" in bindings[0]:
                    result["Revue_wd"] = bindings[0]["Revue_wd"]["value"]
                else:
                    result["Revue_wd"] = None
                    print(f"No Revue_wd present for {journal}")

                if "City" in bindings[0]:
                    result["City"] = bindings[0]["City"]["value"]
                else:
                    result["City"] = None

                cache[journal] = result
                print(f"Successful query for SPARQL: {journal}")
                return result
        else:
            print(f"No SPARQL results for: {journal}")

    return None

In [46]:
col_title_exists = col_title in combined_df.columns

In [47]:
combined_df["SPARQL"] = combined_df["Notice"].str.replace('https://gallica.bnf.fr/', 'http://data.bnf.fr/', regex=False).str.replace('/date', '', regex=False)

In [48]:
combined_df.head()

Unnamed: 0,normalized_date,Notice,Media URL,VC_Journal Type,VC_Title,SPARQL
0,1925-12-15,https://gallica.bnf.fr/ark:/12148/cb32682663c/...,https://gallica.bnf.fr/iiif/ark:/12148/bpt6k42...,Men,Adam : revue des modes masculines en France et...,http://data.bnf.fr/ark:/12148/cb32682663c
1,1926-01-25,https://gallica.bnf.fr/ark:/12148/cb32682663c/...,https://gallica.bnf.fr/iiif/ark:/12148/bpt6k42...,Men,Adam : revue des modes masculines en France et...,http://data.bnf.fr/ark:/12148/cb32682663c
2,1926-02-25,https://gallica.bnf.fr/ark:/12148/cb32682663c/...,https://gallica.bnf.fr/iiif/ark:/12148/bpt6k42...,Men,Adam : revue des modes masculines en France et...,http://data.bnf.fr/ark:/12148/cb32682663c
3,1926-03-25,https://gallica.bnf.fr/ark:/12148/cb32682663c/...,https://gallica.bnf.fr/iiif/ark:/12148/bpt6k42...,Men,Adam : revue des modes masculines en France et...,http://data.bnf.fr/ark:/12148/cb32682663c
4,1926-05-01,https://gallica.bnf.fr/ark:/12148/cb32682663c/...,https://gallica.bnf.fr/iiif/ark:/12148/bpt6k42...,Men,Adam : revue des modes masculines en France et...,http://data.bnf.fr/ark:/12148/cb32682663c


In [49]:
tqdm.pandas(desc="Processing SPARQL queries")
result = combined_df["SPARQL"].progress_apply(lambda x: execute_query(x, col_title_exists))

Processing SPARQL queries:   1%|          | 2/218 [00:01<02:11,  1.64it/s]

No Revue_wd present for http://data.bnf.fr/ark:/12148/cb32682663c
Successful query for SPARQL: http://data.bnf.fr/ark:/12148/cb32682663c


Processing SPARQL queries: 100%|██████████| 218/218 [00:02<00:00, 93.06it/s]

No Revue_wd present for http://data.bnf.fr/ark:/12148/cb34541274z
Successful query for SPARQL: http://data.bnf.fr/ark:/12148/cb34541274z





In [50]:
result_df = pd.DataFrame(result.tolist())

In [51]:
result_df = result_df.reset_index(drop=True)
combined_df = combined_df.reset_index(drop=True)

In [52]:
combined_df = pd.concat([combined_df, result_df], axis=1)

In [53]:
combined_df = combined_df.drop(columns=['SPARQL'])

In [54]:
combined_df["Revue_wd"] = combined_df["Revue_wd"].str.replace('http://wikidata.org/entity/', '', regex=False)

In [55]:
combined_df.head()

Unnamed: 0,normalized_date,Notice,Media URL,VC_Journal Type,VC_Title,Revue_wd,City
0,1925-12-15,https://gallica.bnf.fr/ark:/12148/cb32682663c/...,https://gallica.bnf.fr/iiif/ark:/12148/bpt6k42...,Men,Adam : revue des modes masculines en France et...,,Paris
1,1926-01-25,https://gallica.bnf.fr/ark:/12148/cb32682663c/...,https://gallica.bnf.fr/iiif/ark:/12148/bpt6k42...,Men,Adam : revue des modes masculines en France et...,,Paris
2,1926-02-25,https://gallica.bnf.fr/ark:/12148/cb32682663c/...,https://gallica.bnf.fr/iiif/ark:/12148/bpt6k42...,Men,Adam : revue des modes masculines en France et...,,Paris
3,1926-03-25,https://gallica.bnf.fr/ark:/12148/cb32682663c/...,https://gallica.bnf.fr/iiif/ark:/12148/bpt6k42...,Men,Adam : revue des modes masculines en France et...,,Paris
4,1926-05-01,https://gallica.bnf.fr/ark:/12148/cb32682663c/...,https://gallica.bnf.fr/iiif/ark:/12148/bpt6k42...,Men,Adam : revue des modes masculines en France et...,,Paris


In [56]:
combined_df.to_csv('combined_journal_data_sparql.csv', index=False)