In [1]:
import pandas as pd
import dateparser
import requests
from tqdm import tqdm
import xml.etree.ElementTree as ET

# BNF to Explore

## Set ARK of the revue to download

In [2]:
ark = "ark:/12148/cb32810629m/date"

In [3]:
# Make the API call
url = f"https://gallica.bnf.fr/services/Issues?ark={ark}"
response = requests.get(url)

# Parse the XML response
root = ET.fromstring(response.content)
years = [element.text for element in root.findall(".//year")]

# Create the DataFrame
df = pd.DataFrame({'URL': [ark] * len(years), 'year': years})

In [4]:
df.head()

Unnamed: 0,URL,year
0,ark:/12148/cb32810629m/date,1833
1,ark:/12148/cb32810629m/date,1834
2,ark:/12148/cb32810629m/date,1835
3,ark:/12148/cb32810629m/date,1836
4,ark:/12148/cb32810629m/date,1837


## Make an API call to Gallica to retrieve the issues

the call will extract, for each year, the number of published issues, their ids and their date of publication

In [5]:
def process_xml(xml_data):
    root = ET.fromstring(xml_data)
    issues_extract = []
    for issue_elem in root.findall('.//issue'):
        ark = issue_elem.get('ark')
        content = issue_elem.text.strip()
        issues_extract.append(f"{ark}:::{content}")
    return "||".join(issues_extract)

In [6]:
def get_info(row):
    url = "https://gallica.bnf.fr/services/Issues?ark=" + row['URL'] + "&date=" + str(row['year'])
    response = requests.get(url)
    response.raise_for_status()  # Raise an exception if the request was unsuccessful
    return process_xml(response.content)

In [7]:
tqdm.pandas(desc="Processing rows")
df['issues_extract'] = df.progress_apply(get_info, axis=1)

Processing rows: 100%|██████████| 86/86 [00:21<00:00,  4.07it/s]


## Data wrangling

the data retrieved from the API is organised as pairs of issueid::::year_of_publication divided by "||". We need to splits the retrieved values and reorganise them in order to have for each line only a single value

In [8]:
df.head()

Unnamed: 0,URL,year,issues_extract
0,ark:/12148/cb32810629m/date,1833,bpt6k314169:::1833||bpt6k1197695m:::1833
1,ark:/12148/cb32810629m/date,1834,bpt6k31417n:::1834
2,ark:/12148/cb32810629m/date,1835,bpt6k314180:::1835
3,ark:/12148/cb32810629m/date,1836,bpt6k31419b:::1836
4,ark:/12148/cb32810629m/date,1837,bpt6k31420j:::1837


In [9]:
# Remove leading and trailing spaces
df['issues_extract'] = df['issues_extract'].str.strip()

# Split the 'issues_extract' column and expand it into separate columns
split_issues = df['issues_extract'].str.split('\|\|', expand=True)

# Create new column names
new_column_names = [f'issues_extract_{i+1}' for i in range(split_issues.shape[1])]

# Assign the split values to the new columns
split_issues.columns = new_column_names

df1 = pd.concat([df, split_issues], axis=1)

In [10]:
df1.drop(columns=['issues_extract'], inplace=True)

In [11]:
df1.head()

Unnamed: 0,URL,year,issues_extract_1,issues_extract_2,issues_extract_3,issues_extract_4,issues_extract_5,issues_extract_6,issues_extract_7,issues_extract_8,issues_extract_9,issues_extract_10,issues_extract_11,issues_extract_12
0,ark:/12148/cb32810629m/date,1833,bpt6k314169:::1833,bpt6k1197695m:::1833,,,,,,,,,,
1,ark:/12148/cb32810629m/date,1834,bpt6k31417n:::1834,,,,,,,,,,,
2,ark:/12148/cb32810629m/date,1835,bpt6k314180:::1835,,,,,,,,,,,
3,ark:/12148/cb32810629m/date,1836,bpt6k31419b:::1836,,,,,,,,,,,
4,ark:/12148/cb32810629m/date,1837,bpt6k31420j:::1837,,,,,,,,,,,


In [12]:
df1 = df1.drop(columns=['URL','year'])

In [13]:
table = df1.stack()

In [14]:
table

0   issues_extract_1                   bpt6k314169:::1833
    issues_extract_2                 bpt6k1197695m:::1833
1   issues_extract_1                   bpt6k31417n:::1834
2   issues_extract_1                   bpt6k314180:::1835
3   issues_extract_1                   bpt6k31419b:::1836
                                      ...                
84  issues_extract_6      bpt6k11977044:::01 juillet 1937
    issues_extract_7    bpt6k1197705j:::01 septembre 1937
    issues_extract_8     bpt6k1197706z:::01 novembre 1937
85  issues_extract_1         bpt6k1197707c:::01 mars 1938
    issues_extract_2      bpt6k1197708s:::01 juillet 1938
Length: 109, dtype: object

In [15]:
table = table.reset_index()

table.columns = ['numberIssue', 'issues_extract', 'date']

new_df = table[['issues_extract', 'date']]

In [16]:
new_df.head()

Unnamed: 0,issues_extract,date
0,issues_extract_1,bpt6k314169:::1833
1,issues_extract_2,bpt6k1197695m:::1833
2,issues_extract_1,bpt6k31417n:::1834
3,issues_extract_1,bpt6k314180:::1835
4,issues_extract_1,bpt6k31419b:::1836


In [17]:
new_df = new_df.copy()

### Full Date extractions

In [18]:
new_df[['ark', 'date']] = new_df["date"].apply(lambda x: pd.Series(str(x).split(":::")))

In [19]:
new_df.head()

Unnamed: 0,issues_extract,date,ark
0,issues_extract_1,1833,bpt6k314169
1,issues_extract_2,1833,bpt6k1197695m
2,issues_extract_1,1834,bpt6k31417n
3,issues_extract_1,1835,bpt6k314180
4,issues_extract_1,1836,bpt6k31419b


#### Check Errors

in the next step we check that all the dates are full dates. In order to recognize and transform the dates from the form "01 août 1881" to "1981-08-01" we need to have consistent dates. 
In case some entries does only have the year and no month, it is important to modify it. The next script will check if something is wrong.

In [20]:
import re

def is_full_date(date_str):
    pattern = r'\d{1,2} \w+ \d{4}'
    
    return bool(re.search(pattern, date_str))

new_df['is_full_date'] = new_df['date'].apply(is_full_date)

incomplete_dates = new_df[~new_df['is_full_date']]

print(incomplete_dates)

      issues_extract  date            ark  is_full_date
0   issues_extract_1  1833    bpt6k314169         False
1   issues_extract_2  1833  bpt6k1197695m         False
2   issues_extract_1  1834    bpt6k31417n         False
3   issues_extract_1  1835    bpt6k314180         False
4   issues_extract_1  1836    bpt6k31419b         False
..               ...   ...            ...           ...
91  issues_extract_1  1912    bpt6k34986k         False
92  issues_extract_1  1913  bpt6k55683827         False
93  issues_extract_1  1916  bpt6k55854915         False
94  issues_extract_1  1917  bpt6k5587487w         False
98  issues_extract_1  1923  bpt6k1197698v         False

[84 rows x 4 columns]


if one or more values are to be modified, you can change the instruction below to assign the appropriate date

In [24]:
def update_date(date_str):
    if not is_full_date(date_str):
        year = re.search(r'\d{4}', date_str).group()
        return f"01 Janvier {year}"
    return date_str

In [25]:
new_df['date'] = new_df['date'].apply(update_date)

In [None]:
#new_df.loc[new_df['issues_extract'] =='issues_extract_1', 'date'] = '01 Janvier 1881'

### Parsing date

In [28]:
new_df.date = new_df.date.apply(lambda x: dateparser.parse(x))

In [29]:
new_df.head()

Unnamed: 0,issues_extract,date,ark,is_full_date
0,issues_extract_1,1833-01-01,bpt6k314169,False
1,issues_extract_2,1833-01-01,bpt6k1197695m,False
2,issues_extract_1,1834-01-01,bpt6k31417n,False
3,issues_extract_1,1835-01-01,bpt6k314180,False
4,issues_extract_1,1836-01-01,bpt6k31419b,False


In [30]:
new_df = new_df.drop(columns=['is_full_date'])

In [31]:
new_df["Notice"] = "https://gallica.bnf.fr/" + ark

In [32]:
new_df.head()

Unnamed: 0,issues_extract,date,ark,Notice
0,issues_extract_1,1833-01-01,bpt6k314169,https://gallica.bnf.fr/ark:/12148/cb32810629m/...
1,issues_extract_2,1833-01-01,bpt6k1197695m,https://gallica.bnf.fr/ark:/12148/cb32810629m/...
2,issues_extract_1,1834-01-01,bpt6k31417n,https://gallica.bnf.fr/ark:/12148/cb32810629m/...
3,issues_extract_1,1835-01-01,bpt6k314180,https://gallica.bnf.fr/ark:/12148/cb32810629m/...
4,issues_extract_1,1836-01-01,bpt6k31419b,https://gallica.bnf.fr/ark:/12148/cb32810629m/...


## Metadata

this step is manual and depends on the journal. Each cell assign to the dataframe a new column with Title, City of publication, and all the metadata required by the project. Change it accordingly to your publication

In [33]:
new_df["Title"] = "Le Magasin pittoresque "

In [34]:
new_df["City"] = "Paris"
new_df["Country"] = "France"
new_df["Journal Type"] = ""

In [35]:
new_df = new_df.drop(columns=['issues_extract'])

In [36]:
new_df.rename(columns={'date': 'normalized_date', 'ark':'issueArk'}, inplace=True)

In [37]:
new_df["Media URL"] = 'https://gallica.bnf.fr/iiif/ark:/12148/' + new_df['issueArk'].astype(str) + '/manifest.json'

In [38]:
new_df["wkt"] = 'POINT(2.3513888888889 48.856944444444)'

In [39]:
new_df["City_wd"] = 'Q90'
new_df["Country_wd"] = 'Q142'

In [40]:
new_df.to_csv('/Users/carboni/Downloads/pittoresque.csv', index=False)