In [1]:
import pandas as pd
import dateparser
import requests
from tqdm import tqdm
import xml.etree.ElementTree as ET

# BNF to Explore

## Set ARK of the revue to download

In [2]:
ark = "ark:/12148/cb327024729/date"

In [3]:
# Make the API call
url = f"https://gallica.bnf.fr/services/Issues?ark={ark}"
response = requests.get(url)

# Parse the XML response
root = ET.fromstring(response.content)
years = [element.text for element in root.findall(".//year")]

# Create the DataFrame
df = pd.DataFrame({'URL': [ark] * len(years), 'year': years})

In [4]:
df.head()

Unnamed: 0,URL,year
0,ark:/12148/cb327024729/date,1881
1,ark:/12148/cb327024729/date,1883
2,ark:/12148/cb327024729/date,1885
3,ark:/12148/cb327024729/date,1886
4,ark:/12148/cb327024729/date,1887


## Make an API call to Gallica to retrieve the issues

the call will extract, for each year, the number of published issues, their ids and their date of publication

In [5]:
def process_xml(xml_data):
    root = ET.fromstring(xml_data)
    issues_extract = []
    for issue_elem in root.findall('.//issue'):
        ark = issue_elem.get('ark')
        content = issue_elem.text.strip()
        issues_extract.append(f"{ark}:::{content}")
    return "||".join(issues_extract)

In [6]:
def get_info(row):
    url = "https://gallica.bnf.fr/services/Issues?ark=" + row['URL'] + "&date=" + str(row['year'])
    response = requests.get(url)
    response.raise_for_status()  # Raise an exception if the request was unsuccessful
    return process_xml(response.content)

In [7]:
tqdm.pandas(desc="Processing rows")
df['issues_extract'] = df.progress_apply(get_info, axis=1)

Processing rows: 100%|██████████| 38/38 [00:47<00:00,  1.24s/it]


## Data wrangling

the data retrieved from the API is organised as pairs of issueid::::year_of_publication divided by "||". We need to splits the retrieved values and reorganise them in order to have for each line only a single value

In [8]:
df.head()

Unnamed: 0,URL,year,issues_extract
0,ark:/12148/cb327024729/date,1881,bd6t5343084n:::1881||bpt6k1207922h:::01 févrie...
1,ark:/12148/cb327024729/date,1883,bd6t53347823:::22 décembre 1883
2,ark:/12148/cb327024729/date,1885,bd6t5335101c:::17 janvier 1885||bd6t5335102r::...
3,ark:/12148/cb327024729/date,1886,bd6t53353960:::02 janvier 1886||bd6t5335397c::...
4,ark:/12148/cb327024729/date,1887,bd6t5334472t:::01 janvier 1887||bd6t5337188b::...


In [9]:
# Remove leading and trailing spaces
df['issues_extract'] = df['issues_extract'].str.strip()

# Split the 'issues_extract' column and expand it into separate columns
split_issues = df['issues_extract'].str.split('\|\|', expand=True)

# Create new column names
new_column_names = [f'issues_extract_{i+1}' for i in range(split_issues.shape[1])]

# Assign the split values to the new columns
split_issues.columns = new_column_names

df1 = pd.concat([df, split_issues], axis=1)

In [10]:
df1.drop(columns=['issues_extract'], inplace=True)

In [11]:
df1.head()

Unnamed: 0,URL,year,issues_extract_1,issues_extract_2,issues_extract_3,issues_extract_4,issues_extract_5,issues_extract_6,issues_extract_7,issues_extract_8,...,issues_extract_67,issues_extract_68,issues_extract_69,issues_extract_70,issues_extract_71,issues_extract_72,issues_extract_73,issues_extract_74,issues_extract_75,issues_extract_76
0,ark:/12148/cb327024729/date,1881,bd6t5343084n:::1881,bpt6k1207922h:::01 février 1881,bd6t53357331:::01 août 1881,bd6t53357331:::01 août 1881,,,,,...,,,,,,,,,,
1,ark:/12148/cb327024729/date,1883,bd6t53347823:::22 décembre 1883,,,,,,,,...,,,,,,,,,,
2,ark:/12148/cb327024729/date,1885,bd6t5335101c:::17 janvier 1885,bd6t5335102r:::24 janvier 1885,bd6t53351034:::31 janvier 1885,bd6t5335104h:::07 février 1885,bd6t5335105w:::14 février 1885,bd6t53351068:::21 février 1885,bd6t5335107n:::28 février 1885,bd6t53351081:::07 mars 1885,...,,,,,,,,,,
3,ark:/12148/cb327024729/date,1886,bd6t53353960:::02 janvier 1886,bd6t5335397c:::09 janvier 1886,bd6t5335398r:::16 janvier 1886,bd6t53353994:::23 janvier 1886,bd6t5335400x:::30 janvier 1886,bd6t53354019:::06 février 1886,bd6t5335402p:::13 février 1886,bd6t53354032:::20 février 1886,...,,,,,,,,,,
4,ark:/12148/cb327024729/date,1887,bd6t5334472t:::01 janvier 1887,bd6t5337188b:::01 janvier 1887,bd6t5337189q:::07 janvier 1887,bd6t5337190n:::28 janvier 1887,bd6t53371911:::04 février 1887,bd6t5337192d:::11 février 1887,bd6t5337193s:::18 février 1887,bd6t5337193s:::18 février 1887,...,bd6t53396079:::02 décembre 1887,bd6t5339608p:::09 décembre 1887,bd6t5339608p:::09 décembre 1887,bd6t53396092:::16 décembre 1887,bd6t53396092:::16 décembre 1887,bd6t53396100:::23 décembre 1887,bd6t5339611c:::30 décembre 1887,,,


In [13]:
df1 = df1.drop(columns=['URL','year'])

In [14]:
table = df1.stack()

In [15]:
table

0   issues_extract_1                 bd6t5343084n:::1881
    issues_extract_2     bpt6k1207922h:::01 février 1881
    issues_extract_3         bd6t53357331:::01 août 1881
    issues_extract_4         bd6t53357331:::01 août 1881
1   issues_extract_1     bd6t53347823:::22 décembre 1883
                                      ...               
37  issues_extract_45    bd6t5347492k:::27 novembre 1920
    issues_extract_46    bd6t5347493z:::04 décembre 1920
    issues_extract_47    bd6t5347494b:::11 décembre 1920
    issues_extract_48    bd6t5347495q:::18 décembre 1920
    issues_extract_49    bd6t53474963:::25 décembre 1920
Length: 1579, dtype: object

In [16]:
table = table.reset_index()

table.columns = ['numberIssue', 'issues_extract', 'date']

new_df = table[['issues_extract', 'date']]

In [17]:
new_df.head()

Unnamed: 0,issues_extract,date
0,issues_extract_1,bd6t5343084n:::1881
1,issues_extract_2,bpt6k1207922h:::01 février 1881
2,issues_extract_3,bd6t53357331:::01 août 1881
3,issues_extract_4,bd6t53357331:::01 août 1881
4,issues_extract_1,bd6t53347823:::22 décembre 1883


In [18]:
new_df = new_df.copy()

### Full Date extractions

In [19]:
new_df[['ark', 'date']] = new_df["date"].apply(lambda x: pd.Series(str(x).split(":::")))

In [20]:
new_df.head()

Unnamed: 0,issues_extract,date,ark
0,issues_extract_1,1881,bd6t5343084n
1,issues_extract_2,01 février 1881,bpt6k1207922h
2,issues_extract_3,01 août 1881,bd6t53357331
3,issues_extract_4,01 août 1881,bd6t53357331
4,issues_extract_1,22 décembre 1883,bd6t53347823


#### Check Errors

in the next step we check that all the dates are full dates. In order to recognize and transform the dates from the form "01 août 1881" to "1981-08-01" we need to have consistent dates. 
In case some entries does only have the year and no month, it is important to modify it. The next script will check if something is wrong.

In [21]:
import re

def is_full_date(date_str):
    pattern = r'\d{1,2} \w+ \d{4}'
    
    return bool(re.search(pattern, date_str))

new_df['is_full_date'] = new_df['date'].apply(is_full_date)

incomplete_dates = new_df[~new_df['is_full_date']]

print(incomplete_dates)

     issues_extract  date           ark  is_full_date
0  issues_extract_1  1881  bd6t5343084n         False


if one or more values are to be modified, you can change the instruction below to assign the appropriate date

In [22]:
new_df.loc[new_df['issues_extract'] =='issues_extract_1', 'date'] = '01 Janvier 1881'

### Parsing date

In [23]:
new_df.date = new_df.date.apply(lambda x: dateparser.parse(x))

In [24]:
new_df.head()

Unnamed: 0,issues_extract,date,ark,is_full_date
0,issues_extract_1,1881-01-01,bd6t5343084n,False
1,issues_extract_2,1881-02-01,bpt6k1207922h,True
2,issues_extract_3,1881-08-01,bd6t53357331,True
3,issues_extract_4,1881-08-01,bd6t53357331,True
4,issues_extract_1,1881-01-01,bd6t53347823,True


In [25]:
new_df = new_df.drop(columns=['is_full_date'])

In [26]:
new_df["Notice"] = "https://gallica.bnf.fr/" + ark

In [27]:
new_df.head()

Unnamed: 0,issues_extract,date,ark,Notice
0,issues_extract_1,1881-01-01,bd6t5343084n,https://gallica.bnf.fr/ark:/12148/cb327024729/...
1,issues_extract_2,1881-02-01,bpt6k1207922h,https://gallica.bnf.fr/ark:/12148/cb327024729/...
2,issues_extract_3,1881-08-01,bd6t53357331,https://gallica.bnf.fr/ark:/12148/cb327024729/...
3,issues_extract_4,1881-08-01,bd6t53357331,https://gallica.bnf.fr/ark:/12148/cb327024729/...
4,issues_extract_1,1881-01-01,bd6t53347823,https://gallica.bnf.fr/ark:/12148/cb327024729/...


## Metadata

this step is manual and depends on the journal. Each cell assign to the dataframe a new column with Title, City of publication, and all the metadata required by the project. Change it accordingly to your publication

In [28]:
new_df["Title"] = "L'art et la mode : journal de la vie mondaine"

In [29]:
new_df["City"] = "Paris"
new_df["Country"] = "France"
new_df["Journal Type"] = "Fashion"

In [30]:
new_df = new_df.drop(columns=['issues_extract'])

In [31]:
new_df.rename(columns={'date': 'normalized_date', 'ark':'issueArk'}, inplace=True)

In [32]:
new_df["Media URL"] = 'https://gallica.bnf.fr/iiif/ark:/12148/' + new_df['issueArk'].astype(str) + '/manifest.json'

In [33]:
new_df["wkt"] = 'POINT(2.3513888888889 48.856944444444)'

In [34]:
new_df["City_wd"] = 'Q90'
new_df["Country_wd"] = 'Q142'

In [35]:
new_df.to_csv('/Users/carboni/Downloads/art_mode1.csv', index=False)