In [1]:
## Import BeautifulSoup for xml parsing, pandas for data wrangling and os for operative system access.
from bs4 import BeautifulSoup
import os
import pandas as pd

In [2]:
## We are going to extract metadata from every document, id, el idno (link), publisher, dewey, comprehensiveness score and text 
## Those are tags in the xml file.
docs = []
for file in os.listdir('./TGB_Sample'):  # list of every document in the folder (the folder path where the documents are stored)
    a = open('./TGB_Sample/' + file, 'r', encoding="utf8") # open with utf8 encoding
    soup = BeautifulSoup(a, 'lxml')             # making the soup
    try:    # Using try to catch those documents without the tags
        id = soup.tei['n']                      # Extracts id
        idno = soup.idno.string                 # Extracts idno
        date = soup.xenodata.date.text          # Extracts date
        titulo = soup.xenodata.title.text       # Extracts the title
        compr_score = soup.find('meta-data_comprehensiveness_score').text   # Extracts comprehensiveness score
        campo = soup.xenodata.dewey.text                # Extracts dewey tag (topic/subject/field/area)
        autor = soup.xenodata.author_1.author.text      # Extracts author
        autor_key = soup.xenodata.author_1.key.text     # Extracts author key
        docs.append([id, idno, date, titulo, compr_score, campo, autor, autor_key])  # All data is stored in a pandas dataframe
    except:
        docs.append('no xenodata tag')  # Documents withput xenodata tag are marked to make more pre processing

In [5]:
## Making more pre processing for those documents marked as 'no xenodata tag'
files = os.listdir('./TGB_Sample') ## Using list of documents
mas_prepoc = []  
just_text_extract = [] 
for i in range(len(docs)):
    if docs[i] == 'no xenodata tag': 
        mas_prepoc.append(files[i])  ## appending those with a 'no xenodata tag' to another set
    else:
        just_text_extract.append(docs[i]) ## Those ones who already have data and we are going to extract only the text

In [7]:
masdat = []
for i in mas_prepoc:  ## for those with 'no xenodata tag'
    a = open('./TGB_Sample/' + i, 'r', encoding="utf8")
    soup = BeautifulSoup(a, 'lxml')
    try:    ## try to extract some information
        id = soup.tei['n']
        idno = soup.idno.string
        compr_score = soup.find('meta-data_comprehensiveness_score').text
        campo = soup.dewey.text      
        masdat.append([id, idno, compr_score, campo])
    except:
        masdat.append([0, i,'Nodataatall','NoData']) ## If doesnt have info mark with 'No data at all' and id 

In [8]:
## Those with only few metadata
masproc = pd.DataFrame(masdat, columns = ['id', 'idno', 'compr_score', 'campo'])

In [9]:
## Those with useful metadata
buenos = pd.DataFrame(just_text_extract, columns = ['id', 'idno', 'date', 'titulo', 'compr_score', 'campo', 'autor', 'autor_key'])

In [6]:
## Documents with no comprehensiveness score and dewey data
masproc[masproc['id'] == 0]

Unnamed: 0,id,idno,compr_score,campo
55,0,1025034_r.xml,No_data_at_all,NoData
129,0,5403285_r.xml,No_data_at_all,NoData


In [17]:
## Table that shows the documents grouped by 'field'
buenos.groupby(['campo']).count()

Unnamed: 0_level_0,id,idno,date,titulo,compr_score,autor,autor_key
campo,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
000 - Généralités,2,2,2,2,2,2,2
010 - Bibliographies,3,3,3,3,3,3,3
030 - Encyclopédies générales,1,1,1,1,1,1,1
050 - Publications en série d'ordre général,1,1,1,1,1,1,1
"070 - Journalisme, édition. Journaux",4,4,4,4,4,4,4
...,...,...,...,...,...,...,...
930 - Histoire générale du monde ancien,2,2,2,2,2,2,2
940 - Histoire générale de l'Europe,15,15,15,15,15,15,15
944 - Histoire de la France (depuis 486),49,49,49,49,49,49,49
960 - Histoire générale de l'Afrique,2,2,2,2,2,2,2


In [18]:
## Document count with all metadata by field
buenos.campo.value_counts(ascending = False)

campo
610 - Sciences médicales. Médecine                               184
840 - Littératures des langues romanes. Littérature française    129
800 - Littérature (Belles-lettres)                                81
944 - Histoire de la France (depuis 486)                          49
320 - Science politique                                           35
                                                                ... 
890 - Littératures des autres langues                              1
670 - Fabrication industrielle                                     1
030 - Encyclopédies générales                                      1
301 - Anthropologie et sociologie                                  1
330 - Economie                                                     1
Name: count, Length: 67, dtype: int64

In [19]:
## Document count with some metadata by field
masproc.campo.value_counts(ascending = False)

campo
840 - Littératures des langues romanes. Littérature française     30
944 - Histoire de la France (depuis 486)                          26
320 - Science politique                                           22
800 - Littérature (Belles-lettres)                                20
340 - Droit                                                       20
370 - Education                                                   16
610 - Sciences médicales. Médecine                                14
920 - Biographie générale et généalogie                           11
350 - Administration publique                                      8
940 - Histoire générale de l'Europe                                6
010 - Bibliographies                                               5
100 - Philosophie et disciplines connexes                          4
170 - Ethique                                                      3
780 - Musique                                                      3
070 - Journalisme, édition. 

In [20]:
## Comprehensiveness score for those documents with few metadatta and from field 610 - Sciences médicales. Médecine
masproc[masproc['campo'] == '610 - Sciences médicales. Médecine'].groupby(['compr_score']).count()

Unnamed: 0_level_0,id,idno,campo
compr_score,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0.37,14,14,14


In [21]:
## Comprehensiveness score for those documents with all metadata and from field 610 - Sciences médicales. Médecine
buenos[buenos['campo'] == '610 - Sciences médicales. Médecine'].groupby(['compr_score']).count()

Unnamed: 0_level_0,id,idno,date,titulo,campo,autor,autor_key
compr_score,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0.64,3,3,3,3,3,3,3
0.66,84,84,84,84,84,84,84
0.67,88,88,88,88,88,88,88
0.68,6,6,6,6,6,6,6
0.73,3,3,3,3,3,3,3


In [22]:
## Date count from the documents with all metadata
buenos.date.value_counts(ascending=False)

date
1870    50
1882    25
1822    22
1879    21
1853    19
        ..
1773     1
1782     1
1893     1
1722     1
1790     1
Name: count, Length: 129, dtype: int64

In [23]:
## Data subset, for working only in field '610 - Sciences médicales. Médecine' and comprehensivenes score bigger than .5
medicina = buenos[buenos['campo'] == '610 - Sciences médicales. Médecine']

In [25]:
## Save those documents in an excel file
medicina.to_excel('Medicina.xlsx', index = False)

In [138]:
## Check the dates from that subset
medicina.date.value_counts(ascending=True)

date
1871    1
1888    1
1895    1
1882    1
1887    1
       ..
1863    5
1862    5
1868    6
1891    6
1853    7
Name: count, Length: 75, dtype: int64