In [13]:
'''
Escreva código python que escreve uma função read_tei_papers(path) que lê o nome de todos os arquivos XML de path e criar um pandas df em que cada registro tem a coluna paper_id, que é um número sequencial.
'''

'\nEscreva código python que escreve uma função read_tei_papers(path) que lê o nome de todos os arquivos XML de path e criar um pandas df em que cada registro tem a coluna paper_id, que é um número sequencial.\n'

In [14]:
import re

def extract_year(citation):
    match = re.search(r'\b(\d{4})\b', citation)
    return (int(match.group(1)), None) if match else (None, None)


def extract_year_and_page(citation):

    if citation is None:
        return (None, None)

    # Use a regular expression to extract the year and page
    match = re.search(
        r"(?:\b|[^\(])(?:[^,]*, )?(\d{4}[a-z]?)[:\s,]*pp?\.\s*(\d+)(?:-\d+)?|"  # Match for "pp." or "p." with year
        r"(?:\b|[^\(])(?:[^,]*, )?(\d{4}[a-z]?):\s*(\d+)|"                     # Match for ":" format
        r"(?:\b|[^\(])(?:[^,]*, )?(\d{4}[a-z]?),\s*p\.\s*(\d+)|"              # Match for "p." format
        r"(?:\b|[^\(])(?:[^,]*, )?(\d{4}[a-z]?),\s*(\d+)-\d+|"                # Match for page range without "pp." or "p."
        r"(?:\b|[^\(])(?:[^,]* )?(\d{4}[a-z]?),?\s*(\d+)(?:-\d+)?|"           # Match for year and page without explicit "pp." or "p."
        r"(?:\b|[^\(])(?:[^,]* )?(\d{4}[a-z]?),?\s*p\.\s*(\d+)|"              # Match for "p." without explicit parentheses
        r"(?:\b|[^\(])(?:[^,]* )?(\d{4}[a-z]?)\s*:\s*(\d+)|"                  # Match for ": Page" format
        r"(?:\b|[^\(])(?:[^,]*)pp?\.\s*(\d+)|"                                # Match for "pp." without year
        r"(?:\b|[^\(])(?:[^,]*, )?(\d{4}[a-z]?),\s*chap\.\s*(\d+)",           # Match for "chap." format
        citation
    )
    if match:
        if match.group(1):  # Match for "pp." or "p." with year
            year = match.group(1)
            page = int(match.group(2))
        elif match.group(3):  # Match for ":" format
            year = match.group(3)
            page = int(match.group(4))
        elif match.group(5):  # Match for single "p." with year
            year = match.group(5)
            page = int(match.group(6))
        elif match.group(7):  # Match for range without "pp." or "p."
            year = match.group(7)
            page = int(match.group(8))
        elif match.group(9):  # Match for year and page without explicit "pp." or "p."
            year = match.group(9)
            page = int(match.group(10))
        elif match.group(11):  # Match for "p." without year and page
            year = match.group(11)
            page = int(match.group(12))
        elif match.group(13):  # Match for ": Page" format
            year = match.group(13)
            page = int(match.group(14))
        elif match.group(15):  # Match for "pp." without year
            year = None
            page = int(match.group(15))
        elif match.group(16):  # Match for "chap." format
            year = match.group(16)
            page = int(match.group(17))
        
        # Strip the letter suffix from the year before returning (if present)
        if year:
            year = int(re.match(r'\d{4}', year).group())

        return year, page
    else:
        return extract_year(citation)  # Return None if the format doesn't match
    

# Test the function
assert extract_year_and_page("Mises (1949, p.258)") == (1949, 258)
assert extract_year_and_page("(Mises, 1996, pp. 538-86)") == (1996, 538)
assert extract_year_and_page("(von Mises, 1963, p.254)") == (1963, 254)
assert extract_year_and_page("(Mises, 1920, 121-122)") == (1920, 121)
assert extract_year_and_page("(Mises 1949, 236-237)") == (1949, 236)
assert extract_year_and_page("(Mises 1920, 109)") == (1920, 109)
assert extract_year_and_page("(Mises 1920, p.162)") == (1920, 162)
assert extract_year_and_page("(von Mises, 1949: 351)") == (1949, 351)
assert extract_year_and_page("(Mises 1966: 493)") == (1966, 493)
assert extract_year_and_page("(von Mises 1998, p. 270)") == (1998, 270)
assert extract_year_and_page("Mises 1949, p. 3)") == (1949, 3)
assert extract_year_and_page("Mises 1985b, p. 236") == (1985, 236)
assert extract_year_and_page("Mises 1957b, 372") == (1957, 372)
assert extract_year_and_page("Mises, pp. 105-6;") == (None, 105)
assert extract_year_and_page("(Mises, 1949, p. 3)") == (1949, 3)
assert extract_year_and_page("(L Von Mises 1949, pp. 393)") == (1949, 393)
assert extract_year_and_page("(L Von Mises 1949 , pp. 393)") == (1949, 393)
assert extract_year_and_page("(C Berg 2022)") == (2022, None)
assert extract_year_and_page("Hayek (1976:71)") == (1976, 71)
assert extract_year_and_page("According to Mises ([1949] 1998: 116)") == (1998, 116)


assert extract_year_and_page(None) == (None, None)
assert extract_year_and_page("[and]") == (None, None)

In [15]:
import re

def extract_author(ref):
    """
    Extracts the author's full name from a reference string, including multi-word names,
    names with suffixes like 'et al.', and cases where the name appears before a closing parenthesis.
    
    Args:
        ref (str): A reference string (e.g., "McGrath et al., 2004: 96" or "Allen 2005)").
        
    Returns:
        str: The extracted author's full name with proper capitalization, or None if no name is found.
    """
    if ref == None:
        return None

    # Regular expression to capture names, including edge cases, and possessive form ('s)
    match = re.search(r"([A-Za-z][A-Za-z\s\.]+?)(?=\s\d{4}|\s\[|\s\(|,|:|\))", ref)
    if match:
        # Capitalize each word, handling special cases for prefixes and suffixes
        name = match.group(1).strip()
        
        # Handle multi-word names and ensure proper capitalization, ignoring 'et al.'
        temp = " ".join(word.capitalize() if word.lower() not in {"et", "al."} else word for word in name.split()) \
                  .replace("Von Mises", "Mises") \
                  .replace("L V Mises", "Mises") \
                  .replace("L Mises", "Mises") \
                  .replace("Von Hayek", "Hayek") \
                  .replace("F A Hayek", "Hayek") \
                  .replace("K Marx", "Marx") \
                  .replace(" et al.", "") 
        
        temp = temp.split(" And")[0] if " And" in temp else temp
        return temp

    
    # Special case for single-word names like Folta (1998)
    match_single_word = re.search(r"([A-Za-z]+(?:'s)?)(?=\s\d{4}|\s\(|:|\))", ref)
    if match_single_word:
        # Remove possessive 's' if present
        name = match_single_word.group(1).replace("'s", "")
        return name.capitalize()
    
    return None

# Test cases
print(extract_author("Johnson, 1999;"))             # Output: Johnson
print(extract_author("Jouvenel (1961)"))            # Output: Jouvenel
print(extract_author("Keen 2011"))                  # Output: Keen
print(extract_author("(Menger, [1981])"))           # Output: Menger
print(extract_author("von Mises (1949)"))           # Output: Von Mises
print(extract_author("Von Hayek (1949)"))           # Output: Von Mises
print(extract_author("de Broglie (1924)"))          # Output: De Broglie
print(extract_author("Van der Waals (1873)"))       # Output: Van Der Waals
print(extract_author("(McGrath et al., 2004: 96)")) # Output: McGrath Et Al.
print(extract_author("Allen 2005)"))                # Output: Allen
print(extract_author("Folta's (1998)"))             # Output: Folta
print(extract_author("Boettke et al. (1998)"))
print(extract_author("Floss and Klein (1998)"))
print(extract_author("Floss And Klein (1998)"))
print(extract_author("(Von Mises 1949 )"))
print(extract_author("(L V Mises 1949 )"))
print(extract_author("(L Mises 1998 )"))
print(extract_author("(L Von Mises 1998 )"))

assert extract_author(None) == None

Johnson
Jouvenel
Keen
Menger
Mises
Hayek
De Broglie
Van Der Waals
Mcgrath
Allen
Folta
Boettke
Floss
Floss
Mises
Mises
Mises
Mises


In [16]:
'''
prompt:
Gere código python que cria uma classe Reference que tem atributos: raw, context, sentence_seq_number, sentence_id, author, page, year. 
raw, context, e sentence_id são passados pelo construtor.
page e year são obtidos a partir da chamada à extract_year_and_page(raw), que retorna uma tupla (year, page).
author é obtido a partir da chamada a extract_author(raw).
'''

class Reference:
    def __init__(self, raw, context, sentence_seq_number, sentence_id):
        self.raw = raw
        self.context = context
        self.sentence_seq_number = sentence_seq_number
        self.sentence_id = sentence_id

        self.year, self.page = extract_year_and_page(raw)
        self.author = extract_author(raw)

    def __repr__(self):
        return (f"Reference(raw={self.raw!r}, context={self.context!r}, "
                f"sentence_id={self.sentence_id!r}, year={self.year!r}, page={self.page!r})")



ref = Reference("(L Von Mises 1949 , pp. 393)", "",  3, "123")
assert ref.year == 1949
assert ref.page == 393
assert ref.sentence_seq_number == 3
assert ref.author == "Mises"


ref = Reference(None, None, None, None)
assert ref.year == None
assert ref.page == None
assert ref.sentence_seq_number == None
assert ref.author == None


In [17]:
'''
prompt:

Write a function parse_tei(tei_filepath) that opens the TEI XML file and counts the number of biblStruct entries. 
Also, count the number of <s></s> entries. 
Return four values:  
1. The paper title, available in teiHeader / fileDesc / titleStmt / title.
2. the number of s entries
3. the number of biblStruct entries
4. a list of Reference objects.  

Each reference object has a field raw, a field sentence_seq_number, a field sentence_id and a field context. 

Raw should be filled with the text inside the ref tag; context should be filled with the text on the parent <s> tag. 
sentence_seq should be filled with the sequential count of the <s> in the XML file, e.g, sentence_seq = 10 if s is the 10th sentence in the file.
sentence_id should be filled with the value of the property "xml:id" from the parent <s> tag.

'''

from lxml import etree
from dataclasses import dataclass
from typing import List, Tuple


def parse_tei(tei_filepath: str) -> Tuple[str, int, int, List[Reference]]:
    # Parse the XML
    parser = etree.XMLParser(ns_clean=True)
    tree = etree.parse(tei_filepath, parser)
    root = tree.getroot()

    # TEI namespace (if any)
    nsmap = root.nsmap.copy()
    nsmap['tei'] = nsmap.get(None, 'http://www.tei-c.org/ns/1.0')

    # 1. Paper title
    title_xpath = './/tei:teiHeader/tei:fileDesc/tei:titleStmt/tei:title'
    title_elem = root.find(title_xpath, namespaces=nsmap)
    title = title_elem.text.strip() if title_elem is not None and title_elem.text is not None else "Unknown Title"

    # 2. Count number of <s> entries
    s_xpath = './/tei:s'
    s_elems = root.findall(s_xpath, namespaces=nsmap)
    num_s = len(s_elems)

    # 3. Count number of <biblStruct> entries
    bibl_xpath = './/tei:biblStruct'
    bibl_elems = root.findall(bibl_xpath, namespaces=nsmap)
    num_bibl = len(bibl_elems)

    # 4. References
    references = []
    for idx, s in enumerate(s_elems, start=1):
        sentence_id = s.get('{http://www.w3.org/XML/1998/namespace}id')
        context_text = ''.join(s.itertext()).strip()

        for ref in s.findall('.//tei:ref', namespaces=nsmap):
            ref_text = ''.join(ref.itertext()).strip()
            references.append(Reference(
                raw=ref_text,
                sentence_seq_number=idx,
                sentence_id=sentence_id,
                context=context_text
            ))

    return title, num_s, num_bibl, references


paper_path = "../data/teis/from-scopus/A-Historical-Intervention-in-the-Opportunity-Wars-Forgotten-Scholarship-the-DiscoveryCreation-Disruption-and-Moving-Forward-by-Looking-Backward_2023_SAGE-Publications-Ltd.pdf.grobid.tei.xml"
title, sentence_count, reference_count, refs = parse_tei(paper_path) 

assert sentence_count == 283
assert reference_count == 106
assert len(refs) == 222
assert title == "A Historical Intervention in the \"Opportunity Wars\": Forgotten Scholarship, the Discovery/Creation Disruption, and Moving Forward by Looking Backward"
assert refs[0].sentence_id == '_efadFU6'
assert refs[0].sentence_seq_number == 7

paper_path = "../data/teis/from-scopus/A-heterodox-kzgazdasgtan-helyzete-a-gazdasgtudomnyban_2021_State-Audit-Office-of-Hungary.pdf.grobid.tei.xml"
title, sentence_count, reference_count, refs = parse_tei(paper_path) 


In [18]:
'''
prompt:
Escreva código python que escreve uma função read_tei_papers(path) que lê o nome de todos os arquivos XML de path e cria dois dataframes.
Um pandas df chamado "papers_df" em que cada registro tem a coluna paper_id, que é um número sequencial, além das colunas title, filename, sentence_count, reference_count.
Para cada arquivo, invoque parse_tei(tei_filepath), que retorna sentence_count, reference_count e refs.  
Adicione sentence_count e reference_count no papers_df.  
Add a try-catch loop that catches exceptions in XML Parsing.
Ao outro df, chamado refs_df, adicione todos os refs. Um ref é um objeto Reference que tem os campos raw, context, sentence_id, sentence_seq_number e page.
Cada campo deve ser uma coluna em refs_df.
'''

import os
import pandas as pd
from xml.etree.ElementTree import ParseError


def read_tei_papers(path: str):
    papers = []
    refs = []
    paper_id = 0

    for filename in os.listdir(path):
        if filename.endswith(".xml"):
            tei_filepath = os.path.join(path, filename)
            try:
                title, sentence_count, reference_count, ref_list = parse_tei(tei_filepath)

                # Adiciona entrada ao papers_df
                papers.append({
                    "paper_id": paper_id,
                    "title": title,
                    "filename": filename,
                    "sentence_count": sentence_count,
                    "reference_count": reference_count
                })

                # Adiciona entradas ao refs_df
                for ref in ref_list:
                    refs.append({
                        "paper_id": paper_id,
                        "raw": ref.raw,
                        "context": ref.context,
                        "sentence_id": ref.sentence_id,
                        "sentence_seq_number": ref.sentence_seq_number,
                        "author": ref.author,
                        "page": ref.page,
                        "year": ref.year
                    })

                paper_id += 1

            except ParseError as e:
                print(f"Erro ao processar {filename}: {e}")
            except Exception as e:
                print(f"Erro desconhecido em {filename}: {e}")

    # Cria os DataFrames
    papers_df = pd.DataFrame(papers)
    refs_df = pd.DataFrame(refs)

    return papers_df, refs_df

In [19]:
papers_df, refs_df = read_tei_papers("../data/teis/from-scopus")

papers_df.sort_values(by='reference_count', ascending=False)

papers_df.to_csv("../data/papers.csv")
refs_df.to_csv("../data/refs.csv")

Erro desconhecido em Property-is-only-another-name-for-decentralized-creation-of-knowledge_2019_Springer-New-York-LLC-barbarabbertramgskcom.pdf.grobid.tei.xml: Document is empty, line 1, column 1 (../data/teis/from-scopus/Property-is-only-another-name-for-decentralized-creation-of-knowledge_2019_Springer-New-York-LLC-barbarabbertramgskcom.pdf.grobid.tei.xml, line 1)
