In [141]:
'''
Escreva código python que escreve uma função read_tei_papers(path) que lê o nome de todos os arquivos XML de path e criar um pandas df em que cada registro tem a coluna paper_id, que é um número sequencial.
'''

'\nEscreva código python que escreve uma função read_tei_papers(path) que lê o nome de todos os arquivos XML de path e criar um pandas df em que cada registro tem a coluna paper_id, que é um número sequencial.\n'

In [142]:
import re

def extract_year(citation):
    match = re.search(r'\b(\d{4})\b', citation)
    return (int(match.group(1)), None) if match else (None, None)


def extract_year_and_page(citation):

    if citation is None:
        return (None, None)

    # Use a regular expression to extract the year and page
    match = re.search(
        r"(?:\b|[^\(])(?:[^,]*, )?(\d{4}[a-z]?)[:\s,]*pp?\.\s*(\d+)(?:-\d+)?|"  # Match for "pp." or "p." with year
        r"(?:\b|[^\(])(?:[^,]*, )?(\d{4}[a-z]?):\s*(\d+)|"                     # Match for ":" format
        r"(?:\b|[^\(])(?:[^,]*, )?(\d{4}[a-z]?),\s*p\.\s*(\d+)|"              # Match for "p." format
        r"(?:\b|[^\(])(?:[^,]*, )?(\d{4}[a-z]?),\s*(\d+)-\d+|"                # Match for page range without "pp." or "p."
        r"(?:\b|[^\(])(?:[^,]* )?(\d{4}[a-z]?),?\s*(\d+)(?:-\d+)?|"           # Match for year and page without explicit "pp." or "p."
        r"(?:\b|[^\(])(?:[^,]* )?(\d{4}[a-z]?),?\s*p\.\s*(\d+)|"              # Match for "p." without explicit parentheses
        r"(?:\b|[^\(])(?:[^,]* )?(\d{4}[a-z]?)\s*:\s*(\d+)|"                  # Match for ": Page" format
        r"(?:\b|[^\(])(?:[^,]*)pp?\.\s*(\d+)|"                                # Match for "pp." without year
        r"(?:\b|[^\(])(?:[^,]*, )?(\d{4}[a-z]?),\s*chap\.\s*(\d+)",           # Match for "chap." format
        citation
    )
    if match:
        if match.group(1):  # Match for "pp." or "p." with year
            year = match.group(1)
            page = int(match.group(2))
        elif match.group(3):  # Match for ":" format
            year = match.group(3)
            page = int(match.group(4))
        elif match.group(5):  # Match for single "p." with year
            year = match.group(5)
            page = int(match.group(6))
        elif match.group(7):  # Match for range without "pp." or "p."
            year = match.group(7)
            page = int(match.group(8))
        elif match.group(9):  # Match for year and page without explicit "pp." or "p."
            year = match.group(9)
            page = int(match.group(10))
        elif match.group(11):  # Match for "p." without year and page
            year = match.group(11)
            page = int(match.group(12))
        elif match.group(13):  # Match for ": Page" format
            year = match.group(13)
            page = int(match.group(14))
        elif match.group(15):  # Match for "pp." without year
            year = None
            page = int(match.group(15))
        elif match.group(16):  # Match for "chap." format
            year = match.group(16)
            page = int(match.group(17))
        
        # Strip the letter suffix from the year before returning (if present)
        if year:
            year = int(re.match(r'\d{4}', year).group())

        return year, page
    else:
        return extract_year(citation)  # Return None if the format doesn't match
    

# Test the function
assert extract_year_and_page("Mises (1949, p.258)") == (1949, 258)
assert extract_year_and_page("(Mises, 1996, pp. 538-86)") == (1996, 538)
assert extract_year_and_page("(von Mises, 1963, p.254)") == (1963, 254)
assert extract_year_and_page("(Mises, 1920, 121-122)") == (1920, 121)
assert extract_year_and_page("(Mises 1949, 236-237)") == (1949, 236)
assert extract_year_and_page("(Mises 1920, 109)") == (1920, 109)
assert extract_year_and_page("(Mises 1920, p.162)") == (1920, 162)
assert extract_year_and_page("(von Mises, 1949: 351)") == (1949, 351)
assert extract_year_and_page("(Mises 1966: 493)") == (1966, 493)
assert extract_year_and_page("(von Mises 1998, p. 270)") == (1998, 270)
assert extract_year_and_page("Mises 1949, p. 3)") == (1949, 3)
assert extract_year_and_page("Mises 1985b, p. 236") == (1985, 236)
assert extract_year_and_page("Mises 1957b, 372") == (1957, 372)
assert extract_year_and_page("Mises, pp. 105-6;") == (None, 105)
assert extract_year_and_page("(Mises, 1949, p. 3)") == (1949, 3)
assert extract_year_and_page("(L Von Mises 1949, pp. 393)") == (1949, 393)
assert extract_year_and_page("(L Von Mises 1949 , pp. 393)") == (1949, 393)
assert extract_year_and_page("(C Berg 2022)") == (2022, None)
assert extract_year_and_page("Hayek (1976:71)") == (1976, 71)
assert extract_year_and_page("According to Mises ([1949] 1998: 116)") == (1998, 116)


assert extract_year_and_page(None) == (None, None)
assert extract_year_and_page("[and]") == (None, None)

In [None]:
'''
prompt:
Gere código python que cria uma classe Reference que tem atributos: raw, context, sentence_id, page, year. 
raw, context, e sentence_id são passados pelo construtor.
page e year são obtidos a partir da chamada à extract_year_and_page(raw), que retorna uma tupla (year, page).
'''

class Reference:
    def __init__(self, raw, context, sentence_id):
        self.raw = raw
        self.context = context
        self.sentence_id = sentence_id

        self.year, self.page = extract_year_and_page(raw)

    def __repr__(self):
        return (f"Reference(raw={self.raw!r}, context={self.context!r}, "
                f"sentence_id={self.sentence_id!r}, year={self.year!r}, page={self.page!r})")



ref = Reference("(L Von Mises 1949 , pp. 393)", "", "123")
assert ref.year == 1949
assert ref.page == 393


ref = Reference(None, None, None)
assert ref.year == None
assert ref.page == None


In [144]:
'''
prompt:
Write a function parse_tei(tei_filepath) that opens the TEI XML file and counts the number of biblStruct entries. 
Also, count the number of <s></s> entries. 
Return fours values: 
1. The paper title, available in teiHeader / fileDesc / titleStmt / title.
2. the number of s entries
3. the number of biblStruct entries
4. a list of Reference objects.  

Each reference object has a field raw, a field sentence_id and a field context. 
Raw should be filled with the text inside the ref tag; context should be filled with the text on the parent <s> tag. 
sentence_id should be filled with the value of the property "xml:id" from the parent <s> tag.

'''


from lxml import etree
from dataclasses import dataclass
from typing import List, Tuple


def parse_tei(tei_filepath: str) -> Tuple[str, int, int, List[Reference]]:
    # Parse the XML file
    tree = etree.parse(tei_filepath)
    root = tree.getroot()

    ns = {'tei': 'http://www.tei-c.org/ns/1.0'}

    # 1. Get the paper title
    title_el = root.find('.//tei:teiHeader/tei:fileDesc/tei:titleStmt/tei:title', namespaces=ns)
    title = title_el.text.strip() if title_el is not None and title_el.text is not None else 'Unknown Title'

    # 2. Count the number of <s> entries
    s_elements = root.findall('.//tei:text//tei:s', namespaces=ns)
    sentence_count = len(s_elements)

    # 3. Count the number of <biblStruct> entries
    bibl_structs = root.findall('.//tei:biblStruct', namespaces=ns)
    reference_count = len(bibl_structs)

    # 4. Extract reference objects
    references = []
    for s in s_elements:
        sentence_id = s.get('{http://www.w3.org/XML/1998/namespace}id', '')
        context = ''.join(s.itertext()).strip()
        for ref in s.findall('.//tei:ref', namespaces=ns):
            raw = ''.join(ref.itertext()).strip()
            references.append(Reference(raw=raw, sentence_id=sentence_id, context=context))

    return title, sentence_count, reference_count, references



paper_path = "../data/teis/from-scopus/A-Historical-Intervention-in-the-Opportunity-Wars-Forgotten-Scholarship-the-DiscoveryCreation-Disruption-and-Moving-Forward-by-Looking-Backward_2023_SAGE-Publications-Ltd.pdf.grobid.tei.xml"
title, sentence_count, reference_count, refs = parse_tei(paper_path) 
assert sentence_count == 278
assert reference_count == 106
assert len(refs) == 222
assert title == "A Historical Intervention in the \"Opportunity Wars\": Forgotten Scholarship, the Discovery/Creation Disruption, and Moving Forward by Looking Backward"
assert refs[0].sentence_id == '_efadFU6'

paper_path = "../data/teis/from-scopus/A-heterodox-kzgazdasgtan-helyzete-a-gazdasgtudomnyban_2021_State-Audit-Office-of-Hungary.pdf.grobid.tei.xml"
title, sentence_count, reference_count, refs = parse_tei(paper_path) 


raw Shane and Venkataraman's (2000)
raw (Alvarez & Barney, 2007;
raw Dimov, 2007;
raw McMullen, 2015;
raw Sarasvathy et al., 2020;
raw Shane, 2003;
raw 2012;
raw Suddaby et al., 2015;
raw Wood & McKinley, 2010)
raw (Alvarez & Porac, 2020;
raw Arikan et al., 2020;
raw Berglund et al., 2020;
raw McBride & Wuebker, 2021)
raw (Davidsson, 2015;
raw Kitching Rouse, 2017)
raw Foss and Klein (2020)
raw Alvarez and Barney (2020)
raw [and]
raw (Alvarez & Barney, 2020, p. 306)
raw Shane and Venkataraman (2000)
raw Shane and Venkataraman's (2000)
raw (Humphrey, 2005)
raw Dutton et al., 1990;
raw Thomas & McDaniel, 1990;
raw Thomas et al., 1993)
raw (Dutton & Jackson, 1987;
raw Fredrickson, 1985)
raw (Gartner et al., 2016)
raw (Dutton et al., 1983;
raw Thomas & McDaniel, 1990)
raw (Jackson & Dutton, 1988)
raw (Ramoglou & McMullen, 2021)
raw (Dutton & Ottensmeyer, 1987;
raw Ginsberg & Venkatraman, 1992)
raw (Ginsberg & Venkataraman, 1992;
raw Thomas & McDaniel, 1990)
raw Shane and Venkataraman's (20

In [145]:
'''
prompt:
Escreva código python que escreve uma função read_tei_papers(path) que lê o nome de todos os arquivos XML de path e cria dois dataframes.
Um pandas df chamado "papers_df" em que cada registro tem a coluna paper_id, que é um número sequencial, além das colunas title, filename, sentence_count, reference_count.
Para cada arquivo, invoque parse_tei(tei_filepath), que retorna sentence_count, reference_count e refs.  
Adicione sentence_count e reference_count no papers_df.  
Add a try-catch loop that catches exceptions in XML Parsing.
Ao outro df, chamado refs_df, adicione todos os refs. Um ref é um objeto Reference que tem os campos raw, context e page. Cada campo deve ser uma coluna em refs_df.
'''

import pandas as pd
from xml.etree.ElementTree import ParseError


def read_tei_papers(path: str):
    papers = []
    refs = []
    paper_id = 0

    for filename in os.listdir(path):
        if filename.endswith(".xml"):
            tei_filepath = os.path.join(path, filename)
            try:
                title, sentence_count, reference_count, ref_list = parse_tei(tei_filepath)

                # Adiciona entrada ao papers_df
                papers.append({
                    "paper_id": paper_id,
                    "title": title,
                    "filename": filename,
                    "sentence_count": sentence_count,
                    "reference_count": reference_count
                })

                # Adiciona entradas ao refs_df
                for ref in ref_list:
                    refs.append({
                        "paper_id": paper_id,
                        "raw": ref.raw,
                        "context": ref.context,
                        "sentence_id": ref.sentence_id,
                        "page": ref.page,
                        "year": ref.year
                    })

                paper_id += 1

            except ParseError as e:
                print(f"Erro ao processar {filename}: {e}")
            except Exception as e:
                print(f"Erro desconhecido em {filename}: {e}")

    # Cria os DataFrames
    papers_df = pd.DataFrame(papers)
    refs_df = pd.DataFrame(refs)

    return papers_df, refs_df

In [146]:
papers_df, refs_df = read_tei_papers("../data/teis/from-scopus")

papers_df.sort_values(by='reference_count', ascending=False)

papers_df.to_csv("../data/papers.csv")
refs_df.to_csv("../data/refs.csv")

raw 2
raw 3
raw 4
raw 5
raw 6
raw McCloskey 1994
raw McCloskey , 2008))
raw Boettke et al. (2010)
raw Lavoie (1997)
raw (Garrison, 1989, pp. 6-7)
raw Mises (2006a
raw Mises ( [1928]]
raw Mises (1954
raw Mises ( [1912], 357-366;], 357-366;
raw 1998
raw [1949]
raw (Garrison, 1989;
raw 2001, pp. 69-71;
raw Haberler, 1983
raw Haberler, [1932], pp. 14-15;], pp. 14-15;
raw Hayek, 1967
raw Hayek, [1935], pp. 54-65, 85-91;], pp. 54-65, 85-91;
raw 2008
raw [1933], pp. 60-62, 67-68, 73-75;
raw Huerta de Soto, 2006, pp. 348-360;
raw Macovei, 2015, pp. 416-418;
raw Mises, 1954
raw Mises, [1912], pp. 357-364;], pp. 357-364;
raw 1983, pp. 2-3;
raw 2006a
raw [1928], pp. 109-111;
raw 2006b
raw [1931], pp. 160-163;
raw Rothbard, 1983
raw Rothbard, [1969], pp. 29-30;], pp. 29-30;
raw 2000, pp. 9-14;
raw 2004
raw [1962
raw ], pp. 996-1004;
raw Salerno, 2012, pp. 15-24;
raw Sieroń, 2016, p. 313;
raw Strigl, 2000
raw Strigl, [1934], pp. 111-116)], pp. 111-116)
raw Simpson (2014, vol. I, p. 74
raw David How