In [None]:
'''
Escreva c√≥digo python que escreve uma fun√ß√£o read_tei_papers(path) que l√™ o nome de todos os arquivos XML de path e criar um pandas df em que cada registro tem a coluna paper_id, que √© um n√∫mero sequencial.
'''

In [None]:
tei_path = "../data/interim/tei"

In [None]:
import re


def extract_year(citation):
    if citation is None:
        return (None, None)

    match = re.search(r'\b(\d{4})\b', citation)
    return (int(match.group(1)), None) if match else (None, None)


def extract_year_and_page(citation):

    if citation is None:
        return (None, None)

    # üî¥ CASO ESPECIAL: [ano original] ano edi√ß√£o, p./pp./:
    # Ex:
    # (Mises [1949] 1963: 251-255)
    # Mises ([1936] 1951, p. 287)
    special = re.search(
        r"\[\s*(\d{4})\s*\]\s*\d{4}[a-z]?\s*(?:[:;,]|\s)\s*"
        r"(?:pp?\.\s*)?(\d+)",
        citation
    )
    if special:
        return int(special.group(1)), int(special.group(2))

    # üîß Normaliza [1949], [[1949]], etc.
    citation = re.sub(r"\[\s*(\d{4})\s*\]+", r"\1", citation)

    match = re.search(
        # Ano + pp./p.
        r"(?:\b|[^\(])(?:[^,]*, )?(\d{4}[a-z]?)[:\s,]*pp?\.\s*(\d+)(?:-\d+)?|"

        # Ano: p√°gina
        r"(?:\b|[^\(])(?:[^,]*, )?(\d{4}[a-z]?):\s*(\d+)|"

        # Ano, p. p√°gina
        r"(?:\b|[^\(])(?:[^,]*, )?(\d{4}[a-z]?),\s*p\.\s*(\d+)|"

        # Ano, intervalo
        r"(?:\b|[^\(])(?:[^,]*, )?(\d{4}[a-z]?),\s*(\d+)-\d+|"

        # Ano, p√°gina
        r"(?:\b|[^\(])(?:[^,]* )?(\d{4}[a-z]?),?\s*(\d+)(?:-\d+)?|"

        # Somente pp. p√°gina
        r"(?:\b|[^\(])(?:[^,]*)pp?\.\s*(\d+)",
        citation
    )

    if match:
        groups = match.groups()

        # üîπ Pares (year, page)
        for i in range(0, len(groups) - 1, 2):
            if groups[i] and groups[i + 1]:
                year = int(re.match(r"\d{4}", groups[i]).group())
                page = int(groups[i + 1])
                return year, page

        # üîπ Apenas p√°gina
        for g in groups:
            if g and g.isdigit():
                return None, int(g)

    # üîπ Caso especial: ibid.: 26 / op. cit.: 45
    match_ibid = re.search(
        r"(?:ibid\.?|op\.?\s*cit\.?)\s*:\s*(\d+)",
        citation,
        re.IGNORECASE
    )
    if match_ibid:
        return None, int(match_ibid.group(1))

    return extract_year(citation)

   

# Test the function
assert extract_year_and_page("Mises (1949, p.258)") == (1949, 258)
assert extract_year_and_page("(Mises, 1996, pp. 538-86)") == (1996, 538)
assert extract_year_and_page("(von Mises, 1963, p.254)") == (1963, 254)
assert extract_year_and_page("(Mises, 1920, 121-122)") == (1920, 121)
assert extract_year_and_page("(Mises 1949, 236-237)") == (1949, 236)
assert extract_year_and_page("(Mises 1920, 109)") == (1920, 109)
assert extract_year_and_page("(Mises 1920, p.162)") == (1920, 162)
assert extract_year_and_page("(von Mises, 1949: 351)") == (1949, 351)
assert extract_year_and_page("(Mises 1966: 493)") == (1966, 493)
assert extract_year_and_page("(von Mises 1998, p. 270)") == (1998, 270)
assert extract_year_and_page("Mises 1949, p. 3)") == (1949, 3)
assert extract_year_and_page("Mises 1985b, p. 236") == (1985, 236)
assert extract_year_and_page("Mises 1957b, 372") == (1957, 372)
assert extract_year_and_page("Mises, pp. 105-6;") == (None, 105)
assert extract_year_and_page("(Mises, 1949, p. 3)") == (1949, 3)
assert extract_year_and_page("(L Von Mises 1949, pp. 393)") == (1949, 393)
assert extract_year_and_page("(L Von Mises 1949 , pp. 393)") == (1949, 393)
assert extract_year_and_page("(C Berg 2022)") == (2022, None)
assert extract_year_and_page("Hayek (1976:71)") == (1976, 71)
assert extract_year_and_page("According to Mises ([1949] 1998: 116)") == (1949, 116)
assert extract_year_and_page(" Mises ( [1949]], p. 405)") == (1949, 405)
assert extract_year_and_page("(Mises [1949] 1963: 251-255)")  == (1949, 251)
assert extract_year_and_page("Mises ([1936] 1951, p. 287)") == (1936, 287)
assert extract_year_and_page("(ibid.: 26)") == (None, 26)

assert extract_year_and_page(None) == (None, None)
assert extract_year_and_page("[and]") == (None, None)



In [None]:
import re

def extract_author(ref):
    if ref is None:
        return None

    match = re.search(
        r"([A-Za-z√Ä-√ø][A-Za-z√Ä-√ø\s\.']+?)"
        r"(?:\s+(?:and|And|&)\s+[A-Za-z√Ä-√ø][A-Za-z√Ä-√ø\s\.']+)?"
        r"(?=\s\d{4}|\s\[|\s\(|,|:|\))",
        ref
    )

    if match:
        name = match.group(1).strip()

        # Remove possessive 's
        name = re.sub(r"'s$", "", name)

        lowercase_particles = {"de", "der", "von"}
        ignore_words = {"et", "al."}

        def normalize(word):
            w = word.lower()
            if w in lowercase_particles:
                return w
            if w in ignore_words:
                return word
            if w.startswith("mc") and len(w) > 2:
                return "Mc" + w[2].upper() + w[3:]
            return word.capitalize()

        temp = " ".join(normalize(word) for word in name.split())

        temp = (
            temp.replace("von Mises", "Mises")
                .replace("Von Mises", "Mises")
                .replace("L V Mises", "Mises")
                .replace("L Mises", "Mises")
                .replace("L Von Mises", "Mises")
                .replace("von Hayek", "Hayek")
                .replace("Von Hayek", "Hayek")
                .replace("F A Hayek", "Hayek")
                .replace("F. A. Hayek", "Hayek")
                .replace("K Marx", "Marx")
                .replace(" et al.", "")
        )

        return temp.rstrip(" .")

    match_single_word = re.search(
        r"([A-Za-z√Ä-√ø]+(?:'s)?)(?=\s\d{4}|\s\(|:|\))",
        ref
    )

    if match_single_word:
        name = match_single_word.group(1).replace("'s", "")
        return name.capitalize().rstrip(" .")

    return None



# Test cases
assert extract_author("Johnson, 1999;") == "Johnson"             
assert extract_author("Jouvenel (1961)") == "Jouvenel"
assert extract_author("Keen 2011") == "Keen"
assert extract_author("(Menger, [1981])") == "Menger"
assert extract_author("von Mises (1949)") == "Mises"
assert extract_author("Von Hayek (1949)") == "Hayek"
assert extract_author("de Broglie (1924)") == "de Broglie"
assert extract_author("Allen 2005)") == "Allen"
assert extract_author("Folta's (1998)") == "Folta"
assert extract_author("Boettke et al. (1998)") == "Boettke"
assert extract_author("Floss and Klein (1998)") == "Floss"
assert extract_author("Floss And Klein (1998)") == "Floss"
assert extract_author("(Von Mises 1949 )") == "Mises"
assert extract_author("(L V Mises 1949 )") == "Mises" 
assert extract_author("(L Mises 1998 )") == "Mises"
assert extract_author("(L Von Mises 1998 )") == "Mises"
assert extract_author("Van der Waals (1873)") == "Van der Waals"
assert extract_author("(McGrath et al., 2004: 96)") == "McGrath"
assert extract_author("Shane and Venkataraman's (2000)") == "Shane"
assert extract_author("(Alvarez & Barney, 2007") == "Alvarez"
assert extract_author("(Ib√≠d., 1986: 40)") == "Ib√≠d"

assert extract_author(None) == None


In [None]:
'''
prompt:
Gere c√≥digo python que cria uma classe Reference que tem atributos: raw, context, sentence_seq_number, sentence_id, author, page, year. 
raw, context, e sentence_id s√£o passados pelo construtor.
page e year s√£o obtidos a partir da chamada √† extract_year_and_page(raw), que retorna uma tupla (year, page).
author √© obtido a partir da chamada a extract_author(raw).
'''

class Reference:
    def __init__(self, raw, context, sentence_seq_number, reference_seq_number, sentence_id, paragraph_id, head_id):
        self.raw = self.expand_raw(raw, context)
        self.context = context
        self.sentence_seq_number = sentence_seq_number
        self.reference_seq_number = reference_seq_number
        self.sentence_id = sentence_id
        self.paragraph_id = paragraph_id
        self.head_id = head_id
        self.co_cited_count = None

        self.year, self.page = extract_year_and_page(self.raw)
        self.author = extract_author(self.raw)

    def expand_raw(self, raw, context):
        """
        Se raw cont√©m '(' mas n√£o ')', tenta expandi-lo usando o contexto
        at√© o primeiro ')' encontrado ap√≥s a ocorr√™ncia de raw.
        """
        if raw is None or context is None:
            return raw

        if "(" not in raw or ")" in raw:
            return raw

        idx = context.find(raw)
        if idx == -1:
            return raw

        after = context[idx:]
        close_paren = after.find(")")
        
        if close_paren == -1:
            return raw

        return after[: close_paren + 1]


    def __repr__(self):
        return (f"Reference(raw={self.raw!r}, context={self.context!r} author={self.author!r}"
                f"sentence_id={self.sentence_id!r}, year={self.year!r}, page={self.page!r}), co-cited={self.co_cited_count!r}")



ref = Reference("(L Von Mises 1949 , pp. 393)", "",  3, 1, "123", "124", "125")
assert ref.year == 1949
assert ref.page == 393
assert ref.sentence_seq_number == 3
assert ref.reference_seq_number == 1
assert ref.author == "Mises"
assert ref.sentence_id == "123"
assert ref.paragraph_id == "124"
assert ref.head_id == "125"
assert ref.co_cited_count == None


ref = Reference('Mises ( [1949]]', 'In a striking section of his Human Action, von Mises (1966Mises ( [1949]], p. 405) discusses the "epistemological import" of Menger\'s theory of money.', 1, 1, '123', '124', '125')
assert ref.author == "Mises"
assert ref.year == 1949
assert ref.page == 405
assert ref.co_cited_count == None

ref = Reference("(Ramoglou & McMullen, 2021)", "As importantly, the creation of a constituent condition of the possibility of entrepreneurial success cannot be the creation of opportunity, since \"an opportunity\" is never a singular condition (Ramoglou & McMullen, 2021) but a collection of \"opportunity ingredients\" (Ramoglou, 2021a).", 3, 1, "123", "124", "125")
assert ref.year == 2021
assert ref.co_cited_count == None

ref = Reference(None, None, None, None, None, None, None)
assert ref.year == None
assert ref.page == None
assert ref.sentence_seq_number == None
assert ref.author == None
assert ref.co_cited_count == None
assert print(ref) == None


In [None]:
def deduplica_refs(references):
    """
    Regras:
    - Se um autor tem refer√™ncias com page, descarta todas as sem page
    - Se um autor s√≥ tem refer√™ncias sem page, mant√©m apenas uma
    - Se o mesmo autor aparece com pages diferentes, mant√©m todas
    """

    from collections import defaultdict

    by_author = defaultdict(list)
    for ref in references:
        by_author[ref.author].append(ref)

    result = []

    for author, refs in by_author.items():

        refs_with_page = [r for r in refs if r.page is not None]
        refs_without_page = [r for r in refs if r.page is None]

        # üîπ Caso 1: existe pelo menos uma refer√™ncia com page
        if refs_with_page:
            best_by_page = {}

            for ref in refs_with_page:
                page = ref.page

                if page not in best_by_page:
                    best_by_page[page] = ref
                    continue

                current = best_by_page[page]

                # desempate (mesma page)
                if current.year is not None and ref.year is not None:
                    if ref.year < current.year:
                        best_by_page[page] = ref
                        continue

                # fallback: mant√©m a atual
            result.extend(best_by_page.values())

        # üîπ Caso 2: s√≥ existem refer√™ncias sem page
        else:
            best = refs_without_page[0]
            for ref in refs_without_page[1:]:
                if best.year is not None and ref.year is not None:
                    if ref.year < best.year:
                        best = ref
            result.append(best)

    return result



def test_single_reference():
    ref = Reference("(L Von Mises 1949)", "", 3, 1, "123", "123", "123")
    ref.author = "Mises"
    ref.page = 10

    result = deduplica_refs([ref])

    assert len(result) == 1
    assert result[0].author == "Mises"
    assert result[0].page == 10


def test_same_author_prefers_with_page():
    ref1 = Reference("(L Von Mises 1949)", "", 1, 1, "1", "1", "1")
    ref1.author = "Mises"
    ref1.page = None

    ref2 = Reference("(L Von Mises 1949, p. 393)", "", 2, 1, "2", "1", "1")
    ref2.author = "Mises"
    ref2.page = 393

    result = deduplica_refs([ref1, ref2])

    assert len(result) == 1
    assert result[0].author == "Mises"
    assert result[0].page == 393


def test_same_author_all_without_page():
    ref1 = Reference("(L Von Mises 1949)", "", 1, 1, "1", "1", "1")
    ref1.author = "Mises"
    ref1.page = None

    ref2 = Reference("(L Von Mises 1950)", "", 2, 1, "2", "2", "12")
    ref2.author = "Mises"
    ref2.page = None

    result = deduplica_refs([ref1, ref2])

    assert len(result) == 1
    assert result[0].author == "Mises"
    assert result[0].page is None


def test_multiple_authors():
    ref1 = Reference("(L Von Mises 1949)", "", 1, 1, "1", "1", "1")
    ref1.author = "Mises"
    ref1.page = None

    ref2 = Reference("(Hayek 1945, p. 12)", "", 2, 1, "2", "2", "2")
    ref2.author = "Hayek"
    ref2.page = 12

    ref3 = Reference("(L Von Mises 1949, p. 393)", "", 3, 1, "3", "2", "1")
    ref3.author = "Mises"
    ref3.page = 393

    result = deduplica_refs([ref1, ref2, ref3])

    assert len(result) == 2

    by_author = {r.author: r for r in result}

    assert by_author["Mises"].page == 393
    assert by_author["Hayek"].page == 12


def test_first_page_wins():
    ref1 = Reference("(Mises 1949, p. 10)", "", 1, 1, "1", "1", "1")
    ref1.author = "Mises"
    ref1.page = 10

    ref2 = Reference("(Mises 1949, p. 20)", "", 2, 1, "2", "2", "2")
    ref2.author = "Mises"
    ref2.page = 20

    result = deduplica_refs([ref1, ref2])

    assert len(result) == 2
    assert result[0].page == 10
    assert result[1].page == 20


def test_older_book_wins():
    ref1 = Reference("(Mises 2008)", "", 1, 1, "1", "1", "1")
    ref1.author = "Mises"
    ref1.page = None

    ref2 = Reference("(Mises 1949)", "", 2, 1, "2", "2", "2")
    ref2.author = "Mises"
    ref2.page = None

    result = deduplica_refs([ref1, ref2])

    assert len(result) == 1
    assert result[0].year == 1949
    assert result[0].page == None


def test_no_years():
    ref1 = Reference("(Mises, p. 10)", "", 1, 1, "1", "1", "1")
    ref1.author = "Mises"
    ref1.page = 10

    ref2 = Reference("(Mises, p. 20)", "", 2, 1, "2", "2", "1")
    ref2.author = "Mises"
    ref2.page = 20

    result = deduplica_refs([ref1, ref2])

    assert len(result) == 2

    assert result[0].year == None
    assert result[0].page == 10

    
    assert len(result) == 2
    assert result[1].year == None
    assert result[1].page == 20

# =========================
# Runner simples
# =========================

test_single_reference()
test_first_page_wins()
test_same_author_prefers_with_page()
test_same_author_all_without_page()
test_multiple_authors()
test_older_book_wins()
test_no_years()

print("All tests passed!")


In [None]:
from lxml import etree
from typing import List, Tuple
import re


def parse_tei(tei_filepath: str) -> Tuple[str, int, int, List[Reference]]:
    parser = etree.XMLParser(ns_clean=True)
    tree = etree.parse(tei_filepath, parser)
    root = tree.getroot()

    # Namespace TEI
    nsmap = root.nsmap.copy()
    nsmap['tei'] = nsmap.get(None, 'http://www.tei-c.org/ns/1.0')

    # 1. Paper title
    title_xpath = './/tei:teiHeader/tei:fileDesc/tei:titleStmt/tei:title'
    title_elem = root.find(title_xpath, namespaces=nsmap)
    title = (
        title_elem.text.strip()
        if title_elem is not None and title_elem.text is not None
        else "Unknown Title"
    )

    # 2. Count <s>
    s_xpath = './/tei:s'
    s_elems = root.findall(s_xpath, namespaces=nsmap)
    num_s = len(s_elems)

    # 3. Count <biblStruct>
    bibl_xpath = './/tei:biblStruct'
    bibl_elems = root.findall(bibl_xpath, namespaces=nsmap)
    num_bibl = len(bibl_elems)

    # 4. References
    references = []
    ref_idx = 0

    # === estado para resolver IBID ===
    last_author = None

    for sentence_seq, s in enumerate(s_elems, start=1):

        # sentence_id
        sentence_id = s.get('{http://www.w3.org/XML/1998/namespace}id')

        # paragraph_id
        paragraph_id = None
        p = s.getparent()
        if p is not None and p.tag.endswith('p'):
            paragraph_id = p.get('{http://www.w3.org/XML/1998/namespace}id')

        # section / head logic
        head_id = None
        div_ancestor = None

        for ancestor in s.iterancestors():
            if ancestor.tag.endswith('div'):
                div_ancestor = ancestor
                break

        if div_ancestor is not None:
            head_elem = div_ancestor.find('tei:head', namespaces=nsmap)
            if head_elem is not None:
                head_id = head_elem.get('{http://www.w3.org/XML/1998/namespace}id')
            else:
                head_id = div_ancestor.get('{http://www.w3.org/XML/1998/namespace}id')

        # contexto da senten√ßa
        context_text = ''.join(s.itertext()).strip()

        local_references = []

        # =====================================================
        # 1) REFER√äNCIAS EXPL√çCITAS (<ref>)
        # =====================================================
        for ref_elem in s.findall('.//tei:ref', namespaces=nsmap):
            ref_idx += 1
            ref_text = ''.join(ref_elem.itertext()).strip()

            ref = Reference(
                raw=ref_text,
                sentence_seq_number=sentence_seq,
                reference_seq_number=ref_idx,
                sentence_id=sentence_id,
                paragraph_id=paragraph_id,
                head_id=head_id,
                context=context_text,
            )

            # ===== RESOLU√á√ÉO DE IBID DENTRO DE <ref> =====
            if ref.author is not None:
                normalized = ref.author.strip().lower()
                if normalized in {"ibid", "ibid.", "ibidem", "ib√≠d"}:
                    ref.author = last_author
                else:
                    last_author = ref.author

            local_references.append(ref)

        # =====================================================
        # 2) IBID FORA DE <ref>  ‚Üí (ibid.: 26)
        # =====================================================
        ibid_pattern = re.compile(
            r'\((\s*(?:ibid\.?|ibidem)\s*[:,]?\s*(?:p{1,2}\.\s*)?\d*\s*)\)',
            flags=re.IGNORECASE
        )

        for match in ibid_pattern.finditer(context_text):
            if last_author is None:
                continue

            ref_idx += 1
            raw_text = f"({match.group(1).strip()})"

            ref = Reference(
                raw=raw_text,
                sentence_seq_number=sentence_seq,
                reference_seq_number=ref_idx,
                sentence_id=sentence_id,
                paragraph_id=paragraph_id,
                head_id=head_id,
                context=context_text,
            )

            # for√ßa autor resolvido
            ref.author = last_author

            # captura simples da p√°gina
            page_match = re.search(r'\d+', raw_text)
            if page_match:
                ref.page = int(page_match.group())

            local_references.append(ref)

        # =====================================================
        # P√≥s-processamento
        # =====================================================
        local_references = deduplica_refs(local_references)

        # co-citation count
        for ref in local_references:
            ref.co_cited_count = len(local_references) - 1

        references.extend(local_references)

    return title, num_s, num_bibl, references


In [None]:
# TESTE 1

paper_path = "../data/interim/tei/A-Historical-Intervention-in-the-Opportunity-Wars-Forgotten-Scholarship-the-DiscoveryCreation-Disruption-and-Moving-Forward-by-Looking-Backward_2023_SAGE-Publications-Ltd.pdf.grobid.tei.xml"
title, sentence_count, reference_count, refs = parse_tei(paper_path) 

assert sentence_count == 283
assert reference_count == 106
assert len(refs) == 209
assert title == "A Historical Intervention in the \"Opportunity Wars\": Forgotten Scholarship, the Discovery/Creation Disruption, and Moving Forward by Looking Backward"

assert refs[0].sentence_seq_number == 7
assert refs[0].reference_seq_number == 1
assert refs[0].sentence_id == '_paHYmXc'
assert refs[0].paragraph_id == "_byrr9Qs"
assert refs[0].head_id == "_Vbdbk8s"
assert refs[0].co_cited_count == 0

target = "Besides Cole's (1959) early discussion"
matches = [ref for ref in refs if ref.context.startswith(target)]
assert len(matches) == 6
assert matches[0].sentence_seq_number == 36


target = "Furthermore, one should not neglect the wealth"
matches = [ref for ref in refs if ref.context.startswith(target)]
assert len(matches) == 2
assert matches[0].sentence_seq_number == 37



In [None]:
# TESTE 2

paper_path = "../data/interim/tei/10.1108.17506221211282000.pdf.grobid.tei.xml"
title, sentence_count, reference_count, refs = parse_tei(paper_path) 

refs = [ref for ref in refs if ref.sentence_id == "_sYcZ3cm"]

assert len(refs) == 1

assert refs[0].author == "Mises"
assert refs[0].page == 417
assert refs[0].co_cited_count == 0

assert refs[0].head_id == "_GaeQjWv"



In [None]:
# TESTE 3

paper_path = "../data/interim/tei/10.1002.9780470999059.ch17.pdf.grobid.tei.xml"
title, sentence_count, reference_count, refs = parse_tei(paper_path) 

refs = [ref for ref in refs if ref.sentence_id == "_szF6vj8"]

refs

In [None]:
# TESTE 4

paper_path = "../data/interim/tei/10.1007.978-3-030-05557-8.pdf.grobid.tei.xml"

title, sentence_count, reference_count, refs = parse_tei(paper_path) 

refs = [ref for ref in refs if ref.sentence_id == "_Gk3c4QK"]

assert len(refs) == 1
assert refs[0].author == "Mises"
assert refs[0].year == 1949
assert refs[0].page == 107
assert refs[0].co_cited_count == 0



In [None]:
# TESTE 5

paper_path = "../data/interim/tei/10.1007.978-3-030-05557-8.pdf.grobid.tei.xml"

title, sentence_count, reference_count, refs = parse_tei(paper_path) 

refs_1 = [ref for ref in refs if ref.sentence_id == "_NhU88KE"]
assert len(refs_1) == 1
assert refs_1[0].author == "Hayek"
assert refs_1[0].sentence_seq_number == 427
assert refs_1[0].page == 29

refs_2 = [ref for ref in refs if ref.sentence_id == "_jbfghEA"]
assert refs_2[0].author == "Hayek"
assert refs_2[0].sentence_seq_number == 428
assert refs_2[0].page == 36





In [None]:
# TESTE 6

paper_path = "../data/interim/tei/10.1007.s11138-009-0093-5.pdf.grobid.tei.xml"

title, sentence_count, reference_count, refs = parse_tei(paper_path) 

refs_1 = [ref for ref in refs if ref.sentence_id == "_8xhbsPH"]
assert len(refs_1) == 2

assert refs_1[0].author == "Mises"
assert refs_1[0].sentence_seq_number == 310
assert refs_1[0].page == 11

assert refs_1[1].author == "Mises"
assert refs_1[1].sentence_seq_number == 310
assert refs_1[1].page == 26




In [None]:
# TESTE 7

paper_path = "../data/interim/tei/10.1007.978-3-030-05557-8.pdf.grobid.tei.xml"

title, sentence_count, reference_count, refs = parse_tei(paper_path) 

refs_1 = [ref for ref in refs if ref.sentence_id == "_NhU88KE"]
assert len(refs_1) == 1
assert refs_1[0].author == "Hayek"
assert refs_1[0].sentence_seq_number == 427
assert refs_1[0].page == 29

refs_2 = [ref for ref in refs if ref.sentence_id == "_jbfghEA"]
assert refs_2[0].author == "Hayek"
assert refs_2[0].sentence_seq_number == 428
assert refs_2[0].page == 36





In [None]:
# TESTE 8

paper_path = "../data/interim/tei/10.1007.BF01102289.pdf.grobid.tei.xml"

title, sentence_count, reference_count, refs = parse_tei(paper_path) 

refs_1 = [ref for ref in refs if ref.paragraph_id == "_yzMgbrV"]
assert len(refs_1) == 1
assert refs_1[0].raw == '(ibid., p. 250)'
assert refs_1[0].page == 250



In [None]:
# TESTE 9

paper_path = "../data/interim/tei/The-Government-of-Possible-Social-and-Solidary-Economy-Subject-and-Power_2016_Universidad-Nacional-Autonoma-de-Mexico-iieanalesgmailcom.pdf.grobid.tei.xml"

title, sentence_count, reference_count, refs = parse_tei(paper_path) 

refs_1 = [ref for ref in refs if ref.paragraph_id == "_XjhWEzM"]
assert len(refs_1) == 1
assert refs_1[0].raw == '(Ib√≠d., 1986: 40)'
assert refs_1[0].author == 'Mises'
assert refs_1[0].page == 40

In [None]:
'''
prompt:
Escreva c√≥digo python que escreve uma fun√ß√£o read_tei_papers(path) que l√™ o nome de todos os arquivos XML de path e cria dois dataframes.
Um pandas df chamado "papers_df" em que cada registro tem a coluna paper_id, que √© um n√∫mero sequencial, al√©m das colunas title, filename, sentence_count, reference_count.
Para cada arquivo, invoque parse_tei(tei_filepath), que retorna sentence_count, reference_count e refs.  
Adicione sentence_count e reference_count no papers_df.  
Add a try-catch loop that catches exceptions in XML Parsing.
Ao outro df, chamado refs_df, adicione todos os refs. Um ref √© um objeto Reference que tem os campos raw, context, sentence_id, sentence_seq_number e page.
Cada campo deve ser uma coluna em refs_df.
'''

import os
import pandas as pd
from xml.etree.ElementTree import ParseError


def read_tei_papers(path: str):
    papers = []
    refs = []

    paper_id = 1

    for filename in os.listdir(path):
        if filename.endswith(".xml"):
            tei_filepath = os.path.join(path, filename)
            try:
                title, sentence_count, reference_count, ref_list = parse_tei(tei_filepath)

                # Adiciona entrada ao papers_df
                papers.append({
                    "paper_id": paper_id,
                    "title": title,
                    "filename": filename,
                    "sentence_count": sentence_count,
                    "reference_count": reference_count
                })

                # Adiciona entradas ao refs_df
                for ref in ref_list:
                    refs.append({
                        "paper_id": paper_id,
                        "raw": ref.raw,
                        "context": ref.context,
                        "co_cited_count": ref.co_cited_count,
                        "head_id": ref.head_id,
                        "paragraph_id": ref.paragraph_id,
                        "sentence_id": ref.sentence_id,
                        "sentence_seq_number": ref.sentence_seq_number,
                        "reference_seq_number": ref.reference_seq_number,
                        "author": ref.author,
                        "page": ref.page,
                        "year": ref.year
                    })

                paper_id += 1

            except ParseError as e:
                print(f"Erro ao processar {filename}: {e}")
            except Exception as e:
                print(f"Erro desconhecido em {filename}: {e}")

    # Cria os DataFrames
    papers_df = pd.DataFrame(papers)
    refs_df = pd.DataFrame(refs)

    return papers_df, refs_df

In [None]:
import pandas as pd


scopus_df = pd.read_csv("../data/raw/scopus.csv")

scopus_df['Source title'] = scopus_df['Source title'].replace("The Review of Austrian Economics", "Review of Austrian Economics")

scopus_df.head()

In [None]:
papers_df, refs_df = read_tei_papers(tei_path)

assert papers_df['paper_id'].is_unique, "Duplicate paper_id values found in papers_df"

print("Read TEI files!")

papers_df.shape

In [None]:
!pip install rapidfuzz

In [None]:
import pandas as pd
from rapidfuzz import process, fuzz

# Normaliza√ß√£o b√°sica
papers_df["title_norm"] = papers_df["title"].str.lower().str.strip()
scopus_df["title_norm"] = scopus_df["Title"].str.lower().str.strip()

scopus_titles = scopus_df["title_norm"].tolist()

def get_best_scopus_source(title):
    if pd.isna(title):
        return pd.Series([None, None])

    match, score, idx = process.extractOne(
        title,
        scopus_titles,
        scorer=fuzz.token_sort_ratio  # melhor para t√≠tulos acad√™micos
    )

    source_title = scopus_df.iloc[idx]["Source title"]
    return pd.Series([source_title, score])

# Aplica fuzzy matching
papers_df[["source title", "similarity"]] = (
    papers_df["title_norm"]
    .apply(get_best_scopus_source)
)

# (Opcional) filtro de qualidade
SIMILARITY_THRESHOLD = 85
papers_df.loc[papers_df["similarity"] < SIMILARITY_THRESHOLD, "scopus_source_title"] = None

# Limpeza
papers_df = papers_df.drop(columns=["title_norm"])
papers_df = papers_df.drop(columns=["scopus_source_title"])

papers_df.head()

In [None]:
#papers_df = papers_df.drop(columns=['Title'])
papers_df = papers_df.sort_values(by='paper_id', ascending=False)

refs_df = pd.merge(refs_df, papers_df, on='paper_id', how='left')

print("Creating spreadsheets...")
papers_df.to_csv("../data/processed/papers.csv", index=False)
refs_df.to_csv("../data/processed/refs.csv", index=False)

na_percentage = papers_df['source title'].isna().mean() * 100
print(f"Percentage of NA in 'source title': {na_percentage:.2f}%")


In [None]:
papers_df.tail()

In [None]:
paper = papers_df[papers_df.title == 'A Historical Intervention in the "Opportunity Wars": Forgotten Scholarship, the Discovery/Creation Disruption, and Moving Forward by Looking Backward'].iloc[0]

assert paper['source title'] == 'Entrepreneurship: Theory and Practice'
