In [5]:
import pandas as pd 

mises_refs_df = pd.read_csv("../data/processed/mises_refs.csv")

In [6]:
mises_refs_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7421 entries, 0 to 7420
Data columns (total 21 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   Unnamed: 0                        7421 non-null   int64  
 1   paper_id                          7421 non-null   int64  
 2   raw                               7421 non-null   object 
 3   context                           7421 non-null   object 
 4   co_cited_count                    7421 non-null   int64  
 5   sentence_id                       7421 non-null   object 
 6   sentence_seq_number               7421 non-null   int64  
 7   author                            7421 non-null   object 
 8   page                              4524 non-null   float64
 9   year                              7164 non-null   float64
 10  title                             7421 non-null   object 
 11  filename                          7421 non-null   object 
 12  senten

In [7]:
mises_refs_df.head()

Unnamed: 0.1,Unnamed: 0,paper_id,raw,context,co_cited_count,sentence_id,sentence_seq_number,author,page,year,...,filename,sentence_count,reference_count,source title,similarity,human_action_chapter_by_ref_page,human_action_chapter_number,human_action_chapter_name,human_action_part_number,human_action_part_name
0,0,790,"von Mises, 1949, pp. 1-71)",Utilising the theory of human action developed...,1,_jm2yrry,90,Mises,1.0,1949.0,...,10.1108.eum0000000005692.pdf.grobid.tei.xml,257,71,International Journal of Social Economics,100.0,0,0,Chapter 0: Introduction,0,Part 0: Introduction
1,1,1842,"(Mises 2011, 1)","Interventionism, for its part, ""seeks to retai...",0,_5k2NTcp,352,Mises,1.0,2011.0,...,IS-THE-AUSTRIAN-SCHOOL-VALUEFREE-ON-THE-DEPEND...,458,87,Quarterly Journal of Austrian Economics,100.0,0,0,Chapter 0: Introduction,0,Part 0: Introduction
2,2,1843,"(Mises 2011, 1)","Interventionism, for its part, ""seeks to retai...",0,_WMFnAys,352,Mises,1.0,2011.0,...,IS-THE-AUSTRIAN-SCHOOL-VALUEFREE-ON-THE-DEPEND...,458,87,Quarterly Journal of Austrian Economics,100.0,0,0,Chapter 0: Introduction,0,Part 0: Introduction
3,3,1392,"Mises (2011, 1-44)","In a world of national fiat currencies, a bala...",0,_QhFwbBe,63,Mises,1.0,2011.0,...,Austrian-Monetary-Theory-Comment-on-Pascal-Sal...,91,22,Quarterly Journal of Austrian Economics,98.245614,0,0,Chapter 0: Introduction,0,Part 0: Introduction
4,4,1354,"(Mises 2009, 1-2)","By the time he finished his studies, however, ...",0,_prAxRdp,117,Mises,1.0,2009.0,...,Are-structural-fluctuations-natural-or-policyi...,402,94,Quarterly Journal of Austrian Economics,98.4375,0,0,Chapter 0: Introduction,0,Part 0: Introduction


In [8]:
# PREPARE THE DATA

import pandas as pd

df = mises_refs_df.dropna(
    subset=["context", "human_action_chapter_number"]
).copy()


X_text = df["context"]
y = df["human_action_chapter_number"]


In [9]:
# VECTORIZE

from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(
    lowercase=True,
    stop_words="english",
    ngram_range=(1, 2),
    min_df=5,          # important for robustness
    max_df=0.8
)

X = vectorizer.fit_transform(X_text)


In [10]:
#  TRAIN REGRESSION

from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(
    multi_class="multinomial",
    solver="lbfgs",
    max_iter=1000,
    n_jobs=-1
)

clf.fit(X, y)




In [15]:
import numpy as np
import re

feature_names = np.array(vectorizer.get_feature_names_out())
chapters = clf.classes_

# Máscara: True apenas para termos SEM números
no_number_mask = np.array([
    not bool(re.search(r"\d", term)) for term in feature_names
])

# Aplica máscara aos nomes e depois aos coeficientes
filtered_feature_names = feature_names[no_number_mask]

top_terms_per_chapter = {}

for i, chapter in enumerate(chapters):
    coef = clf.coef_[i]
    filtered_coef = coef[no_number_mask]
    
    top_idx = np.argsort(filtered_coef)[-20:][::-1]  # top 20 sem números
    
    top_terms_per_chapter[chapter] = list(
        zip(filtered_feature_names[top_idx], filtered_coef[top_idx])
    )


In [18]:
# CONVERT DO DATAFRAME

pd.set_option("display.max_rows", None)


rows = []

for chapter, terms in top_terms_per_chapter.items():
    for term, weight in terms:
        rows.append({
            "human_action_chapter_number": chapter,
            "term": term,
            "log_loss_contribution": weight
        })

terms_df = pd.DataFrame(rows)

terms_df.sort_values(
    ["log_loss_contribution", "human_action_chapter_number",],
    ascending=[False, True]
).head(500)


Unnamed: 0,human_action_chapter_number,term,log_loss_contribution
800,WHOLE,hayek,5.03182
180,17,money,3.620979
801,WHOLE,austrian,3.130802
260,20,boom,2.865835
200,18,capital,2.78947
400,27,government,2.712358
720,6,probability,2.628326
802,WHOLE,rothbard,2.599494
120,14,entrepreneur,2.489945
680,4,goods,2.394403
