In [1]:
import pandas as pd

refs_df = pd.read_csv("../data/processed/refs.csv")

refs_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 265726 entries, 0 to 265725
Data columns (total 18 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   paper_id              265726 non-null  int64  
 1   raw                   265644 non-null  object 
 2   context               265726 non-null  object 
 3   co_cited_count        265726 non-null  int64  
 4   section_id            259027 non-null  object 
 5   paragraph_id          265684 non-null  object 
 6   sentence_id           265726 non-null  object 
 7   sentence_seq_number   265726 non-null  int64  
 8   reference_seq_number  265726 non-null  int64  
 9   author                227628 non-null  object 
 10  page                  74403 non-null   float64
 11  year                  222883 non-null  float64
 12  title                 265726 non-null  object 
 13  filename              265726 non-null  object 
 14  sentence_count        265726 non-null  int64  
 15  

In [2]:
refs_df['author'].nunique()

27520

In [3]:
def filter_top_k_authors(refs_df, k):
    """
    Mantém apenas os top-K autores mais frequentes em refs_df.
    
    Popularidade = número de ocorrências do autor no dataframe.
    """

    # 1. Conta frequência por autor
    author_counts = (
        refs_df["author"]
        .value_counts()
    )

    # 2. Seleciona top-K autores
    top_k_authors = set(author_counts.head(k).index)

    # 3. Filtra o dataframe
    filtered_refs_df = refs_df[
        refs_df["author"].isin(top_k_authors)
    ].copy()

    return filtered_refs_df


# Mantém apenas os 100 autores mais citados
#refs_df = filter_top_k_authors(refs_df, k=200)

print(refs_df.shape)



(265726, 18)


In [4]:
refs_df['author'].value_counts()

author
Mises                             7754
Hayek                             6281
Kirzner                           3013
Rothbard                          2647
Schumpeter                        1992
                                  ... 
Con                                  1
Sterling Drug                        1
Garibaldi V. Lucky Food Stores       1
Mollahoseini                         1
Kott                                 1
Name: count, Length: 27520, dtype: int64

In [5]:
import pandas as pd

mises_refs_df = pd.read_csv("../data/processed/mises_refs.csv")

# Seleciona apenas o que interessa do mises_refs_df
mises_parts = (
    mises_refs_df[
        ["sentence_id", "author", "human_action_part_number"]
    ]
    .dropna(subset=["sentence_id", "human_action_part_number"])
    .copy()
)

# Faz LEFT JOIN em refs_df
refs_df = refs_df.merge(
    mises_parts,
    on=["sentence_id", "author"],
    how="left"
)


def build_author_node(row):
    if (
        row["author"] == "Mises"
        and pd.notna(row["human_action_part_number"])
    ):
        return f"Mises_{(row['human_action_part_number'])}"
    return row["author"]


refs_df = refs_df.copy()
refs_df["mises_part"] = refs_df.apply(build_author_node, axis=1)


In [6]:
refs_df[['author', 'sentence_id', 'paragraph_id', 'section_id', 'paper_id']]

Unnamed: 0,author,sentence_id,paragraph_id,section_id,paper_id
0,Streissler,_TMX2wdg,_NwgSU35,_gzuY8cP,1
1,Menger,_aJz9uAb,_Z99Bvn5,_NytzfyY,1
2,,_aJz9uAb,_Z99Bvn5,_NytzfyY,1
3,Menger,_54sgkMV,_Z99Bvn5,_NytzfyY,1
4,Menger,_F7mv66u,_ZYekVnE,_NytzfyY,1
...,...,...,...,...,...
266005,Ludwig,_t2fUWfj,_4CtFXWw,,2698
266006,Studnicki,_t2fUWfj,_4CtFXWw,,2698
266007,Franko,_t2fUWfj,_4CtFXWw,,2698
266008,Mises,_wSW7FwV,_SQU9UVs,,2698


In [7]:
refs_df[['author', 'sentence_id', 'paragraph_id', 'section_id', 'paper_id']] \
    .isna() \
    .mean() \
    .sort_values(ascending=False)


author          0.143220
section_id      0.025183
paragraph_id    0.000158
sentence_id     0.000000
paper_id        0.000000
dtype: float64

In [8]:
import pandas as pd

def percentage_units_with_mises(refs_df):
    granularities = {
        "sentence": "sentence_id",
        "paragraph": "paragraph_id",
        "section": "section_id",
        "paper": "paper_id",
    }

    results = []

    for level_name, level_col in granularities.items():

        # total de unidades naquele nível
        total_units = (
            refs_df[level_col]
            .dropna()
            .nunique()
        )

        if total_units == 0:
            continue

        # unidades que contêm Mises
        mises_units = (
            refs_df.loc[refs_df["author"] == "Mises", level_col]
            .dropna()
            .nunique()
        )

        percent = 100 * mises_units / total_units

        results.append({
            "granularity": level_name,
            "total_units": total_units,
            "units_with_mises": mises_units,
            "percent_with_mises": percent
        })

    return pd.DataFrame(results)

mises_coverage_df = percentage_units_with_mises(refs_df)
mises_coverage_df


Unnamed: 0,granularity,total_units,units_with_mises,percent_with_mises
0,sentence,190971,7623,3.991706
1,paragraph,97610,6010,6.157156
2,section,26404,3589,13.592637
3,paper,2643,1841,69.655694


In [9]:
import random

# --------------------------------------------------
# papers que contêm Mises
# --------------------------------------------------
papers_with_mises = set(
    refs_df.loc[refs_df["author"] == "Mises", "paper_id"]
    .dropna()
    .unique()
)

# --------------------------------------------------
# todos os papers com título
# --------------------------------------------------
papers_df = (
    refs_df[["paper_id", "title", "filename"]]
    .dropna(subset=["paper_id"])
    .drop_duplicates(subset=["paper_id"])
)

# --------------------------------------------------
# papers sem Mises
# --------------------------------------------------
papers_without_mises_df = papers_df[
    ~papers_df["paper_id"].isin(papers_with_mises)
]

# --------------------------------------------------
# sorteia um paper
# --------------------------------------------------
random_row = papers_without_mises_df.sample(n=1, random_state=None)

random_row


Unnamed: 0,paper_id,title,filename
222549,2195,Robust political economy,Robust-political-economy_2006_.pdf.grobid.tei.xml


In [10]:
def pair_metrics_by_granularity(
    refs_df,
    author_a,
    author_b,
    granularities=("sentence_id", "paragraph_id", "section_id", "paper_id")
):
    results = []

    for g in granularities:
        df = refs_df[[g, "author"]].dropna()

        units = (
            df
            .groupby(g)["author"]
            .agg(set)
        )

        num_units = len(units)

        if num_units == 0:
            results.append({
                "granularity": g,
                "num_units": 0,
                "support_abs": 0,
                "support": 0.0,
                "confidence": 0.0,
                "lift": 0.0
            })
            continue

        has_a = units.apply(lambda s: author_a in s)
        has_b = units.apply(lambda s: author_b in s)

        count_a = has_a.sum()
        count_b = has_b.sum()
        count_ab = (has_a & has_b).sum()

        if count_a == 0 or count_b == 0:
            confidence = 0.0
            lift = 0.0
            support = 0.0
        else:
            support = count_ab / num_units
            confidence = count_ab / count_a
            lift = confidence / (count_b / num_units)

        results.append({
            "granularity": g,
            "num_units": num_units,
            "support_abs": int(count_ab),
            "support": support,
            "confidence": confidence,
            "lift": lift
        })

    return pd.DataFrame(results)


In [11]:
metrics = pair_metrics_by_granularity(
    refs_df,
    author_a="Lange",
    author_b="Mises"
)

metrics

Unnamed: 0,granularity,num_units,support_abs,support,confidence,lift
0,sentence_id,160560,15,9.3e-05,0.081522,1.717058
1,paragraph_id,83708,38,0.000454,0.2375,3.307928
2,section_id,22464,75,0.003339,0.585938,3.667456
3,paper_id,2409,89,0.036945,0.956989,1.252247


In [12]:
def precompute_target_stats(refs_df, target_author="Mises"):
    granularities = ["sentence_id", "paragraph_id", "section_id", "paper_id"]

    stats = {}

    for g in granularities:
        units = (
            refs_df
            .dropna(subset=[g, "author"])
            .groupby(g)["author"]
            .apply(set)
        )

        num_units = len(units)
        units_with_target = units.apply(lambda s: target_author in s)
        support_target = units_with_target.mean()

        stats[g] = {
            "num_units": num_units,
            "support_target": support_target,
            "units": units  # ⚠️ cache estrutural
        }

    return stats


In [13]:
def pair_metrics_from_cache(
    author,
    target_author,
    target_stats
):
    rows = []

    for g, data in target_stats.items():
        units = data["units"]

        both = units.apply(
            lambda s: author in s and target_author in s
        )

        support_abs = both.sum()
        support = support_abs / data["num_units"]

        support_author = units.apply(lambda s: author in s).mean()

        confidence = (
            support / support_author
            if support_author > 0
            else 0.0
        )

        lift = (
            confidence / data["support_target"]
            if data["support_target"] > 0
            else 0.0
        )

        rows.append({
            "granularity": g,
            "num_units": data["num_units"],
            "support_abs": support_abs,
            "support": support,
            "confidence": confidence,
            "lift": lift,
        })

    return pd.DataFrame(rows)


In [14]:
def build_authors_df_optimized(
    refs_df,
    top_k_authors,
    target_author="Mises"
):
    target_stats = precompute_target_stats(
        refs_df,
        target_author
    )

    dfs = []

    for author in top_k_authors:
        df = pair_metrics_from_cache(
            author,
            target_author,
            target_stats
        )

        if df.empty:
            continue

        df = df.assign(
            author=author,
            with_author=target_author
        )

        dfs.append(df)

    if not dfs:
        return pd.DataFrame()

    return pd.concat(dfs, ignore_index=True)[
        ["author", "with_author", "granularity",
         "num_units", "support_abs", "support",
         "confidence", "lift"]
    ]


In [15]:
def get_top_k_authors(refs_df, k=20, exclude=("Mises",)):
    vc = refs_df["author"].value_counts()
    vc = vc.drop(labels=exclude, errors="ignore")
    return vc.head(k).index.tolist()


def build_authors_df(refs_df, top_k_authors, target_author="Mises"):
    dfs = []

    for author in top_k_authors:
        df = pair_metrics_by_granularity(
            refs_df,
            author_a=author,
            author_b=target_author
        )

        if df.empty:
            continue

        # Adiciona colunas fixas de forma vetorizada
        df = df.assign(
            author=author,
            with_author=target_author
        )

        dfs.append(df)

    if not dfs:
        return pd.DataFrame()

    return (
        pd.concat(dfs, ignore_index=True)
        [["author", "with_author", "granularity",
          "num_units", "support_abs", "support",
          "confidence", "lift"]]
    )


In [16]:
pd.set_option("display.max_rows", None)

K = 1000

target_author = "Mises"

top_k_authors = get_top_k_authors(
    refs_df,
    k=K,
    exclude={target_author}
)

print("Top K authors generated.")

authors_df = build_authors_df_optimized(
    refs_df,
    top_k_authors,
    target_author=target_author
)

authors_df.to_csv("../data/processed/author_lifts.csv")


Top K authors generated.


In [21]:
authors_df.head()

Unnamed: 0,author,with_author,granularity,num_units,support_abs,support,confidence,lift
0,Hayek,Mises,sentence_id,160560,591,0.003681,0.095833,2.018482
1,Hayek,Mises,paragraph_id,83708,939,0.011218,0.193449,2.694377
2,Hayek,Mises,section_id,22464,1312,0.058405,0.442496,2.769636
3,Hayek,Mises,paper_id,2409,1133,0.47032,0.874903,1.144836
4,Kirzner,Mises,sentence_id,160560,264,0.001644,0.088472,1.863445


In [22]:
import pandas as pd

granularity_order = [
    "sentence_id",
    "paragraph_id",
    "section_id",
    "paper_id"
]

df = authors_df.copy()

# 1. Cria o sinal da relação
df["lift_relation"] = df["lift"].apply(lambda x: "+" if x >= 1 else "-")

# 2. Garante ordem correta de granularidade
df["granularity"] = pd.Categorical(
    df["granularity"],
    categories=granularity_order,
    ordered=True
)

# 3. Uma linha por author, uma coluna por granularidade
author_patterns = (
    df
    .pivot_table(
        index="author",
        columns="granularity",
        values="lift_relation",
        aggfunc="first"
    )
    .reindex(columns=granularity_order)
)

# 4. Remove autores sem todas as granularidades
author_patterns = author_patterns.dropna()

# 5. Constrói o padrão (+ + - +, etc.)
author_patterns["pattern"] = author_patterns.apply(
    lambda row: " ".join(row.values),
    axis=1
)

# 6. Conta padrões
pattern_stats = (
    author_patterns["pattern"]
    .value_counts()
    .rename("count")
    .reset_index()
    .rename(columns={"index": "pattern"})
)

# 7. Proporção
pattern_stats["proportion"] = (
    pattern_stats["count"] / pattern_stats["count"].sum()
)

pattern_stats


  .pivot_table(


Unnamed: 0,pattern,count,proportion
0,- - + +,353,0.353
1,- - - +,249,0.249
2,- + + +,190,0.19
3,+ + + +,73,0.073
4,- - - -,68,0.068
5,- - + -,33,0.033
6,- + + -,14,0.014
7,- + - +,6,0.006
8,+ - + +,4,0.004
9,+ + - +,3,0.003
