In [None]:
import pandas as pd

refs_df = pd.read_csv(
    "../data/processed/refs.csv",
    usecols=lambda c: c != "context"
)

refs_df.info()


In [None]:
import pandas as pd

# 1. L√™ o CSV com o mapeamento autor ‚Üí tradi√ß√£o
author_school_df = pd.read_csv(
    "../notebooks/map-author-school.csv"
)

# Normaliza√ß√£o b√°sica
author_school_df["author"] = author_school_df["author"].str.strip()
refs_df["author"] = refs_df["author"].str.strip()

# 2. Left join via author
refs_df = (
    refs_df
    .merge(
        author_school_df,
        on="author",
        how="left"
    )
)

# 3. Se tradition n√£o foi encontrada, marcar como 'non-classified'
refs_df["tradition"] = refs_df["tradition"].fillna("non-classified")

refs_df.head()


In [None]:
pd.set_option('display.max_rows', None)


refs_df[['author','tradition']].value_counts(dropna=False).head(100)

In [None]:
def filter_top_k_authors(refs_df, k):
    """
    Mant√©m apenas os top-K autores mais frequentes em refs_df.
    
    Popularidade = n√∫mero de ocorr√™ncias do autor no dataframe.
    """

    # 1. Conta frequ√™ncia por autor
    author_counts = (
        refs_df["author"]
        .value_counts()
    )

    # 2. Seleciona top-K autores
    top_k_authors = set(author_counts.head(k).index)

    # 3. Filtra o dataframe
    filtered_refs_df = refs_df[
        refs_df["author"].isin(top_k_authors)
    ].copy()

    return filtered_refs_df


# Mant√©m apenas os 100 autores mais citados
#refs_df = filter_top_k_authors(refs_df, k=200)

print(refs_df.shape)



In [None]:
refs_df['author'].value_counts()

In [None]:
import pandas as pd

mises_refs_df = pd.read_csv("../data/processed/mises_refs.csv")

# Seleciona apenas o que interessa do mises_refs_df
mises_parts = (
    mises_refs_df[
        ["sentence_id", "author", "human_action_part_number"]
    ]
    .dropna(subset=["sentence_id", "human_action_part_number"])
    .copy()
)

# Faz LEFT JOIN em refs_df
refs_df = refs_df.merge(
    mises_parts,
    on=["sentence_id", "author"],
    how="left"
)


def build_author_node(row):
    if (
        row["author"] == "Mises"
        and pd.notna(row["human_action_part_number"])
    ):
        return f"Mises_{(row['human_action_part_number'])}"
    return row["author"]


refs_df = refs_df.copy()
refs_df["mises_part"] = refs_df.apply(build_author_node, axis=1)


In [None]:
refs_df[['author', 'sentence_id', 'paragraph_id', 'section_id', 'paper_id']]

In [None]:
refs_df[['author', 'sentence_id', 'paragraph_id', 'section_id', 'paper_id']] \
    .isna() \
    .mean() \
    .sort_values(ascending=False)


In [None]:
import pandas as pd

def percentage_units_with_mises(refs_df):
    granularities = {
        "sentence": "sentence_id",
        "paragraph": "paragraph_id",
        "section": "section_id",
        "paper": "paper_id",
    }

    results = []

    for level_name, level_col in granularities.items():

        # total de unidades naquele n√≠vel
        total_units = (
            refs_df[level_col]
            .dropna()
            .nunique()
        )

        if total_units == 0:
            continue

        # unidades que cont√™m Mises
        mises_units = (
            refs_df.loc[refs_df["author"] == "Mises", level_col]
            .dropna()
            .nunique()
        )

        percent = 100 * mises_units / total_units

        results.append({
            "granularity": level_name,
            "total_units": total_units,
            "units_with_mises": mises_units,
            "percent_with_mises": percent
        })

    return pd.DataFrame(results)

mises_coverage_df = percentage_units_with_mises(refs_df)
mises_coverage_df


In [None]:
import random

# --------------------------------------------------
# papers que cont√™m Mises
# --------------------------------------------------
papers_with_mises = set(
    refs_df.loc[refs_df["author"] == "Mises", "paper_id"]
    .dropna()
    .unique()
)

# --------------------------------------------------
# todos os papers com t√≠tulo
# --------------------------------------------------
papers_df = (
    refs_df[["paper_id", "title", "filename"]]
    .dropna(subset=["paper_id"])
    .drop_duplicates(subset=["paper_id"])
)

# --------------------------------------------------
# papers sem Mises
# --------------------------------------------------
papers_without_mises_df = papers_df[
    ~papers_df["paper_id"].isin(papers_with_mises)
]

# --------------------------------------------------
# sorteia um paper
# --------------------------------------------------
random_row = papers_without_mises_df.sample(n=1, random_state=None)

random_row


In [None]:
def pair_metrics_by_granularity(
    refs_df,
    author_a,
    author_b,
    granularities=("sentence_id", "paragraph_id", "section_id", "paper_id")
):
    results = []

    for g in granularities:
        df = refs_df[[g, "author"]].dropna()

        units = (
            df
            .groupby(g)["author"]
            .agg(set)
        )

        num_units = len(units)

        if num_units == 0:
            results.append({
                "granularity": g,
                "num_units": 0,
                "support_abs": 0,
                "support": 0.0,
                "confidence": 0.0,
                "lift": 0.0
            })
            continue

        has_a = units.apply(lambda s: author_a in s)
        has_b = units.apply(lambda s: author_b in s)

        count_a = has_a.sum()
        count_b = has_b.sum()
        count_ab = (has_a & has_b).sum()

        if count_a == 0 or count_b == 0:
            confidence = 0.0
            lift = 0.0
            support = 0.0
        else:
            support = count_ab / num_units
            confidence = count_ab / count_a
            lift = confidence / (count_b / num_units)

        results.append({
            "granularity": g,
            "num_units": num_units,
            "support_abs": int(count_ab),
            "support": support,
            "confidence": confidence,
            "lift": lift
        })

    return pd.DataFrame(results)


In [None]:
metrics = pair_metrics_by_granularity(
    refs_df,
    author_a="Lange",
    author_b="Mises"
)

metrics

In [None]:
def precompute_target_stats(refs_df, target_author="Mises"):
    granularities = ["sentence_id", "paragraph_id", "section_id", "paper_id"]

    stats = {}

    for g in granularities:
        units = (
            refs_df
            .dropna(subset=[g, "author"])
            .groupby(g)["author"]
            .apply(set)
        )

        num_units = len(units)
        units_with_target = units.apply(lambda s: target_author in s)
        support_target = units_with_target.mean()

        stats[g] = {
            "num_units": num_units,
            "support_target": support_target,
            "units": units  # ‚ö†Ô∏è cache estrutural
        }

    return stats


In [None]:
def pair_metrics_from_cache(
    author,
    target_author,
    target_stats
):
    rows = []

    for g, data in target_stats.items():
        units = data["units"]

        both = units.apply(
            lambda s: author in s and target_author in s
        )

        support_abs = both.sum()
        support = support_abs / data["num_units"]

        support_author = units.apply(lambda s: author in s).mean()

        confidence = (
            support / support_author
            if support_author > 0
            else 0.0
        )

        lift = (
            confidence / data["support_target"]
            if data["support_target"] > 0
            else 0.0
        )

        rows.append({
            "granularity": g,
            "num_units": data["num_units"],
            "support_abs": support_abs,
            "support": support,
            "confidence": confidence,
            "lift": lift,
        })

    return pd.DataFrame(rows)


In [None]:
import pandas as pd


def build_authors_df(
    refs_df,
    top_k_authors,
    target_author="Mises"
):
    target_stats = precompute_target_stats(
        refs_df,
        target_author
    )

    dfs = []

    for author in top_k_authors:
        df = pair_metrics_from_cache(
            author,
            target_author,
            target_stats
        )

        if df.empty:
            continue

        df = df.assign(
            author=author,
            with_author=target_author
        )

        dfs.append(df)

    if not dfs:
        return pd.DataFrame()

    authors_df = pd.concat(dfs, ignore_index=True)[
        ["author", "with_author", "granularity",
         "num_units", "support_abs", "support",
         "confidence", "lift"]
    ]

    # üîπ Mapa √∫nico author ‚Üí tradition (vem do refs_df)
    author_tradition = (
        refs_df[["author", "tradition"]]
        .dropna(subset=["author"])
        .drop_duplicates(subset=["author"])
    )

    # üîπ Left join
    authors_df = authors_df.merge(
        author_tradition,
        on="author",
        how="left"
    )

    return authors_df


In [None]:
import pandas as pd


def get_top_k_authors(refs_df, k=20, exclude=("Mises",)):
    vc = refs_df["author"].value_counts()
    vc = vc.drop(labels=exclude, errors="ignore")
    return vc.head(k).index.tolist()


In [None]:
pd.set_option("display.max_rows", None)

K = 1000

target_author = "Mises"

top_k_authors = get_top_k_authors(
    refs_df,
    k=K,
    exclude={target_author}
)

print("Top K authors generated.")

authors_df = build_authors_df(
    refs_df,
    top_k_authors,
    target_author=target_author
)

authors_df.to_csv("../data/processed/author_lifts.csv")


In [None]:
authors_df.head()

In [None]:
import pandas as pd

# --------------------------------------------------
# Configura√ß√£o
# --------------------------------------------------

GRANULARITY_ORDER = [
    "sentence_id",
    "paragraph_id",
    "section_id",
    "paper_id"
]

def lift_to_sign(lift: float) -> str:
    if lift > 1:
        return "+"
    elif lift < 1:
        return "-"
    else:
        return "0"   # opcional


# --------------------------------------------------
# Fun√ß√£o principal
# --------------------------------------------------

def build_pattern_table_from_authors_df(authors_df: pd.DataFrame) -> pd.DataFrame:
    """
    A partir de authors_df (uma linha por author √ó with_author √ó granularidade),
    computa:
      - pattern de sinais de lift ao longo da granularidade
      - count ponderado por support_abs

    Retorna tabela ordenada por count (desc):
      author | with_author | tradition | pattern | count
    """

    df = authors_df.copy()

    # Sinal do lift
    df["sign"] = df["lift"].apply(lift_to_sign)

    # Garantir ordem das granularidades
    df["granularity"] = pd.Categorical(
        df["granularity"],
        categories=GRANULARITY_ORDER,
        ordered=True
    )

    # Pivot dos sinais
    pivot_sign = (
        df
        .pivot_table(
            index=["author", "with_author", "tradition"],
            columns="granularity",
            values="sign",
            aggfunc="first"
        )
        .add_suffix("_sign")
    )

    # Pivot do support_abs
    pivot_support = (
        df
        .pivot_table(
            index=["author", "with_author", "tradition"],
            columns="granularity",
            values="support_abs",
            aggfunc="sum"
        )
        .add_suffix("_support")
    )

    # Junta tudo
    pivot = (
        pivot_sign
        .join(pivot_support)
        .reset_index()
    )

    # Construir pattern usando as colunas *_sign
    sign_cols = [f"{g}_sign" for g in GRANULARITY_ORDER]
    pivot["pattern"] = pivot[sign_cols].apply(
        lambda row: "".join(row.values.astype(str)),
        axis=1
    )

    # Definir count como soma do support_abs
    support_cols = [f"{g}_support" for g in GRANULARITY_ORDER]
    pivot["count"] = pivot[support_cols].sum(axis=1)

    # Tabela final
    table = (
        pivot[["author", "with_author", "tradition", "pattern", "count"]]
        .sort_values("count", ascending=False)
        .reset_index(drop=True)
    )

    return table


# --------------------------------------------------
# Uso
# --------------------------------------------------

pattern_table = build_pattern_table_from_authors_df(authors_df)
pattern_table.head(20)


In [None]:
import pandas as pd

def build_pattern_table_from_authors_df(authors_df: pd.DataFrame) -> pd.DataFrame:
    """
    Gera uma tabela agregada por:
    author, with_author, tradition, pattern

    Retorna contagem de ocorr√™ncias (count),
    ordenada de forma decrescente.
    """

    table = (
        authors_df
        .groupby(
            ["author", "with_author", "tradition", "pattern"],
            dropna=False
        )
        .size()
        .reset_index(name="count")
        .sort_values("count", ascending=False)
        .reset_index(drop=True)
    )

    return table

pattern_table = build_pattern_table_from_authors_df(authors_df)

pattern_table.head(20)


In [None]:
import pandas as pd

granularity_order = [
    "sentence_id",
    "paragraph_id",
    "section_id",
    "paper_id"
]

df = authors_df.copy()

# 1. Sinal do lift
if "lift_relation" not in df.columns:
    df["lift_relation"] = df["lift"].apply(lambda x: "+" if x >= 1 else "-")

# 2. Ordem das granularidades
df["granularity"] = pd.Categorical(
    df["granularity"],
    categories=granularity_order,
    ordered=True
)

# 3. Uma linha por author √ó tradition
author_patterns = (
    df
    .pivot_table(
        index=["author", "tradition"],
        columns="granularity",
        values="lift_relation",
        aggfunc="first"
    )
    .reindex(columns=granularity_order)
)

# 4. Apenas patterns completos
author_patterns = author_patterns.dropna()

# 5. Pattern multigranular
author_patterns["pattern"] = author_patterns.apply(
    lambda row: " ".join(row.values),
    axis=1
)

author_patterns = author_patterns.reset_index()

# üîπ Total de autores por tradition
tradition_totals = (
    author_patterns
    .groupby("tradition")["author"]
    .nunique()
    .rename("total_authors")
    .reset_index()
)

# 6. Contagem por tradition √ó pattern
pattern_by_tradition = (
    author_patterns
    .groupby(["tradition", "pattern"])
    .size()
    .rename("count")
    .reset_index()
)

# 7. Junta total e calcula propor√ß√£o correta
pattern_by_tradition = pattern_by_tradition.merge(
    tradition_totals,
    on="tradition",
    how="left"
)

pattern_by_tradition["proportion"] = (
    pattern_by_tradition["count"] /
    pattern_by_tradition["total_authors"]
)

# 8. Ordena√ß√£o para inspe√ß√£o
pattern_by_tradition.sort_values(
    ["tradition", "proportion", "pattern"],
    ascending=[True, False, True]
).head(100)
