In [145]:
import os
import zipfile
import fitz  # PyMuPDF
import re
import math
import numpy as np
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
import shutil
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import torch

In [146]:
# Solo una vez al inicio
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained("gpt2")
model.eval()  # No entrenamiento



GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [147]:
def calculate_perplexity(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs, labels=inputs["input_ids"])
        loss = outputs.loss
        return math.exp(loss.item())

In [148]:
def extract_text_from_pdf(pdf_path):
    text = ""
    with fitz.open(pdf_path) as doc:
        for page in doc:
            text += page.get_text()
    return text

In [149]:
def preprocess_text(text):
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'[^\w\s]', '', text)
    return text.lower()

In [150]:
def entropy(text):
    probs = [freq / len(text) for freq in Counter(text).values()]
    return -sum(p * math.log2(p) for p in probs)

In [151]:

def ngram_metrics(text, n=3):
    vectorizer = CountVectorizer(ngram_range=(n, n))
    ngrams = vectorizer.fit_transform([text])
    return dict(zip(vectorizer.get_feature_names_out(), ngrams.toarray()[0]))

In [152]:
def burstiness(text):
    words = text.split()
    word_counts = Counter(words)
    freqs = list(word_counts.values())
    if len(freqs) < 2:
        return 0.0
    return np.std(freqs) / np.mean(freqs)

In [153]:
def top_ngrams(text, n=3, top_k=10):
    vectorizer = CountVectorizer(ngram_range=(n, n))
    ngrams = vectorizer.fit_transform([text])
    freqs = ngrams.toarray()[0]
    ngram_names = vectorizer.get_feature_names_out()
    sorted_ngrams = sorted(zip(ngram_names, freqs), key=lambda x: x[1], reverse=True)
    return sorted_ngrams[:top_k]

In [154]:
def extract_zip(zip_path, extract_to="pdfs_temp"):
    os.makedirs(extract_to, exist_ok=True)
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(extract_to)
    
    pdf_files = []
    for root, dirs, files in os.walk(extract_to):
        for f in files:
            if f.endswith(".pdf"):
                pdf_files.append(os.path.join(root, f))
    
    return pdf_files

In [155]:
def process_pdfs_in_zip(zip_path):
    pdf_paths = extract_zip(zip_path)
    
    results = []

    for path in pdf_paths:
        print(f"Procesando: {os.path.basename(path)}")
        raw = extract_text_from_pdf(path)
        clean = preprocess_text(raw)
        ent = entropy(clean)
        bur = burstiness(clean)
        ppl = calculate_perplexity(clean)

        results.append({
            "archivo": os.path.basename(path),
            "entropia": round(ent, 4),
            "burstiness": round(bur, 4),
            "perplejidad": round(ppl, 4)
        })

    return pd.DataFrame(results)

In [156]:
def run_pipeline(zip_path, output_csv="resultados.csv"):
    df = process_pdfs_in_zip(zip_path)
    
    shutil.rmtree("pdfs_temp", ignore_errors=True)
    print("🧹 Carpeta 'pdfs_temp' eliminada.")
    
    return df

In [157]:
df_resultados = run_pipeline("Documents.zip")

display(df_resultados)


Procesando: D1.pdf
Procesando: D2.pdf
Procesando: D3.pdf
Procesando: D4.pdf
Procesando: D5.pdf
Procesando: D6.pdf
🧹 Carpeta 'pdfs_temp' eliminada.


Unnamed: 0,archivo,entropia,burstiness,perplejidad
0,D1.pdf,4.0744,0.5679,160.4222
1,D2.pdf,4.0528,0.5415,108.0962
2,D3.pdf,3.9881,0.6576,219.6888
3,D4.pdf,4.0383,0.688,110.8801
4,D5.pdf,4.1078,0.7635,173.1369
5,D6.pdf,4.0451,0.4651,196.9493


In [158]:
def calcular_umbral_penalizacion(df, sensibilidad=1.0):  # sensibilidad = IQR factor
    umbrales = {}

    for columna in ["entropia", "burstiness", "perplejidad"]:
        q1 = df[columna].quantile(0.25)
        q3 = df[columna].quantile(0.75)
        iqr = q3 - q1

        lim_inf = q1 - sensibilidad * iqr
        lim_sup = q3 + sensibilidad * iqr

        umbrales[columna] = {
            "q1": q1,
            "q3": q3,
            "inferior": lim_inf,
            "superior": lim_sup
        }

    return umbrales


In [159]:
def marcar_sospechosos_compuesto(df, ppl_ratio=0.6, entropia_min=3.5):
    max_ppl = df["perplejidad"].max()
    df["perplejidad_relativa"] = df["perplejidad"] / max_ppl

    def sospechoso(row):
        condiciones = [
            row["perplejidad_relativa"] < ppl_ratio,
            row["entropia"] < entropia_min
        ]
        return "Sí" if sum(condiciones) >= 1 else "No"

    df["sospechoso"] = df.apply(sospechoso, axis=1)
    return df

In [160]:
df = run_pipeline("Documents.zip")
umbrales = calcular_umbral_penalizacion(df)
df_1 = marcar_sospechosos_compuesto(df)



Procesando: D1.pdf
Procesando: D2.pdf
Procesando: D3.pdf
Procesando: D4.pdf
Procesando: D5.pdf
Procesando: D6.pdf
🧹 Carpeta 'pdfs_temp' eliminada.


In [161]:
display(df_1)

Unnamed: 0,archivo,entropia,burstiness,perplejidad,perplejidad_relativa,sospechoso
0,D1.pdf,4.0744,0.5679,160.4222,0.730225,No
1,D2.pdf,4.0528,0.5415,108.0962,0.492042,Sí
2,D3.pdf,3.9881,0.6576,219.6888,1.0,No
3,D4.pdf,4.0383,0.688,110.8801,0.504714,Sí
4,D5.pdf,4.1078,0.7635,173.1369,0.788101,No
5,D6.pdf,4.0451,0.4651,196.9493,0.896492,No
