In [33]:
#!/usr/bin/env python3

import os
import glob
import pandas as pd
import hashlib

# Configurations
RSS_DIR = '/home/matias/Documents/media_monitor/data/rss_slices/'
MASTER_INDEX_PATH = './data/master_index.csv'
PROCESSED_FILES_LOG = './data/processed_files.txt'
DATA_DIR = './data/'

# Ensure output directory exists
os.makedirs(DATA_DIR, exist_ok=True)



def compute_short_hash(text):
    """Compute a short hash of a text (Title + Source)."""
    h = hashlib.sha1(text.encode('utf-8')).hexdigest()
    return h[:8]

def process_csv_file(csv_path):
    """Load a CSV and return a DataFrame with index_id added, reusing uid if available."""
    df = pd.read_csv(csv_path)

    # Drop rows with missing Title or Source
    df = df.dropna(subset=['Title', 'Source'])

    if 'uid' in df.columns:
        df['index_id'] = df['uid']
    else:
        df['index_id'] = df.apply(lambda row: compute_short_hash(row['Title'] + row['Source']), axis=1)

    return df


def load_master_index(path):
    """Load the master index CSV if it exists, else return an empty DataFrame."""
    if os.path.exists(path):
        master_df = pd.read_csv(path)
    else:
        master_df = pd.DataFrame()
    return master_df

def save_master_index(master_df, path):
    """Save the master index to a CSV."""
    master_df.to_csv(path, index=False)
    print(f"✅ Master index saved at {path} with {len(master_df)} articles.")

def load_processed_files(log_path):
    """Load the list of processed files to avoid reprocessing."""
    if os.path.exists(log_path):
        with open(log_path, 'r', encoding='utf-8') as f:
            return set(line.strip() for line in f)
    else:
        return set()

def save_processed_files(processed_files, log_path):
    """Save the updated list of processed files."""
    with open(log_path, 'w', encoding='utf-8') as f:
        for filename in processed_files:
            f.write(f"{filename}\n")

def update_master_index_from_directory(rss_dir, master_index_path, processed_files_log):
    import glob

    master_df = load_master_index(master_index_path)
    processed_files = load_processed_files(processed_files_log)

    new_files = []
    all_files = sorted(glob.glob(os.path.join(rss_dir, '**/*.csv'), recursive=True))
    for csv_file in all_files:
        filename = os.path.basename(csv_file)
        if filename not in processed_files:
            new_files.append((csv_file, filename))

    if not new_files and master_df.shape[0] > 0:
        print("✅ No new CSV files to process. Master index is up-to-date.")
        return

    new_dfs = []
    for file, filename in new_files:
        print(f"📥 Processing new file: {filename}")
        df = process_csv_file(file)
        new_dfs.append(df)
        processed_files.add(filename)

    # Acá continúa igual
    if new_dfs:
        new_data_df = pd.concat(new_dfs, ignore_index=True)
    else:
        new_data_df = pd.DataFrame()

    combined_df = pd.concat([new_data_df, master_df], ignore_index=True)

    if combined_df.empty:
        print("⚠️ Combined DataFrame is empty. No articles to process.")
        return

    if 'Published' not in combined_df.columns:
        print("⚠️ 'Published' column missing. Columns found:", combined_df.columns.tolist())
        return

    combined_df['Published'] = pd.to_datetime(combined_df['Published'], errors='coerce')
    combined_df = combined_df.sort_values('Published', ascending=False)

    if 'Title' not in combined_df.columns or 'Source' not in combined_df.columns:
        print("⚠️ Missing 'Title' or 'Source' columns.")
        return

    deduped_df = combined_df.drop_duplicates(subset=['index_id'], keep='first')
    assert deduped_df['index_id'].is_unique, "❌ Duplicate index_id found in deduped output."

    columns_to_keep = ['index_id', 'uid', 'Topic', 'Title', 'Published', 'Source', 'Link']
    deduped_df = deduped_df[[col for col in columns_to_keep if col in deduped_df.columns]]

    save_master_index(deduped_df, master_index_path)
    save_processed_files(processed_files, processed_files_log)


# def main():
#     update_master_index_from_directory(RSS_DIR, MASTER_INDEX_PATH, PROCESSED_FILES_LOG)

# if __name__ == "__main__":
#     main()


In [34]:
import pandas as pd
import os
# from glob import glob

MASTER_INDEX_PATH = './data/master_index.csv'
rss_slice_dir = '/home/matias/Documents/media_monitor/data/rss_slices/rss_dumps/'
rss_df_path = './data/digest_article_map.csv'


update_master_index_from_directory(RSS_DIR, MASTER_INDEX_PATH, PROCESSED_FILES_LOG)


# Cargar master index
master_index = pd.read_csv(MASTER_INDEX_PATH)
master_index["Title"] = master_index["Title"].str.strip()
master_index["Source"] = master_index["Source"].str.strip()

# Cargar todos los CSV de RSS slices
digest_records = []
for file in sorted(glob.glob(os.path.join(rss_slice_dir, '*.csv'))):
    digest_file = os.path.splitext(os.path.basename(file))[0]
    df = pd.read_csv(file, usecols=['article_id', 'Title', 'Source'])
    df['digest_file'] = digest_file
    df['Title'] = df['Title'].str.strip()
    df['Source'] = df['Source'].str.strip()
    digest_records.append(df)

rss_df = pd.concat(digest_records, ignore_index=True)
rss_df["article_id"] = rss_df["article_id"].astype(str)

# Realizar match con master_index usando Title + Source
merged = rss_df.merge(master_index, on=['Title', 'Source'], how='left', suffixes=('', '_master'))

# Validar si hubo pérdida de correspondencias
missing = merged[merged['index_id'].isna()]
if not missing.empty:
    print(f"⚠️ {len(missing)} artículos de digest no fueron encontrados en el master index.")

# Crear campo clave: digest_file::article_id
merged["key"] = merged["digest_file"] + "::" + merged["article_id"]

# Selección de columnas y export
master_ref = merged[['digest_file', 'article_id', 'index_id', 'Source', 'Title', 'Published', 'Link', 'key']].drop_duplicates()
master_ref.to_csv('./data/master_ref.csv', index=False)
print("✅ Archivo master_ref.csv actualizado.")

✅ No new CSV files to process. Master index is up-to-date.
✅ Archivo master_ref.csv actualizado.


In [35]:
xx

NameError: name 'xx' is not defined

In [12]:
# dont run
# import numpy as np

# for file in csv_files:
#     filename = os.path.splitext(os.path.basename(file))[0]
    
#     df = pd.read_csv(file)
#     df = df.sort_values('Published')
#     df['article_id'] = np.arange(1, len(df)+1)
#     df.to_csv(file, index = False)

# df

In [None]:
import pandas as pd
import os
# from glob import glob

MASTER_INDEX_PATH = './data/master_index.csv'

# Ruta donde están los archivos
rss_slice_dir = '/home/matias/Documents/media_monitor/data/rss_slices/rss_dumps/'

# Buscar todos los archivos CSV
csv_files = glob.glob(os.path.join(rss_slice_dir, '*.csv'))

# Cargar todos los archivos, agregando nombre del archivo como prefijo
dfs = []
for file in csv_files:
    print('file:', file)
    filename = os.path.splitext(os.path.basename(file))[0]
    print('filename:', filename)
    
    df = pd.read_csv(file, usecols=['article_id', 'Title', 'Source'])
    df['digest_file'] = filename
    dfs.append(df)

# Concatenar y eliminar duplicados
rss_df = pd.concat(dfs, ignore_index=True)
rss_df.drop_duplicates(subset=['article_id', 'Title', 'Source'], inplace=True)

# Guardar como referencia
rss_df_path = '/home/matias/Documents/media_monitor/data/article_id_reference.csv'
rss_df.to_csv(rss_df_path, index=False)

df =rss_df

file: /home/matias/Documents/media_monitor/data/rss_slices/rss_dumps/4h_window_20250618T0800.csv
filename: 4h_window_20250618T0800
file: /home/matias/Documents/media_monitor/data/rss_slices/rss_dumps/4h_window_20250612T2000.csv
filename: 4h_window_20250612T2000
file: /home/matias/Documents/media_monitor/data/rss_slices/rss_dumps/4h_window_20250618T0000.csv
filename: 4h_window_20250618T0000
file: /home/matias/Documents/media_monitor/data/rss_slices/rss_dumps/4h_window_20250611T0400.csv
filename: 4h_window_20250611T0400
file: /home/matias/Documents/media_monitor/data/rss_slices/rss_dumps/4h_window_20250531T2000.csv
filename: 4h_window_20250531T2000
file: /home/matias/Documents/media_monitor/data/rss_slices/rss_dumps/8h_window_20250606T1600.csv
filename: 8h_window_20250606T1600
file: /home/matias/Documents/media_monitor/data/rss_slices/rss_dumps/4h_window_20250621T0400.csv
filename: 4h_window_20250621T0400
file: /home/matias/Documents/media_monitor/data/rss_slices/rss_dumps/4h_window_2025

In [14]:
master_index.nunique()

index_id     21487
uid          11893
Topic            7
Title        14425
Published    10753
Source        1028
Link         14032
dtype: int64

In [15]:
master_index = pd.read_csv(MASTER_INDEX_PATH)

master_ref = master_index.merge(rss_df, how = 'inner').drop_duplicates().sort_values('Published', ascending = False)

master_ref = master_ref[['digest_file', 'article_id', 'index_id', 'Source', 'Title', 'Published', 'Link']].drop_duplicates()
master_ref["key"] = master_ref["digest_file"] + "::" + master_ref["article_id"].astype(str)

master_ref = master_ref.drop_duplicates()
master_ref.to_csv('./data/master_ref.csv')

In [16]:
master_ref.nunique()

digest_file      218
article_id       658
index_id       16202
Source           850
Title           9742
Published       9153
Link            9732
key            23878
dtype: int64

In [19]:
master_ref.index_id.str.len().describe()

count    39805.000000
mean         9.199749
std          0.979859
min          8.000000
25%          8.000000
50%         10.000000
75%         10.000000
max         10.000000
Name: index_id, dtype: float64

In [18]:
master_ref.shape

(39805, 8)