In [132]:
# importing the required libraries. Some libraries run on colab while others run locally

import os
import zipfile
from tqdm import tqdm
from openai import OpenAI

try:
    from dotenv import load_dotenv
    load_dotenv()
    HF_TOKEN = os.getenv("HUGGINGFACE_TOKEN")
    OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
    
except:
    from google.colab import drive
    from google.colab import userdata
    HF_TOKEN = userdata.get('HF_TOKEN')
    OPENAI_API_KEY = userdata.get('OPENAI_API_KEY')

# Set to True to run the pipelines. Since we stored the results in a file, 
# we can set this to False to avoid running the pipelines again
RUN_PIPELINES = False   

In [59]:
# from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM

In [11]:
# same as above. If on colab, we need to mount the drive. If locally, we can set the path to the local directory

# ========== Mount Google Drive ==========
try:
    drive.mount('/content/drive')

# ========== Path Configuration ==========
# Update these paths according to your Google Drive structure
    DRIVE_BASE = '/content/drive/MyDrive/'
    EXTRACT_DIR = '/tmp/extracted'  # Using tmp for faster I/O

exept:
    DRIVE_BASE = './outputs/'
    EXTRACT_DIR = './tmp/extracted'  # Using tmp for faster I/O

ZIP_PATH = "./data/elmundo_chunked_es_page1_40years.zip"
OUTPUT_DIR = os.path.join(DRIVE_BASE, 'cleaned_articles1')

# Create directories
os.makedirs(EXTRACT_DIR, exist_ok=True)
os.makedirs(OUTPUT_DIR, exist_ok=True)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# ========== File Extraction ==========
def extract_files(zip_path, extract_dir):
    """
    Extracts files from a zip archive to a directory.
    
    Args:
    zip_path (str): Path to the zip archive.
    extract_dir (str): Directory to extract the files to.
    """
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        # Extract nested structure
        for file in zip_ref.namelist():
            if file.endswith('.txt'):
                zip_ref.extract(file, extract_dir)
    print("*" * 50)
    print(f"Extracted files to: {extract_dir}")

1. Extract zip file
2. open the folder
3. For each file in folder,
    read the content and extract the text
    <!-- chunk the text into 1000 words -->
    <!-- pass chunks to the model so it can fix the spelling -->
    translate the corrected text to english
    <!-- add the file name and the corrected text to a dictionary -->
    save the corrected text to a new file
4. Save the dictionary to a pkl file
5.


In [62]:
from openai import OpenAI
client = OpenAI(
    api_key=OPENAI_API_KEY
)

def correct_with_openai(text, filename, just_text = True, max_completion_tokens = 2048, temperature = 1, top_p = 1, frequency_penalty=0, presence_penalty=0,**kwargs):
  """
  Corrects text using OpenAI's GPT-4o-mini model.
  
  Args:
  text (str): The text to correct.
  filename (str): The name of the file.
  just_text (bool): Whether to return just the corrected text.
  max_completion_tokens (int): The maximum number of tokens to generate.
  temperature (float): The temperature for sampling.
  top_p (float): The nucleus sampling probability.
  frequency_penalty (float): The frequency penalty.
  presence_penalty (float): The presence penalty.
  kwargs: Additional keyword arguments.
  
  Returns:
  str: The corrected text.
  """
  response = client.chat.completions.create(
    model="gpt-4o-mini",
    messages=[
      {
        "role": "user",
        "content": [
          {
            "type": "text",
            "text": f"Eres un experto en documentos históricos de Puerto Rico. El texto en español son noticias del siglo XX y contiene muchos errores a causa del OCR. Descifra el contenido y tradúcelo al inglés:\n1. Preserva nombres propios (ej: Mayagüez, Caguas)\n2. Ignora el \"header\" (ej:\n```EL MUNDO\nPRONOSTICOS DEL TIEMPO PARA LA ISLA, HOY: Mayormente nublado, con aguaceros dispersos temprano en la mafiana. EN SAN JUAN. AYER: Temperatura máxima. 80; mínima, 77. Presión barométrica al nivel del mar, a las 4:80 de la tarde. 38.88 pulgadas de mercurio. No hay indicios de disturbio tropical.\n40 páginas 5/\nDIARIO DE LA MARANA\nAÑO XXVIII\nEntered aa second clsss matter, Post Office, San Juan, P. R.)```\n3. Ignora los anuncios\n4. Solo mantén contenido relacionado a Puerto Rico (especialmente sobre ciudades, locaciones o eventos históricos)\n5. Traduce el texto a inglés. Solo mantén los datos mas importantes\n6.  Lista las ciudades o locaciones de Puerto Rico mencionadas\n7. Escribe solo en texto (no uses **negrillas** ni *itálicas* ni nada en markdown)\n8. return it as a JSON object with two fields:\n    - 'metadata': un diccionario con la siguiente informacion: 'filename' (nombre del articulo), 'date' (fecha del articulo), 'locations' (lista de las ciudades o locaciones de Puerto Rico mencionadas).\n    - 'text': the corrected and summarized text in English.\n8. No digas nada mas ni preguntes más. El nombre del articulo es {filename}. Usa el siguiente texto: {text}"
          }
        ]
      }
    ],
    response_format={
      "type": "json_object"
    },
    temperature=temperature,
    max_completion_tokens=max_completion_tokens,
    top_p=top_p,
    frequency_penalty=frequency_penalty,
    presence_penalty=presence_penalty,
    **kwargs
  )
  if just_text:
    return response.choices[0].message.content

  return response

In [63]:
from datetime import datetime
import pickle as pkl

def save_progress(data, filename="all_docs.pkl"):
    """ Save the current state of data to Google Drive. 
    
    Args:
    data (dict): The data to save.
    filename (str): The filename to save the data to.
    
    Returns:
    None
    """
    save_path = os.path.join(OUTPUT_DIR, filename)

    with open(save_path, 'wb') as f:
        pkl.dump(data, f)

    print(f"Progress saved at {datetime.now().strftime('%Y-%m-%d %H:%M:%S')} to {save_path}")

In [9]:
PROGRESS_FILE = os.path.join(OUTPUT_DIR, "processed_files.log")

def get_processed_files():
    """
    Returns a set of processed files.
    
    Returns:
    set: The set of processed files
    """
    if os.path.exists(PROGRESS_FILE):
        with open(PROGRESS_FILE, 'r') as f:
            return set(f.read().splitlines())
    return set()

def update_progress(filename):
    """
    Updates the progress file with the processed filename.
    
    Args:
    filename (str): The filename to add to the progress file.
    
    Returns:
    None
    """
    with open(PROGRESS_FILE, 'a') as f:
        f.write(f"{filename}\n")

In [26]:
# ========== Processing Pipeline ==========
import json
import pickle as pkl
from langchain.docstore.document import Document
import time

# Save progress every 15 minutes
interval_minutes = 15

def process_files():
    """
    Processes the text files in the ZIP archive.
    
    Returns:
    list: A list of Document objects.
    """
    extract_files(ZIP_PATH, EXTRACT_DIR)  # extract files from zip

    all_docs = [] # for storing all the documents

    # Track when the last save occurred
    last_save_time = time.time()
    processed = get_processed_files()

    # Get all text files from nested directory
    base_dir = os.path.join(EXTRACT_DIR, "elmundo_chunked_es_page1_40years")
    txt_files = [f for f in os.listdir(base_dir) if f.endswith('.txt')]

    for filename in tqdm(txt_files, desc="Processing files"):

        if filename in processed:
            # Skip already processed files
            continue

        input_path = os.path.join(base_dir, filename)
        output_path = os.path.join(OUTPUT_DIR, f"cleaned_{filename}")

        with open(input_path, 'r', encoding='utf-8', errors='ignore') as f: # open current text file
            raw_text = f.read()

        try:
            # gets gpt-4o-mini JSON object with 'metadata' and 'text' fields:
            json_object = json.loads(correct_with_openai(raw_text, filename))  # OpenAI version

            cleaned_text = json_object['text']  # get the text from the gpt-4o-mini model

            with open(output_path, 'w', encoding='utf-8') as f: # save text on google drive
                f.write(cleaned_text)

            print(f"Processed: {filename} -> Saved to Drive")

            doc = Document(                           # convert text to a langchain text object (for use on Chroma later)
                page_content=json_object['text'],
                metadata=json_object['metadata']
            )
            all_docs.append(doc)                      # append docs to list

            # Update the processed log
            update_progress(filename)

        except Exception as e:
            print(f"Error processing {filename}: {str(e)}")
            continue

        current_time = time.time()
        if (current_time - last_save_time) >= (interval_minutes * 60):
            save_progress(all_docs)
            last_save_time = current_time  # Update the last save time

    # Save all_docs as pkl file
    with open(os.path.join(OUTPUT_DIR, "all_docs.pkl"), 'wb') as f:
        pkl.dump(all_docs, f)

    with open("all_docs.pkl", 'wb') as f:
        pkl.dump(all_docs, f)

    return all_docs

In [None]:
# Run the pipeline. False by default, set to True to run in the first cell
if RUN_PIPELINES:
    all_docs = process_files()
else:
    with open("all_docs.pkl", 'rb') as f:
        all_docs = pkl.load(f)

In [109]:
def translate_list (lista, just_text = True, max_completion_tokens = 2048, temperature = 1, top_p = 1, frequency_penalty=0, presence_penalty=0,**kwargs):
  """
  Translates a list of items to English using OpenAI's GPT-4o-mini model.
  
  Args:
  lista (list): The list to translate.
  just_text (bool): Whether to return just the translated text.
  
  Returns:
  str: The translated text in a JSON-ish style.
  """
  largo = len(lista)

  if isinstance(lista, list):
    lista = str(lista)

  response = client.chat.completions.create(
    model="gpt-4o-mini",
    messages=[
      {
        "role": "user",
        "content": [
          {
            "type": "text",
            "text": f"""
                Eres un experto en documentos históricos, localidades, y personas ilustres de Puerto Rico. La lista que se te pasará es un de lugares conocidos y 
                personas ilustres de Puerto Rico. Tu deber es traducir la lista al inglés y corregir cualquier error que encuentres. 
                1. Preserva nombres propios (ej: Mayagüez, Caguas, Julia de Burgos)\n
                2. Escribe solo en texto (no uses **negrillas** ni *itálicas* ni nada en markdown)\n
                3. Es posible que la lista ya contenga elementos en inglés. En ese caso, no los traduzcas, pero incluyelos en la respuesta final.\n
                3. Retorna un objeto JSON con {largo} pares key-value:\n
                    - key: el texto de la lista. Podría estar en inglés o español. Depende de como te lo dieron en la lista\n
                    - value: el texto traducido al inglés. Si el elemento ya estaba en inglés, no hace falta traducirlo, pero igualmente incluyes el texto aqui\n
                4. El ejemplo de como se veria la respuesta JSON aceptable para la lista de ejemplo ["playa de Camuy", "Julia de Burgos", "Parque acuatico Las Cascadas", "Aguada Transmission Center", "Domes Beach"]:\n
                    ```
                    {{
                        "playa de Camuy": "Camuy Beach",
                        "Julia de Burgos": "Julia de Burgos",
                        "Parque acuático Las Cascadas": "Las Cascadas Water Park",
                        "Centro Ceremonial Indígena de Caguana": "Caguana Indigenous Ceremonial Center",
                        "Aguada Transmission Center": "Aguada Transmission Center",
                        "Domes beach": "Domes beach"
                    }}
                    ```
                  En general, el JSON deberia verse {{key_1: value_1, key_2: value_2, key_3: value_3, ..., key_{largo}: value_{largo}}}\n
                5. La lista de arriba solamente es un ejemplo para que te guies.
                6. Absolutamente todos los elementos en la que el usuario te de tienen que aparecer en el JSON final con su traducción correspondiente.
                7. No digas nada mas ni preguntes más.
                8. La lista del usuario que vas a usar para el JSON es la siguiente:\n 
                    {lista}
                """
          }
        ]
      }
    ],
    response_format={
      "type": "json_object"
    },
    temperature=temperature,
    max_completion_tokens=max_completion_tokens,
    top_p=top_p,
    frequency_penalty=frequency_penalty,
    presence_penalty=presence_penalty,
    **kwargs
  )
  if just_text:
    return response.choices[0].message.content

  return response

In [65]:
def update_metadata_with_landmarks(all_docs, landmarks_dict):
    """
    Updates the metadata of the documents with the landmarks found in the text.
    
    Args:
    all_docs (list): A list of Document objects.
    landmarks_dict (dict): A dictionary of landmarks in English and Spanish.
    
    Returns:
    list: A list of Document objects with updated metadata.
    """
    for doc in all_docs:
        text = doc.page_content.lower()  # Convert to lowercase for easier matching
        for landmark_es, landmark_en in landmarks_dict.items():
            if landmark_es.lower() in text or landmark_en.lower() in text:
                if 'locations' not in doc.metadata:
                    doc.metadata['locations'] = []
                if landmark_en not in doc.metadata['locations']:
                    doc.metadata['locations'].append(landmark_en)  # Add the English landmark
                if landmark_es not in doc.metadata['locations']:
                    doc.metadata['locations'].append(landmark_es)  # Add the Spanish landmark

    return all_docs


In [158]:
# !pip install spacy
!spacy download en_core_web_sm

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
      --------------------------------------- 0.3/12.8 MB ? eta -:--:--
     - ------------------------------------- 0.5/12.8 MB 932.9 kB/s eta 0:00:14
     --- ------------------------------------ 1.0/12.8 MB 1.4 MB/s eta 0:00:09
     ---- ----------------------------------- 1.3/12.8 MB 1.4 MB/s eta 0:00:09
     ----- ---------------------------------- 1.8/12.8 MB 1.6 MB/s eta 0:00:07
     ------ --------------------------------- 2.1/12.8 MB 1.6 MB/s eta 0:00:07
     ------- -------------------------------- 2.4/12.8 MB 1.6 MB/s eta 0:00:07
     --------- ------------------------------ 2.9/12.8 MB 1.6 MB/s eta 0:00:07
     --------- ------------------------------ 3.1/12.8

In [159]:
import spacy

# Load spaCy's NER model
nlp = spacy.load("en_core_web_sm")

# Example function to process the documents and add NER results to metadata
def enrich_metadata_with_ner(all_docs):
    """
    Enriches the metadata of the documents with named entities recognized by spaCy.
    
    Args:
    all_docs (list): A list of Document objects.
    
    Returns:
    list: A list of Document objects with updated metadata.
    """
    for doc in all_docs:
        text = doc.page_content
        spacy_doc = nlp(text)  # Process text through spaCy NER engine

        # Collect the detected locations (GPE and LOC entities)
        ner_locations = {ent.text for ent in spacy_doc.ents if ent.label_ in ['GPE', 'LOC']}
        
        # Combine with existing locations in metadata
        existing_locations = set(doc.metadata.get('locations', []))
        updated_locations = list(existing_locations.union(ner_locations))
        
        # Update the document's metadata with enriched locations
        doc.metadata['locations'] = updated_locations
    
    return all_docs

In [130]:
def get_landmarks_dict():
    """
    Returns a dictionary of landmarks in English and Spanish.
    
    Returns:
    dict: A dictionary of landmarks in English and Spanish.
    """

    path_zip = "./data/landmarks.zip"
    extract = './tmp/extracted_landmarks'  # Using tmp for faster I/O
    # Create directories
    os.makedirs(extract, exist_ok=True)

    # Extract the landmarks.zip file
    extract_files(path_zip, extract)


    #landmarks list of file names, removing .txt and changing `_`, and `-` to spaces
    base_dir = os.path.join(extract, "landmarks")
    landmarks = [f.replace('.txt', '').replace('_', ' ').replace('-', ' ') for f in os.listdir(base_dir) if f.endswith('.txt')]

    # Translate the landmarks list with OpenAI (splitting into two to avoid exceeding the token limit)
    translations1 = translate_list(landmarks[:len(landmarks)//2], just_text=False, max_completion_tokens=7000)
    translations2 = translate_list(landmarks[len(landmarks)//2:], just_text=False, max_completion_tokens=7000)

    # Combine the translations. translations contain 2 ChatCompletion objects
    translations = [translations1, translations2]

    #save translations to pkl
    with open("landmark translations.pkl", 'wb') as f:
        pkl.dump(translations, f)

    # change the translations to json
    translation1_json = json.loads(translations[0].choices[0].message.content)
    translation2_json = json.loads(translations[1].choices[0].message.content)

    # make translations_json by adding translation1_json and translation2_json
    translations_json = {**translation1_json, **translation2_json}

    # save translations_json to json file
    with open("landmarks.json", 'w') as f:
        json.dump(translations_json, f)

    return translations_json

In [129]:
# delete the extracted files
! rm -rf ./tmp/extracted_landmarks

In [150]:
# Run the pipeline. False by default, set to True to run in the first cell
if RUN_PIPELINES:
    translations = get_landmarks_dict()
    # update metadata with landmarks
    all_docs = update_metadata_with_landmarks(all_docs, translations)
    all_docs = enrich_metadata_with_ner(all_docs)
    # save the updated all_docs to pkl
    with open("all_docs_updated.pkl", 'wb') as f:
        pkl.dump(all_docs, f)

else:
    with open("all_docs_updated.pkl", 'rb') as f:
        all_docs = pkl.load(f)      # load the updated all_docs from pkl file, containing the landmarks and NER results in the metadata

In [None]:
## ========== Chroma ==========
# from langchain_huggingface.embeddings import HuggingFaceEmbeddings

# sentence_transformer_embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
# print("Initialized SentenceTransformer embeddings.")

# # Load all documents into Chroma
# db = Chroma.from_documents(all_docs, sentence_transformer_embeddings, persist_directory="./chroma_db_clean_huggingface")
# print('All documents loaded and embedded.(huggingface)')