In [4]:
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\patricio\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\patricio\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\patricio\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

### Reading Data

In [5]:
#dataset with news between 2008 and 2016 from Rosario12
df_p12 = pd.read_excel("data/p12_2008_2016.xlsx")

#dataset with news between 2008 and 2020 from El Litoral
df_el = pd.read_excel("data/el_2008_2020.xlsx")

df_el["media"] = "litoral"
df_p12["media"] = "p12"

df_el = df_el[['date', 'title', 'url', 'content', 'media']]
df_p12 = df_p12[['date', 'title', 'url', 'content', 'media']]

df = pd.concat([df_el, df_p12], axis=0, ignore_index=True)

df.value_counts("media")

media
p12        360
litoral    128
Name: count, dtype: int64

In [6]:
#transform date column to dateformat
df["date"] = pd.to_datetime(df["date"])

df["date"].dt.year.value_counts().sort_index()

date
2008     22
2009     32
2010     36
2011     33
2012     81
2013    123
2014    104
2015     57
Name: count, dtype: int64

### Function to clean text data

In [7]:
def clean_text_data(df, content_column='content', cleaned_column='cleaned_content'):
    """
    Cleans text data in a DataFrame by applying several preprocessing steps:
    - Drops rows with missing values in the content column.
    - Converts text to lowercase.
    - Removes special characters, numbers, and punctuation.
    - Tokenizes text and removes stop words.
    - Removes extra whitespaces.

    Parameters:
        df (pd.DataFrame): Input DataFrame containing text data.
        content_column (str): Name of the column containing the original text.
        cleaned_column (str): Name of the new column for cleaned text.

    Returns:
        pd.DataFrame: A DataFrame with an additional column for cleaned text.
    """
    # Ensure stopwords are downloaded
    stop_words = set(stopwords.words('spanish'))

    # Drop rows with missing content
    df = df.dropna(subset=[content_column]).copy()

    # Convert the content to lowercase
    df.loc[:, cleaned_column] = df[content_column].str.lower()

    # Remove special characters, numbers, and punctuation
    df.loc[:, cleaned_column] = df[cleaned_column].apply(
        lambda x: re.sub(r'[^a-zA-Z\sáéíóúüñÁÉÍÓÚÜÑ]', '', x)
    )

    # Tokenize the text and remove stop words
    df.loc[:, cleaned_column] = df[cleaned_column].apply(
        lambda x: ' '.join([word for word in word_tokenize(x) if word not in stop_words])
    )

    # Remove extra whitespaces
    df.loc[:, cleaned_column] = df[cleaned_column].apply(lambda x: ' '.join(x.split()))

    return df


In [8]:
df = clean_text_data(df)
df.head()

Unnamed: 0,date,title,url,content,media,cleaned_content
0,2014-02-12,"""El dinero que mueve el narcotráfico está man...",https://www.ellitoral.com/edicion-online/diner...,"El Litoral, DyN, La Comisión Nacional de Justi...",litoral,litoral dyn comisión nacional justicia paz cnj...
1,2009-12-05,"""Ahora se incauta más droga porque hay más tar...",https://www.ellitoral.com/edicion-online/ahora...,Emerio Agretti - política@ellitoral La image...,litoral,emerio agretti políticaellitoral imagen camion...
2,2009-12-19,"""El enemigo es el narcotráfico, no el Frente P...",https://www.ellitoral.com/politica/enemigo-nar...,"En la tarde de este lunes, referentes del PS, ...",litoral,tarde lunes referentes ps ucr pares si partido...
3,2012-07-18,"""El narcotráfico es cuestión de Estado, no deb...",https://www.ellitoral.com/politica/narcotrafic...,"Mario Cáffaro | email protected, Los ministros...",litoral,mario cáffaro email protected ministros justic...
4,2013-05-01,"""Este gobierno es y será un enemigo declarado ...",https://www.ellitoral.com/politica/gobierno-en...,"El gobernador Antonio Bonfatti afirmó que ""la ...",litoral,gobernador antonio bonfatti afirmó seguridad d...


Vectorizar con modelo preentrenado (ver posibilidad de hacerlo con otro específicamente entrenado en español)

In [9]:
from sentence_transformers import SentenceTransformer

# Load embedding model
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

# Generate embeddings
df['embedding'] = df['cleaned_content'].apply(lambda x: model.encode(x))

df['date'] = pd.to_datetime(df['date']).dt.strftime('%Y-%m-%d')  # Convert to string format

  from .autonotebook import tqdm as notebook_tqdm


In [10]:
df.to_parquet("data/embeddings.pkl")

In [None]:
import chromadb

client = chromadb.Client()

collection = client.create_collection("drug_trafficking_articles")

In [15]:
# Adding embeddings to ChromaDB collection
for index, row in df.iterrows():
    collection.add(
        ids=[str(index)],  
        documents=[row['content']],  # Original article content (optional)
        metadatas=[{'date': row['date'], 'media': row['media'], 'url': row['url']}],  # Metadata
        embeddings=[row['embedding']]  # The actual embedding vector
    )

In [29]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

query = "relación entre el gobernador y el narcotrafico"
# Generate the query embedding
query_embedding = model.encode(query)

# Perform a similarity search
results = collection.query(
    query_embeddings=[query_embedding],  # The query embedding
    n_results=5  # Number of results to retrieve
)


In [30]:
results

{'ids': [['267', '264', '262', '391', '459']],
 'embeddings': None,
 'documents': [['"Si hay alguien que no puede hablar sobre cómo luchar contra el narcotráfico es el gobernador de Santa Fe", sostuvo ayer el secretario de Seguridad de la Nación, Sergio Berni, al ser consultado sobre la propuesta del gobernador Antonio Bonfatti de debatir la despenalización del consumo de drogas blandas como la marihuana. El funcionario subrayó que la del mandatario provincial le parece "una respuesta muy liviana e irresponsable de su parte. Un hombre que no ha podido o no ha querido luchar contra el narcotráfico en los últimos años no puede hablar sobre la legalización de la droga como una manera de combatir al narcotráfico"., El secretario de Seguridad remarcó que "la ciudad de Rosario y los suburbios se han convertido en un caos, producto de la lucha territorial por la venta de droga, con tasas de homicidios que nos tiene preocupados a todo el país"., Berni consideró que el crecimiento del narcotráf

In [24]:
from huggingface_hub import login

login("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")

In [31]:
from transformers import pipeline

# Initialize the question-answering pipeline
qa_pipeline = pipeline("question-answering", model="dccuchile/bert-base-spanish-wwm-uncased")

# Retrieve the context (articles) from ChromaDB
context = results['documents'][:5]  # Using the most relevant article for the question

# Ask a question based on the context
question = "¿Cómo se relaciona el gobernador con el narcotráfico?"

# Get the answer from the model
response = qa_pipeline(question=question, context=context)
print(response)

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at dccuchile/bert-base-spanish-wwm-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


ValueError: Arguments can't be understood

In [32]:

# Query your database for similar articles (based on an input query or question)
query = "¿Cómo se relaciona el gobernador con el narcotráfico?"
results = collection.query(query_texts=[query], n_results=5)  # Get the top 5 results

# Initialize the question-answering pipeline
qa_pipeline = pipeline("question-answering", model="dccuchile/bert-base-spanish-wwm-uncased")

# Prepare the context from the first 5 documents
contexts = results['documents'][:5]  # Get the first 5 articles

# Join the top 5 results into a single context string (optional)
context = "\n".join(contexts)

# Ask a question based on the context
question = "¿Cómo se relaciona el gobernador con el narcotráfico?"

# Get the answer from the model, passing the question and context explicitly
response = qa_pipeline(question=question, context=context)

# Print the response
print(response)

C:\Users\patricio\.cache\chroma\onnx_models\all-MiniLM-L6-v2\onnx.tar.gz: 100%|██████████| 79.3M/79.3M [00:28<00:00, 2.89MiB/s]
Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at dccuchile/bert-base-spanish-wwm-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


TypeError: sequence item 0: expected str instance, list found