In [2]:
import os
os.listdir('/kaggle/input')
embeddings_path = "/kaggle/input/embeddings"
print(os.listdir(embeddings_path))

['speeches_with_embeddings.csv', 'news_with_embeddings.csv']


# Similarity algorithm efficiently 

In [3]:
import pandas as pd

speeches_embeddings = pd.read_csv(f"{embeddings_path}/speeches_with_embeddings.csv")
news_embeddings = pd.read_csv(f"{embeddings_path}/news_with_embeddings.csv")

# Display first few rows
print("News Data:")
print(news_embeddings.head())

print("\nSpeeches Data:")
print(speeches_embeddings.head())

News Data:
          Index                                               Link  \
0  1_01_12_2018  https://www.bbc.com/mundo/noticias-america-lat...   
1  2_01_12_2018  https://politica.expansion.mx/presidencia/2018...   
2  3_01_12_2018  https://oem.com.mx/elsoldemexico/mexico/en-don...   
3  4_01_12_2018  https://politica.expansion.mx/presidencia/2018...   
4  5_01_12_2018  https://www.eleconomista.com.mx/politica/Nicol...   

                                              Domain  \
0  BBC\nToma de protesta de AMLO: las 5 tradicion...   
1  Expansión Política\nAMLO rinde protesta y prom...   
2  El Sol de México\n¿Hay Ley Seca este 1 de dici...   
3  Expansión Política\nAMLO es un "líder persiste...   
4  El Economista\nNicolás Maduro llega a Palacio ...   

                                               Title        Date  \
0  Toma de protesta de AMLO: las 5 tradiciones qu...  2018-12-01   
1        AMLO rinde protesta y promete no reelegirse  2018-12-01   
2  ¿Hay Ley Seca este 1 de 

In [4]:
print(type(news_embeddings['news_embeddings'].iloc[0]))
#since it is a class 'str' then we need to convert the embedding as a numpy array
# Convert 'date' column to datetime if it is not already in datetime format
speeches_embeddings['date'] = pd.to_datetime(speeches_embeddings['date'], errors='coerce')
news_embeddings['Date'] = pd.to_datetime(news_embeddings['Date'], errors='coerce')

# Subset the data to 2019 and reset indices

news_2019_reset = news_embeddings[news_embeddings['Date'].dt.year == 2019].reset_index(drop=True)
speeches_2019_reset = speeches_embeddings[speeches_embeddings['date'].dt.year == 2019].reset_index(drop=True)


<class 'str'>


In [6]:
import numpy as np

# Function to clean and convert the embeddings
def clean_and_convert(embedding_str):
    # Remove any unwanted characters like brackets or newline characters
    cleaned_str = embedding_str.replace('[', '').replace(']', '').replace('\n', '')
    
    # Convert the cleaned string to a numpy array
    return np.array(cleaned_str.split()).astype(float)

In [7]:
import torch 

# Step 2: Clean and convert the embeddings for 2019 data
cleaned_news_embeddings_2019 = news_2019_reset['news_embeddings'].apply(clean_and_convert)
cleaned_speech_embeddings_2019 = speeches_2019_reset['speech_embeddings'].apply(clean_and_convert)

# Verify the cleaned embeddings
print("Cleaned news embedding (first sample) for 2019:", cleaned_news_embeddings_2019.iloc[0])
print("Cleaned speech embedding (first sample) for 2019:", cleaned_speech_embeddings_2019.iloc[0])


Cleaned news embedding (first sample) for 2019: [ 7.00545609e-02  1.59867704e-01  1.82033628e-01 -1.10902913e-01
 -1.68071046e-01  7.82872558e-01  1.24405897e+00  9.35690045e-01
 -8.79230499e-01  1.69901311e-01  3.14941794e-01 -8.36316347e-01
  1.94153115e-01 -1.05180137e-01 -1.43057257e-01 -4.09085423e-01
  4.37348098e-01  3.12500400e-03  5.30271530e-01 -4.83261496e-02
 -7.44194686e-01  5.34093797e-01  6.49602234e-01 -1.16227746e+00
 -8.81177306e-01 -6.07623994e-01  7.66389966e-01  3.55166703e-01
 -4.08001542e-01  5.95054999e-02 -4.70891893e-01 -3.36702853e-01
  2.59495556e-01 -8.38763833e-01 -2.42981687e-01 -3.25389415e-01
 -3.09586227e-01  3.96134198e-01  4.72463161e-01 -6.93257987e-01
 -3.02345544e-01 -5.06430805e-01  3.93638819e-01  8.52328613e-02
  5.45643926e-01  6.85154796e-01 -1.47084430e-01  2.22051278e-01
 -5.23011625e-01 -5.16374409e-01  1.38684058e+00 -5.68437874e-01
 -4.49357063e-01 -1.91186011e-01 -2.05529436e-01 -5.80088556e-01
 -2.78938144e-01  3.71591657e-01  1.321355

In [8]:
import torch
import numpy as np

news_matrix_2019 = np.vstack(cleaned_news_embeddings_2019.values)
speech_matrix_2019 = np.vstack(cleaned_speech_embeddings_2019.values)


# Convert NumPy arrays to PyTorch tensors and move to GPU
news_tensor_2019 = torch.tensor(news_matrix_2019, dtype=torch.float32).cuda()
speech_tensor_2019 = torch.tensor(speech_matrix_2019, dtype=torch.float32).cuda()


In [9]:
import torch
import pandas as pd
import time

# Step 1: Save tensors to disk before loading them
torch.save(news_tensor_2019.cpu(), '/kaggle/working/news_tensor_2019.pt')
torch.save(speech_tensor_2019.cpu(), '/kaggle/working/speech_tensor_2019.pt')

# Step 2: Load tensors from disk and move them to GPU
news_tensor_2019 = torch.load('/kaggle/working/news_tensor_2019.pt').cuda()
speech_tensor_2019 = torch.load('/kaggle/working/speech_tensor_2019.pt').cuda()

# Step 3: Compute cosine similarity in chunks
def compute_batch_cosine_similarity(embedding, batch_embeddings):
    # Ensure embedding is 2D (batch_size, embedding_size)
    embedding = embedding.unsqueeze(0) if embedding.dim() == 1 else embedding
    batch_embeddings = batch_embeddings / batch_embeddings.norm(dim=1, keepdim=True)  # Normalize batch

    # Normalize the embedding (1xembedding_size)
    embedding = embedding / embedding.norm(dim=1, keepdim=True)  # Normalize single embedding
    return torch.mm(batch_embeddings, embedding.T).squeeze()  # Cosine similarity


def compute_similarities_in_chunks(speech_tensor, news_tensor, chunk_size=1000):
    similarities = []
    for i in range(0, len(news_tensor), chunk_size):
        chunk = news_tensor[i:i+chunk_size]
        cosine_sim = compute_batch_cosine_similarity(speech_tensor, chunk)
        similarities.extend(cosine_sim.cpu().tolist())
    return similarities

# Step 4: Process the tensor in chunks
similarities_2019 = []
start_time = time.time()

for idx, speech_embedding_2019 in enumerate(speech_tensor_2019):
    cosine_similarities = compute_similarities_in_chunks(speech_embedding_2019, news_tensor_2019)
    
    # Store results
    for news_id, sim_value in enumerate(cosine_similarities):
        similarities_2019.append({
            'speech_id': idx,
            'news_id': news_id,
            'cosine_similarity': sim_value
        })
        
    # Print progress
    if idx % 1000 == 0:
        elapsed_time = time.time() - start_time
        remaining_time = (elapsed_time / (idx + 1)) * (len(speech_tensor_2019) - (idx + 1))
        print(f"Processed {idx + 1}/{len(speech_tensor_2019)} speeches. ETA: {remaining_time:.2f}s")

# Step 5: Save results to disk
similarities_df_2019 = pd.DataFrame(similarities_2019)
similarities_df_2019.to_parquet('/kaggle/working/similarities_2019.parquet')

print("Finished computing cosine similarities.")

  news_tensor_2019 = torch.load('/kaggle/working/news_tensor_2019.pt').cuda()
  speech_tensor_2019 = torch.load('/kaggle/working/speech_tensor_2019.pt').cuda()


Processed 1/32874 speeches. ETA: 7517.82s
Processed 1001/32874 speeches. ETA: 80.51s
Processed 2001/32874 speeches. ETA: 72.74s
Processed 3001/32874 speeches. ETA: 68.85s
Processed 4001/32874 speeches. ETA: 66.00s
Processed 5001/32874 speeches. ETA: 63.75s
Processed 6001/32874 speeches. ETA: 61.26s
Processed 7001/32874 speeches. ETA: 58.68s
Processed 8001/32874 speeches. ETA: 56.24s
Processed 9001/32874 speeches. ETA: 53.92s
Processed 10001/32874 speeches. ETA: 51.50s
Processed 11001/32874 speeches. ETA: 49.15s
Processed 12001/32874 speeches. ETA: 46.86s
Processed 13001/32874 speeches. ETA: 44.60s
Processed 14001/32874 speeches. ETA: 42.30s
Processed 15001/32874 speeches. ETA: 40.02s
Processed 16001/32874 speeches. ETA: 37.76s
Processed 17001/32874 speeches. ETA: 35.51s
Processed 18001/32874 speeches. ETA: 33.28s
Processed 19001/32874 speeches. ETA: 31.10s
Processed 20001/32874 speeches. ETA: 28.84s
Processed 21001/32874 speeches. ETA: 26.58s
Processed 22001/32874 speeches. ETA: 24.34s

KeyboardInterrupt: 