In [6]:
import os 
os.listdir('/kaggle/input') 

embeddings_path = "/kaggle/input/embeddings"
print(os.listdir(embeddings_path))

['speeches_with_embeddings.csv', 'speeches_embeddings_sentiment.csv', 'news_embeddings_sentiment.csv', 'news_with_embeddings.csv']


In [7]:
import pandas as pd

speeches_embeddings = pd.read_csv(f"{embeddings_path}/speeches_with_embeddings.csv")
news_embeddings = pd.read_csv(f"{embeddings_path}/news_with_embeddings.csv")

# Display first few rows
print("News Data:")
print(news_embeddings.head())

print("\nSpeeches Data:")
print(speeches_embeddings.head())

News Data:
          Index                                               Link  \
0  1_01_12_2018  https://www.bbc.com/mundo/noticias-america-lat...   
1  2_01_12_2018  https://politica.expansion.mx/presidencia/2018...   
2  3_01_12_2018  https://oem.com.mx/elsoldemexico/mexico/en-don...   
3  4_01_12_2018  https://politica.expansion.mx/presidencia/2018...   
4  5_01_12_2018  https://www.eleconomista.com.mx/politica/Nicol...   

                                              Domain  \
0  BBC\nToma de protesta de AMLO: las 5 tradicion...   
1  Expansión Política\nAMLO rinde protesta y prom...   
2  El Sol de México\n¿Hay Ley Seca este 1 de dici...   
3  Expansión Política\nAMLO es un "líder persiste...   
4  El Economista\nNicolás Maduro llega a Palacio ...   

                                               Title        Date  \
0  Toma de protesta de AMLO: las 5 tradiciones qu...  2018-12-01   
1        AMLO rinde protesta y promete no reelegirse  2018-12-01   
2  ¿Hay Ley Seca este 1 de 

In [8]:
import pandas as pd
import numpy as np
import torch
import re

# Convert date columns to datetime
news_embeddings['Date'] = pd.to_datetime(news_embeddings['Date'])
speeches_embeddings['date'] = pd.to_datetime(speeches_embeddings['date'])

# Sort both dataframes by date
news_embeddings = news_embeddings.sort_values(by='Date').reset_index(drop=True)
speeches_embeddings = speeches_embeddings.sort_values(by='date').reset_index(drop=True)

# Fix duplicate news indices
duplicate_mask = news_embeddings.duplicated(subset=['Index'], keep=False)
news_embeddings.loc[duplicate_mask, 'Index'] = (
    news_embeddings.loc[duplicate_mask].groupby('Index').cumcount().astype(str) + "_" +
    news_embeddings.loc[duplicate_mask, 'Index']
)

# Generate chunk IDs (Including Date in speech_index)
speeches_embeddings_full = speeches_embeddings.copy()
news_embeddings_full = news_embeddings.copy()

# Ensure 'date' is in string format
speeches_embeddings_full['date_str'] = speeches_embeddings_full['date'].dt.strftime('%Y-%m-%d')

# Generate speech chunk IDs with date
speeches_embeddings_full['chunk_id'] = (
    "speech_" + speeches_embeddings_full['date_str'] + "_" +
    speeches_embeddings_full['speech_id'].astype(str) + "_chunk_" +
    speeches_embeddings_full.groupby('speech_id').cumcount().astype(str)
)

# Generate news chunk IDs
news_embeddings_full['chunk_id'] = news_embeddings_full.groupby('Index').cumcount()
news_embeddings_full['chunk_id'] = (
    "news_" + news_embeddings_full['Index'].astype(str) + "_chunk_" +
    news_embeddings_full['chunk_id'].astype(str)
)

# Track chunk IDs for alignment
speech_tracking_df_full = speeches_embeddings_full[['speech_id', 'chunk_id', 'date']].reset_index(drop=True)
news_tracking_df_full = news_embeddings_full[['Index', 'chunk_id', 'Date']].reset_index(drop=True)

# Sort and format dates
speech_tracking_df_full = speech_tracking_df_full.sort_values(by=['date', 'speech_id']).reset_index(drop=True)
news_tracking_df_full = news_tracking_df_full.sort_values(by=['Date', 'Index']).reset_index(drop=True)

speech_tracking_df_full['date'] = speech_tracking_df_full['date'].dt.strftime('%Y-%m-%d')
news_tracking_df_full['Date'] = news_tracking_df_full['Date'].dt.strftime('%Y-%m-%d')

# **Check the full date range before filtering**
print(f"🔍 Full News Date Range: {news_embeddings['Date'].min()} to {news_embeddings['Date'].max()}")
print(f"🔍 Full Speeches Date Range: {speeches_embeddings['date'].min()} to {speeches_embeddings['date'].max()}")

# Find common dates and filter
common_dates_full = set(speech_tracking_df_full['date']).intersection(set(news_tracking_df_full['Date']))

# Check if common dates are too limited
if len(common_dates_full) > 0:
    print(f"✅ Common Date Range: {min(common_dates_full)} to {max(common_dates_full)}")
else:
    print("⚠️ Warning: No common dates found!")

speech_tracking_df_full = speech_tracking_df_full[speech_tracking_df_full['date'].isin(common_dates_full)].reset_index(drop=True)
news_tracking_df_full = news_tracking_df_full[news_tracking_df_full['Date'].isin(common_dates_full)].reset_index(drop=True)
# Inspect the first few rows of speech and news tracking dataframes
print("Last few rows of Speech Tracking DataFrame:")
print(speech_tracking_df_full.tail())

print("\nLast few rows of News Tracking DataFrame:")
print(news_tracking_df_full.tail())
# Check the lengths of the filtered tracking dataframes
print(f"Length of filtered speech_tracking_df_full: {len(speech_tracking_df_full)}")
print(f"Length of filtered news_tracking_df_full: {len(news_tracking_df_full)}")
# Check the first and last dates in the filtered dataframes
print("First date in speech_tracking_df_full:", speech_tracking_df_full['date'].min())
print("Last date in speech_tracking_df_full:", speech_tracking_df_full['date'].max())

print("First date in news_tracking_df_full:", news_tracking_df_full['Date'].min())
print("Last date in news_tracking_df_full:", news_tracking_df_full['Date'].max())


🔍 Full News Date Range: 2018-12-01 00:00:00 to 2024-10-06 00:00:00
🔍 Full Speeches Date Range: 2018-12-04 00:00:00 to 2024-09-30 00:00:00
✅ Common Date Range: 2018-12-04 to 2024-09-30
Last few rows of Speech Tracking DataFrame:
        speech_id                       chunk_id        date
171052          1  speech_2024-09-30_1_chunk_225  2024-09-30
171053          1  speech_2024-09-30_1_chunk_226  2024-09-30
171054          1  speech_2024-09-30_1_chunk_227  2024-09-30
171055          1  speech_2024-09-30_1_chunk_228  2024-09-30
171056          1  speech_2024-09-30_1_chunk_229  2024-09-30

Last few rows of News Tracking DataFrame:
                  Index                       chunk_id        Date
34969  9_353_30_09_2024  news_9_353_30_09_2024_chunk_0  2024-09-30
34970  9_483_30_09_2024  news_9_483_30_09_2024_chunk_0  2024-09-30
34971  9_504_30_09_2024  news_9_504_30_09_2024_chunk_0  2024-09-30
34972   9_54_30_09_2024   news_9_54_30_09_2024_chunk_0  2024-09-30
34973   9_69_30_09_2024   ne

In [10]:
# Step 1: Generate chunk IDs for both embeddings and tracking dataframes
# This assumes that you already have `speeches_embeddings` and `news_embeddings` with embeddings generated for each row.

# Create chunk IDs for speeches_embeddings
speeches_embeddings['chunk_id'] = speeches_embeddings.groupby('speech_id').cumcount()

# Create chunk IDs for news_embeddings (assuming each article has one chunk)
news_embeddings['chunk_id'] = news_embeddings.groupby('Index').cumcount()

# Step 2: Generate tracking dataframes with proper chunking
speech_tracking_df_full = speeches_embeddings[['speech_id', 'chunk_id', 'date']].reset_index(drop=True)
news_tracking_df_full = news_embeddings[['Index', 'chunk_id', 'Date']].reset_index(drop=True)

# Step 3: Check and print the shape of tracking dataframes
print(f"Shape of Speech Tracking DataFrame: {speech_tracking_df_full.shape}")
print(f"Shape of News Tracking DataFrame: {news_tracking_df_full.shape}")

# Step 4: Ensure the chunk_ids align between embeddings and tracking dataframes
assert len(speech_tracking_df_full) == len(speeches_embeddings), "Speech tracking dataframe length mismatch!"
assert len(news_tracking_df_full) == len(news_embeddings), "News tracking dataframe length mismatch!"

# Step 5: Check and print the shape of embeddings tensors
print(f"Shape of Speech Embeddings Tensor: {speeches_embeddings.shape}")
print(f"Shape of News Embeddings Tensor: {news_embeddings.shape}")

# Step 6: Ensure the number of rows match between expanded dataframe and embeddings tensor
assert speeches_embeddings.shape[0] == speech_tracking_df_full.shape[0], "Mismatch in rows between speech embeddings and tracking dataframe!"
assert news_embeddings.shape[0] == news_tracking_df_full.shape[0], "Mismatch in rows between news embeddings and tracking dataframe!"

# Optional: Print some sample rows for validation
print(f"Sample Speech Tracking Data: {speech_tracking_df_full.head()}")
print(f"Sample News Tracking Data: {news_tracking_df_full.head()}")


Shape of Speech Tracking DataFrame: (174818, 3)
Shape of News Tracking DataFrame: (42822, 3)
Shape of Speech Embeddings Tensor: (174818, 10)
Shape of News Embeddings Tensor: (42822, 11)
Sample Speech Tracking Data:    speech_id  chunk_id       date
0      18410         0 2018-12-04
1      18410         1 2018-12-04
2      18410         2 2018-12-04
3      18410         3 2018-12-04
4      18410         4 2018-12-04
Sample News Tracking Data:            Index  chunk_id       Date
0   1_01_12_2018         0 2018-12-01
1  27_01_12_2018         0 2018-12-01
2  28_01_12_2018         0 2018-12-01
3  29_01_12_2018         0 2018-12-01
4  30_01_12_2018         0 2018-12-01


In [11]:
import pandas as pd
import numpy as np
import torch

def process_embeddings_column(df, column_name):
    """
    Processes embeddings from a DataFrame column. This function:
    - Converts string representations of arrays into NumPy arrays.
    - Removes invalid embeddings.
    - Converts the final list into a PyTorch tensor.

    Args:
    - df (pd.DataFrame): DataFrame containing the embeddings.
    - column_name (str): Name of the column with embeddings.

    Returns:
    - torch.Tensor: Processed tensor with valid embeddings.
    """
    embeddings_list = []
    
    for i, row in df[column_name].items():
        try:
            # Convert string to NumPy array safely
            if isinstance(row, str):
                row = np.fromstring(row.strip("[]"), sep=" ", dtype=np.float32)
            elif isinstance(row, list):
                row = np.array(row, dtype=np.float32)

            # Ensure it's a valid array
            if row.ndim == 1 and row.size > 0:
                embeddings_list.append(row)
        except Exception as e:
            print(f"Skipping row {i} due to error: {e}")

    # Ensure all embeddings have the same shape
    if len(embeddings_list) == 0:
        raise ValueError("No valid embeddings found.")

    try:
        embeddings_matrix = np.vstack(embeddings_list)
    except ValueError:
        raise ValueError("Embeddings have inconsistent sizes.")

    return torch.tensor(embeddings_matrix, dtype=torch.float32)

# Process embeddings
news_embeddings_tensor = process_embeddings_column(news_embeddings, 'news_embeddings')
speeches_embeddings_tensor = process_embeddings_column(speeches_embeddings, 'speech_embeddings')

# Check tensor shapes
print(f"News embeddings shape: {news_embeddings_tensor.shape}")
print(f"Speeches embeddings shape: {speeches_embeddings_tensor.shape}")


News embeddings shape: torch.Size([42822, 768])
Speeches embeddings shape: torch.Size([174818, 768])
