In [2]:
import os 
os.listdir('/kaggle/input') 

embeddings_path = "/kaggle/input/embeddings"
print(os.listdir(embeddings_path))
import pandas as pd

speeches_embeddings = pd.read_csv(f"{embeddings_path}/speeches_with_embeddings.csv")
news_embeddings = pd.read_csv(f"{embeddings_path}/news_with_embeddings.csv")

# Display first few rows
print("News Data:")
print(news_embeddings.head())

print("\nSpeeches Data:")
print(speeches_embeddings.head())

['speeches_with_embeddings.csv', 'speeches_embeddings_sentiment.csv', 'news_embeddings_sentiment.csv', 'news_with_embeddings.csv']
News Data:
          Index                                               Link  \
0  1_01_12_2018  https://www.bbc.com/mundo/noticias-america-lat...   
1  2_01_12_2018  https://politica.expansion.mx/presidencia/2018...   
2  3_01_12_2018  https://oem.com.mx/elsoldemexico/mexico/en-don...   
3  4_01_12_2018  https://politica.expansion.mx/presidencia/2018...   
4  5_01_12_2018  https://www.eleconomista.com.mx/politica/Nicol...   

                                              Domain  \
0  BBC\nToma de protesta de AMLO: las 5 tradicion...   
1  Expansión Política\nAMLO rinde protesta y prom...   
2  El Sol de México\n¿Hay Ley Seca este 1 de dici...   
3  Expansión Política\nAMLO es un "líder persiste...   
4  El Economista\nNicolás Maduro llega a Palacio ...   

                                               Title        Date  \
0  Toma de protesta de AMLO: las

In [3]:
import pandas as pd
import numpy as np
import torch
import torch.nn.functional as F

# Convert and verify date columns
news_embeddings['news_date'] = pd.to_datetime(news_embeddings['Date'])
speeches_embeddings['speech_date'] = pd.to_datetime(speeches_embeddings['date'])

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

#Temporal window calculation and expansion
def generate_temporal_pairs(news_df, speeches_df, window_days=3):
    """Generate news-speech pairs within a symmetric temporal window (-4 to +4 days)"""
    pairs = []
    chunk_size = 2000
    news_chunks = np.array_split(news_df, len(news_df) // chunk_size + 1)
    
    for chunk in news_chunks:
        for _, row in chunk.iterrows():
            news_date = row['news_date']
            start_date = news_date - pd.Timedelta(days=window_days) 
            end_date = news_date + pd.Timedelta(days=window_days)  
            
            mask = (speeches_df['speech_date'] >= start_date) & (speeches_df['speech_date'] <= end_date)
            speech_ids = speeches_df[mask].index.tolist()
            pairs.extend([(row.name, s_id) for s_id in speech_ids])
    
    return pd.DataFrame(pairs, columns=['news_id', 'speech_id'])


alignment_df = generate_temporal_pairs(news_embeddings, speeches_embeddings)

#optimized embeddings calculation
def load_embeddings_half(df, col_name):
    embeddings = []
    for i, row in df.iterrows():
        if isinstance(row[col_name], str):
            arr = np.fromstring(row[col_name].strip("[]"), sep=" ", dtype=np.float16)
        else:
            arr = np.array(row[col_name], dtype=np.float16)
        embeddings.append(torch.tensor(arr, device=device).half())
        if i % 1000 == 0: torch.cuda.empty_cache()
    return torch.stack(embeddings)

news_tensor = load_embeddings_half(news_embeddings, 'news_embeddings')
speeches_tensor = load_embeddings_half(speeches_embeddings, 'speech_embeddings')

#Batched cosine similarity computation
def compute_cosine_similarities(pairs_df, news_emb, speech_emb, batch_size=8192):
    news_norm = F.normalize(news_emb, p=2, dim=1)
    speech_norm = F.normalize(speech_emb, p=2, dim=1)
    similarities = []
    for i in range(0, len(pairs_df), batch_size):
        batch = pairs_df.iloc[i:i+batch_size]
        news_batch = news_norm[batch['news_id'].values]
        speech_batch = speech_norm[batch['speech_id'].values]
        similarities.append(F.cosine_similarity(news_batch, speech_batch).cpu().numpy())
        del news_batch, speech_batch
        torch.cuda.empty_cache()
    return np.concatenate(similarities)

alignment_df['cosine_similarity'] = compute_cosine_similarities(alignment_df, news_tensor, speeches_tensor)

#include metadata to the embeddings to track temporal dependencies
def add_temporal_features(pairs_df, news_df, speeches_df):
    pairs_df = pairs_df.merge(
        news_df[['news_date']],
        left_on='news_id',
        right_index=True
    ).merge(
        speeches_df[['speech_date']],
        left_on='speech_id',
        right_index=True
    )
    pairs_df['days_diff'] = (pairs_df['news_date'] - pairs_df['speech_date']).dt.days
    return pairs_df

enriched_df = add_temporal_features(alignment_df, news_embeddings, speeches_embeddings)

#Save data to avoid rerunning everything again 
enriched_df.to_parquet('news_speech_similarities.parquet', engine='pyarrow', compression='zstd')
print("Processing complete. Results saved with columns:", enriched_df.columns.tolist())

Using device: cuda


  return bound(*args, **kwds)


Processing complete. Results saved with columns: ['news_id', 'speech_id', 'cosine_similarity', 'news_date', 'speech_date', 'days_diff']


In [6]:
def load_safe_data():
    """Load data with proper dtype conversions"""
    df = pd.read_parquet('news_speech_similarities.parquet').astype({
        'news_id': 'int64',
        'speech_id': 'int64',
        'cosine_similarity': 'float32'  # Convert from float16 to float32
    })
    df['news_date'] = pd.to_datetime(df['news_date'])
    return df

# Load processed data
df = load_safe_data()

# Load processed data
df = pd.read_parquet('news_speech_similarities.parquet') #We can use this code when we compute everything again, else we have to roead the parquet
#df = pd.read_parquet(SIMILARITIES_PATH) #Piece of code when we have the path defined, otherwise it'll work by loading them again. 
# Convert to datetime and normalize (remove time components)
df['news_date'] = pd.to_datetime(df['news_date']).dt.normalize()
df['year'] = df['news_date'].dt.year

# Create daily aggregates with std dev
daily_agg = df.groupby('news_date')['cosine_similarity'].agg(['mean', 'std']).reset_index()
daily_agg.columns = ['date', 'cosine_similarity', 'std_dev']

# Extend full_dates to include October 2024 explicitly
end_date = pd.to_datetime('2024-10-31')  # Adjust as needed
full_dates = pd.date_range(
    start=daily_agg['date'].min(), 
    end=end_date, 
    freq='D'
)
daily_agg = daily_agg.set_index('date').reindex(full_dates).reset_index().rename(columns={'index': 'date'})

# Calculate bounds
daily_agg['upper_bound'] = daily_agg['cosine_similarity'] + daily_agg['std_dev'].fillna(0)
daily_agg['lower_bound'] = daily_agg['cosine_similarity'] - daily_agg['std_dev'].fillna(0)

# Create monthly aggregates (fill NaN with 0 for plotting)
monthly_agg = daily_agg.set_index('date').resample('M')['cosine_similarity'].mean().fillna(0).reset_index()
monthly_agg['month_label'] = monthly_agg['date'].dt.strftime('%b\n%Y')

# Get unique years present in data
years = daily_agg['date'].dt.year.unique()

  monthly_agg = daily_agg.set_index('date').resample('M')['cosine_similarity'].mean().fillna(0).reset_index()


In [7]:
import os
import re
import numpy as np
import pandas as pd
import torch
import torch.nn.functional as F
import statsmodels.api as sm
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from urllib.parse import urlparse
from sklearn.preprocessing import OneHotEncoder

# Configuration
EMBEDDINGS_PATH = "/kaggle/input/embeddings"
OUTLET_CACHE = "outlet_cache.parquet"
SIMILARITIES_PATH = "news_speech_similarities.parquet"
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [12]:
import statsmodels.formula.api as smf
import statsmodels.formula.api as smf


def extract_and_cache_outlets(news_df, force_refresh=False):
    """Efficient outlet extraction with caching"""
    if not force_refresh and Path(OUTLET_CACHE_PATH).exists():
        # Read the cached outlets as a Series
        return pd.read_parquet(OUTLET_CACHE_PATH)['outlet']
    
    print("Extracting outlets...")
    pattern = r"https?://(?:www\.)?([^/.]+)\."
    news_df['outlet'] = news_df['Link'].str.extract(pattern, flags=re.IGNORECASE)[0].str.lower()
    news_df['outlet'] = news_df['outlet'].fillna('unknown')
    
    # Save just the outlet series
    news_df[['outlet']].to_parquet(OUTLET_CACHE_PATH)
    return news_df['outlet']

def prepare_regression_data(enriched_df, news_embeddings):
    """Prepare data for regression analyses with caching"""
    # Merge outlet information
    if 'outlet' not in enriched_df.columns:
        outlets = extract_and_cache_outlets(news_embeddings)
        # Convert Series to DataFrame with proper column name
        enriched_df = enriched_df.merge(
            outlets.rename('outlet').to_frame(),
            left_on='news_id',
            right_index=True
        )
    
    # Temporal features
    enriched_df['date'] = pd.to_datetime(enriched_df['news_date'])
    enriched_df['year'] = enriched_df['date'].dt.year
    enriched_df['month'] = enriched_df['date'].dt.month
    
    return enriched_df

def run_combined_regression(df):
    """Run combined temporal + outlet regression"""
    # Get top outlets (cached)
    if not TOP_OUTLET_CACHE.exists():
        top_outlets = df['outlet'].value_counts().nlargest(10).index.tolist()
        pd.Series(top_outlets).to_csv(TOP_OUTLET_CACHE, index=False)
    else:
        top_outlets = pd.read_csv(TOP_OUTLET_CACHE).squeeze().tolist()
    
    # Filter and format
    df = df[df['outlet'].isin(top_outlets)].copy()
    df['outlet'] = pd.Categorical(df['outlet'], categories=top_outlets)
    
    # Formula specification
    formula = "cosine_similarity ~ days_diff + C(outlet) + C(year) + C(month)"
    
    # Fit robust regression
    model = smf.ols(formula, data=df).fit(
        cov_type='HC3',
        use_t=True
    )
    
    return model

def format_regression_results(model):
    """Create human-readable regression summary"""
    results = model.summary2().tables[1]
    results['Significance'] = results['P>|t|'].apply(
        lambda x: '***' if x < 0.001 else '**' if x < 0.01 else '*' if x < 0.05 else ''
    )
    return results[['Coef.', 'Std.Err.', 'Significance']]

if __name__ == "__main__":
    # Load and prepare data
    enriched_df = pd.read_parquet('news_speech_similarities.parquet')
    enriched_df = prepare_regression_data(enriched_df, news_embeddings)
    
    # Run combined regression
    combined_model = run_combined_regression(enriched_df)
    
    print("=== Combined Regression Results ===")
    print(format_regression_results(combined_model))

=== Combined Regression Results ===
                              Coef.  Std.Err. Significance
Intercept                  0.253413  0.000376          ***
C(outlet)[T.proceso]       0.056428  0.000048          ***
C(outlet)[T.oem]           0.077872  0.000073          ***
C(outlet)[T.politica]      0.029216  0.000073          ***
C(outlet)[T.elfinanciero]  0.019403  0.000079          ***
C(outlet)[T.forbes]        0.025054  0.000085          ***
C(outlet)[T.elpais]       -0.005327  0.000089          ***
C(outlet)[T.eleconomista]  0.060001  0.000097          ***
C(outlet)[T.lasillarota]   0.080962  0.000096          ***
C(outlet)[T.milenio]       0.042358  0.000127          ***
C(year)[T.2019]           -0.003982  0.000375          ***
C(year)[T.2020]            0.010182  0.000373          ***
C(year)[T.2021]            0.004434  0.000370          ***
C(year)[T.2022]            0.015632  0.000369          ***
C(year)[T.2023]            0.022493  0.000369          ***
C(year)[T.2024]     