In [1]:
import os 
os.listdir('/kaggle/input') 

embeddings_path = "/kaggle/input/embeddings"
print(os.listdir(embeddings_path))
import pandas as pd

speeches_embeddings = pd.read_csv(f"{embeddings_path}/speeches_with_embeddings.csv")
news_embeddings = pd.read_csv(f"{embeddings_path}/news_with_embeddings.csv")

# Display first few rows
print("News Data:")
print(news_embeddings.head())

print("\nSpeeches Data:")
print(speeches_embeddings.head())

['speeches_with_embeddings.csv', 'speeches_embeddings_sentiment.csv', 'news_embeddings_sentiment.csv', 'news_with_embeddings.csv']
News Data:
          Index                                               Link  \
0  1_01_12_2018  https://www.bbc.com/mundo/noticias-america-lat...   
1  2_01_12_2018  https://politica.expansion.mx/presidencia/2018...   
2  3_01_12_2018  https://oem.com.mx/elsoldemexico/mexico/en-don...   
3  4_01_12_2018  https://politica.expansion.mx/presidencia/2018...   
4  5_01_12_2018  https://www.eleconomista.com.mx/politica/Nicol...   

                                              Domain  \
0  BBC\nToma de protesta de AMLO: las 5 tradicion...   
1  Expansión Política\nAMLO rinde protesta y prom...   
2  El Sol de México\n¿Hay Ley Seca este 1 de dici...   
3  Expansión Política\nAMLO es un "líder persiste...   
4  El Economista\nNicolás Maduro llega a Palacio ...   

                                               Title        Date  \
0  Toma de protesta de AMLO: las

In [2]:
import pandas as pd
import numpy as np
import torch
import torch.nn.functional as F

# Convert and verify date columns
news_embeddings['news_date'] = pd.to_datetime(news_embeddings['Date'])
speeches_embeddings['speech_date'] = pd.to_datetime(speeches_embeddings['date'])

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

#Temporal window calculation and expansion
def generate_temporal_pairs(news_df, speeches_df, window_days=4):
    """Generate news-speech pairs within a symmetric temporal window (-4 to +4 days)"""
    pairs = []
    chunk_size = 2000
    news_chunks = np.array_split(news_df, len(news_df) // chunk_size + 1)
    
    for chunk in news_chunks:
        for _, row in chunk.iterrows():
            news_date = row['news_date']
            start_date = news_date - pd.Timedelta(days=window_days) 
            end_date = news_date + pd.Timedelta(days=window_days)  
            
            mask = (speeches_df['speech_date'] >= start_date) & (speeches_df['speech_date'] <= end_date)
            speech_ids = speeches_df[mask].index.tolist()
            pairs.extend([(row.name, s_id) for s_id in speech_ids])
    
    return pd.DataFrame(pairs, columns=['news_id', 'speech_id'])


alignment_df = generate_temporal_pairs(news_embeddings, speeches_embeddings)

#optimized embeddings calculation
def load_embeddings_half(df, col_name):
    embeddings = []
    for i, row in df.iterrows():
        if isinstance(row[col_name], str):
            arr = np.fromstring(row[col_name].strip("[]"), sep=" ", dtype=np.float16)
        else:
            arr = np.array(row[col_name], dtype=np.float16)
        embeddings.append(torch.tensor(arr, device=device).half())
        if i % 1000 == 0: torch.cuda.empty_cache()
    return torch.stack(embeddings)

news_tensor = load_embeddings_half(news_embeddings, 'news_embeddings')
speeches_tensor = load_embeddings_half(speeches_embeddings, 'speech_embeddings')

#Batched cosine similarity computation
def compute_cosine_similarities(pairs_df, news_emb, speech_emb, batch_size=8192):
    news_norm = F.normalize(news_emb, p=2, dim=1)
    speech_norm = F.normalize(speech_emb, p=2, dim=1)
    similarities = []
    for i in range(0, len(pairs_df), batch_size):
        batch = pairs_df.iloc[i:i+batch_size]
        news_batch = news_norm[batch['news_id'].values]
        speech_batch = speech_norm[batch['speech_id'].values]
        similarities.append(F.cosine_similarity(news_batch, speech_batch).cpu().numpy())
        del news_batch, speech_batch
        torch.cuda.empty_cache()
    return np.concatenate(similarities)

alignment_df['cosine_similarity'] = compute_cosine_similarities(alignment_df, news_tensor, speeches_tensor)

#include metadata to the embeddings to track temporal dependencies
def add_temporal_features(pairs_df, news_df, speeches_df):
    pairs_df = pairs_df.merge(
        news_df[['news_date']],
        left_on='news_id',
        right_index=True
    ).merge(
        speeches_df[['speech_date']],
        left_on='speech_id',
        right_index=True
    )
    pairs_df['days_diff'] = (pairs_df['news_date'] - pairs_df['speech_date']).dt.days
    return pairs_df

enriched_df = add_temporal_features(alignment_df, news_embeddings, speeches_embeddings)

#Save data to avoid rerunning everything again 
enriched_df.to_parquet('news_speech_similarities.parquet', engine='pyarrow', compression='zstd')
print("Processing complete. Results saved with columns:", enriched_df.columns.tolist())

Using device: cuda


  return bound(*args, **kwds)


Processing complete. Results saved with columns: ['news_id', 'speech_id', 'cosine_similarity', 'news_date', 'speech_date', 'days_diff']


In [3]:
import os
import pandas as pd
from urllib.parse import urlparse

# Load data
embeddings_path = "/kaggle/input/embeddings"
#speeches_embeddings = pd.read_csv(f"{embeddings_path}/speeches_with_embeddings.csv")
#news_embeddings = pd.read_csv(f"{embeddings_path}/news_with_embeddings.csv")

# Preprocessing function for media outlets
def extract_outlet(url):
    """Extract media outlet name from URL using domain parsing"""
    try:
        parsed = urlparse(url)
        domain = parsed.netloc
        if not domain:
            return "unknown"
        
        # Clean domain and split into parts
        domain = domain.lower().replace("www.", "")
        parts = domain.split(".")
        
        if len(parts) >= 3 and parts[-2] in ['co', 'com', 'org', 'net', 'edu', 'gov']:
            return parts[-3]  # Handle domains like .co.uk, .com.br
        elif len(parts) >= 2:
            return parts[-2]
        return domain
    except:
        return "unknown"

# Check and add outlet information
OUTLET_PATH = "news_outlets.parquet"

if not os.path.exists(OUTLET_PATH):
    print("Extracting media outlets...")
    news_embeddings['outlet'] = news_embeddings['Link'].apply(extract_outlet)
    news_embeddings[['outlet']].reset_index().rename(columns={'index': 'news_id'})\
        .to_parquet(OUTLET_PATH, engine='pyarrow', compression='zstd')
    print(f"Saved outlets to {OUTLET_PATH}")

# Main analysis workflow (keep existing temporal alignment code)
# [Keep your existing code for temporal alignment and cosine similarity here]

# Visualization with precomputed outlets
def plot_temporal_alignment_heatmap(df, outlet_path=OUTLET_PATH, resample_freq='M'):
    """Use precomputed outlet data for visualization"""
    news_metadata = pd.read_parquet(outlet_path)
    
    merged_df = df.merge(
        news_metadata,
        on='news_id',
        how='inner'
    ).dropna(subset=['cosine_similarity'])
    
    # Rest of the plotting logic remains the same
    # [Keep your existing visualization code here]

if __name__ == "__main__":
    try:
        similarities_df = pd.read_parquet('news_speech_similarities.parquet')
        plot_temporal_alignment_heatmap(similarities_df)
    except Exception as e:
        print(f"Error: {str(e)}")

Extracting media outlets...
Saved outlets to news_outlets.parquet


In [4]:
import pandas as pd
import statsmodels.api as sm
from pathlib import Path

def prepare_regression_data(enriched_df, outlet_path="news_outlets.parquet"):
    """Preprocess data for regression analysis with caching"""
    # Load and merge outlet metadata
    outlets = pd.read_parquet(outlet_path).set_index('news_id')
    df = enriched_df.join(outlets, on='news_id')
    
    # Calculate top outlets once and cache
    top_cache = Path("top_outlets.csv")
    if not top_cache.exists():
        top_outlets = df['outlet'].value_counts().nlargest(10).index.tolist()
        pd.Series(top_outlets).to_csv(top_cache, index=False)
    else:
        top_outlets = pd.read_csv(top_cache).squeeze().tolist()

    # Create efficient dummy encoding
    df['outlet'] = pd.Categorical(df['outlet'], categories=top_outlets)
    dummies = pd.get_dummies(df['outlet'], prefix='outlet', dtype=int)
    
    # Prepare final dataset
    df = df.assign(
        days_diff=pd.to_numeric(df['days_diff'], errors='coerce'),
        cosine_similarity=df['cosine_similarity'].astype(float)
    ).join(dummies).dropna(subset=['days_diff', 'cosine_similarity'])
    
    return df

def run_alignment_regression(preprocessed_df):
    """Run fixed effects regression with robust standard errors"""
    # Select features dynamically
    dummy_cols = [c for c in preprocessed_df if c.startswith('outlet_')]
    X = sm.add_constant(preprocessed_df[['days_diff'] + dummy_cols])
    y = preprocessed_df['cosine_similarity']
    
    # Build and fit model
    model = sm.OLS(y, X).fit(
        cov_type='HC3',
        use_t=True  # Use t-distribution for p-values
    )
    
    # Format output
    summary = model.summary2().tables[1]
    summary['Coef.'] = summary['Coef.'].map("{:.4f}".format)
    summary['P>|t|'] = summary['P>|t|'].map(lambda x: "<0.001" if x < 0.001 else f"{x:.3f}")
    
    return model, summary

# Usage
if __name__ == "__main__":
    # Load precomputed data
    enriched_df = pd.read_parquet('news_speech_similarities.parquet')
    
    # Prepare data
    reg_df = prepare_regression_data(enriched_df)
    
    # Run analysis
    model, results = run_alignment_regression(reg_df)
    
    print("=== Media Alignment Regression Results ===")
    print(results)
    print("\nKey Insights:")
    print(f"- Temporal effect (days_diff): {model.params['days_diff']:.4f} (p={model.pvalues['days_diff']:.3f})")
    print(f"- Top aligned outlet: {model.params.idxmax()} ({model.params.max():.3f})")
    

=== Media Alignment Regression Results ===
                       Coef.  Std.Err.             t   P>|t|    [0.025  \
const                 0.2963  0.000026  11321.480947  <0.001  0.296266   
days_diff            -0.0009  0.000005   -176.542877  <0.001 -0.000948   
outlet_infobae       -0.0135  0.000038   -360.761344  <0.001 -0.013608   
outlet_proceso        0.0419  0.000042    985.298995  <0.001  0.041774   
outlet_expansion      0.0103  0.000062    165.356181  <0.001  0.010160   
outlet_oem            0.0518  0.000065    801.506916  <0.001  0.051643   
outlet_elfinanciero   0.0108  0.000069    156.940828  <0.001  0.010664   
outlet_forbes         0.0061  0.000075     80.613589  <0.001  0.005925   
outlet_elpais        -0.0234  0.000078   -300.680688  <0.001 -0.023528   
outlet_lasillarota    0.0495  0.000078    638.016662  <0.001  0.049304   
outlet_eleconomista   0.0456  0.000086    528.347961  <0.001  0.045424   
outlet_milenio        0.0203  0.000110    184.224925  <0.001  0.02012

In [5]:
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:      cosine_similarity   R-squared:                       0.078
Model:                            OLS   Adj. R-squared:                  0.078
Method:                 Least Squares   F-statistic:                 2.790e+05
Date:                Mon, 24 Mar 2025   Prob (F-statistic):               0.00
Time:                        12:39:29   Log-Likelihood:             3.5765e+07
No. Observations:            31327039   AIC:                        -7.153e+07
Df Residuals:                31327027   BIC:                        -7.153e+07
Df Model:                          11                                         
Covariance Type:                  HC3                                         
                          coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------------
const                   0.2963   2