In [1]:
import os 
os.listdir('/kaggle/input') 

embeddings_path = "/kaggle/input/embeddings"
print(os.listdir(embeddings_path))
import pandas as pd

speeches_embeddings = pd.read_csv(f"{embeddings_path}/speeches_with_embeddings.csv")
news_embeddings = pd.read_csv(f"{embeddings_path}/news_with_embeddings.csv")

# Display first few rows
print("News Data:")
print(news_embeddings.head())

print("\nSpeeches Data:")
print(speeches_embeddings.head())

['speeches_with_embeddings.csv', 'speeches_embeddings_sentiment.csv', 'news_embeddings_sentiment.csv', 'news_with_embeddings.csv']
News Data:
          Index                                               Link  \
0  1_01_12_2018  https://www.bbc.com/mundo/noticias-america-lat...   
1  2_01_12_2018  https://politica.expansion.mx/presidencia/2018...   
2  3_01_12_2018  https://oem.com.mx/elsoldemexico/mexico/en-don...   
3  4_01_12_2018  https://politica.expansion.mx/presidencia/2018...   
4  5_01_12_2018  https://www.eleconomista.com.mx/politica/Nicol...   

                                              Domain  \
0  BBC\nToma de protesta de AMLO: las 5 tradicion...   
1  Expansión Política\nAMLO rinde protesta y prom...   
2  El Sol de México\n¿Hay Ley Seca este 1 de dici...   
3  Expansión Política\nAMLO es un "líder persiste...   
4  El Economista\nNicolás Maduro llega a Palacio ...   

                                               Title        Date  \
0  Toma de protesta de AMLO: las

In [2]:

import pandas as pd
import numpy as np
import torch
import torch.nn.functional as F

# Convert and verify date columns
news_embeddings['news_date'] = pd.to_datetime(news_embeddings['Date'])
speeches_embeddings['speech_date'] = pd.to_datetime(speeches_embeddings['date'])

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

#Temporal window calculation and expansion
def generate_temporal_pairs(news_df, speeches_df, window_days=3):
    """Generate news-speech pairs within a symmetric temporal window (-4 to +4 days)"""
    pairs = []
    chunk_size = 2000
    news_chunks = np.array_split(news_df, len(news_df) // chunk_size + 1)
    
    for chunk in news_chunks:
        for _, row in chunk.iterrows():
            news_date = row['news_date']
            start_date = news_date - pd.Timedelta(days=window_days) 
            end_date = news_date + pd.Timedelta(days=window_days)  
            
            mask = (speeches_df['speech_date'] >= start_date) & (speeches_df['speech_date'] <= end_date)
            speech_ids = speeches_df[mask].index.tolist()
            pairs.extend([(row.name, s_id) for s_id in speech_ids])
    
    return pd.DataFrame(pairs, columns=['news_id', 'speech_id'])


alignment_df = generate_temporal_pairs(news_embeddings, speeches_embeddings)

#optimized embeddings calculation
def load_embeddings_half(df, col_name):
    embeddings = []
    for i, row in df.iterrows():
        if isinstance(row[col_name], str):
            arr = np.fromstring(row[col_name].strip("[]"), sep=" ", dtype=np.float16)
        else:
            arr = np.array(row[col_name], dtype=np.float16)
        embeddings.append(torch.tensor(arr, device=device).half())
        if i % 1000 == 0: torch.cuda.empty_cache()
    return torch.stack(embeddings)

news_tensor = load_embeddings_half(news_embeddings, 'news_embeddings')
speeches_tensor = load_embeddings_half(speeches_embeddings, 'speech_embeddings')

#Batched cosine similarity computation
def compute_cosine_similarities(pairs_df, news_emb, speech_emb, batch_size=8192):
    news_norm = F.normalize(news_emb, p=2, dim=1)
    speech_norm = F.normalize(speech_emb, p=2, dim=1)
    similarities = []
    for i in range(0, len(pairs_df), batch_size):
        batch = pairs_df.iloc[i:i+batch_size]
        news_batch = news_norm[batch['news_id'].values]
        speech_batch = speech_norm[batch['speech_id'].values]
        similarities.append(F.cosine_similarity(news_batch, speech_batch).cpu().numpy())
        del news_batch, speech_batch
        torch.cuda.empty_cache()
    return np.concatenate(similarities)

alignment_df['cosine_similarity'] = compute_cosine_similarities(alignment_df, news_tensor, speeches_tensor)

#include metadata to the embeddings to track temporal dependencies
def add_temporal_features(pairs_df, news_df, speeches_df):
    pairs_df = pairs_df.merge(
        news_df[['news_date']],
        left_on='news_id',
        right_index=True
    ).merge(
        speeches_df[['speech_date']],
        left_on='speech_id',
        right_index=True
    )
    pairs_df['days_diff'] = (pairs_df['news_date'] - pairs_df['speech_date']).dt.days
    return pairs_df

enriched_df = add_temporal_features(alignment_df, news_embeddings, speeches_embeddings)

#Save data to avoid rerunning everything again 
enriched_df.to_parquet('news_speech_similarities.parquet', engine='pyarrow', compression='zstd')
print("Processing complete. Results saved with columns:", enriched_df.columns.tolist())

Using device: cpu


  return bound(*args, **kwds)


Processing complete. Results saved with columns: ['news_id', 'speech_id', 'cosine_similarity', 'news_date', 'speech_date', 'days_diff']


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns


def load_safe_data():
    """Load data with proper dtype conversions"""
    df = pd.read_parquet('news_speech_similarities.parquet').astype({
        'news_id': 'int64',
        'speech_id': 'int64',
        'cosine_similarity': 'float32'  # Convert from float16 to float32
    })
    df['news_date'] = pd.to_datetime(df['news_date'])
    return df

# Load processed data
df = load_safe_data()

#Code to include temporal features
def plot_temporal_trends(df, resample_freq='W', rolling_window=7):
    """Plot similarity trends with type-safe processing"""
    plt.figure(figsize=(16, 8))
    
    # Ensure float32 type
    temp_df = df[['news_date', 'cosine_similarity']].copy()
    temp_df['cosine_similarity'] = temp_df['cosine_similarity'].astype('float32')
    
    # Resample data
    df_temp = temp_df.set_index('news_date')
    resampled = df_temp['cosine_similarity'].resample(resample_freq).mean()
    
    # Create rolling average
    rolling_mean = resampled.rolling(window=rolling_window, center=True).mean()
    
    # Plot with enhanced formatting
    plt.plot(resampled.index, resampled.values, 
            alpha=0.3, label='Weekly Average')
    plt.plot(rolling_mean.index, rolling_mean.values,
            linewidth=2, label=f'{rolling_window}-week Rolling Mean')
    
    plt.title('News-Speech Content Alignment Over Time')
    plt.xlabel('Date', fontsize=12)
    plt.ylabel('Cosine Similarity', fontsize=12)
    plt.legend()
    plt.grid(alpha=0.3)
    plt.gca().spines[['top', 'right']].set_visible(False)
    plt.tight_layout()
    plt.show()

#Testing execution code, tho not for testing 
if __name__ == "__main__":
    df = load_safe_data()
    
    # Verify data types
    print("Data types:\n", df.dtypes)
    
    # Plot temporal trends only
    plot_temporal_trends(df)

In [None]:
import matplotlib.pyplot as plt
# Load processed data
df = pd.read_parquet('news_speech_similarities.parquet') #We can use this code when we compute everything again, else we have to roead the parquet
#df = pd.read_parquet(SIMILARITIES_PATH) #Piece of code when we have the path defined, otherwise it'll work by loading them again. 
# Convert to datetime and normalize (remove time components)
df['news_date'] = pd.to_datetime(df['news_date']).dt.normalize()
df['year'] = df['news_date'].dt.year

# Create daily aggregates with std dev
daily_agg = df.groupby('news_date')['cosine_similarity'].agg(['mean', 'std']).reset_index()
daily_agg.columns = ['date', 'cosine_similarity', 'std_dev']

# Extend full_dates to include October 2024 explicitly
end_date = pd.to_datetime('2024-10-31')  # Adjust as needed
full_dates = pd.date_range(
    start=daily_agg['date'].min(), 
    end=end_date, 
    freq='D'
)
daily_agg = daily_agg.set_index('date').reindex(full_dates).reset_index().rename(columns={'index': 'date'})

# Calculate bounds
daily_agg['upper_bound'] = daily_agg['cosine_similarity'] + daily_agg['std_dev'].fillna(0)
daily_agg['lower_bound'] = daily_agg['cosine_similarity'] - daily_agg['std_dev'].fillna(0)

# Create monthly aggregates (fill NaN with 0 for plotting)
monthly_agg = daily_agg.set_index('date').resample('M')['cosine_similarity'].mean().fillna(0).reset_index()
monthly_agg['month_label'] = monthly_agg['date'].dt.strftime('%b\n%Y')

# Get unique years present in data
years = daily_agg['date'].dt.year.unique()

# Set up plot
plt.figure(figsize=(18, 8 * len(years)))

for i, year in enumerate(years, 1):
    year_mask = daily_agg['date'].dt.year == year
    yearly_daily = daily_agg[year_mask]
    yearly_monthly = monthly_agg[monthly_agg['date'].dt.year == year]
    
    if yearly_daily.empty:
        continue
    
    ax = plt.subplot(len(years), 1, i)
    
    # Daily plot with variability
    ax.plot(yearly_daily['date'], 
            yearly_daily['cosine_similarity'], 
            color='#2ca02c', 
            linewidth=1.5,
            label='Daily Average')
    
    ax.fill_between(yearly_daily['date'],
                    yearly_daily['upper_bound'],
                    yearly_daily['lower_bound'],
                    color='gray', alpha=0.3, 
                    label='Daily Std Dev')
    
    # Monthly markers (plot even if value is 0)
    ax.scatter(yearly_monthly['date'], 
               yearly_monthly['cosine_similarity'],
               color='darkblue', 
               s=100,
               zorder=5,
               label='Monthly Average')
    
    # Annotate monthly values (skip if 0)
    for _, row in yearly_monthly.iterrows():
        if row['cosine_similarity'] != 0:
            ax.text(row['date'], row['cosine_similarity']+0.02,
                    f"{row['cosine_similarity']:.2f}",
                    ha='center', va='bottom',
                    fontsize=9, color='darkblue')
    
    # Highlight missing days
    missing_mask = yearly_daily['cosine_similarity'].isna()
    ax.fill_between(yearly_daily['date'],
                    yearly_daily['cosine_similarity'].min() - 0.1,
                    yearly_daily['cosine_similarity'].max() + 0.1,
                    where=missing_mask,
                    color='red', alpha=0.1,
                    label='Missing Days')
    
    # Formatting
    ax.set_title(f'{year} Daily/Monthly Speech-News Cosine Similarity', pad=20)
    ax.set_xlabel('')
    ax.set_ylabel('Cosine Similarity', fontsize=12)
    ax.grid(True, alpha=0.3)
    ax.legend(loc='upper right')
    
    # Set monthly x-ticks
    ax.set_xticks(yearly_monthly['date'])
    ax.set_xticklabels(yearly_monthly['month_label'])
    
    # Set y-axis limits
    y_min = max(daily_agg['cosine_similarity'].min() - 0.1, 0)
    y_max = min(daily_agg['cosine_similarity'].max() + 0.1, 1)
    ax.set_ylim(y_min, y_max)

plt.tight_layout()
plt.show()

  monthly_agg = daily_agg.set_index('date').resample('M')['cosine_similarity'].mean().fillna(0).reset_index()


In [None]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import os
from matplotlib.colors import Normalize

#Function to extract top media outlets 
def extract_outlet(url):
    """Extract media outlet name from news article URL"""
    patterns = [
        r"https?://(?:www\.)?([^/.]+)\.",
        r"https?://([^/]+)/",
        r"([a-z0-9-]+)\.(com|org|net|edu|gov)"
    ]
    for pattern in patterns:
        match = re.search(pattern, url, re.IGNORECASE)
        if match:
            return match.group(1).lower()
    return "unknown"


def plot_temporal_alignment_heatmap(df, news_metadata, resample_freq='M'):
    """Show alignment patterns for top 10 outlets"""
    # Merge data and clean
    merged_df = df.merge(
        news_metadata[['news_id', 'outlet']],
        on='news_id',
        how='inner'
    ).dropna(subset=['cosine_similarity'])
    
    # Get top 10 outlets by article count
    top_outlets = merged_df.groupby('outlet')['news_id'].nunique() \
                          .nlargest(10).index.tolist()
    
    # Filter to top outlets only
    filtered_df = merged_df[merged_df['outlet'].isin(top_outlets)]
    
    # Prepare temporal data
    filtered_df['period'] = filtered_df['news_date'].dt.to_period(resample_freq)
    pivot_data = filtered_df.groupby(['outlet', 'period'])['cosine_similarity'] \
                          .mean().unstack().fillna(0)
    
    # Sort outlets by total articles (descending)
    outlet_order = filtered_df.groupby('outlet')['news_id'].nunique() \
                             .sort_values(ascending=False).index
    pivot_data = pivot_data.loc[outlet_order]
    
    # Fixed sizing for 10 outlets
    plt.figure(figsize=(18, 8))  # Width 18", Height 8"
    
    # Plot heatmap
    ax = sns.heatmap(
        pivot_data,
        cmap=sns.light_palette("#cc0000", as_cmap=True), #color palette and heatmap 
        norm=Normalize(vmin=0, vmax=0.5),
        linewidths=0.3,
        linecolor='lightgray',
        cbar_kws={'label': 'Alignment Score (0-0.5 scale)'}
    )
    
    # Formatting
    ax.set_title('Top 10 Media Outlets: Alignment with Presidential Speeches\n', pad=20, fontsize=14)
    ax.set_xlabel('Time Period', labelpad=15, fontsize=12)
    ax.set_ylabel('Media Outlet', labelpad=15, fontsize=12)
    ax.set_xticklabels(
        [col.strftime('%b\n%Y') if i%3==0 else '' for i, col in enumerate(pivot_data.columns)],
        rotation=0,
        fontsize=9
    )
    plt.tight_layout()
    plt.show()

#Code to execute instructions, not testing. 

if __name__ == "__main__":
    try:
        # Load data
        news_df = news_embeddings
        similarities_df = pd.read_parquet('news_speech_similarities.parquet')
        
        # Add outlet information
        news_df['outlet'] = news_df['Link'].apply(extract_outlet)
        news_metadata = news_df[['outlet']].reset_index().rename(columns={'index': 'news_id'})
        
        # Generate visualization
        plot_temporal_alignment_heatmap(
            similarities_df,
            news_metadata,
            resample_freq='M'  # Monthly aggregation
        )
        
    except Exception as e:
        print(f"Error: {str(e)}")
        print("Troubleshooting steps:")
        print(f"1. Verify files exist in: {DATA_DIR}")
        print(f"2. Check file contents: {os.listdir(DATA_DIR)}")
        print("3. Confirm required columns exist in dataframes")

## Statistical Analysis 

Arianna emphasized the unit or level. Meaning is this article level or not? And then, if it is then it makes sense to have it as an article level with time-dependencies. 

In [13]:

# 1. Daily Aggregated Data
#I have to establish within the text the level (article level and speech level)
import pandas as pd
import statsmodels.api as sm


df_daily = daily_agg[['date', 'cosine_similarity']].copy()
df_daily['time'] = (df_daily['date'] - df_daily['date'].min()).dt.days

# Drop missing values
df_daily = df_daily.dropna(subset=['cosine_similarity'])


# Basic OLS: Similarity ~ Time
X = sm.add_constant(df_daily['time'])
y = df_daily['cosine_similarity']

model_time = sm.OLS(y, X).fit()
print("=== Basic Time Regression ===")
print(model_time.summary())


=== Basic Time Regression ===
                            OLS Regression Results                            
Dep. Variable:      cosine_similarity   R-squared:                       0.048
Model:                            OLS   Adj. R-squared:                  0.047
Method:                 Least Squares   F-statistic:                     103.5
Date:                Tue, 08 Apr 2025   Prob (F-statistic):           9.12e-24
Time:                        09:37:40   Log-Likelihood:                 3876.5
No. Observations:                2078   AIC:                            -7749.
Df Residuals:                    2076   BIC:                            -7738.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.2811 

## Effects of outlets. 

In [None]:
import re
# again extracting top media outlets. Maybe there is a more efficient way to do this. 
def extract_outlet(url):
    patterns = [r"https?://(?:www\.)?([^/.]+)\."]
    match = re.search(patterns[0], str(url), re.IGNORECASE)
    return match.group(1).lower() if match else "unknown"

# Create outlet column in news_embeddings
news_embeddings['outlet'] = news_embeddings['Link'].apply(extract_outlet)

# Verify
print("News columns:", news_embeddings.columns.tolist())
print("Sample outlets:", news_embeddings['outlet'].unique()[:5])

In [5]:
print(enriched_df)

          news_id  speech_id  cosine_similarity  news_date speech_date  \
0               0     150494           0.315430 2018-12-01  2018-12-04   
1               0     150495           0.315430 2018-12-01  2018-12-04   
2               0     150496           0.315430 2018-12-01  2018-12-04   
3               0     150497           0.315430 2018-12-01  2018-12-04   
4               0     150498           0.315430 2018-12-01  2018-12-04   
...           ...        ...                ...        ...         ...   
24418166    42773     150618           0.273682 2024-10-03  2024-09-30   
24418167    42773     150619           0.273682 2024-10-03  2024-09-30   
24418168    42773     150620           0.273682 2024-10-03  2024-09-30   
24418169    42773     150621           0.273682 2024-10-03  2024-09-30   
24418170    42773     150622           0.273682 2024-10-03  2024-09-30   

          days_diff  
0                -3  
1                -3  
2                -3  
3                -3  
4

In [5]:
# Now this will work

enriched_df = enriched_df.merge(
    news_embeddings[['outlet']],
    left_on='news_id',
    right_index=True,
    how='left'
)

# Final check
print("\nColumns in enriched_df:", enriched_df.columns.tolist())
print("Sample outlets:", enriched_df['outlet'].unique()[:5])


Columns in enriched_df: ['news_id', 'speech_id', 'cosine_similarity', 'news_date', 'speech_date', 'days_diff', 'outlet']
Sample outlets: ['bbc' 'politica' 'oem' 'eleconomista' 'milenio']


In [None]:
print(enriched_df['days_diff'].nunique())

In [10]:
import statsmodels.api as sm

# Simple regression: cosine similarity ~ Days between speech and news
X_time = sm.add_constant(enriched_df[['days_diff']])
y = enriched_df['cosine_similarity']

model_time = sm.OLS(y, X_time).fit(cov_type='HC3')
print("=== Time Trend Model ===")
print(model_time.summary())

=== Time Trend Model ===
                            OLS Regression Results                            
Dep. Variable:      cosine_similarity   R-squared:                       0.001
Model:                            OLS   Adj. R-squared:                  0.001
Method:                 Least Squares   F-statistic:                 1.285e+04
Date:                Wed, 09 Apr 2025   Prob (F-statistic):               0.00
Time:                        09:26:33   Log-Likelihood:             2.6827e+07
No. Observations:            24418171   AIC:                        -5.365e+07
Df Residuals:                24418169   BIC:                        -5.365e+07
Df Model:                           1                                         
Covariance Type:                  HC3                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.3045   1.6

In [6]:
import pandas as pd
import statsmodels.api as sm

# 1. Get top outlets. So again this is repeated 
top_outlets = enriched_df['outlet'].value_counts().nlargest(10).index.tolist()#im duplocating data here 
# 2. Create dummy variables with proper naming 
dummies = pd.get_dummies(enriched_df['outlet'], prefix='outlet')
# 3. Filter columns using PREFIXED names
dummy_columns = [f'outlet_{outlet}' for outlet in top_outlets if f'outlet_{outlet}' in dummies.columns]
dummies = dummies[dummy_columns]


In [7]:

# 4. Ensure numerical data types
dummies = dummies.astype(int)  # Convert boolean dummies to integers
enriched_df['days_diff'] = pd.to_numeric(enriched_df['days_diff'], errors='coerce')

# 5. Handle missing values
valid_data = enriched_df[['days_diff', 'cosine_similarity']].join(dummies).dropna()

In [14]:
# 6. Create regression matrix
X = sm.add_constant(valid_data[['days_diff'] + dummy_columns])
y = valid_data['cosine_similarity']

# 7. Run regression
model = sm.OLS(y, X).fit(cov_type='HC3')
print("\n=== Outlet Fixed Effects Model ===")
print(model.summary())


=== Outlet Fixed Effects Model ===
                            OLS Regression Results                            
Dep. Variable:      cosine_similarity   R-squared:                       0.077
Model:                            OLS   Adj. R-squared:                  0.077
Method:                 Least Squares   F-statistic:                 2.139e+05
Date:                Wed, 09 Apr 2025   Prob (F-statistic):               0.00
Time:                        09:33:01   Log-Likelihood:             2.7793e+07
No. Observations:            24418171   AIC:                        -5.559e+07
Df Residuals:                24418159   BIC:                        -5.559e+07
Df Model:                          11                                         
Covariance Type:                  HC3                                         
                          coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------------------------

# Regression with temporal dependencies (dummies per month and year) 

In [13]:
print(enriched_df.columns)
print(enriched_df["news_date"])

Index(['news_id', 'speech_id', 'cosine_similarity', 'news_date', 'speech_date',
       'days_diff', 'outlet'],
      dtype='object')
0          2018-12-01
1          2018-12-01
2          2018-12-01
3          2018-12-01
4          2018-12-01
              ...    
24418166   2024-10-03
24418167   2024-10-03
24418168   2024-10-03
24418169   2024-10-03
24418170   2024-10-03
Name: news_date, Length: 24418171, dtype: datetime64[ns]


In [8]:
if 'outlet_x' in enriched_df.columns and 'outlet_y' in enriched_df.columns:
    enriched_df = enriched_df.drop(columns=['outlet_x', 'outlet_y'])

In [12]:
# 1. Parse the date columns (if not already done)
enriched_df['news_date'] = pd.to_datetime(enriched_df['news_date'])
enriched_df['speech_date'] = pd.to_datetime(enriched_df['speech_date'])

# 2. Create 'year-month' variable from news_date
enriched_df['year_month'] = enriched_df['news_date'].dt.to_period('M')

# 3. Create dummies for each month
month_dummies = pd.get_dummies(enriched_df['year_month'], prefix='month')

# 4. Rebuild valid_data matrix (if not done yet)
dummies = pd.get_dummies(enriched_df['outlet'], prefix='outlet')
dummy_columns = [f'outlet_{outlet}' for outlet in top_outlets if f'outlet_{outlet}' in dummies.columns]
dummies = dummies[dummy_columns].astype(int)
# 5. Combine everything for regression
valid_data = enriched_df[['days_diff', 'cosine_similarity']].join(dummies).join(month_dummies)
valid_data = valid_data.dropna()

In [13]:
enriched_df['speech_month'] = enriched_df['speech_date'].dt.strftime('%Y-%m')
enriched_df['news_month'] = enriched_df['news_date'].dt.strftime('%Y-%m')

enriched_df['speech_year'] = enriched_df['speech_date'].dt.year
enriched_df['news_year'] = enriched_df['news_date'].dt.year

# This will now give: '2018-12', as a string


In [14]:
print(enriched_df.head(10))

   news_id  speech_id  cosine_similarity  news_date speech_date  days_diff  \
0        0     150494            0.31543 2018-12-01  2018-12-04         -3   
1        0     150495            0.31543 2018-12-01  2018-12-04         -3   
2        0     150496            0.31543 2018-12-01  2018-12-04         -3   
3        0     150497            0.31543 2018-12-01  2018-12-04         -3   
4        0     150498            0.31543 2018-12-01  2018-12-04         -3   
5        0     150499            0.31543 2018-12-01  2018-12-04         -3   
6        0     150500            0.31543 2018-12-01  2018-12-04         -3   
7        0     150501            0.31543 2018-12-01  2018-12-04         -3   
8        0     150502            0.31543 2018-12-01  2018-12-04         -3   
9        0     150503            0.31543 2018-12-01  2018-12-04         -3   

  outlet  
0    bbc  
1    bbc  
2    bbc  
3    bbc  
4    bbc  
5    bbc  
6    bbc  
7    bbc  
8    bbc  
9    bbc  


In [10]:
#no dummies because of space complexity. 
### More efficient code: 

df = enriched_df.copy()
df['time_index'] = (df['news_date'] - df['news_date'].min()).dt.days #Now time_index = 0 corresponds to the earliest news_date, and it increases linearly with time.
#this variable time index is a numerical encoding of time. It captures the chronological order of the observations. 

top_outlets = df['outlet'].value_counts().nlargest(10).index.tolist()
df['outlet_top'] = df['outlet'].where(df['outlet'].isin(top_outlets), 'Other')

X_outlets = pd.get_dummies(df['outlet_top'], prefix='outlet', drop_first=True)



In [15]:
import statsmodels.api as sm

X = pd.concat([
    df[['days_diff', 'time_index']],
    X_outlets
], axis=1)

X = sm.add_constant(X)  # adds intercept
y = df['cosine_similarity']

In [16]:
X = X.astype('float64')
y = y.astype('float64')

print(X.dtypes)
print(y.dtype)


const                  float64
days_diff              float64
time_index             float64
outlet_eleconomista    float64
outlet_elfinanciero    float64
outlet_elpais          float64
outlet_forbes          float64
outlet_infobae         float64
outlet_lasillarota     float64
outlet_milenio         float64
outlet_oem             float64
outlet_politica        float64
outlet_proceso         float64
dtype: object
float64


In [17]:
model = sm.OLS(y, X).fit()
print(model.summary())


                            OLS Regression Results                            
Dep. Variable:      cosine_similarity   R-squared:                       0.107
Model:                            OLS   Adj. R-squared:                  0.107
Method:                 Least Squares   F-statistic:                 2.444e+05
Date:                Wed, 09 Apr 2025   Prob (F-statistic):               0.00
Time:                        14:04:47   Log-Likelihood:             2.8205e+07
No. Observations:            24418171   AIC:                        -5.641e+07
Df Residuals:                24418158   BIC:                        -5.641e+07
Df Model:                          12                                         
Covariance Type:            nonrobust                                         
                          coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------------
const                   0.2640   4

In [18]:
import gc
gc.collect()


577

# Efficient code 

In [6]:
print(enriched_df.head(2))

   news_id  speech_id  cosine_similarity  news_date speech_date  days_diff  \
0        0     150494            0.31543 2018-12-01  2018-12-04         -3   
1        0     150495            0.31543 2018-12-01  2018-12-04         -3   

  outlet  
0    bbc  
1    bbc  


In [7]:
import pandas as pd
import statsmodels.formula.api as smf

# --- Data Preparation ---
# Convert to categorical types upfront
enriched_df = enriched_df.astype({
    'outlet': 'category',
    'news_date': 'datetime64[ns]',
    'speech_date': 'datetime64[ns]'
})

# Create temporal features without expanding memory
enriched_df['month'] = enriched_df['news_date'].dt.month.astype('int8')
enriched_df['year'] = enriched_df['news_date'].dt.year.astype('int16')

# Filter to top outlets using category reordering
top_outlets = enriched_df['outlet'].value_counts().nlargest(10).index.tolist()
enriched_df['outlet'] = enriched_df['outlet'].cat.set_categories(top_outlets + ['Other'])
enriched_df['outlet'] = enriched_df['outlet'].fillna('Other')

# Downcast numerical columns
enriched_df['cosine_similarity'] = pd.to_numeric(
    enriched_df['cosine_similarity'], 
    downcast='float'
)
enriched_df['days_diff'] = pd.to_numeric(
    enriched_df['days_diff'], 
    downcast='integer'
)

# --- Memory Optimization ---
# Keep only necessary columns
keep_cols = ['cosine_similarity', 'days_diff', 'outlet', 'month', 'year']
enriched_df = enriched_df[keep_cols].copy()

In [8]:
# Force garbage collection
import gc
gc.collect()

30

In [10]:
# --- Model Specification ---
# Use formula API with categorical variables
formula = """cosine_similarity ~ days_diff +
C(outlet, Treatment('Other')) + 
C(month) + 
C(year)
"""

# Fit model with reduced memory footprint
model = smf.ols(
    formula, 
    data=enriched_df,
    missing='drop'  # Automatically drops NA rows
).fit(cov_type='HC3')

print(model.summary())

                            OLS Regression Results                            
Dep. Variable:      cosine_similarity   R-squared:                       0.120
Model:                            OLS   Adj. R-squared:                  0.120
Method:                 Least Squares   F-statistic:                 1.316e+05
Date:                Wed, 09 Apr 2025   Prob (F-statistic):               0.00
Time:                        15:49:37   Log-Likelihood:             2.8382e+07
No. Observations:            24418171   AIC:                        -5.676e+07
Df Residuals:                24418142   BIC:                        -5.676e+07
Df Model:                          28                                         
Covariance Type:                  HC3                                         
                                                    coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------------------------

In [14]:
import statsmodels.formula.api as smf

formula = """cosine_similarity ~ 
             days_diff + 
             C(outlet, Treatment('Other')) + 
             C(month) + 
             C(year)"""

model_fe = smf.ols(formula, data=enriched_df, missing='drop').fit(cov_type='HC3')
print(model_fe.summary())


                            OLS Regression Results                            
Dep. Variable:      cosine_similarity   R-squared:                       0.120
Model:                            OLS   Adj. R-squared:                  0.120
Method:                 Least Squares   F-statistic:                 1.316e+05
Date:                Wed, 09 Apr 2025   Prob (F-statistic):               0.00
Time:                        16:42:27   Log-Likelihood:             2.8382e+07
No. Observations:            24418171   AIC:                        -5.676e+07
Df Residuals:                24418142   BIC:                        -5.676e+07
Df Model:                          28                                         
Covariance Type:                  HC3                                         
                                                    coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------------------------

In [9]:
import statsmodels.api as sm

md = sm.MixedLM.from_formula(
    "cosine_similarity ~ days_diff + C(outlet, Treatment('Other')) + C(month) + C(year)",
    groups="outlet",
    data=enriched_df
)
model_re = md.fit(reml=False)
print(model_re.summary())



                          Mixed Linear Model Regression Results
Model:                     MixedLM           Dependent Variable:         cosine_similarity
No. Observations:          24418171          Method:                     ML               
No. Groups:                11                Scale:                      0.0057           
Min. group size:           435409            Log-Likelihood:             28382075.1418    
Max. group size:           8279797           Converged:                  Yes              
Mean group size:           2219833.7                                                      
------------------------------------------------------------------------------------------
                                              Coef.  Std.Err.    z     P>|z| [0.025 0.975]
------------------------------------------------------------------------------------------
Intercept                                      0.267    0.076    3.531 0.000  0.119  0.416
C(outlet, Treatment('Other

#  not efficeint code

In [16]:
# Limit the number of top outlets (you could change the number, here we are using top 10)
top_outlets = enriched_df['outlet'].value_counts().nlargest(10).index.tolist()

# Create a new column that groups outlets into 'Other' if they're not in the top outlets
enriched_df['outlet_top'] = enriched_df['outlet'].where(enriched_df['outlet'].isin(top_outlets), 'Other')

# Use `Categorical` dtype to reduce memory usage
enriched_df['outlet_top'] = enriched_df['outlet_top'].astype('category')

# Create dummies for the top outlets (drop the first to avoid multicollinearity)
outlet_dummies = pd.get_dummies(enriched_df['outlet_top'], drop_first=True)

# Add the dummies to the dataframe
enriched_df = pd.concat([enriched_df, outlet_dummies], axis=1)

# Verify memory usage
print(enriched_df.memory_usage(deep=True))


Index                       128
news_id               195345368
speech_id             195345368
cosine_similarity      48836342
news_date             195345368
speech_date           195345368
days_diff             195345368
outlet               1584360090
outlet_top             24419182
eleconomista           24418171
elfinanciero           24418171
elpais                 24418171
forbes                 24418171
infobae                24418171
lasillarota            24418171
milenio                24418171
oem                    24418171
politica               24418171
proceso                24418171
dtype: int64


In [17]:
# Extract Month and Year from `news_date`
enriched_df['month'] = enriched_df['news_date'].dt.month.astype('category')
enriched_df['year'] = enriched_df['news_date'].dt.year.astype('category')

# Create dummies for month and year, also using `drop_first=True` to avoid multicollinearity
month_dummies = pd.get_dummies(enriched_df['month'], prefix='month', drop_first=True)
year_dummies = pd.get_dummies(enriched_df['year'], prefix='year', drop_first=True)

# Concatenate the dummies into the dataframe
enriched_df = pd.concat([enriched_df, month_dummies, year_dummies], axis=1)

# Verify memory usage again
print(enriched_df.memory_usage(deep=True))


Index                       128
news_id               195345368
speech_id             195345368
cosine_similarity      48836342
news_date             195345368
speech_date           195345368
days_diff             195345368
outlet               1584360090
outlet_top             24419182
eleconomista           24418171
elfinanciero           24418171
elpais                 24418171
forbes                 24418171
infobae                24418171
lasillarota            24418171
milenio                24418171
oem                    24418171
politica               24418171
proceso                24418171
month                  24418455
year                   24418435
month_2                24418171
month_3                24418171
month_4                24418171
month_5                24418171
month_6                24418171
month_7                24418171
month_8                24418171
month_9                24418171
month_10               24418171
month_11               24418171
month_12

In [18]:
# Sort by date to ensure proper time-series structure
enriched_df = enriched_df.sort_values(by='news_date')

In [21]:
import statsmodels.api as sm
# Use `Categorical` for efficient memory use on the independent variables (outlet dummies)
X = enriched_df.drop(['cosine_similarity', 'speech_id'], axis=1)
X = sm.add_constant(X)  # Add an intercept
y = enriched_df['cosine_similarity']

In [23]:
print(X.dtypes)
print(y.dtypes)


const                  float64
news_id                  int64
news_date       datetime64[ns]
speech_date     datetime64[ns]
days_diff                int64
outlet                  object
outlet_top            category
eleconomista              bool
elfinanciero              bool
elpais                    bool
forbes                    bool
infobae                   bool
lasillarota               bool
milenio                   bool
oem                       bool
politica                  bool
proceso                   bool
month                 category
year                  category
month_2                   bool
month_3                   bool
month_4                   bool
month_5                   bool
month_6                   bool
month_7                   bool
month_8                   bool
month_9                   bool
month_10                  bool
month_11                  bool
month_12                  bool
year_2019                 bool
year_2020                 bool
year_202

In [None]:
# Convert categorical columns to dummies (one-hot encoding)
X = pd.get_dummies(X, columns=['outlet_top', 'month', 'year'], drop_first=True)
# Ensure boolean columns are integers (True -> 1, False -> 0)
bool_columns = [
    'eleconomista', 'elfinanciero', 'elpais', 'forbes', 'infobae', 
    'lasillarota', 'milenio', 'oem', 'politica', 'proceso', 
    'month_2', 'month_3', 'month_4', 'month_5', 'month_6', 
    'month_7', 'month_8', 'month_9', 'month_10', 'month_11', 'month_12', 
    'year_2019', 'year_2020', 'year_2021', 'year_2022', 'year_2023', 'year_2024'
]

X[bool_columns] = X[bool_columns].astype(int)


In [None]:
y = pd.to_numeric(y, errors='coerce')  # Convert to numeric, coerces errors to NaN
y = y.dropna()  # Drop rows with missing target values


In [22]:
import statsmodels.api as sm

# Fit the OLS model
model = sm.OLS(y, X).fit()

# Summary of the regression results
print(model.summary())


ValueError: Pandas data cast to numpy dtype of object. Check input data with np.asarray(data).

# Regression with topic modelling and sentiment analysis 

In [None]:
!pip install bertopic
!pip install umap-learn hdbscan


In [None]:
import os
import numpy as np
import pandas as pd
import torch
import torch.nn.functional as F
from bertopic import BERTopic
import umap
import hdbscan
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt
import seaborn as sns


In [None]:
# Check files in Kaggle input directory
print(os.listdir('/kaggle/input')) 

# Define paths
embeddings_path = "/kaggle/input/embeddings"
print(os.listdir(embeddings_path))

# Load speech and news embeddings
speeches_embeddings = pd.read_csv(f"{embeddings_path}/speeches_with_embeddings.csv")
news_embeddings = pd.read_csv(f"{embeddings_path}/news_with_embeddings.csv")

# Convert date columns
news_embeddings['news_date'] = pd.to_datetime(news_embeddings['Date'])
speeches_embeddings['speech_date'] = pd.to_datetime(speeches_embeddings['date'])

# Show first rows
print("News Data:", news_embeddings.head())
print("\nSpeeches Data:", speeches_embeddings.head())

# Define device for efficient computations
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
#Temporal window calculation and expansion
def generate_temporal_pairs(news_df, speeches_df, window_days=3):
    """Generate news-speech pairs within a symmetric temporal window (-4 to +4 days)"""
    pairs = []
    chunk_size = 2000
    news_chunks = np.array_split(news_df, len(news_df) // chunk_size + 1)
    
    for chunk in news_chunks:
        for _, row in chunk.iterrows():
            news_date = row['news_date']
            start_date = news_date - pd.Timedelta(days=window_days) 
            end_date = news_date + pd.Timedelta(days=window_days)  
            
            mask = (speeches_df['speech_date'] >= start_date) & (speeches_df['speech_date'] <= end_date)
            speech_ids = speeches_df[mask].index.tolist()
            pairs.extend([(row.name, s_id) for s_id in speech_ids])
    
    return pd.DataFrame(pairs, columns=['news_id', 'speech_id'])


alignment_df = generate_temporal_pairs(news_embeddings, speeches_embeddings)

#optimized embeddings calculation
def load_embeddings_half(df, col_name):
    embeddings = []
    for i, row in df.iterrows():
        if isinstance(row[col_name], str):
            arr = np.fromstring(row[col_name].strip("[]"), sep=" ", dtype=np.float16)
        else:
            arr = np.array(row[col_name], dtype=np.float16)
        embeddings.append(torch.tensor(arr, device=device).half())
        if i % 1000 == 0: torch.cuda.empty_cache()
    return torch.stack(embeddings)

news_tensor = load_embeddings_half(news_embeddings, 'news_embeddings')
speeches_tensor = load_embeddings_half(speeches_embeddings, 'speech_embeddings')

#Batched cosine similarity computation
def compute_cosine_similarities(pairs_df, news_emb, speech_emb, batch_size=8192):
    news_norm = F.normalize(news_emb, p=2, dim=1)
    speech_norm = F.normalize(speech_emb, p=2, dim=1)
    similarities = []
    for i in range(0, len(pairs_df), batch_size):
        batch = pairs_df.iloc[i:i+batch_size]
        news_batch = news_norm[batch['news_id'].values]
        speech_batch = speech_norm[batch['speech_id'].values]
        similarities.append(F.cosine_similarity(news_batch, speech_batch).cpu().numpy())
        del news_batch, speech_batch
        torch.cuda.empty_cache()
    return np.concatenate(similarities)

alignment_df['cosine_similarity'] = compute_cosine_similarities(alignment_df, news_tensor, speeches_tensor)

#include metadata to the embeddings to track temporal dependencies
def add_temporal_features(pairs_df, news_df, speeches_df):
    pairs_df = pairs_df.merge(
        news_df[['news_date']],
        left_on='news_id',
        right_index=True
    ).merge(
        speeches_df[['speech_date']],
        left_on='speech_id',
        right_index=True
    )
    pairs_df['days_diff'] = (pairs_df['news_date'] - pairs_df['speech_date']).dt.days
    return pairs_df

enriched_df = add_temporal_features(alignment_df, news_embeddings, speeches_embeddings)

#Save data to avoid rerunning everything again 
enriched_df.to_parquet('news_speech_similarities.parquet', engine='pyarrow', compression='zstd')
print("Processing complete. Results saved with columns:", enriched_df.columns.tolist())

In [None]:
print("Speeches columns:", speeches_embeddings.columns)
print("News columns:", news_embeddings.columns)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# 🔹 Count topics in news & speeches
news_topic_counts = news_embeddings['topic'].value_counts().sort_index()
speech_topic_counts = speeches_embeddings['topic'].value_counts().sort_index()

# 🔹 Plot topic distribution
fig, ax = plt.subplots(1, 2, figsize=(14, 5))

sns.barplot(x=news_topic_counts.index, y=news_topic_counts.values, ax=ax[0], palette="Blues")
ax[0].set_title("News Topic Distribution")
ax[0].set_xlabel("Topic")
ax[0].set_ylabel("Count")

sns.barplot(x=speech_topic_counts.index, y=speech_topic_counts.values, ax=ax[1], palette="Oranges")
ax[1].set_title("Speeches Topic Distribution")
ax[1].set_xlabel("Topic")
ax[1].set_ylabel("Count")

plt.tight_layout()
plt.show()

# 🔹 Time trends of topics
news_embeddings['month'] = news_embeddings['news_date'].dt.to_period('M')
speeches_embeddings['month'] = speeches_embeddings['speech_date'].dt.to_period('M')

news_topic_trends = news_embeddings.groupby(['month', 'topic']).size().unstack(fill_value=0)
speech_topic_trends = speeches_embeddings.groupby(['month', 'topic']).size().unstack(fill_value=0)

# 🔹 Plot topic trends
fig, ax = plt.subplots(2, 1, figsize=(14, 10), sharex=True)

news_topic_trends.plot(ax=ax[0], cmap="Blues")
ax[0].set_title("News Topic Trends Over Time")
ax[0].set_ylabel("Count")

speech_topic_trends.plot(ax=ax[1], cmap="Oranges")
ax[1].set_title("Speeches Topic Trends Over Time")
ax[1].set_ylabel("Count")

plt.xlabel("Time (Month)")
plt.tight_layout()
plt.show()


In [None]:
topics = topic_model.get_topics()
print(f"Number of topics generated: {len(topics)}")
print(f"Topic IDs: {list(topics.keys())}")


In [None]:
# Remove the outlier topic (-1) before extracting words
valid_topics = [t for t in topic_model.get_topics().keys() if t != -1]

# Extract top 10 words per topic
topic_words = {
    topic_id: [word for word, _ in topic_model.get_topic(topic_id)[:10]]
    for topic_id in valid_topics
}

# Print extracted words per topic
for topic_id, words in topic_words.items():
    print(f"Topic {topic_id}: {', '.join(words)}")



In [None]:
# Extract top 10 words per topic
topic_words = {}

for topic_id in range(len(topic_model.get_topics())):
    words = [word for word, _ in topic_model.get_topic(topic_id)[:10]]  # Top 10 words
    topic_words[topic_id] = words

# Convert to DataFrame for better readability
topic_words_df = pd.DataFrame.from_dict(topic_words, orient='index', columns=[f'Word {i+1}' for i in range(10)])

# Display the first few topics
print(topic_words_df.head(10))

In [None]:
import pandas as pd

# 🔹 Extract top 20 words per topic
news_top_words = news_topic_model.get_topics()
speech_top_words = speech_topic_model.get_topics()

# 🔹 Function to format topic words
def extract_top_words(topic_words, top_n=20):
    top_words_dict = {}
    for topic, words in topic_words.items():
        if topic != -1:  # Ignore outliers
            top_words_dict[topic] = [word for word, _ in words[:top_n]]
    return top_words_dict

# 🔹 Get top words for each topic
news_words_dict = extract_top_words(news_top_words)
speech_words_dict = extract_top_words(speech_top_words)

# 🔹 Convert to DataFrame for easier visualization
news_words_df = pd.DataFrame.from_dict(news_words_dict, orient='index')
speech_words_df = pd.DataFrame.from_dict(speech_words_dict, orient='index')

# 🔹 Display top words per topic
print("🔹 News Topics - Top 20 Words:")
print(news_words_df.head(10))  # Print first 10 topics

print("\n🔹 Speeches Topics - Top 20 Words:")
print(speech_words_df.head(10))  # Print first 10 topics


### bin for already incorrect code 

In [None]:
# Ensure numeric type and handle NaNs safely
speeches_embeddings['avg_similarity_x'] = pd.to_numeric(speeches_embeddings['avg_similarity_x'], errors='coerce')
speeches_embeddings['avg_similarity_y'] = pd.to_numeric(speeches_embeddings['avg_similarity_y'], errors='coerce')

# Fill NaNs properly
speeches_embeddings['avg_similarity'] = speeches_embeddings['avg_similarity_x'].fillna(speeches_embeddings['avg_similarity_y'])

# Drop the redundant columns
speeches_embeddings = speeches_embeddings.drop(columns=['avg_similarity_x', 'avg_similarity_y'])


In [None]:
# Prepare regression data
X = speeches_embeddings[['topic', 'avg_similarity']]  # Features
y = speeches_embeddings['cosine_similarity']  # Target

# Convert categorical topic to numerical encoding
X = pd.get_dummies(X, columns=['topic'], drop_first=True)

# Train regression model
reg_model = LinearRegression()
reg_model.fit(X, y)

# Print model coefficients
print("Regression Coefficients:", dict(zip(X.columns, reg_model.coef_)))

# Evaluate model
y_pred = reg_model.predict(X)
print("R² Score:", reg_model.score(X, y))
