### The purpose is to continue with the results. 

In [None]:
import os 
os.listdir('/kaggle/input') 

embeddings_path = "/kaggle/input/embeddings"
print(os.listdir(embeddings_path))
import pandas as pd

speeches_embeddings = pd.read_csv(f"{embeddings_path}/speeches_with_embeddings.csv")
news_embeddings = pd.read_csv(f"{embeddings_path}/news_with_embeddings.csv")

# Display first few rows
print("News Data:")
print(news_embeddings.head())

print("\nSpeeches Data:")
print(speeches_embeddings.head())

['speeches_with_embeddings.csv', 'speeches_embeddings_sentiment.csv', 'news_embeddings_sentiment.csv', 'news_with_embeddings.csv']


In [None]:
# =====================================================================
# 1. Imports & Initialization
# =====================================================================
import pandas as pd
import numpy as np
import torch
import torch.nn.functional as F
from datetime import datetime

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Convert date columns to datetime
news_embeddings['Date'] = pd.to_datetime(news_embeddings['Date'])
speeches_embeddings['date'] = pd.to_datetime(speeches_embeddings['date'])

# =====================================================================
# 2. Data Preparation & Date Alignment
# =====================================================================
def align_dates(news_df, speeches_df):
    """Filter and align dates between news and speeches across all years"""
    # Find common dates across the full dataset
    common_dates = set(news_df['Date']).intersection(set(speeches_df['date']))
    
    return (
        news_df[news_df['Date'].isin(common_dates)].reset_index(drop=True),
        speeches_df[speeches_df['date'].isin(common_dates)].reset_index(drop=True),
        sorted(common_dates)  # Sorted for consistency
    )

news_data, speeches_data, common_dates = align_dates(news_embeddings, speeches_embeddings)

# =====================================================================
# 3. GPU-Optimized Embedding Processing
# =====================================================================
def process_embeddings(df, col_name):
    """Convert text-based or list embeddings to GPU tensors"""
    embeddings = []
    for row in df[col_name]:
        if isinstance(row, str):
            arr = np.fromstring(row.strip("[]"), sep=" ", dtype=np.float32)
        elif isinstance(row, list):
            arr = np.array(row, dtype=np.float32)
        embeddings.append(torch.tensor(arr, device=device))
    return torch.stack(embeddings)

news_tensor = process_embeddings(news_data, 'news_embeddings')
speeches_tensor = process_embeddings(speeches_data, 'speech_embeddings')

# =====================================================================
# 4. Same-Day Cosine Similarity Calculation
# =====================================================================
def compute_daily_similarities(news_tensor, speeches_tensor, dates, news_df, speeches_df):
    """Compute cosine similarity per date between news and speeches"""
    # Normalize embeddings
    news_norm = F.normalize(news_tensor, p=2, dim=1)
    speeches_norm = F.normalize(speeches_tensor, p=2, dim=1)
    
    # Map dates to indices
    unique_dates = sorted(dates)
    date_indices = {date: i for i, date in enumerate(unique_dates)}
    
    # Initialize result tensors
    daily_avg = torch.zeros(len(unique_dates), device=device)
    daily_std = torch.zeros(len(unique_dates), device=device)
    
    # Compute daily cosine similarity
    for date in unique_dates:
        news_mask = news_df['Date'] == date
        speech_mask = speeches_df['date'] == date
        
        if news_mask.any() and speech_mask.any():
            sim_matrix = torch.mm(news_norm[news_mask], speeches_norm[speech_mask].T)
            daily_avg[date_indices[date]] = sim_matrix.mean()
            daily_std[date_indices[date]] = sim_matrix.std()
    
    return daily_avg.cpu().numpy(), daily_std.cpu().numpy(), unique_dates

daily_scores, daily_stds, valid_dates = compute_daily_similarities(
    news_tensor, speeches_tensor, common_dates, news_data, speeches_data
)

# =====================================================================
# 5. Result Compilation & Handling Missing Data
# =====================================================================
# Create a complete date range across all available years
start_date, end_date = min(common_dates), max(common_dates)
full_dates = pd.date_range(start=start_date, end=end_date, freq="D")

# Build a DataFrame with results
daily_avg_df = pd.DataFrame({
    'date': valid_dates,
    'cosine_similarity': daily_scores,
    'std_dev': daily_stds
})

# Merge with full date range
final_df = pd.DataFrame({'date': full_dates}).merge(
    daily_avg_df, 
    on='date', 
    how='left'
)

# Fill missing values using forward-fill with a 3-day limit
final_df['cosine_similarity'] = final_df['cosine_similarity'].ffill(limit=3)
final_df['std_dev'] = final_df['std_dev'].ffill(limit=3)

# =====================================================================
# 6. Create Bounds for Standard Deviation
# =====================================================================
final_df['upper_bound'] = final_df['cosine_similarity'] + final_df['std_dev']
final_df['lower_bound'] = final_df['cosine_similarity'] - final_df['std_dev']

# Save the processed data for later plotting
final_df.to_csv("cosine_similarity_results_all_years.csv", index=False)

print("Processing complete. Data saved for plotting.")


In [None]:
import matplotlib.pyplot as plt

# Define the years to plot
years = range(2019, 2025)

# Set up the figure size
plt.figure(figsize=(14, 7 * len(years)))

for i, year in enumerate(years, 1):
    yearly_df = final_df[final_df['date'].dt.year == year]
    
    if yearly_df.empty:
        continue  # Skip empty years
    
    plt.subplot(len(years), 1, i)  # Create a subplot for each year
    
    plt.plot(yearly_df['date'], 
             yearly_df['cosine_similarity'], 
             color='#2ca02c', 
             linewidth=1.5,
             marker='o',
             markersize=4,
             label='Cosine Similarity')

    # Plot the upper and lower bounds for 1 standard deviation
    plt.fill_between(yearly_df['date'], 
                     yearly_df['upper_bound'], 
                     yearly_df['lower_bound'], 
                     color='gray', alpha=0.3, label='1 Std Dev Range')

    # Highlight missing data in red
    missing_mask = yearly_df['cosine_similarity'].isna()
    plt.fill_between(yearly_df['date'], 
                     yearly_df['cosine_similarity'], 
                     where=missing_mask,
                     color='red', 
                     alpha=0.1,
                     label='Missing Data')

    # Formatting
    plt.title(f'{year} Daily Average Speech-News Cosine Similarity')
    plt.xlabel('Date')
    plt.ylabel('Cosine Similarity')
    plt.grid(True, alpha=0.3)
    plt.xticks(rotation=45)
    plt.legend()

plt.tight_layout()
plt.show()


In [None]:
from bertopic import BERTopic
from sklearn.cluster import KMeans

# Initialize topic model with GPU-accelerated UMAP
topic_model = BERTopic(
    umap_model_params={'n_neighbors': 15, 'metric': 'cosine', 'random_state': 42},
    hdbscan_model=KMeans(n_clusters=20, random_state=42),
    nr_topics=20,  # Force consistent number of topics
    calculate_probabilities=True
)

# Cluster speech content
speech_docs = speeches_data['text'].tolist()  # Assuming text column exists
speech_topics, _ = topic_model.fit_transform(speech_docs, embeddings=speeches_tensor.cpu().numpy())

# Cluster news content
news_docs = news_data['content'].tolist()  # Assuming content column exists
news_topics, _ = topic_model.transform(news_docs, embeddings=news_tensor.cpu().numpy())


In [None]:


# =====================================================================
# 8. Prepare Regression Dataset
# =====================================================================
# Create topic dominance features
def get_dominant_topic_share(topics, probabilities):
    dominant_topic = np.argmax(probabilities, axis=1)
    return pd.Series(dominant_topic).value_counts(normalize=True).to_dict()

# Speech topic features
speeches_data['topic_dist'] = speeches_data.apply(
    lambda x: get_dominant_topic_share(speech_topics, x['probabilities']), axis=1
)
daily_speech_topics = speeches_data.groupby('date')['topic_dist'].agg(
    lambda x: pd.Series(x.sum()).fillna(0)
).add_prefix('speech_topic_')

# News topic features
news_data['topic_dist'] = news_data.apply(
    lambda x: get_dominant_topic_share(news_topics, x['probabilities']), axis=1
)
daily_news_topics = news_data.groupby('Date')['topic_dist'].agg(
    lambda x: pd.Series(x.sum()).fillna(0)
).add_prefix('news_topic_')

# Merge with similarity data
regression_df = final_df.merge(
    daily_speech_topics, 
    left_on='date', 
    right_index=True,
    how='left'
).merge(
    daily_news_topics,
    left_on='date',
    right_index=True,
    how='left'
)



In [None]:
# =====================================================================
# 9. Time-Series Regression Model
# =====================================================================
import statsmodels.api as sm
from linearmodels import PanelOLS

# Create temporal features
regression_df['day_of_week'] = regression_df['date'].dt.dayofweek
regression_df['time_trend'] = np.arange(len(regression_df))

# Lagged similarity (t-1)
regression_df['lagged_similarity'] = regression_df['cosine_similarity'].shift(1)

# Speech occurrence indicator
regression_df['speech_occurred'] = regression_df['date'].isin(speeches_data['date']).astype(int)

# Prepare formula
topic_terms = ' + '.join([f'speech_topic_{i}' for i in range(20)])
formula = f'''
cosine_similarity ~ 
    speech_occurred +
    {topic_terms} +
    news_topic_0 + news_topic_1 + news_topic_2 +  # Select key news topics
    lagged_similarity +
    C(day_of_week) + 
    time_trend
'''

# Fit model with HAC standard errors
model = sm.OLS.from_formula(
    formula, 
    data=regression_df.dropna()
).fit(
    cov_type='HAC',
    cov_kwds={'maxlags': 3},
    use_t=True
)


In [None]:
# =====================================================================
# 10. Diagnostics & Visualization
# =====================================================================
print(model.summary())

# Plot significant coefficients
significant_results = model.params[model.pvalues < 0.05]
plt.figure(figsize=(10, 6))
significant_results.plot(kind='barh')
plt.title('Significant Predictors of Media-Speech Alignment')
plt.xlabel('Coefficient Size')
plt.ylabel('Predictors')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

# Save model results
with open('regression_results.txt', 'w') as f:
    f.write(str(model.summary()))