In [2]:
"""
Political News Engagement Forecasting System
Author: Mohammad Rasoul Salmani
Project: AI Final Project
Date: 2026/01/22
"""

'\nPolitical News Engagement Forecasting System\nAuthor: Mohammad Rasoul Salmani\nProject: AI Final Project\nDate: 2026/01/22\n'

In [3]:
# ==================== IMPORTS ====================
import os
import json
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from collections import defaultdict, Counter
from sklearn.metrics import mean_absolute_error, silhouette_score
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import CountVectorizer
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')
import pickle
from scipy.spatial import distance

In [7]:
# Import original model classes from your notebook
# You need to copy the model classes from models.ipynb
from Models import LSTMForecasterMF, BinnedSeqDataset  # Assuming you save models.py

In [None]:
# ==================== CONFIGURATION ====================
class Config:
    """Configuration parameters matching the original paper"""
    SEED = 42
    BATCH_SIZE = 32
    SEQ_LENGTH = 4  # 8 time steps = 2 years (quarterly)
    NUM_STANCES = 7  # -3 to +3
    NUM_CLUSTERS = 20
    DEVICE = torch.device("cuda:2" if torch.cuda.is_available() else "cpu")
    MODEL_PATHS = {
        'D1': 'final_models/all_model_data_1.pth',
        'D2': 'final_models/all_model_data_2.pth', 
        'D3': 'final_models/all_model_data_3.pth',
        'D4': 'final_models/all_model_data_4.pth'
    }

In [9]:
# ==================== ORIGINAL MODEL LOADER ====================
class OriginalModelLoader:
    """Load and use the original models from the paper"""
    
    def __init__(self, config):
        self.config = config
        self.models = {}
        self.load_all_models()
    
    def load_all_models(self):
        """Load all 4 original MFN models"""
        print("Loading original models from paper...")
        
        for dataset_name, model_path in self.config.MODEL_PATHS.items():
            print(f"  Loading {dataset_name} from {model_path}")
            
            try:
                # Load model with DataParallel wrapper
                model = torch.load(model_path, map_location=self.config.DEVICE)
                
                # Handle DataParallel models
                if isinstance(model, nn.DataParallel):
                    model = model.module
                
                model.eval()
                model.to(self.config.DEVICE)
                self.models[dataset_name] = model
                
                print(f"    ✓ Successfully loaded {dataset_name}")
                
            except Exception as e:
                print(f"    ✗ Failed to load {dataset_name}: {e}")
    
    def extract_hidden_states(self, input_data, dataset_name='D1'):
        """
        Extract hidden states from original model
        Input should match the original model's expected format
        """
        if dataset_name not in self.models:
            raise ValueError(f"Model for {dataset_name} not loaded")
        
        model = self.models[dataset_name]
        
        with torch.no_grad():
            # Prepare input - this needs to match the original model's input format
            # Based on LSTMForecasterMF.forward() signature:
            # forward(self, count_feats, in_time_feats, out_time_feats, 
            #         ns_feats, mention_feats, hash_feats, text_feats)
            
            # Assuming input_data is a dictionary with all required features
            hidden_state, predictions = model(
                input_data['eng_counts'].to(self.config.DEVICE),
                input_data['input_time_feat'].to(self.config.DEVICE),
                input_data['output_time_feat'].to(self.config.DEVICE),
                input_data['ns_feat'].to(self.config.DEVICE),
                input_data['mentions_feat'].to(self.config.DEVICE),
                input_data['hashtag_feat'].to(self.config.DEVICE),
                input_data['text_feat'].to(self.config.DEVICE)
            )
            
            return hidden_state.cpu().numpy(), predictions.cpu().numpy()

In [10]:
# ==================== DATA PROCESSOR ====================
class DataProcessor:
    """Process raw JSON data into sequences compatible with original models"""
    
    def __init__(self, config):
        self.config = config
        np.random.seed(config.SEED)
        torch.manual_seed(config.SEED)
    
    def load_data(self, filepath, sample_size=None):
        """Load JSON data and convert to DataFrame"""
        print(f"Loading data from {filepath}...")
        
        with open(filepath, 'r', encoding='utf-8') as f:
            data = json.load(f)
        
        # Convert to list of records
        records = []
        items_processed = 0
        
        for key, value in data.items():
            if sample_size and items_processed >= sample_size:
                break
                
            records.append({
                'user_id': value['user_id_anonymized'],
                'timestamp': pd.to_datetime(value['created_at']),
                'sources': value['news sources'],
                'stances': value['partisan stance']
            })
            items_processed += 1
        
        df = pd.DataFrame(records)
        print(f"Loaded {len(df)} records from {items_processed} users")
        return df
    
    def prepare_for_original_model(self, df):
        """
        Prepare data in the exact format expected by the original model
        This is a simplified version - you may need to adjust based on actual data format
        """
        print("Preparing data for original model...")
        
        # Group by user and create sequences
        user_sequences = defaultdict(list)
        
        # For each user, create 8 time steps of data
        for user_id, user_df in tqdm(df.groupby('user_id'), desc="Processing users"):
            user_df = user_df.sort_values('timestamp')
            
            # Create quarterly aggregates (simplified)
            user_df['quarter'] = user_df['timestamp'].dt.to_period('Q')
            quarterly_data = []
            
            for quarter, quarter_df in user_df.groupby('quarter'):
                # Count engagements per stance
                stance_counts = [0] * 7
                for stances in quarter_df['stances']:
                    for stance in stances:
                        idx = int(stance) + 3
                        if 0 <= idx < 7:
                            stance_counts[idx] += 1
                quarterly_data.append(stance_counts)
            
            # If we have enough quarters, create sequences
            if len(quarterly_data) >= 9:  # 8 input + 1 output
                # Create multiple overlapping sequences
                for i in range(len(quarterly_data) - 8):
                    input_seq = quarterly_data[i:i+8]
                    output_seq = quarterly_data[i+8]
                    
                    # Create feature dictionary matching original format
                    features = {
                        'eng_counts': np.array(input_seq).reshape(1, 8, 7),
                        'label_original': np.array(output_seq).reshape(1, 7),
                        # Add dummy features for other inputs (simplified)
                        'input_time_feat': np.zeros((1, 8, 4)),
                        'output_time_feat': np.zeros((1, 4)),
                        'ns_feat': np.zeros((1, 1536)),
                        'mentions_feat': np.zeros((1, 1536)),
                        'hashtag_feat': np.zeros((1, 1536)),
                        'text_feat': np.zeros((1, 3072))
                    }
                    
                    user_sequences[user_id].append(features)
        
        # Flatten all sequences
        all_sequences = []
        for user_seqs in user_sequences.values():
            all_sequences.extend(user_seqs)
        
        print(f"Created {len(all_sequences)} sequences")
        return all_sequences

In [11]:
# ==================== CLUSTERING ANALYZER ====================
class UserClusterAnalyzer:
    """Analyze user behavior patterns through clustering"""
    
    def __init__(self, config):
        self.config = config
    
    def cluster_users(self, representations, n_clusters=20):
        """Cluster users based on their representations"""
        kmeans = KMeans(n_clusters=n_clusters, random_state=self.config.SEED, n_init=10)
        cluster_labels = kmeans.fit_predict(representations)
        
        # Calculate silhouette score
        if len(np.unique(cluster_labels)) > 1:
            silhouette_avg = silhouette_score(representations, cluster_labels)
            print(f"Silhouette Score: {silhouette_avg:.3f}")
        else:
            print("Only one cluster found, silhouette score not calculated")
        
        return cluster_labels, kmeans.cluster_centers_
    
    def analyze_cluster_topics(self, user_texts, cluster_labels, top_n=10):
        """Find distinguishing terms for each cluster"""
        if not user_texts:
            return {}
        
        vectorizer = CountVectorizer(max_features=1000, stop_words='english')
        X = vectorizer.fit_transform(user_texts)
        
        cluster_terms = {}
        for cluster_id in np.unique(cluster_labels):
            cluster_mask = cluster_labels == cluster_id
            other_mask = ~cluster_mask
            
            if sum(cluster_mask) < 2 or sum(other_mask) < 2:
                continue
            
            # Get feature frequencies
            cluster_counts = X[cluster_mask].sum(axis=0).A1
            other_counts = X[other_mask].sum(axis=0).A1
            
            # Simple frequency ratio
            total_cluster = cluster_counts.sum()
            total_other = other_counts.sum()
            
            if total_cluster == 0 or total_other == 0:
                continue
            
            term_scores = []
            feature_names = vectorizer.get_feature_names_out()
            
            for i, term in enumerate(feature_names):
                cluster_freq = cluster_counts[i] / total_cluster if total_cluster > 0 else 0
                other_freq = other_counts[i] / total_other if total_other > 0 else 0
                
                if cluster_freq > 0:
                    score = cluster_freq / (other_freq + 1e-10)  # Avoid division by zero
                    term_scores.append((term, score, cluster_freq))
            
            # Sort by score
            term_scores.sort(key=lambda x: x[1], reverse=True)
            cluster_terms[cluster_id] = term_scores[:top_n]
        
        return cluster_terms

In [12]:
# ==================== VISUALIZATION ====================
class Visualization:
    """Create visualizations for the project"""
    
    @staticmethod
    def plot_simple_heatmap(cluster_data, save_path='cluster_heatmap.png'):
        """
        Simple heatmap showing engagement patterns per cluster
        cluster_data: dictionary with cluster_id as key and dict as value
        """
        if not cluster_data:
            print("No cluster data to visualize")
            return
        
        # Prepare data for heatmap
        n_clusters = len(cluster_data)
        engagement_matrix = np.zeros((n_clusters, 7))
        cluster_sizes = []
        cluster_labels = []
        
        for cluster_id, data in sorted(cluster_data.items()):
            engagement_matrix[cluster_id] = data['engagement_pattern']
            cluster_sizes.append(data['size'])
            cluster_labels.append(f"Cluster {cluster_id}\n(n={data['size']})")
        
        # Create heatmap
        plt.figure(figsize=(14, 10))
        
        ax = sns.heatmap(engagement_matrix, 
                        annot=True, 
                        fmt='.1f',
                        cmap='YlOrRd',
                        xticklabels=['-3', '-2', '-1', '0', '+1', '+2', '+3'],
                        yticklabels=cluster_labels,
                        cbar_kws={'label': 'Average Engagement Count'})
        
        plt.title('Average News Engagement by Cluster and Political Stance', fontsize=16, pad=20)
        plt.xlabel('Political Stance', fontsize=14)
        plt.ylabel('Cluster (with user count)', fontsize=14)
        plt.xticks(fontsize=12)
        plt.yticks(fontsize=10, rotation=0)
        
        # Add cluster sizes as text on the right
        for i, size in enumerate(cluster_sizes):
            plt.text(engagement_matrix.shape[1] + 0.5, i + 0.5, 
                    f"Size: {size}", 
                    ha='left', va='center', fontsize=10)
        
        plt.tight_layout()
        plt.savefig(save_path, dpi=300, bbox_inches='tight')
        plt.show()
        print(f"Heatmap saved to {save_path}")
    
    @staticmethod
    def plot_cluster_sizes(cluster_labels, save_path='cluster_sizes.png'):
        """Plot bar chart of cluster sizes"""
        cluster_sizes = Counter(cluster_labels)
        
        plt.figure(figsize=(12, 6))
        
        clusters = sorted(cluster_sizes.keys())
        sizes = [cluster_sizes[c] for c in clusters]
        
        bars = plt.bar(range(len(clusters)), sizes, color='skyblue', edgecolor='black')
        
        # Add value labels on top of bars
        for bar, size in zip(bars, sizes):
            plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.5,
                    str(size), ha='center', va='bottom', fontsize=10)
        
        plt.xlabel('Cluster ID', fontsize=12)
        plt.ylabel('Number of Users', fontsize=12)
        plt.title(f'Distribution of Users Across {len(clusters)} Clusters', fontsize=14)
        plt.xticks(range(len(clusters)), [f'Cluster {c}' for c in clusters], rotation=45)
        plt.grid(axis='y', alpha=0.3)
        
        plt.tight_layout()
        plt.savefig(save_path, dpi=300, bbox_inches='tight')
        plt.show()
    
    @staticmethod
    def plot_engagement_trends(cluster_data, save_path='engagement_trends.png'):
        """Plot engagement trends over time for each cluster"""
        fig, axes = plt.subplots(5, 4, figsize=(20, 15))
        axes = axes.flatten()
        
        for idx, (cluster_id, data) in enumerate(sorted(cluster_data.items())):
            if idx >= len(axes):
                break
            
            engagement_pattern = data['engagement_pattern']
            
            # Plot each stance as a line
            stances = ['-3', '-2', '-1', '0', '+1', '+2', '+3']
            colors = plt.cm.Set3(np.linspace(0, 1, 7))
            
            for stance_idx in range(7):
                axes[idx].plot(engagement_pattern[:, stance_idx], 
                             label=stances[stance_idx], 
                             color=colors[stance_idx],
                             linewidth=2,
                             marker='o')
            
            axes[idx].set_title(f'Cluster {cluster_id} (n={data["size"]})', fontsize=11)
            axes[idx].set_xlabel('Time Step', fontsize=9)
            axes[idx].set_ylabel('Engagement', fontsize=9)
            axes[idx].grid(True, alpha=0.3)
            axes[idx].legend(loc='upper right', fontsize=7)
        
        # Hide unused subplots
        for idx in range(len(cluster_data), len(axes)):
            axes[idx].axis('off')
        
        plt.suptitle('Engagement Trends Across Clusters', fontsize=16, y=1.02)
        plt.tight_layout()
        plt.savefig(save_path, dpi=300, bbox_inches='tight')
        plt.show()

In [13]:
# ==================== MAIN PIPELINE WITH ORIGINAL MODELS ====================
def main():
    """Main execution pipeline using original models"""
    print("=" * 70)
    print("POLITICAL NEWS ENGAGEMENT FORECASTING SYSTEM - ORIGINAL MODELS")
    print("=" * 70)
    
    # Initialize configuration
    config = Config()
    print(f"Device: {config.DEVICE}")
    print(f"Seed: {config.SEED}")
    
    # Step 1: Load original models
    print("\n" + "=" * 50)
    print("STEP 1: LOADING ORIGINAL MODELS")
    print("=" * 50)
    
    model_loader = OriginalModelLoader(config)
    
    if not model_loader.models:
        print("ERROR: No models loaded. Check model paths.")
        return None, None
    
    print(f"✓ Loaded {len(model_loader.models)} original models")
    
    # Step 2: Load and prepare data
    print("\n" + "=" * 50)
    print("STEP 2: LOADING AND PREPARING DATA")
    print("=" * 50)
    
    data_processor = DataProcessor(config)
    
    # Load sample data
    df = data_processor.load_data('data/icwsm-2024-forecasting-data-anon.json', 
                                 sample_size=20000)
    
    if len(df) == 0:
        print("ERROR: No data loaded!")
        return None, None
    
    print(f"\nData statistics:")
    print(f"- Total records: {len(df):,}")
    print(f"- Unique users: {df['user_id'].nunique():,}")
    print(f"- Time range: {df['timestamp'].min().date()} to {df['timestamp'].max().date()}")
    
    # Prepare data for model
    all_sequences = data_processor.prepare_for_original_model(df)
    
    if len(all_sequences) == 0:
        print("ERROR: No sequences created!")
        print("Users may not have enough data points (need at least 9 quarters)")
        return None, None
    
    print(f"\n✓ Created {len(all_sequences):,} sequences")
    
    # Step 3: Extract hidden states using original model
    print("\n" + "=" * 50)
    print("STEP 3: EXTRACTING HIDDEN STATES")
    print("=" * 50)
    
    # We'll use the first model (D1) for extraction
    # In practice, you might want to use all models based on time period
    print("Using model D1 for feature extraction...")
    
    # Extract features for a subset of sequences (for speed)
    sample_size = min(1000, len(all_sequences))
    print(f"Processing {sample_size} sequences...")
    
    all_hidden_states = []
    all_predictions = []
    all_labels = []
    
    with torch.no_grad():
        for i in tqdm(range(sample_size), desc="Extracting features"):
            seq_data = all_sequences[i]
            
            # Convert numpy arrays to torch tensors
            input_data = {
                'eng_counts': torch.FloatTensor(seq_data['eng_counts']),
                'input_time_feat': torch.FloatTensor(seq_data['input_time_feat']),
                'output_time_feat': torch.FloatTensor(seq_data['output_time_feat']),
                'ns_feat': torch.FloatTensor(seq_data['ns_feat']),
                'mentions_feat': torch.FloatTensor(seq_data['mentions_feat']),
                'hashtag_feat': torch.FloatTensor(seq_data['hashtag_feat']),
                'text_feat': torch.FloatTensor(seq_data['text_feat'])
            }
            
            # Extract hidden states
            hidden_state, prediction = model_loader.extract_hidden_states(input_data, 'D1')
            all_hidden_states.append(hidden_state.flatten())
            all_predictions.append(prediction.flatten())
            all_labels.append(seq_data['label_original'].flatten())
    
    user_repr = np.array(all_hidden_states)
    predictions = np.array(all_predictions)
    true_labels = np.array(all_labels)
    
    print(f"\n✓ Extracted features for {user_repr.shape[0]} sequences")
    print(f"  Feature dimension: {user_repr.shape[1]}")
    
    # Step 4: Clustering analysis
    print("\n" + "=" * 50)
    print("STEP 4: CLUSTERING ANALYSIS")
    print("=" * 50)
    
    analyzer = UserClusterAnalyzer(config)
    cluster_labels, cluster_centers = analyzer.cluster_users(user_repr, config.NUM_CLUSTERS)
    
    # Analyze cluster sizes
    cluster_sizes = Counter(cluster_labels)
    print(f"\nCluster distribution:")
    for cluster_id in sorted(cluster_sizes.keys()):
        size = cluster_sizes[cluster_id]
        percentage = (size / len(cluster_labels)) * 100
        print(f"  Cluster {cluster_id:2d}: {size:3d} users ({percentage:5.1f}%)")
    
    # Step 5: Create cluster visualizations
    print("\n" + "=" * 50)
    print("STEP 5: CREATING VISUALIZATIONS")
    print("=" * 50)
    
    # تابع کمکی برای محاسبه میانگین گرایش
    def calculate_avg_stance(engagement_pattern):
        """محاسبه میانگین وزنی گرایش سیاسی"""
        stances = np.array([-3, -2, -1, 0, 1, 2, 3])
        total_engagements = np.sum(engagement_pattern)
        if total_engagements > 0:
            weighted_avg = np.sum(stances * engagement_pattern) / total_engagements
            return weighted_avg
        return 0
    
    # Prepare cluster data for visualization
    cluster_data = {}
    for cluster_id in range(config.NUM_CLUSTERS):
        cluster_users_idx = np.where(cluster_labels == cluster_id)[0]
        
        if len(cluster_users_idx) > 0:
            # Calculate average engagement pattern for this cluster
            cluster_engagements = true_labels[cluster_users_idx]
            avg_engagement = np.mean(cluster_engagements, axis=0)
            
            cluster_data[cluster_id] = {
                'size': len(cluster_users_idx),
                'avg_stance': calculate_avg_stance(avg_engagement),
                'engagement_pattern': avg_engagement,
                'user_indices': cluster_users_idx
            }
    
    # Create visualizations
    viz = Visualization()
    
    # 1. Cluster sizes bar chart
    print("\n1. Creating cluster sizes visualization...")
    viz.plot_cluster_sizes(cluster_labels, 'results/cluster_sizes.png')
    
    # 2. Simple heatmap of engagement patterns
    print("\n2. Creating engagement heatmap...")
    viz.plot_simple_heatmap(cluster_data, 'results/cluster_heatmap.png')
    
    # 3. Engagement trends (if we have time-series data)
    print("\n3. Creating engagement trends visualization...")
    # This would need time-series engagement data
    
    # Step 6: Performance evaluation
    print("\n" + "=" * 50)
    print("STEP 6: MODEL PERFORMANCE EVALUATION")
    print("=" * 50)
    
    # Calculate MAE for each stance
    mae_per_stance = []
    for stance_idx in range(7):
        mae = mean_absolute_error(true_labels[:, stance_idx], predictions[:, stance_idx])
        mae_per_stance.append(mae)
    
    print("\nModel Performance (MAE per stance):")
    print("-" * 40)
    for idx, (stance, mae) in enumerate(zip(['-3', '-2', '-1', '0', '+1', '+2', '+3'], mae_per_stance)):
        print(f"  Stance {stance}: {mae:.4f}")
    
    avg_mae = np.mean(mae_per_stance)
    print(f"\n  Average MAE: {avg_mae:.4f}")
    
    # Step 7: Save results
    print("\n" + "=" * 50)
    print("STEP 7: SAVING RESULTS")
    print("=" * 50)
    
    # Save cluster assignments
    results = {
        'cluster_labels': cluster_labels,
        'cluster_data': cluster_data,
        'user_representations': user_repr,
        'predictions': predictions,
        'true_labels': true_labels,
        'mae_per_stance': mae_per_stance,
        'avg_mae': avg_mae
    }
    
    with open('results/clustering_results.pkl', 'wb') as f:
        pickle.dump(results, f)
    
    print("✓ Results saved to 'results/' directory:")
    print("  - clustering_results.pkl")
    print("  - cluster_sizes.png")
    print("  - cluster_heatmap.png")
    
    print("\n" + "=" * 70)
    print("PIPELINE COMPLETED SUCCESSFULLY!")
    print("=" * 70)
    
    return cluster_labels, cluster_data

In [14]:
# ==================== EXECUTION ====================
if __name__ == "__main__":
    
    # Create results directory if it doesn't exist
    os.makedirs('results', exist_ok=True)
    
    try:
        # Run the main pipeline
        cluster_labels, cluster_data = main()
        
        if cluster_labels is not None:
            print("\nSummary:")
            print("-" * 40)
            print(f"Total users clustered: {len(cluster_labels)}")
            print(f"Number of clusters: {len(np.unique(cluster_labels))}")
            print(f"Results saved in 'results/' directory")
            
            # Show sample cluster
            if cluster_data:
                sample_cluster = list(cluster_data.keys())[0]
                print(f"\nSample cluster #{sample_cluster}:")
                print(f"  Size: {cluster_data[sample_cluster]['size']} users")
                print(f"  Avg stance: {cluster_data[sample_cluster]['avg_stance']:.2f}")
        
    except Exception as e:
        print(f"\nERROR: {e}")
        import traceback
        traceback.print_exc()
    
    print("\nDone!")

POLITICAL NEWS ENGAGEMENT FORECASTING SYSTEM - ORIGINAL MODELS
Device: cpu
Seed: 42

STEP 1: LOADING ORIGINAL MODELS
Loading original models from paper...
  Loading D1 from final_models/all_model_data_1.pth
    ✗ Failed to load D1: [Errno 2] No such file or directory: 'final_models/all_model_data_1.pth'
  Loading D2 from final_models/all_model_data_2.pth
    ✗ Failed to load D2: [Errno 2] No such file or directory: 'final_models/all_model_data_2.pth'
  Loading D3 from final_models/all_model_data_3.pth
    ✗ Failed to load D3: [Errno 2] No such file or directory: 'final_models/all_model_data_3.pth'
  Loading D4 from final_models/all_model_data_4.pth
    ✗ Failed to load D4: [Errno 2] No such file or directory: 'final_models/all_model_data_4.pth'
ERROR: No models loaded. Check model paths.

Done!
