<a href="https://colab.research.google.com/github/muhammmad-al/decentralized-ai-content-recommender/blob/main/reddit_federated_recommender.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
# Cell 1: Initial Setup and Imports (same as before)
%cd /content
!rm -rf *  # Remove everything first
!git clone https://github.com/muhammmad-al/decentralized-ai-content-recommender.git
%cd decentralized-ai-content-recommender

# Install required packages for federated learning
!pip install flwr tensorflow pandas scikit-learn

# Import necessary libraries
import pandas as pd
import numpy as np
import tensorflow as tf
import flwr as fl
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler, LabelEncoder
import logging
import warnings
warnings.filterwarnings('ignore')

# Cell 2: Load and Preprocess Dataset for Node 1
def load_node_data(node_number=1, primary_category='ai'):
    """Load and preprocess data for a specific node

    Args:
        node_number: Which node this is (1,2,3)
        primary_category: Main category for this node ('ai','web3','music')
    """
    # Load all datasets
    datasets = {
        'ai': pd.read_csv('data/raw/reddit_analysis_ai.csv'),
        'music': pd.read_csv('data/raw/reddit_analysis_music.csv'),
        'web3': pd.read_csv('data/raw/reddit_analysis_web3.csv')
    }

    # Primary category gets full data
    primary_df = datasets[primary_category]

    # Sample from other categories (50% of primary size from each)
    other_categories = [cat for cat in datasets.keys() if cat != primary_category]
    sample_size = len(primary_df) // 2

    other_dfs = []
    for cat in other_categories:
        sampled_df = datasets[cat].sample(n=min(sample_size, len(datasets[cat])),
                                        random_state=42+node_number)
        other_dfs.append(sampled_df)

    # Combine datasets
    df = pd.concat([primary_df] + other_dfs, ignore_index=True)

    # Create derived features
    df['score_log'] = np.log1p(df['score'])
    df['comments_log'] = np.log1p(df['num_comments'])
    df['text_word_count'] = df['cleaned_text'].fillna('').str.split().str.len()
    df['sentiment_compound'] = (df['textblob_sentiment'] + df['transformer_score']) / 2

    # Print node info
    print(f"Node {node_number} Dataset Overview:")
    print(f"Total samples: {len(df)}")
    print("\nCategory distribution:")
    print(df['category'].value_counts())
    return df

# Load data for Node 1
node1_df = load_node_data(node_number=1, primary_category='ai')

# Cell 3: Node Client Setup
class NodeClient(fl.client.NumPyClient):
    def __init__(self, df, node_number):
        """Initialize node client with dataset"""
        self.df = df
        self.node_number = node_number
        self.prepare_data()
        self.model = self.create_model()

    def prepare_data(self):
        """Prepare features and labels"""
        # Combine text features
        text_data = self.df['title'].fillna('') + ' ' + \
                   self.df['cleaned_text'].fillna('')

        # Create text features
        self.vectorizer = TfidfVectorizer(
            max_features=1000,
            stop_words='english'
        )
        X_text = self.vectorizer.fit_transform(text_data).toarray()

        # Add numerical features
        numerical_features = ['score_log', 'comments_log',
                            'sentiment_compound', 'text_word_count']
        X_numerical = self.df[numerical_features].fillna(0).values

        # Combine features
        self.X = np.hstack([X_text, X_numerical])

        # Prepare labels
        self.label_encoder = LabelEncoder()
        self.y = self.label_encoder.fit_transform(self.df['category'])

        # Split data
        self.X_train, self.X_test, self.y_train, self.y_test = \
            train_test_split(self.X, self.y, test_size=0.2,
                           random_state=42+self.node_number)

        print(f"\nNode {self.node_number} Training Data:")
        print(f"Training data shape: {self.X_train.shape}")
        print(f"Testing data shape: {self.X_test.shape}")

    # Rest of the methods remain the same, but change Dense(1) to Dense(3)
    # in create_model() and use categorical_crossentropy as loss function
    def create_model(self):
        """Create neural network model"""
        model = tf.keras.Sequential([
            tf.keras.layers.Dense(256, activation='relu',
                                input_shape=(self.X_train.shape[1],)),
            tf.keras.layers.Dropout(0.3),
            tf.keras.layers.Dense(128, activation='relu'),
            tf.keras.layers.Dropout(0.2),
            tf.keras.layers.Dense(3, activation='softmax')  # 3 categories
        ])

        model.compile(
            optimizer='adam',
            loss='sparse_categorical_crossentropy',
            metrics=['accuracy']
        )
        return model

# Cell 4: Test Node 1
# Initialize client
node1 = NodeClient(node1_df, node_number=1)

# Test local training
print("\nTesting local training:")
history = node1.model.fit(
    node1.X_train,
    node1.y_train,
    epochs=5,
    batch_size=32,
    validation_split=0.2,
    verbose=1
)

/content
Cloning into 'decentralized-ai-content-recommender'...
remote: Enumerating objects: 50, done.[K
remote: Counting objects: 100% (50/50), done.[K
remote: Compressing objects: 100% (45/45), done.[K
remote: Total 50 (delta 16), reused 23 (delta 2), pack-reused 0 (from 0)[K
Receiving objects: 100% (50/50), 1.49 MiB | 15.42 MiB/s, done.
Resolving deltas: 100% (16/16), done.
/content/decentralized-ai-content-recommender
Node 1 Dataset Overview:
Total samples: 400

Category distribution:
category
ai       200
music    100
web3     100
Name: count, dtype: int64

Node 1 Training Data:
Training data shape: (320, 1004)
Testing data shape: (80, 1004)

Testing local training:
Epoch 1/5
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 31ms/step - accuracy: 0.4064 - loss: 2.6277 - val_accuracy: 0.4375 - val_loss: 6.5369
Epoch 2/5
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - accuracy: 0.5041 - loss: 2.8605 - val_accuracy: 0.4062 - val_loss: 1.2902
