In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import joblib
from statsmodels.tsa.arima.model import ARIMA
import torch
import torch.nn as nn
import torch.nn.functional as F
import networkx as nx
import numpy as np
from scipy.sparse import coo_matrix

# Load data
df = pd.read_csv('../data/processed.csv', encoding='utf-8')
required_cols = ['yea_count', 'nay_count', 'nominate_mid_1', 'nominate_mid_2', 'cumulative_rolls', 'action_count', 'sentiment', 'date']
for col in required_cols:
    if col not in df.columns:
        print(f'Warning: {col} missing; using default 0')
        df[col] = 0 if col != 'date' else pd.to_datetime('2023-01-01')

df['date'] = pd.to_datetime(df['date'])

# Logistic Regression
X = df[['yea_count', 'nay_count', 'nominate_mid_1', 'nominate_mid_2', 'cumulative_rolls', 'action_count', 'sentiment']]
y = df['passed']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
model_lr = LogisticRegression()
model_lr.fit(X_train, y_train)
preds_lr = model_lr.predict(X_test)
print(f'Logistic Accuracy: {accuracy_score(y_test, preds_lr):.4f}')
joblib.dump(model_lr, '../models/logistic_model.pkl')

# Time-Series (ARIMA on cumulative_rolls grouped by date)
ts_data = df.groupby('date')['cumulative_rolls'].mean()
ts_data = ts_data.asfreq('D', fill_value=0)  # Daily frequency, fill missing
model_arima = ARIMA(ts_data, order=(5,1,0))
model_fit = model_arima.fit()
forecast = model_fit.forecast(steps=5)
print(f'ARIMA Forecast: {forecast.values}')
joblib.dump(model_fit, '../models/arima_model.pkl')

# GNN Layer (pure PyTorch)
class GCNLayer(nn.Module):
    def __init__(self, input_dim: int, output_dim: int, A: torch.Tensor):
        super(GCNLayer, self).__init__()
        self.input_dim = input_dim
        self.output_dim = output_dim
        self.A = A.float()
        self.A_hat = self.A + torch.eye(self.A.size(0))
        degrees = torch.sum(self.A_hat, dim=1)
        self.D_neg_sqrt = torch.diag(torch.pow(degrees, -0.5))
        self.W = nn.Parameter(torch.rand(input_dim, output_dim))

    def forward(self, X: torch.Tensor):
        support = torch.matmul(self.D_neg_sqrt, torch.matmul(self.A_hat, self.D_neg_sqrt))
        aggregated = torch.matmul(support, torch.matmul(X, self.W))
        return F.relu(aggregated)

# Full GNN Model
class GNN(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, A):
        super(GNN, self).__init__()
        self.gcn1 = GCNLayer(input_dim, hidden_dim, A)
        self.gcn2 = GCNLayer(hidden_dim, output_dim, A)

    def forward(self, X):
        x = self.gcn1(X)
        x = self.gcn2(x)
        x = torch.mean(x, dim=0)  # Graph-level mean pool
        return torch.sigmoid(x)

# Mock graph (adapt with real sponsor data later)
num_nodes = 10
G = nx.Graph()
G.add_nodes_from(range(num_nodes))
G.add_edges_from([(0,1), (1,2), (2,3), (3,0), (4,5), (5,6)])
adj = nx.adjacency_matrix(G).todense()
A = torch.tensor(adj, dtype=torch.float)

# Node features: Use processed features
X_gnn = torch.tensor(df[['nominate_mid_1', 'nominate_mid_2']].values[:num_nodes], dtype=torch.float)
y_gnn = torch.tensor([df['passed'].values[0]], dtype=torch.float)

# Train GNN
model_gnn = GNN(input_dim=2, hidden_dim=16, output_dim=1, A=A)
optimizer = torch.optim.Adam(model_gnn.parameters(), lr=0.01)
criterion = nn.BCELoss()
for epoch in range(100):
    optimizer.zero_grad()
    pred = model_gnn(X_gnn)
    loss = criterion(pred, y_gnn)
    loss.backward()
    optimizer.step()
    if epoch % 10 == 0:
        print(f'Epoch {epoch}, Loss: {loss.item():.4f}, Pred: {pred.item():.4f}')

torch.save(model_gnn.state_dict(), '../models/gnn_model.pth')

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Logistic Accuracy: 0.8409
ARIMA Forecast: [993571.69847073 989759.25659001 988778.41586316 989269.77398506
 992872.70447619]
Epoch 0, Loss: 0.9899, Pred: 0.6284
Epoch 10, Loss: 0.8581, Pred: 0.5760
Epoch 20, Loss: 0.7714, Pred: 0.5376
Epoch 30, Loss: 0.7010, Pred: 0.5039
Epoch 40, Loss: 0.6969, Pred: 0.5019
Epoch 50, Loss: 0.6963, Pred: 0.5016
Epoch 60, Loss: 0.6957, Pred: 0.5013
Epoch 70, Loss: 0.6950, Pred: 0.5009
Epoch 80, Loss: 0.6944, Pred: 0.5006
Epoch 90, Loss: 0.6939, Pred: 0.5004
