In [3]:
# import pandas as pd
# import torch
# from torch_geometric.data import Data
# import json
# import numpy as np
# from sklearn.model_selection import train_test_split
from helper import load_twitch_dataset, prepare_GNN_data

In [4]:
data = load_twitch_dataset()
print(data)

Index(['Source', 'Target'], dtype='object')
   Source  Target
0    6194     255
1    6194     980
2    6194    2992
3    6194    2507
4    6194     986
Data(edge_index=[2, 35324])
Data(edge_index=[2, 35324], x=[7126, 3170])
Data(edge_index=[2, 35324], x=[7126, 3170], y=[7126])
Data(edge_index=[2, 35324], x=[7126, 3170], y=[7126])


In [6]:
data = prepare_GNN_data(data)
print(data)

Data(edge_index=[2, 35324], x=[7126, 3170], y=[7126], train_mask=[7126], val_mask=[7126], test_mask=[7126])
Data(edge_index=[2, 35324], x=[7126, 3170], y=[7126], train_mask=[7126], val_mask=[7126], test_mask=[7126])


In [24]:
# Load edges file
edges = pd.read_csv('./twitch/ENGB/musae_ENGB_edges_edited.csv', sep=',')
# print(edges)

print(edges.columns)
print(edges.head())

# Ensure columns are integers
edges['Source'] = pd.to_numeric(edges['Source'], errors='coerce').fillna(0).astype(int)
edges['Target'] = pd.to_numeric(edges['Target'], errors='coerce').fillna(0).astype(int)

# Convert to edge index tensor
edge_index = torch.tensor(edges[['Source', 'Target']].values.T, dtype=torch.long)

# Create graph data object
data = Data(edge_index=edge_index)
print(data)



Index(['Source', 'Target'], dtype='object')
   Source  Target
0    6194     255
1    6194     980
2    6194    2992
3    6194    2507
4    6194     986
Data(edge_index=[2, 35324])


In [25]:
# Load node features
with open('./twitch/ENGB/musae_ENGB_features.json') as f:
    features = json.load(f)

# Convert features to a matrix
node_features = np.zeros((len(features), max(max(f) for f in features.values()) + 1))
for node, feats in features.items():
    node_features[int(node), feats] = 1  # One-hot encoding of features

# Convert to tensor
x = torch.tensor(node_features, dtype=torch.float)
data.x = x
print(data)

Data(edge_index=[2, 35324], x=[7126, 3170])


In [26]:
# Load target file
target = pd.read_csv('./twitch/ENGB/musae_ENGB_target_edited.csv')

# Create label tensor
labels = target['mature'].astype(int).values
y = torch.tensor(labels, dtype=torch.long)
data.y = y
print(data)

Data(edge_index=[2, 35324], x=[7126, 3170], y=[7126])


In [None]:
# Split indices for training, validation, and testing
train_idx, test_idx = train_test_split(range(len(labels)), test_size=0.3, stratify=labels)
val_idx, test_idx = train_test_split(test_idx, test_size=0.5, stratify=labels[test_idx])

# Convert to tensors
train_mask = torch.zeros(len(labels), dtype=torch.bool)
val_mask = torch.zeros(len(labels), dtype=torch.bool)
test_mask = torch.zeros(len(labels), dtype=torch.bool)

train_mask[train_idx] = True
val_mask[val_idx] = True
test_mask[test_idx] = True

data.train_mask = train_mask
data.val_mask = val_mask
data.test_mask = test_mask
print(data)


Data(edge_index=[2, 35324], x=[7126, 3170], y=[7126], train_mask=[7126], val_mask=[7126], test_mask=[7126])


In [None]:
print(data.x.shape)  # Should be [num_nodes, num_features]
print(data.edge_index.shape)  # Should be [2, num_edges]
print(data.y.shape)  # Should be [num_nodes]
print(f"Train nodes: {data.train_mask.sum().item()}, Validation nodes: {data.val_mask.sum().item()}, Test nodes: {data.test_mask.sum().item()}")

In [28]:
import torch
import torch.nn.functional as F
from torch_geometric.nn import GCNConv

# Define the GCN model
class GCN(torch.nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(GCN, self).__init__()
        self.conv1 = GCNConv(input_dim, hidden_dim)
        self.conv2 = GCNConv(hidden_dim, output_dim)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = self.conv2(x, edge_index)
        return F.log_softmax(x, dim=1)

# Initialize the model, optimizer, and loss function
input_dim = data.x.shape[1]
hidden_dim = 16
output_dim = 2  # Binary classification (mature or not)
model_gcn = GCN(input_dim, hidden_dim, output_dim)
optimizer = torch.optim.Adam(model_gcn.parameters(), lr=0.01, weight_decay=5e-4)

# Training function
def train():
    model_gcn.train()
    optimizer.zero_grad()
    out = model_gcn(data)
    loss = F.nll_loss(out[data.train_mask], data.y[data.train_mask])
    loss.backward()
    optimizer.step()
    return loss.item()

# Evaluation function
def evaluate(mask):
    model_gcn.eval()
    with torch.no_grad():
        out = model_gcn(data)
        pred = out[mask].max(1)[1]
        acc = (pred == data.y[mask]).sum().item() / mask.sum().item()
    return acc

# Train and evaluate the model
for epoch in range(200):
    loss = train()
    if epoch % 10 == 0:
        train_acc = evaluate(data.train_mask)
        val_acc = evaluate(data.val_mask)
        print(f'Epoch {epoch}, Loss: {loss:.4f}, Train Acc: {train_acc:.4f}, Val Acc: {val_acc:.4f}')

# Test the model
test_acc = evaluate(data.test_mask)
print(f'Test Accuracy: {test_acc:.4f}')


Epoch 0, Loss: 0.6941, Train Acc: 0.5457, Val Acc: 0.5463
Epoch 10, Loss: 0.6670, Train Acc: 0.6153, Val Acc: 0.5304
Epoch 20, Loss: 0.6263, Train Acc: 0.6534, Val Acc: 0.5295
Epoch 30, Loss: 0.5888, Train Acc: 0.6824, Val Acc: 0.5398
Epoch 40, Loss: 0.5647, Train Acc: 0.6981, Val Acc: 0.5313
Epoch 50, Loss: 0.5462, Train Acc: 0.7093, Val Acc: 0.5267
Epoch 60, Loss: 0.5288, Train Acc: 0.7310, Val Acc: 0.5220
Epoch 70, Loss: 0.5123, Train Acc: 0.7428, Val Acc: 0.5220
Epoch 80, Loss: 0.4942, Train Acc: 0.7586, Val Acc: 0.5201
Epoch 90, Loss: 0.4763, Train Acc: 0.7704, Val Acc: 0.5267
Epoch 100, Loss: 0.4633, Train Acc: 0.7759, Val Acc: 0.5313
Epoch 110, Loss: 0.4510, Train Acc: 0.7911, Val Acc: 0.5295
Epoch 120, Loss: 0.4376, Train Acc: 0.7983, Val Acc: 0.5285
Epoch 130, Loss: 0.4246, Train Acc: 0.8152, Val Acc: 0.5192
Epoch 140, Loss: 0.4157, Train Acc: 0.8256, Val Acc: 0.5201
Epoch 150, Loss: 0.4120, Train Acc: 0.8326, Val Acc: 0.5145
Epoch 160, Loss: 0.4003, Train Acc: 0.8350, Val Acc

In [32]:
from torch_geometric.nn import SAGEConv
import torch
import torch.nn.functional as F

# Define the GraphSAGE model
class GraphSAGE(torch.nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(GraphSAGE, self).__init__()
        self.conv1 = SAGEConv(input_dim, hidden_dim)
        self.conv2 = SAGEConv(hidden_dim, output_dim)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = self.conv2(x, edge_index)
        return F.log_softmax(x, dim=1)

# Initialize the model, optimizer, and loss function
input_dim = data.x.shape[1]
hidden_dim = 16
output_dim = 2  # Binary classification (mature or not)
model_sage = GraphSAGE(input_dim, hidden_dim, output_dim)
optimizer = torch.optim.Adam(model_sage.parameters(), lr=0.01, weight_decay=5e-4)

# Training function
def train():
    model_sage.train()
    optimizer.zero_grad()
    out = model_sage(data)
    loss = F.nll_loss(out[data.train_mask], data.y[data.train_mask])
    loss.backward()
    optimizer.step()
    return loss.item()

# Evaluation function
def evaluate(mask):
    model_sage.eval()
    with torch.no_grad():
        out = model_sage(data)
        pred = out[mask].max(1)[1]
        acc = (pred == data.y[mask]).sum().item() / mask.sum().item()
    return acc

# Train and evaluate the model
for epoch in range(200):
    loss = train()
    if epoch % 10 == 0:
        train_acc = evaluate(data.train_mask)
        val_acc = evaluate(data.val_mask)
        print(f'Epoch {epoch}, Loss: {loss:.4f}, Train Acc: {train_acc:.4f}, Val Acc: {val_acc:.4f}')

# Test the model
test_acc = evaluate(data.test_mask)
print(f'Test Accuracy: {test_acc:.4f}')

Epoch 0, Loss: 0.7120, Train Acc: 0.5463, Val Acc: 0.5519
Epoch 10, Loss: 0.6441, Train Acc: 0.5932, Val Acc: 0.5585
Epoch 20, Loss: 0.5802, Train Acc: 0.6995, Val Acc: 0.5323
Epoch 30, Loss: 0.5175, Train Acc: 0.7833, Val Acc: 0.5023
Epoch 40, Loss: 0.4539, Train Acc: 0.8308, Val Acc: 0.4995
Epoch 50, Loss: 0.3937, Train Acc: 0.8791, Val Acc: 0.5061
Epoch 60, Loss: 0.3374, Train Acc: 0.9178, Val Acc: 0.5042
Epoch 70, Loss: 0.2779, Train Acc: 0.9487, Val Acc: 0.5070
Epoch 80, Loss: 0.2196, Train Acc: 0.9721, Val Acc: 0.5089
Epoch 90, Loss: 0.1784, Train Acc: 0.9854, Val Acc: 0.5070
Epoch 100, Loss: 0.1518, Train Acc: 0.9918, Val Acc: 0.5033
Epoch 110, Loss: 0.1319, Train Acc: 0.9950, Val Acc: 0.5051
Epoch 120, Loss: 0.1193, Train Acc: 0.9964, Val Acc: 0.5108
Epoch 130, Loss: 0.1071, Train Acc: 0.9986, Val Acc: 0.5070
Epoch 140, Loss: 0.1013, Train Acc: 0.9980, Val Acc: 0.5005
Epoch 150, Loss: 0.0957, Train Acc: 0.9982, Val Acc: 0.4977
Epoch 160, Loss: 0.0856, Train Acc: 0.9988, Val Acc

In [31]:
from torch_geometric.nn import GATConv

class GAT(torch.nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, heads=1):
        super(GAT, self).__init__()
        self.conv1 = GATConv(input_dim, hidden_dim, heads=heads, concat=True)
        self.conv2 = GATConv(hidden_dim * heads, output_dim, heads=1, concat=False)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = self.conv2(x, edge_index)
        return F.log_softmax(x, dim=1)

# Initialize the model, optimizer, and train
input_dim = data.x.shape[1]
hidden_dim = 32
output_dim = 2
model_gat = GAT(input_dim, hidden_dim, output_dim, heads=4)
optimizer = torch.optim.Adam(model_gat.parameters(), lr=0.005, weight_decay=5e-4)

# Training function
def train():
    model_gat.train()
    optimizer.zero_grad()
    out = model_gat(data)
    loss = F.nll_loss(out[data.train_mask], data.y[data.train_mask])
    loss.backward()
    optimizer.step()
    return loss.item()

# Evaluation function
def evaluate(mask):
    model_gat.eval()
    with torch.no_grad():
        out = model_gat(data)
        pred = out[mask].max(1)[1]
        acc = (pred == data.y[mask]).sum().item() / mask.sum().item()
    return acc

# Train and evaluate the model
for epoch in range(200):
    loss = train()
    if epoch % 10 == 0:
        train_acc = evaluate(data.train_mask)
        val_acc = evaluate(data.val_mask)
        print(f'Epoch {epoch}, Loss: {loss:.4f}, Train Acc: {train_acc:.4f}, Val Acc: {val_acc:.4f}')

# Test the model
test_acc = evaluate(data.test_mask)
print(f'Test Accuracy: {test_acc:.4f}')

Epoch 0, Loss: 0.6909, Train Acc: 0.5457, Val Acc: 0.5463
Epoch 10, Loss: 0.6496, Train Acc: 0.6494, Val Acc: 0.5388
Epoch 20, Loss: 0.5893, Train Acc: 0.6830, Val Acc: 0.5304
Epoch 30, Loss: 0.5285, Train Acc: 0.7386, Val Acc: 0.5220
Epoch 40, Loss: 0.4700, Train Acc: 0.7813, Val Acc: 0.5220
Epoch 50, Loss: 0.4078, Train Acc: 0.8186, Val Acc: 0.5285
Epoch 60, Loss: 0.3622, Train Acc: 0.8338, Val Acc: 0.5313
Epoch 70, Loss: 0.3038, Train Acc: 0.8843, Val Acc: 0.5164
Epoch 80, Loss: 0.2491, Train Acc: 0.9242, Val Acc: 0.5061
Epoch 90, Loss: 0.2148, Train Acc: 0.9202, Val Acc: 0.5145
Epoch 100, Loss: 0.1718, Train Acc: 0.9531, Val Acc: 0.4864
Epoch 110, Loss: 0.1479, Train Acc: 0.9667, Val Acc: 0.4911
Epoch 120, Loss: 0.1388, Train Acc: 0.9629, Val Acc: 0.5061
Epoch 130, Loss: 0.1166, Train Acc: 0.9769, Val Acc: 0.5023
Epoch 140, Loss: 0.1062, Train Acc: 0.9798, Val Acc: 0.5108
Epoch 150, Loss: 0.3417, Train Acc: 0.9493, Val Acc: 0.4780
Epoch 160, Loss: 0.1039, Train Acc: 0.9641, Val Acc