## QUESTION 23: Idea 1
Use Graph Convolutional Networks. What hyperparameters do you choose to get the optimal performance? How many layers did you choose?

In [1]:
import torch
import torch.nn.functional as F
from torch_geometric.datasets import Planetoid
from torch_geometric.nn import GCNConv

# Load the dataset (Cora)
dataset = Planetoid(root='/tmp/Cora', name='Cora')

# GCN Model Definition
class GCN(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels):
        super(GCN, self).__init__()
        self.conv1 = GCNConv(in_channels, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, out_channels)
        # self.conv3 = GCNConv(hidden_channels, out_channels)

    def forward(self, x, edge_index):
        # First GCN layer
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, training=self.training)
        # Second GCN layer
        x = self.conv2(x, edge_index)
        # x = F.relu(x)
        # x = F.dropout(x, training=self.training)
        # Third  GCN Layer
        # x = self.conv3(x, edge_index)
        return x

# Initialize model
model = GCN(dataset.num_node_features, 16, dataset.num_classes)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)

# Move data to the appropriate device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)
data = dataset[0].to(device)

# Training the model
model.train()
for epoch in range(200):
    optimizer.zero_grad()
    out = model(data.x, data.edge_index)
    loss = F.cross_entropy(out[data.train_mask], data.y[data.train_mask])
    loss.backward()
    optimizer.step()

# Evaluating the model
model.eval()
preds = model(data.x, data.edge_index).argmax(dim=1)
correct = (preds[data.test_mask] == data.y[data.test_mask]).sum()
accuracy = int(correct) / int(data.test_mask.sum())
print(f'Accuracy: {accuracy:.4f}')

  _torch_pytree._register_pytree_node(


Accuracy: 0.7960


The choice of hyperparameters we choose was 16 for the hidden_channels and used 2 layer GCM Model

## QUESTION 24: Idea 2
Extract structure-based node features using Node2Vec. Briefly describe how Node2Vec finds node features. Choose your desired classifier (one of SVM, Neural Networks, or Random Forest) and classify the documents using only Node2Vec (graph strcuture) features. Now classify the documents using only the 1433-dimensional text features. Which one outperforms? Why do you think this is the case? Combine the Node2Vec and text features and train your classifier on the combined features. What is the ebst classification accuracy you get (in terms of the percentage of test documents correctly classified)?

In [13]:
import networkx as nx
from node2vec import Node2Vec
from torch_geometric.datasets import Planetoid
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
import torch

In [14]:
# Load the Cora dataset
dataset = Planetoid(root='/tmp/Cora', name='Cora')
data = dataset[0]

# Convert PyG graph to NetworkX graph
G = nx.Graph()
G.add_nodes_from(range(data.num_nodes))
edge_list = data.edge_index.t().tolist()
G.add_edges_from(edge_list)

# Node2Vec model setup
node2vec = Node2Vec(G, dimensions=64, walk_length=30, num_walks=200, workers=4)

# Train Node2Vec model
model = node2vec.fit(window=10, min_count=1, batch_words=4)

# Get embeddings for all nodes
embeddings = np.array([model.wv[str(i)] for i in range(data.num_nodes)])

# Labels for nodes
labels = data.y.numpy()

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(embeddings, labels, test_size=0.1, random_state=42)

# Support Vector Machine classifier
svm = SVC(kernel='linear')

# Train the SVM
svm.fit(X_train, y_train)

# Predict the labels on test dataset
predictions = svm.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, predictions)
print(f'Accuracy: {accuracy:.4f}')

Computing transition probabilities:   0%|          | 0/2708 [00:00<?, ?it/s]

Generating walks (CPU: 1): 100%|██████████| 50/50 [00:07<00:00,  6.75it/s]
Generating walks (CPU: 2): 100%|██████████| 50/50 [00:07<00:00,  6.77it/s]
Generating walks (CPU: 3): 100%|██████████| 50/50 [00:07<00:00,  6.83it/s]
Generating walks (CPU: 4): 100%|██████████| 50/50 [00:07<00:00,  6.79it/s]


Accuracy: 0.8524


In [15]:
features = data.x
labels = data.y.numpy()


# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=42)

# Convert torch tensor to numpy for sklearn compatibility
X_train = X_train.numpy()
X_test = X_test.numpy()

# Initialize the SVM classifier
svm = SVC(kernel='linear', C=1.0, random_state=42)

# Train the SVM
svm.fit(X_train, y_train)

# Predict the labels on test dataset
predictions = svm.predict(X_test)

# Calculate and print the accuracy
accuracy = accuracy_score(y_test, predictions)
print(f'Accuracy of SVM using only text features: {accuracy:.4f}')

Accuracy of SVM using only text features: 0.7196


node2vec outperforms compared to using only text features as node2vec is using sturctural information rather than the textual information

In [16]:
combined_features = np.concatenate((embeddings, features.numpy()), axis=1)
# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(combined_features, labels, test_size=0.2, random_state=42)

# Initialize the SVM classifier
svm = SVC(kernel='linear', C=1.0, random_state=42)

# Train the SVM
svm.fit(X_train, y_train)

# Predict the labels on test dataset
predictions = svm.predict(X_test)

# Calculate and print the accuracy
accuracy = accuracy_score(y_test, predictions)
print(f'Combined Features Accuracy: {accuracy:.4f}')

Combined Features Accuracy: 0.8469


The combined feature is the one that performs the best with 84.5% accuracy.

## QUESTION 25: Idea 3
We can find the personalized PageRank of each document in seven different runs, one per class. 
In each run, select one of the classes and take the 20 seed documents of that class. Then,
perform a random walk with the following customized properties: (a) teleportation takes t e
random walker to one of the seed documents of that class (with a uniform probability of 1/20
per seed document). Vary the teleportation probability in {0, 0.1, 0.2}. (b) the probabilit  of
transitioning to neighbors is not uniform among the neighbors. Rather, it is proportional t  the
cosine similarity between the text features of the current node and the next neighboring    
Repeat part b for every teleportation probability in part a.
Run the PageRank only on the GCC. for each seed node, do 1000 random walks. Maintai 
a class-wise visited frequency count for every unlabeled node. The predicted class for that
unlabeled node is the class which lead to maximum visits to that node. Report accuracy  nd
f1 sco  ass A.x2, x3.

In [6]:
import torch
import numpy as np

# Assuming data.x contains the feature vectors for each document/node
# and data.edge_index contains the graph structure
feature_vectors = data.x
graph_edges = data.edge_index

# Helper function to calculate cosine similarity
def cosine_similarity(vec1, vec2):
    cos_sim = np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))
    return cos_sim

# Perform the random walk for a given set of seed nodes
def random_walk(seed_nodes, graph_edges, feature_vectors, teleportation_prob):
    visited_counts = np.zeros(feature_vectors.shape[0], dtype=int)
    for seed_node in seed_nodes:
        current_node = seed_node
        for _ in range(1000):  # Perform 1000 random walks
            if np.random.rand() < teleportation_prob:
                current_node = np.random.choice(seed_nodes)  # Teleport to a random seed node
            else:
                neighbors = graph_edges[1][graph_edges[0] == current_node]
                if len(neighbors) > 0:
                    # Calculate transition probabilities based on cosine similarity
                    transition_probs = np.exp([cosine_similarity(feature_vectors[current_node], feature_vectors[neighbor]) for neighbor in neighbors])
                    # Normalize probabilities
                    transition_probs /= np.sum(transition_probs)
                    # Choose the next node based on transition probabilities
                    current_node = np.random.choice(neighbors, p=transition_probs)
            visited_counts[current_node] += 1  # Increment visit count for the current node
    return visited_counts

# Prepare the data and labels
train_nodes = data.train_mask == True
labels = np.unique(data.y.numpy())

# Container for visit counts of each class
all_class_visits = []

# Vary the teleportation probability
teleportation_probs = [0, 0.1, 0.2]

for label in labels:
    # Filter nodes that are both training nodes and have the current label
    selected_nodes_mask = train_nodes & (data.y == label)
    selected_node_indices = selected_nodes_mask.nonzero(as_tuple=True)[0]
    
    # Select 20 seed nodes for the label
    if len(selected_node_indices) >= 20:
        seed_nodes = np.random.choice(selected_node_indices.cpu().numpy(), size=20, replace=False)
    else:
        # If there aren't enough nodes, take all of them as seed nodes
        seed_nodes = selected_node_indices.cpu().numpy()
    
    # For each teleportation probability
    for tp in teleportation_probs:
        # Perform the random walk
        visits = random_walk(seed_nodes, graph_edges, feature_vectors, tp)
        all_class_visits.append((label, tp, visits))

In [7]:
def predictions(tele_prob, all_class_visits):
    data = []
    for list in all_class_visits:
        if (list[1] == tele_prob):
            data.append((list[0], list[2]))

    length_of_array = len(data[0][1])
    max_indices = []
    for i in range(length_of_array):
        max_value = -float('inf')
        candidates = []
        for index, array in data:
            if array[i] > max_value:
                max_value = array[i]
                candidates = [index]
            elif array[i] == max_value:
                candidates.append(index)
        if candidates:
            max_indices.append(random.choice(candidates))
    return max_indices

In [12]:
from sklearn.metrics import accuracy_score, f1_score
import random
for prob in ([0, 0.1, 0.2]):
    pred = predictions(prob, all_class_visits)
    accuracy = accuracy_score(pred, data.y.numpy())
    f1 = f1_score(pred, data.y.numpy(), average='micro')
    print(f"For teleportation probability: {prob}\n the accuracy is {accuracy:.2f} and the f1 score is {f1:.2f}")

For teleportation probability: 0
 the accuracy is 0.28 and the f1 score is 0.28
For teleportation probability: 0.1
 the accuracy is 0.68 and the f1 score is 0.68
For teleportation probability: 0.2
 the accuracy is 0.66 and the f1 score is 0.66
