In [1]:
%cd C:\Users\lukec\PycharmProjects\emissions-tracking-conda

C:\Users\lukec\PycharmProjects\emissions-tracking-conda


In [108]:
# %%writefile functions/graph_utilities.py

import itertools
import numpy as np
import pandas as pd
from more_itertools import locate
import torch
import networkx as nx
import matplotlib.pyplot as plt
from torch_geometric.utils import to_networkx
from torch_geometric.data import Data
from sklearn.preprocessing import OrdinalEncoder

def concat_lists(lists):
    """List concatenation utility function"""
    return list(itertools.chain.from_iterable(lists))

def pd_to_adj_matrix(df:pd.DataFrame, columns:list, weights=False):
    """Form adjacency matrix from pandas dataframe columns"""
    adj, edgeWeights = np.zeros([2,0]), []
    for i, column in enumerate(columns):
        edgeDest = [list(locate(df[column], lambda x: x==i)) for i in df[column]]
        edgeOrig = concat_lists([list(np.ones(len(x)).astype(int)*i) for i, x in enumerate(edgeDest)])
        edgeDest = concat_lists(edgeDest)
        nonSelf = np.where([val!=edgeDest[i] for i, val in enumerate(edgeOrig)])[0]
        adj = np.concatenate((adj,np.array([np.take(edgeOrig,nonSelf), np.take(edgeDest,nonSelf)])),axis=-1)
        if weights: edgeWeights = edgeWeights + [weights[i]]*len(edgeDest)
    return adj.astype(int), edgeWeights

def encode_string_cols(df):
    """Encodes columns of a pandas dataframe with string dtype as integer classes"""
    ord_enc = OrdinalEncoder()
    columns = df.columns[[i in [object, str] for i in df.dtypes]]
    for col in columns:
        df[col] = ord_enc.fit_transform(df[col].values.reshape(-1,1))
    return df

def build(df, edges:list, y_col=False, bins=False):
    """This function builds a graph given a Pandas DataFrame"""
    adjacency, _ = pd_to_adj_matrix(df, edges)
    edge_index = torch.tensor(adjacency, dtype=torch.long)
    data_x = df.drop(y_col, axis=1) if y_col else df
    x = torch.tensor(encode_string_cols(data_x).values, dtype=torch.float)

    if y_col:
        if bins:
            y = torch.tensor(np.digitize(df[y_col],
                             bins= np.linspace(min(df[y_col].dropna()),max(df[y_col].dropna()),bins))-1,
                             dtype= torch.long)
        else: y = torch.tensor(df[y_col], dtype=torch.float)
        return Data(x=x, edge_index=edge_index, y=y)

    else: return Data(x=x, edge_index=edge_index)

def draw_pyg_graph(data, node_size=50, width=0.1, cmap=plt.cm.coolwarm):
    graph = to_networkx(data, to_undirected=True)
    pos = nx.kamada_kawai_layout(graph)
    fig, ax = plt.subplots(1,1,figsize=[12,12])
    nx.draw_networkx(graph,pos, with_labels=False,node_size=node_size,width=width, node_color=data.y, cmap=cmap)

In [109]:
# %%writefile functions/node_classification

import torch.nn.functional as F
from torch_geometric.nn import GCNConv
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt

class GCN(torch.nn.Module):
    """Graph Convolutional Network model"""
    def __init__(self, dataset, hidden_channels):
        super().__init__()
        torch.manual_seed(123)
        self.conv1 = GCNConv(dataset.num_features, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, len(dataset.y.unique()))

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index)
        x = x.relu()
        x = F.dropout(x, p=0.5, training=self.training)
        x = self.conv2(x, edge_index)
        return x

def train(data, model, optimizer, criterion = torch.nn.CrossEntropyLoss()):
      model.train()
      optimizer.zero_grad()  # Clear gradients.
      out = model(data.x, data.edge_index)  # Perform a single forward pass.
      loss = criterion(out[data.train_mask], data.y[data.train_mask])  # Compute the loss solely based on the training nodes.
      loss.backward()  # Derive gradients.
      optimizer.step()  # Update parameters based on gradients.
      return loss

def test(data, model):
      model.eval()
      out = model(data.x, data.edge_index)
      pred = out.argmax(dim=1)  # Use the class with highest probability.
      test_correct = pred[data.test_mask] == data.y[data.test_mask]  # Check against ground-truth labels.
      test_acc = int(test_correct.sum()) / int(data.test_mask.sum())  # Derive ratio of correct predictions.
      return test_acc

def run(data, model, optimizer, verbose=True):
    for epoch in range(1, 101):
        loss = train(data, model, optimizer)
        if verbose: print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}')

    test_acc = test(graph, model)
    print(f'Test Accuracy: {test_acc:.4f}')

def visualize(h, color):
    """Define TSNE visualisation"""
    z = TSNE(n_components=2).fit_transform(h.detach().cpu().numpy())

    plt.figure(figsize=(10,10))
    plt.xticks([])
    plt.yticks([])

    plt.scatter(z[:, 0], z[:, 1], s=70, c=color, cmap="Set2")
    plt.show()

In [118]:
# %%writefile scripts/graph_run.py

import pandas as pd
from torch_geometric.transforms import RandomNodeSplit

# Build graph
data = pd.read_csv("C:/Users\lukec\OneDrive - University of Cambridge\PhD\Data\Aggregate/US_2019.csv", low_memory=False).dropna(subset='CO2e')
graph = build(data, ['PRODUCT','SITE'], y_col='CO2e', bins=6)

#draw_pyg_graph(graph)

In [119]:
# Run classification
"""Train & evaluate GCN model"""
RandomNodeSplit('train_rest', num_val=50, num_test=50)(graph)
GCN_model = GCN(graph, hidden_channels=30)
optimizer = torch.optim.Adam(GCN_model.parameters(), lr=0.01, weight_decay=5e-4)
run(graph, GCN_model, optimizer)

IndexError: Target 5 is out of bounds.

In [93]:
model.eval()

out = model(data.x, data.edge_index)
visualize(out, color=data.y)

tensor([308,  12,   6,   0,   3])

In [120]:
graph.y

tensor([0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 5, 5, 5, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
        1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,