# Genre Classification
## From Graph+AI World 2020 Conference
This notebook contains the genre classification Graph Convolutional Neural Network for the Graph+AI World session From Dataframes to Graph: Data Science with pyTigerGraph by Parker Erickson.

In [1]:
import pandas as pd 
import pyTigerGraph as tg
import networkx as nx
import dgl
import torch
import torch.nn as nn
import torch.nn.functional as F
import random

Using backend: pytorch


# Create TigerGraph Cloud Connection

In [2]:
conn = tg.TigerGraphConnection("https://musicgraph.i.tgcloud.io", version="3.0.5", graphname="musicGraph")
conn.apiToken = conn.getToken(conn.createSecret())

Downloading gsql client Jar
Downloading SSL Certificate


# Install Queries

In [3]:
print(conn.gsql('''
                CREATE QUERY mainGenres() FOR GRAPH musicGraph {
                    types = {Genre.*};
                    genres = SELECT t FROM types:g-(parentGenre:e)->Genre:t WHERE g.Title == "";
                    PRINT genres;
                }
                
                INSTALL QUERY mainGenres'''))

Trying version: v3_0_5
Connecting to musicgraph.i.tgcloud.io:14240
If there is any relative path, it is relative to <System.AppRoot>/dev/gdk/gsql
The query mainGenres has been added!
Start installing queries, about 1 minute ...
mainGenres query: curl -X GET 'https://127.0.0.1:9000/query/musicGraph/mainGenres'. Add -H "Authorization: Bearer TOKEN" if authentication is enabled.




In [4]:
print(conn.gsql('''
                CREATE QUERY artistGenre(/* Parameters here */) FOR GRAPH musicGraph SYNTAX v2{ 
                    MaxAccum<VERTEX<Genre>> @genre;
                    types = {Genre.*};
                    genres = SELECT t FROM types:g-(parentGenre>:e)-Genre:t WHERE g.Title == "";
                    PRINT genres;
                    artistGenres = SELECT a FROM Artist:a-(:e)-Track:t-(:e2)-genres:g ACCUM a.@genre = g;
                    PRINT artistGenres;
                }

                INSTALL QUERY artistGenre'''))

Trying version: v3_0_5
Connecting to musicgraph.i.tgcloud.io:14240
If there is any relative path, it is relative to <System.AppRoot>/dev/gdk/gsql
The query artistGenre has been added!
Start installing queries, about 1 minute ...
artistGenre query: curl -X GET 'https://127.0.0.1:9000/query/musicGraph/artistGenre'. Add -H "Authorization: Bearer TOKEN" if authentication is enabled.




In [5]:
print(conn.gsql('''
                CREATE QUERY artistLinks(/* Parameters here */) FOR GRAPH musicGraph SYNTAX v2{ 
                    TYPEDEF TUPLE <VERTEX src, VERTEX dest> TUPLE_RECORD;
                    SetAccum<TUPLE_RECORD> @@edges;
                    //result = SELECT tgt FROM Artist:s-(:e1)-:mid-(:e2)-Artist:tgt WHERE s != tgt
                    //        ACCUM @@edges += TUPLE_RECORD(s, tgt);
                    res = SELECT tgt FROM Artist:s-(:e1)-Track:mid1-(:e2)-Album:mid2-(:e3)-Track:mid3-(:e4)-Artist:tgt WHERE s!= tgt
                            ACCUM @@edges += TUPLE_RECORD(s, tgt);
                    PRINT @@edges;
                }
                
                INSTALL QUERY artistLinks'''))

Trying version: v3_0_5
Connecting to musicgraph.i.tgcloud.io:14240
If there is any relative path, it is relative to <System.AppRoot>/dev/gdk/gsql
The query artistLinks has been added!
Start installing queries, about 1 minute ...
artistLinks query: curl -X GET 'https://127.0.0.1:9000/query/musicGraph/artistLinks'. Add -H "Authorization: Bearer TOKEN" if authentication is enabled.




# Get Edges Between Artists

In [4]:
edges = [(x["src"], x["dest"]) for x in conn.runInstalledQuery("artistLinks")[0]["@@edges"]]
edges = random.choices(edges, k=5000)

In [5]:
edges[:5]

[('5102', '2209'),
 ('24128', '24120'),
 ('14774', '8429'),
 ('7616', '9272'),
 ('10312', '14234')]

# Create DGL Graph
Convert the list of edges into a graph that DGL, the machine learning library we are using, can interpret

In [6]:
allNodes = list(conn.getVertexDataframe("Artist")["v_id"])

In [7]:
artistToNum = {} # translation dictionary for company name to number (for dgl)
numToArtist = {} # translation dictionary for number to company name

numericalNodes = []

for i in range(0, len(allNodes)):
    artistToNum[allNodes[i]] = i
    numericalNodes.append(i)
    artistToNum[i] = allNodes[i]

def createEdgeList(result): # returns tuple of number version of edge
    fromKey = artistToNum[result[0]]
    toKey = artistToNum[result[1]]
    return (fromKey, toKey)

edges = [createEdgeList(thing) for thing in edges]
print("Number of Edges: ", len(edges))
print(edges[:5])

Number of Edges:  5000
[(11619, 15542), (10162, 14818), (8903, 532), (2601, 1081), (1306, 14523)]


In [8]:
g = nx.Graph()
g.add_nodes_from(numericalNodes)
g.add_edges_from(edges)

G = dgl.DGLGraph(g) # Convert networkx graph to a graph that DGL can work on

In [9]:
G.number_of_nodes()

16341

# Add Features to Vertices
Currently, we just one-hot encode the vertices in the graph to give them unique features.

In [10]:
G.ndata["feat"] = torch.eye(G.number_of_nodes())
print(G.nodes[2].data['feat'])

tensor([[0., 0., 1.,  ..., 0., 0., 0.]])


# Define Hyperparameters and Build GCN

In [11]:
numEpochs = 100
learningRate = 0.01

In [12]:
# Define the message and reduce function
# NOTE: We ignore the GCN's normalization constant c_ij for this tutorial.
def gcn_message(edges):
    # The argument is a batch of edges.
    # This computes a (batch of) message called 'msg' using the source node's feature 'h'.
    return {'msg' : edges.src['h']}

def gcn_reduce(nodes):
    # The argument is a batch of nodes.
    # This computes the new 'h' features by summing received 'msg' in each node's mailbox.
    return {'h' : torch.sum(nodes.mailbox['msg'], dim=1)}

# Define the GCNLayer module
class GCNLayer(nn.Module):
    def __init__(self, in_feats, out_feats):
        super(GCNLayer, self).__init__()
        self.linear = nn.Linear(in_feats, out_feats)

    def forward(self, g, inputs):
        # g is the graph and the inputs is the input node features
        # first set the node features
        g.ndata['h'] = inputs
        # trigger message passing on all edges
        g.send(g.edges(), gcn_message)
        # trigger aggregation at all nodes
        g.recv(g.nodes(), gcn_reduce)
        # get the result node features
        h = g.ndata.pop('h')
        # perform linear transformation
        return self.linear(h)

# Define a 2-layer GCN model
class GCN(nn.Module):
    def __init__(self, in_feats, hidden_size, num_classes):
        super(GCN, self).__init__()
        self.gcn1 = GCNLayer(in_feats, hidden_size)
        self.gcn2 = GCNLayer(hidden_size, num_classes)

    def forward(self, g, inputs):
        h = self.gcn1(g, inputs)
        h = torch.relu(h)
        h = self.gcn2(g, h)
        return h

# Create Labels for Vertices

In [13]:
genres = [int(x["v_id"]) for x in conn.runInstalledQuery("mainGenres")[0]["genres"]]

In [14]:
genres

[14, 2, 21, 8, 4, 1235, 15, 20, 12, 38, 5, 10, 13, 9, 3, 17]

In [15]:
artistGenres = {}

for artist in conn.runInstalledQuery("artistGenre")[1]["artistGenres"]:
    if artist["v_id"] in artistToNum.keys():
        try:
            artistGenres[int(artist["attributes"]["@genre"])].append(artist["v_id"])
        except:
            artistGenres[int(artist["attributes"]["@genre"])] = [artist["v_id"]]
    else:
        continue

In [16]:
net = GCN(G.number_of_nodes(), 128, len(genres)) #Two layer GCN
inputs = G.ndata["feat"]
labeled_nodes = torch.tensor([artistToNum[artistGenres[x][-1]] for x in genres])  # only the liked movies and the disliked movies are labelled
labels = torch.tensor([x for x in range(0, len(genres))])  # their labels are different
optimizer = torch.optim.Adam(net.parameters(), lr=learningRate)

In [17]:
labeled_nodes

tensor([3416,  427,  448,  444,  291,  440,  469,  183,  470,  478,  382,  461,
         174,  305,  323,  468])

In [18]:
labels

tensor([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15])

# Train GCN

In [19]:
all_logits = []
for epoch in range(numEpochs):
    logits = net(G, inputs)
    # we save the logits for visualization later
    all_logits.append(logits.detach())
    logp = F.log_softmax(logits, 1)
    # we only compute loss for labeled nodes
    loss = F.nll_loss(logp[labeled_nodes], labels)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    print('Epoch %d | Loss: %6.3e' % (epoch, loss.item()))

Epoch 0 | Loss: 2.773e+00
Epoch 1 | Loss: 2.636e+00
Epoch 2 | Loss: 2.553e+00
Epoch 3 | Loss: 2.516e+00
Epoch 4 | Loss: 2.476e+00
Epoch 5 | Loss: 2.438e+00
Epoch 6 | Loss: 2.409e+00
Epoch 7 | Loss: 2.388e+00
Epoch 8 | Loss: 2.371e+00
Epoch 9 | Loss: 2.356e+00
Epoch 10 | Loss: 2.340e+00
Epoch 11 | Loss: 2.324e+00
Epoch 12 | Loss: 2.308e+00
Epoch 13 | Loss: 2.292e+00
Epoch 14 | Loss: 2.277e+00
Epoch 15 | Loss: 2.263e+00
Epoch 16 | Loss: 2.251e+00
Epoch 17 | Loss: 2.240e+00
Epoch 18 | Loss: 2.232e+00
Epoch 19 | Loss: 2.226e+00
Epoch 20 | Loss: 2.221e+00
Epoch 21 | Loss: 2.216e+00
Epoch 22 | Loss: 2.213e+00
Epoch 23 | Loss: 2.210e+00
Epoch 24 | Loss: 2.207e+00
Epoch 25 | Loss: 2.204e+00
Epoch 26 | Loss: 2.201e+00
Epoch 27 | Loss: 2.199e+00
Epoch 28 | Loss: 2.197e+00
Epoch 29 | Loss: 2.194e+00
Epoch 30 | Loss: 2.192e+00
Epoch 31 | Loss: 2.190e+00
Epoch 32 | Loss: 2.188e+00
Epoch 33 | Loss: 2.186e+00
Epoch 34 | Loss: 2.184e+00
Epoch 35 | Loss: 2.182e+00
Epoch 36 | Loss: 2.181e+00
Epoch 37 | 