In [15]:
import pickle 
import matplotlib.pyplot as plt
import numpy as np
import numpy.linalg as linalg

In [29]:
file_training_data = open('data-challenge-kernel-methods-2022-2023\\training_data.pkl', 'rb')
file_training_labels = open('data-challenge-kernel-methods-2022-2023\\training_labels.pkl', 'rb')

# dump information to that file
data = pickle.load(file_training_data)
labels = pickle.load(file_training_labels)

# close the file
file_training_data.close()
file_training_labels.close()

# Manipulation de la donnée

In [17]:
##this shows how many atoms there are in molecule 10
print('atoms are represented by nodes = ', data[10].nodes)

##this shows the type of atom 0 of molecule 10
print('\nThe atom type is = ', (data[10].nodes[10])['labels']) 

##A list of all chemical bonds
print('\nBonds between atoms are represented by edges between nodes', data[10].edges )

##The bond type (simple, double or triple bond)
print('\nThe bond type of bond 0-1 is', data[10].edges[0,1] ) #15-16 is of type 1

##The label gives the presence or absence of a property in the molecule
print ('\nThe molecule 10 is labelled ', labels[10] ) 

atoms are represented by nodes =  [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26]

The atom type is =  [2]

Bonds between atoms are represented by edges between nodes [(0, 1), (1, 2), (2, 3), (2, 5), (3, 4), (5, 6), (5, 7), (7, 8), (7, 13), (8, 9), (9, 10), (10, 11), (10, 12), (12, 13), (14, 15), (15, 16), (15, 17), (17, 18), (18, 19), (18, 20), (18, 24), (20, 21), (21, 22), (21, 23), (24, 25), (24, 26)]

The bond type of bond 0-1 is {'labels': [0]}

The molecule 10 is labelled  0


In [18]:
print( 'The number of 0 in the label list is :' , np.sum(np.array(labels)==0) )
print( 'The number of 1 in the label list is :' ,  np.sum(np.array(labels)==1) )
print('The total number of molecule is :', len(labels))
print('\n\nThe dataset is unbalanced. ')

The number of 0 in the label list is : 5445
The number of 1 in the label list is : 555
The total number of molecule is : 6000


The dataset is unbalanced. 


# Définition du kernel

## Tensor Product (def slide 563)

(u,v) un node de G1*G2 ssi u est un node de G1,v un node de G2 et de même label
les edges sont ((u,u'),(v,v')) si les deux sont des edges de G1*G2 et si (u,v) et (u',v') sont des edges de respectivement G1 et G2

In [19]:
"""
Graph products.
"""
from itertools import product

import networkx as nx
from networkx.utils import not_implemented_for



def _dict_product(d1, d2):
    return {k: (d1.get(k), d2.get(k)) for k in set(d1) | set(d2)}


# Generators for producing graph products
def _node_product(G, H):
    for u, v in product(G, H):
        if (G.nodes[u])['labels'][0] != (H.nodes[v])['labels'][0]: #makes sure that only nodes with the same label get paired
            continue
        yield ((u, v), _dict_product(G.nodes[u], H.nodes[v]) )


def _directed_edges_cross_edges(G, H):
    if not G.is_multigraph() and not H.is_multigraph():
        #print(G.edges(data=True), H.edges(data=True))
        for u, v, c in G.edges(data=True):
            for x, y, d in H.edges(data=True):
                #print((u, x), (v, y), )
                if (G.nodes[u])['labels'][0] == (H.nodes[x])['labels'][0] and (G.nodes[v])['labels'][0] == (H.nodes[y])['labels'][0]:  
                    yield (u, x), (v, y), _dict_product(c, d)
                #the edge is (u,v) but it can also be described as (v,u)
                if (G.nodes[u])['labels'][0] == (H.nodes[y])['labels'][0] and (G.nodes[v])['labels'][0] == (H.nodes[x])['labels'][0]:
                    yield (u, y), (v, x), _dict_product(c, d)
                
    else:
        raise NotImplemented


def _undirected_edges_cross_edges(G, H):
    if not G.is_multigraph() and not H.is_multigraph():
        for u, v, c in G.edges(data=True):
            for x, y, d in H.edges(data=True):
                yield (v, x), (u, y), _dict_product(c, d)
    if not G.is_multigraph() and H.is_multigraph():
        for u, v, c in G.edges(data=True):
            for x, y, k, d in H.edges(data=True, keys=True):
                yield (v, x), (u, y), k, _dict_product(c, d)
    if G.is_multigraph() and not H.is_multigraph():
        for u, v, k, c in G.edges(data=True, keys=True):
            for x, y, d in H.edges(data=True):
                yield (v, x), (u, y), k, _dict_product(c, d)
    if G.is_multigraph() and H.is_multigraph():
        for u, v, j, c in G.edges(data=True, keys=True):
            for x, y, k, d in H.edges(data=True, keys=True):
                yield (v, x), (u, y), (j, k), _dict_product(c, d)


def _edges_cross_nodes(G, H):
    if G.is_multigraph():
        for u, v, k, d in G.edges(data=True, keys=True):
            for x in H:
                yield (u, x), (v, x), k, d
    else:
        for u, v, d in G.edges(data=True):
            for x in H:
                if H.is_multigraph():
                    yield (u, x), (v, x), None, d
                else:
                    yield (u, x), (v, x), d


def _nodes_cross_edges(G, H):
    if H.is_multigraph():
        for x in G:
            for u, v, k, d in H.edges(data=True, keys=True):
                yield (x, u), (x, v), k, d
    else:
        for x in G:
            for u, v, d in H.edges(data=True):
                if G.is_multigraph():
                    yield (x, u), (x, v), None, d
                else:
                    yield (x, u), (x, v), d


def _edges_cross_nodes_and_nodes(G, H):
    if G.is_multigraph():
        for u, v, k, d in G.edges(data=True, keys=True):
            for x in H:
                for y in H:
                    yield (u, x), (v, y), k, d
    else:
        for u, v, d in G.edges(data=True):
            for x in H:
                for y in H:
                    if H.is_multigraph():
                        yield (u, x), (v, y), None, d
                    else:
                        yield (u, x), (v, y), d


def _init_product_graph(G, H):
    if G.is_directed() != H.is_directed():
        msg = "G and H must be both directed or both undirected"
        raise nx.NetworkXError(msg)
    if G.is_multigraph() or H.is_multigraph():
        GH = nx.MultiGraph()
    else:
        GH = nx.Graph()
    if G.is_directed():
        GH = GH.to_directed()
    return GH


def tensor_product(G, H):
    r"""Returns the tensor product of G and H.

    The tensor product $P$ of the graphs $G$ and $H$ has a node set that
    is the tensor product of the node sets, $V(P)=V(G) \times V(H)$.
    $P$ has an edge $((u,v), (x,y))$ if and only if $(u,x)$ is an edge in $G$
    and $(v,y)$ is an edge in $H$.

    Tensor product is sometimes also referred to as the categorical product,
    direct product, cardinal product or conjunction.


    Parameters
    ----------
    G, H: graphs
     Networkx graphs.

    Returns
    -------
    P: NetworkX graph
     The tensor product of G and H. P will be a multi-graph if either G
     or H is a multi-graph, will be a directed if G and H are directed,
     and undirected if G and H are undirected.

    Raises
    ------
    NetworkXError
     If G and H are not both directed or both undirected.

    Notes
    -----
    Node attributes in P are two-tuple of the G and H node attributes.
    Missing attributes are assigned None.

    Examples
    --------
    >>> G = nx.Graph()
    >>> H = nx.Graph()
    >>> G.add_node(0, a1=True)
    >>> H.add_node("a", a2="Spam")
    >>> P = nx.tensor_product(G, H)
    >>> list(P)
    [(0, 'a')]

    Edge attributes and edge keys (for multigraphs) are also copied to the
    new product graph
    """
    GH = _init_product_graph(G, H)
    GH.add_nodes_from(_node_product(G, H))
    GH.add_edges_from(_directed_edges_cross_edges(G, H))
    
    return GH





## N-th order kernel

In [20]:
def n_th_order_kernel(G1,G2,n=3):
    G1G2 = tensor_product(G1, G2)
    adjency_matrix = nx.to_numpy_array(G1G2)
    adj_mat_n = linalg.matrix_power(adjency_matrix, n)
    return np.sum(adj_mat_n)

On applique les différentes techniques de machine learning dorénavant. 

## SVM

In [21]:
##On réutilise ici une implémentation du SVM du devoir 2.
from svm import KernelSVC

In [22]:
from time import time 

In [23]:
def kernel_n_order(X,Y,same = True):
    return np.array([[n_th_order_kernel(G1,G2) for G1 in Y] for G2 in X])

In [24]:
tic =time()
#np.array(list(map(lambda G2: list(map(lambda G1: n_th_order_kernel(G1, G2), data[:30])), data[:30])))
print(time()-tic, 's')
#20 en 5.8s
#30 en 14 s

0.0 s


In [25]:
tic =time()
#kernel_n_order(X,Y,same = True):
print(time()-tic, 's')
#20 en 6s
#30 en 14s

0.0 s


In [None]:
C=100.
kernel = kernel_n_order
model = KernelSVC(C=C, kernel=kernel)


model.fit(data[30:], labels[30:])

In [None]:
import pickle

# Ouvrir un fichier en mode écriture binaire
with open('model_10_04.pkl', 'wb') as fichier:
    # Écrire l'objet dans le fichier
    pickle.dump(model, fichier)