#### Libraries

In [None]:
import pickle as pkl
import pandas as pd
import networkx as nx
import numpy as np
from tqdm import tqdm
from itertools import chain
from scipy import sparse
import scipy.sparse as ss

#### Helper Functions

#### Main Loop

In [None]:
with open('Data/test_data.pkl', 'rb') as file:
    test_graphs = pkl.load(file)

with open('Data/training_data.pkl', 'rb') as file:
    train_graphs = pkl.load(file)
    
def translateGraphs(GRAPHS):
    graphs = []
    traduction={0: 50, 1: 51, 2:52, 3:53}
    for i in tqdm(range(len(GRAPHS))):
        H = GRAPHS[i].copy()
        oldlbls = nx.get_edge_attributes(H, 'labels')
        newlbls = {e: [traduction[l[0]]] for e, l in oldlbls.items()}
        #set edge labels
        nx.set_edge_attributes(H, newlbls, name='labels')
        #overwrite
        graphs.append(H)
    return graphs 

test_graphs = translateGraphs(test_graphs)

In [None]:
from lib import getnodelblarr, getedgelabelarr, hashtodic, assignewlabels_node, assignewlabels_edge

def WL(graphs, h=4):
    #Number of hops
    N = len(graphs)# N = len(train_graphs)//1
    
    hashedgraphs = graphs#train_graphs.copy()
    hashedgraphs = hashedgraphs[0:N]
    Alphabet = {}#{f'{i}': i for i in range(49)} #{}
    currentmax = 53

    #Alphabet instances for each graph over all hops
    #Initialise lv - our list of node-labels
    #       and ev - our list of edge-labels
    lv = [[] for _ in range(len(hashedgraphs))]
    el = [[] for _ in range(len(hashedgraphs))]
    for i in range(len(hashedgraphs)):
        #get node labels, edgelabels
        v = nx.get_node_attributes(hashedgraphs[i], 'labels').values()
        e = nx.get_edge_attributes(hashedgraphs[i], 'labels').values()
        #extract from list-of-list
        v = list(chain.from_iterable(v))
        e = list(chain.from_iterable(e))
        #add []+a
        lv[i] = lv[i] + v
        el[i] = el[i] + e

    print(f'Hops = {h}')
    #For each hop
    for _ in range(h):
        #For each graph
        for i in tqdm(range(len(hashedgraphs))):
            #graph to be worked with
            Gi = hashedgraphs[i].copy()
            #simplify: get biggest connected subgraph
            Gi = Gi.subgraph(sorted(nx.connected_components(Gi), key=len, reverse=True)[0])
            
            #For each node
            #Array with columns: node, label, newlabel
            lblarri_v = getnodelblarr(Gi)
            lblarri_e = getedgelabelarr(Gi)
            #Update big alphabet, hash
            Alphabet, lblarri_v, currentmax = hashtodic(Alphabet, lblarri_v, currentmax, node=True)
            #sort and reset index
            lblarri_v = lblarri_v.sort_values(by='node').reset_index(drop=True)  
            #relabel to a different graph
            # print(lblarri_v, lblarri_e, sep='\n\n')
            Gi = assignewlabels_node(Gi, lblarri_v)

            if len(lblarri_e)!=0:
                Alphabet, lblarri_e, currentmax = hashtodic(Alphabet, lblarri_e, currentmax, node=False)
                lblarri_e = lblarri_e.sort_values(by='edge').reset_index(drop=True)
                Gi = assignewlabels_edge(Gi, lblarri_e)
                e = nx.get_edge_attributes(hashedgraphs[i], 'labels').values()
                e = list(chain.from_iterable(e))
                el[i] = el[i] + e

            #assign graph-value
            hashedgraphs[i] = Gi
            #add counts of labels, reuse previous variable
            v = nx.get_node_attributes(hashedgraphs[i], 'labels').values()
            v = list(chain.from_iterable(v))
            lv[i] = lv[i] + v
            
    return hashedgraphs, Alphabet, lv, el

In [None]:
hashedgraphs, Alphabet, lv, ev = WL(train_graphs + test_graphs, h=10)

#### Generate feature vectors

In [None]:
def generateFeatureVectors(hashedgraphs, Alphabet, l):
    '''Creating matrix of feature vectors for each graph'''
    M = np.zeros((len(hashedgraphs), len(Alphabet)+100))
    for i in tqdm(range(len(hashedgraphs))):
        a = pd.value_counts(l[i])
        M[i, list(a.index)] = a.values

    sM = sparse.csr_matrix(M)
    #And save so this only needs to be saved once.
    return sM

sM = generateFeatureVectors(hashedgraphs, Alphabet, lv)

Save Data for easier reloading later in sparse format

In [None]:
from lib import save_sparse_csr, load_sparse_csr
save_sparse_csr('Data/WL_allfeatures_e.npz', sM)
save_sparse_csr('Data/WL_trainfeatvec_e.npz', sM[0:len(train_graphs)])
save_sparse_csr('Data/WL_testfeatvec_e.npz', sM[len(train_graphs):len(train_graphs+test_graphs)])

#### Create kernel matrix

In [None]:
def kernelFromFeatureVectors(filename, sparse=True):
    sM = load_sparse_csr(filename) if sparse else np.loadtxt(filename) #'Data/WLfeaturevectors.npz'
    print(f'sM.shape = {sM.shape}')
    M = ss.csr_matrix.toarray(sM)
    K = np.dot(M, M.T)
    sK = ss.csr_matrix(K)
    return sK, M

sK, M = kernelFromFeatureVectors('Data/WL_allfeatures_e.npz', sparse=True)   
print('Kernel Matrix Determined')
save_sparse_csr('Data/WLKernel_traintest_e', sK)

# Compare how much sparse arrays save space
# from sys import getsizeof
# M = ss.csr_matrix.toarray(sM)
# print(M.shape, getsizeof(M), getsizeof(sM), sep='\n')


#### Classify

Load data

In [None]:
import pickle as pkl
import numpy as np
from sklearn.model_selection import train_test_split
with open('Data/training_labels.pkl', 'rb') as file:
    labels = pkl.load(file)
from lib import load_sparse_csr
# WLData = load_sparse_csr('Data/WLKernel_traintest_e.npz')
# WLData = load_sparse_csr('Data\WL\WLKernel_traintest_e_h5.npz')
# WLData = ss.csr_matrix.toarray(WLData)

Center dataset

In [None]:
# N = len(WLData)
# K = WLData
# U = (1/N) * np.ones((N,N))
# I = np.eye(N)
# Kc = (I-U) @ K @ (I-U)
# np.savetxt('Data/Kc8000.txt', Kc)
Kc = np.loadtxt('Data/Kc8000.txt')

Project entire dataset (train-validate-test) to PCA

In [None]:
Kc.shape

In [None]:
# import numpy as np
# evals, evecs = np.linalg.eigh(Kc)
# numpca = 8000
# WLDataNew = Kc.dot(evecs[0:numpca].T)
# np.savetxt('Kc8000_pcaed.txt', WLDataNew)

In [1]:
import pickle as pkl
import numpy as np
from sklearn.model_selection import train_test_split
with open('Data/training_labels.pkl', 'rb') as file:
    labels = pkl.load(file)

WLDataNew = np.loadtxt('Kc8000_pcaed.txt')
N = 6000

Split data

Train Linear Regression Classifier

In [66]:
#Choose subset
t = N//3
r = N//3
WLTrainValid = WLDataNew[0:t, 0:r] #, 0:len(train_graphs)]
WLLabels = labels[0:t]

X_train, X_validate, y_train, y_validate = train_test_split(WLTrainValid, WLLabels, test_size=0.2) # random_state=1)


print(X_train.shape, X_validate.shape)

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn import metrics
lr = LogisticRegression(solver='liblinear', verbose=False, penalty='l2', C=200, warm_start=False)
# lr = SVC(C=10, kernel='linear')
# Fit on training data
lr.fit(X_train, y_train)
#Predict on validation data
pred = lr.predict(X_validate)
fpr, tpr, thresholds = metrics.roc_curve(y_validate, pred, pos_label=1)
#AUC metric
print(f'AUC: {metrics.auc(fpr, tpr)}')

(1600, 2000) (400, 2000)
AUC: 0.7722304102186195




In [None]:
from lib import LIN, KernelSVC

sigma = 1.5
C=100.
kernel = LIN().kernel
linmodel = KernelSVC(C=C, kernel=kernel)

X_train, X_validate, y_train, y_validate = train_test_split(WLTrainValid, WLLabels, test_size=0.2) # random_state=1)

linmodel.fit(X_train, y_train)
linmodel.predict(X_validate)
# plotClassification(train_dataset['x'], train_dataset['y'], model, label='Training')

Predict on Validation Data

In [39]:
from lib import calculateLogits
X_test = WLDataNew[6000:8000, 0:r]
pred_test = lr.predict_proba(X_test)
logit = calculateLogits(pred_test)

Range:
 [-258.73844563838435, 36.04365338911715]


Predict on testing data

In [40]:
from lib import saveDataToFormattedSubmissionFile
ctr = int(input('Current submission attempt:\t'))
saveDataToFormattedSubmissionFile(logit, f'Data/WL_test_pred{ctr}.csv')

In [41]:
# save the model to disk
filename = 'Data/Models/lr795.sav'
pkl.dump(lr, open(filename, 'wb'))

Save to relevant file