#### Split our data

_My suggestion would be to use a split with these parameters for our validation_

In [None]:
import pandas as pd
import pickle as pkl
with open('training_data.pkl', 'rb') as file:
    train_graphs = pkl.load(file)
with open('training_labels.pkl', 'rb') as file:
    train_labels = pkl.load(file)

from sklearn.model_selection import train_test_split
X_train, X_validate, y_train, y_validate = train_test_split(train_graphs, train_labels, test_size=0.1, random_state=1)

#### Sklearn multiple classifiers

Import

In [None]:
# # Code source: Gaël Varoquaux
# #              Andreas Müller
# # Modified for documentation by Jaques Grobler
# # License: BSD 3 clause

# import numpy as np
# import matplotlib.pyplot as plt
# from matplotlib.colors import ListedColormap
# from sklearn.model_selection import train_test_split
# from sklearn.preprocessing import StandardScaler
# from sklearn.pipeline import make_pipeline
# from sklearn.datasets import make_moons, make_circles, make_classification
# from sklearn.neural_network import MLPClassifier
# from sklearn.neighbors import KNeighborsClassifier
# from sklearn.svm import SVC
# from sklearn.gaussian_process import GaussianProcessClassifier
# from sklearn.gaussian_process.kernels import RBF
# from sklearn.tree import DecisionTreeClassifier
# from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
# from sklearn.naive_bayes import GaussianNB
# from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
# from sklearn.inspection import DecisionBoundaryDisplay


#### Original

Setup

In [None]:
names = [
    "Nearest Neighbors",
    "Linear SVM",
    "RBF SVM",
    "Gaussian Process",
    "Decision Tree",
    "Random Forest",
    "Neural Net",
    "AdaBoost",
    "Naive Bayes",
    "QDA",
]

classifiers = [
    KNeighborsClassifier(3),
    SVC(kernel="linear", C=0.025, probability=True),
    SVC(gamma=2, C=1, probability=True),
    GaussianProcessClassifier(1.0 * RBF(1.0)),
    DecisionTreeClassifier(max_depth=5),
    RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
    MLPClassifier(alpha=1, max_iter=1000),
    AdaBoostClassifier(),
    GaussianNB(),
    QuadraticDiscriminantAnalysis(),
]

def myplot(ax, X_train, X_test, y_train, y_test, x_min, x_max, y_min, y_max, title):
    cm_bright = ListedColormap(["#FF0000", "#0000FF"])
    ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm_bright, edgecolors="k")
    # testing points
    ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=cm_bright, alpha=0.6, edgecolors="k"  )
    # set axes
    ax.set_xlim(x_min, x_max)
    ax.set_ylim(y_min, y_max)
    ax.set_xticks(())
    ax.set_yticks(())
    ax.set_title(title)

In [None]:
X, y = make_classification(
    n_features=2, n_redundant=0, n_informative=2, random_state=1, n_clusters_per_class=1
)
rng = np.random.RandomState(2)
X += 2 * rng.uniform(size=X.shape)
linearly_separable_ds = (X, y)

datasets = [
    make_moons(noise=0.3, random_state=0),
    make_circles(noise=0.2, factor=0.5, random_state=1),
    linearly_separable_ds,
]

Classify and plot

In [None]:
figure = plt.figure(figsize=(27, 9))

i = 1
# iterate over datasets
for ds_cnt, ds in enumerate(datasets):
    # preprocess; split into training/test part
    X, y = ds
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42 )
    x_min, x_max = X[:, 0].min() - 0.5, X[:, 0].max() + 0.5
    y_min, y_max = X[:, 1].min() - 0.5, X[:, 1].max() + 0.5

    # plot dataset
    ax = plt.subplot(len(datasets), len(classifiers) + 1, i)
    title = "Input data" if ds_cnt == 0 else ""
    myplot(ax, X_train, X_test, y_train, y_test, x_min, x_max, y_min, y_max, title)
    i += 1

    # iterate over classifiers
    for name, clf in zip(names, classifiers):
        ax = plt.subplot(len(datasets), len(classifiers) + 1, i)

        #Fit data
        clf = make_pipeline(StandardScaler(), clf)
        clf.fit(X_train, y_train)
        score = clf.score(X_test, y_test)
        # pred = clf.predict_proba(X_test)
        # print(metrics.f1_score(y_test, clf.predict(X_test)))
        
        cm = plt.cm.RdBu
        DecisionBoundaryDisplay.from_estimator(
            clf, X, cmap=cm, alpha=0.8, ax=ax, eps=0.5
        )

        # Plot the training points
        title = name if ds_cnt == 0 else ""
        myplot(ax, X_train, X_test, y_train, y_test, x_min, x_max, y_min, y_max, title)
        ax.text(x_max - 0.3, y_min + 0.3,  ("%.2f" % score).lstrip("0"),  size=15,  horizontalalignment="right",  )
        i += 1

plt.tight_layout()
plt.show()

#### Kaggle

Load data

In [2]:
import scipy.sparse as ss

In [9]:
with open('Data/training_labels.pkl', 'rb') as file:
    labels = pkl.load(file)
from lib import load_sparse_csr
WLData = load_sparse_csr('Data/WLKernel.npz')
WLData = ss.csr_matrix.toarray(WLData)
WLData.shape

(6000, 6000)

Single classifier: KNN, Cross-fold

In [None]:
def crossfold(classif, X, y, n_splits, classif):
    n_splits = 10
    kf = KFold(n_splits=n_splits, random_state=1, shuffle=True)
    kf.get_n_splits(X)
    scores = np.empty(n_splits)
    for i, (train_index, test_index) in enumerate(kf.split(X)):
        #Split
        X_train = X[train_index]
        y_train = y[train_index]
        #Fit
        clf = make_pipeline(StandardScaler(), classif)
        clf.fit(X_train, y_train)
        #Hit
        X_test = X[test_index]
        y_test = y[test_index]
        scores[i] = clf.score(X_test, y_test)
        pred = clf.predict(X_test)
        print(f"Fold {i}: {scores[i]}, {metrics.f1_score(y_test, pred)}")
        # print(f'{scores[i]*len(y_test)}/{len(y_test)}')
        # print(f'')
        #Count number of 1's 
        # pred[y_test==1]
    
    return scores, pred

In [10]:
# iterate over classifiers
X_train, X_validate, y_train, y_validate = train_test_split(WLData, labels, test_size=0.3, random_state=1)

    

In [26]:
# Creating a classifier instance
#
# rf = RandomForestClassifier(n_estimators=500, min_samples_leaf=1, max_depth=50)
from sklearn import metrics
from sklearn.svm import SVC
rbf = SVC(gamma=2, C=1, probability=True, verbose=1)
rbf.fit(X_train, y_train)

In [27]:
rbf.fit(X_train, y_train)
pred = rbf.predict(X_validate)

In [None]:
print(metrics.recall_score(y_validate, pred))
print(metrics.accuracy_score(y_validate, pred))
print(metrics.f1_score(y_validate, pred))

from sklearn.metrics import confusion_matrix
pd.DataFrame(confusion_matrix(y_validate, pred))

0.35365853658536583
0.93
0.4793388429752066


In [1]:
import pickle as pkl
import scipy.sparse as ss
from sklearn.model_selection import train_test_split
with open('Data/training_labels.pkl', 'rb') as file:
    labels = pkl.load(file)
from lib import load_sparse_csr
WLData = load_sparse_csr('Data/WLKernel.npz')
WLData = ss.csr_matrix.toarray(WLData)
print(WLData.shape)
X_train, X_validate, y_train, y_validate = train_test_split(WLData, labels, test_size=0.3, random_state=1)


(6000, 6000)


### Ok restart tonight

Project

In [5]:
import numpy as np
evals, evecs = np.linalg.eigh(WLData)

In [86]:
numpca = 4000
WLDataNew = WLData.dot(evecs[0:numpca].T)
X_train, X_validate, y_train, y_validate = train_test_split(WLDataNew, labels, test_size=0.3, random_state=1)

##### SVM

In [77]:
from sklearn import metrics
from sklearn.svm import SVC
rbf = SVC(gamma=0.5, C=1, probability=True, verbose=1)
rbf.fit(X_train, y_train)

[LibSVM]

In [78]:
# X_validate_pca = pca.transform(X_validate)
pred = rbf.predict(X_validate)
fpr, tpr, thresholds = metrics.roc_curve(y_validate, pred, pos_label=1)
print(f'AUC: {metrics.auc(fpr, tpr)}')
# AUC: 0.6082279802015624 for gamma = 2

AUC: 0.6082279802015624


In [21]:
logproba = rbf.predict_proba(X_validate)
# [-0.0063112232179776875, -5.068579719299193]
logit = np.log(logproba[:,1]/(1-logproba[:,1])) 
[np.min(logit), np.max(logit)]

##### MLP

In [46]:
from sklearn.neural_network import MLPClassifier
mlp = MLPClassifier(alpha=1, max_iter=2000)
mlp.fit(X_train, y_train)

In [48]:
pred = mlp.predict(X_validate)
fpr, tpr, thresholds = metrics.roc_curve(y_validate, pred, pos_label=1)
print(f'AUC: {metrics.auc(fpr, tpr)}')

AUC: 0.5


##### RF

In [79]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(max_depth=500, n_estimators=100, max_features=10)
rf.fit(X_train, y_train)
pred = rf.predict(X_validate)
fpr, tpr, thresholds = metrics.roc_curve(y_validate, pred, pos_label=1)
print(f'AUC: {metrics.auc(fpr, tpr)}')

AUC: 0.6505978293279265


##### Logistic Regression

In [87]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(random_state=0, solver='newton-cholesky')
lr.fit(X_train, y_train)
pred = lr.predict(X_validate)
fpr, tpr, thresholds = metrics.roc_curve(y_validate, pred, pos_label=1)
print(f'AUC: {metrics.auc(fpr, tpr)}')

AUC: 0.7639617747033215


In [143]:
def calculateLogits(pred):
    logproba = pred
    logproba[list(np.where(logproba[:,1]==1)[0]), 1] = 1-1e-10
    logit = np.log(logproba[:,1]/(1-logproba[:,1])) 
    print([np.min(logit), np.max(logit)])
    return logit

predValidate = lr.predict_proba(X_validate)
logit = calculateLogits(predValidate)# logproba = lr.predict_proba(X_validate)


with open('Data/test_data.pkl', 'rb') as file:
    train_graphs = pkl.load(file)
predTest = lr.predict_proba()

[-372.34819604074823, 33.33560318801494]


In [135]:
# predict on the test data 
# for instance, 


DP Kernel

In [100]:
DPdata = np.loadtxt('Data/DPKernel.txt')

In [105]:
import pickle as pkl
import networkx as nx
import numpy as np
from tqdm import tqdm
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

with open("Data/training_data.pkl", 'rb') as file:
    train_graphs = pkl.load(file)

In [107]:
def direct_product(G1, G2, node_label='labels', edge_label='labels'):
    from itertools import product
    GP = nx.Graph()
    # add nodes
    for u, v in product(G1, G2):
        if G1.nodes[u][node_label] == G2.nodes[v][node_label]:
            GP.add_node((u, v))
            GP.nodes[(u, v)].update({node_label: G1.nodes[u][node_label]})

    # add edges
    for u, v in product(GP, GP):
            if (u[0], v[0]) in G1.edges and (u[1], v[1]) in G2.edges and G1.edges[u[0],v[0]][edge_label] == G2.edges[u[1],v[1]][edge_label]:
                GP.add_edge((u[0], u[1]), (v[0], v[1]))
                GP.edges[(u[0], u[1]), (v[0], v[1])].update({edge_label:G1.edges[u[0], v[0]][edge_label]})

    return GP

In [109]:
import scipy.sparse as ss
def DPKernel(Gx):
    adjacencymatrix = nx.adjacency_matrix(Gx)
    rho = ss.linalg.norm(adjacencymatrix)
    infsum = ss.linalg.inv( ss.eye(adjacencymatrix.shape[0], format='csc') - rho*adjacencymatrix )
    k = np.sum(np.abs(infsum))
    return k


In [110]:
from lib import direct_product , DPKernel

N = len(train_graphs)
K_dp = np.zeros((N,N))

for i in tqdm(range(N)):
    G1 = train_graphs[i]
    for j in range(i+1): #K will be symmetric, so only calculate upper and then change
        G2 = train_graphs[j]        
        Gx = direct_product(G1, G2)
        K_dp[i,j] = DPKernel(Gx)
#extract diagonal and then make into diagonal matrix
K_dp = K_dp + K_dp.T - np.diag(np.diag(K_dp)) 

np.savetxt('Data/DPKernel.txt', K_dp)

  5%|▌         | 311/6000 [3:46:05<68:55:43, 43.62s/it]  


KeyboardInterrupt: 