## Main file project


In [12]:
# Main imports
import networkx as nx
import numpy as np
from sklearn.model_selection import train_test_split, KFold 

In [6]:
# Load the training dataset (with the labels)
X = []
y = []
with open("../data/training.txt", "r") as f:
    for line in f:
        line = line.split()
        X.append(np.array([int(line[0]), int(line[1])]))
        y.append(np.array(int(line[2])))
    X = np.array(X)
    y = np.array(y)

## Define some functions

In [14]:
# Fill graph from np.arrays
def fill_graph(X, y):
    G = nx.Graph()
    for nd, v in zip(X, y):
        if int(v) == 1:
            G.add_edge(nd[0], nd[1])
    return G


# Compute F1-Score
def compute_score(y_pred, y):
    # pred is the predicted vector of 0's and 1's
    # data is the expected vector of 0's and 1's
    n_pred = len(y_pred)
    n_data = len(y)
    assert(n_pred == n_data)
    tp, fp, fn = 0, 0, 0
    for i in range(n_data):
        
        if int(y_pred[i]) == 1 and int(y[i]) == 1:
            tp += 1
        elif int(y_pred[i]) == 1 and int(y[i]) == 0:
            fp += 1
        elif int(y_pred[i]) == 0 and int(y[i]) == 0:
            fn += 1
    p = 1.*tp / (tp + fp)
    r = 1.*tp / (tp + fn)
    return 2*p*r/(p+r)

# Compute jaccard predictions given a Graph and the nodes array
def jaccard_prediction(G, X):
    predictions = []
    for x in X:
        try:
            coef = [[u, v, p] for u, v, p in nx.jaccard_coefficient(G, [(x[0], x[1])])][0]
        except KeyError:  # If the node tryed isn't in the Graph we predict 0...
            coef = [0, 0, 0]
        if coef[2] > 0.005:
            predictions.append(1)
        else:
            predictions.append(0)
    return predictions


# Compute the score given a predictor and the training set

def computeTrainingScore_kFold(predictor, X, y, n_splits):
    kf = KFold(n_splits = n_splits)  # Define the split - into n_splits folds

    mean_score = 0
    i=1
    for train_index, val_index in kf.split(X):
        X_train, X_val = X[train_index], X[val_index]
        y_train, y_val = y[train_index], y[val_index]

        # Let's reinitilize the graph
        G = fill_graph(X_train, y_train)
        y_pred = predictor(G, X_val)
        mean_score += compute_score(y_pred, y_val)
        print(mean_score/i)
        i+=1
    return mean_score/n_splits
def computeTrainingScore(predictor, X, y, test_size=0.1):
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=test_size, random_state=1)
    G = fill_graph(X_train, y_train)
    y_pred = predictor(G, X_val)
    return compute_score(y_pred, y_val)

## 1) Given approach - Graph baseline - Jaccard coefficient

In [15]:
# For training purpose, as the testing.txt set doesn't have the labels
# We will split our original (training) dataset with kfolds 
# For each split we'll initilize a Graph, train our model and get the mean score
# On the training set
n_splits = 2

jacc_score = computeTrainingScore(jaccard_prediction, X, y)
print(jacc_score)

0.6158371040723982
