## Main file project


In [2]:
# Main imports
import networkx as nx
import numpy as np
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, KFold 
from sklearn.metrics import f1_score
import json
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import make_pipeline
from sklearn.naive_bayes import MultinomialNB
import os 
import pandas as pd 
from node2vec import Node2Vec
from node2vec.edges import HadamardEmbedder
import csv

from sys import getsizeof
from natsort import natsorted

In [3]:
# Load the training dataset (with the labels)
X = []
y = []
with open("./../Data/training.txt", "r") as f:
    for line in tqdm(f):
        line = line.split()
        X.append(np.array([int(line[0]), int(line[1])]))
        y.append(np.array(int(line[2])))
    X = np.array(X)
    y = np.array(y)
    

Widget Javascript not detected.  It may not be installed or enabled properly.





In [4]:
Node_info = [] 

i =0
j=0
for root, dirs, files in os.walk("./../Data/node_information/text", topdown=False):
        for name in tqdm(natsorted(files)):
            path = os.path.join("./../Data/node_information/text",name)
            try:
                with open(path, "r",  encoding='utf-8', errors='ignore') as f:
                    Node_info.append(f.read())
            except:
                print(path)
                i+=1
            j+=1

print(i/j)

Widget Javascript not detected.  It may not be installed or enabled properly.



0.0


## Define some functions

In [None]:
# Fill graph from np.arrays
def fill_graph(X, y):
    G = nx.Graph()
    for nd, v in tqdm(zip(X, y)):
        if int(v) == 1:
            G.add_edge(nd[0], nd[1])
    return G

def node2vec_input(X, y):       
    with open('./../node2vec/data.txt', 'w') as outfile:
        for nd, v in tqdm(zip(X, y)):
            if int(v) == 1:
                outfile.write(str(nd[0])+ " "+str(nd[1])+"\n")


def export_to_json(X,y):
    data = {}
    data["edges"]=[]
    data["nodes"]=[]
    i=0
    for _ in tqdm(X):
        data["nodes"].append({
            'id': str(_[0]),
            'label' : str(_[0]),
            'group' : 1
        })
        data["nodes"].append({
            'id': str(_[1]),
            'label' : str(_[1]),
            'group' : 1
        })
        i+=1
    
    for nd, v in tqdm(zip(X, y)):
        if int(v) == 1:
            data['edges'].append({
                'from': str(nd[0]),
                'to' : str(nd[1])
            })
            
    with open('./Vis/data.json', 'w') as outfile:
        json.dump(data, outfile)
    

def compute_score(y_pred, y):
    # pred is the predicted vector of 0's and 1's
    # data is the expected vector of 0's and 1's
    n_pred = len(y_pred)
    n_data = len(y)
    assert(n_pred == n_data)
    tp = 0
    fp = 0
    fn = 0
    for i in range(n_data):
        if y_pred[i] == 1 and y[i] == 1:
            tp += 1
        elif y_pred[i] == 1 and y[i] == 0:
            fp += 1
        elif y_pred[i] == 0 and y[i] == 0:
            fn += 1
    p = 1.*tp / (tp + fp)
    r = 1.*tp / (tp + fn)
    return 2*p*r/(p+r)


# Compute jaccard predictions given a Graph and the nodes array
def jaccard_prediction(G, X):
    predictions = []
    for x in X:
        try:
            coef = [[u, v, p] for u, v, p in nx.jaccard_coefficient(G, [(x[0], x[1])])][0]
        except KeyError:  # If the node tryed isn't in the Graph we predict 0...
            coef = [0, 0, 0]
        if coef[2] > 0.005:
            predictions.append(1)
        else:
            predictions.append(0)
    return predictions


# Compute the score given a predictor and the training set

def computeTrainingScore_kFold(predictor, X, y, n_splits):
    kf = KFold(n_splits = n_splits)  # Define the split - into n_splits folds

    mean_score = 0
    i=1
    for train_index, val_index in kf.split(X):
        X_train, X_val = X[train_index], X[val_index]
        y_train, y_val = y[train_index], y[val_index]

        # Let's reinitilize the graph
        G = fill_graph(X_train, y_train)
        y_pred = predictor(G, X_val)
        mean_score += compute_score(y_pred, y_val)
        print(mean_score/i)
        i+=1
    return mean_score/n_splits

def computeTrainingScore(predictor, X, y, test_size=0.1):
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=test_size, random_state=1)
    G = fill_graph(X_train, y_train)
    y_pred = predictor(G, X_val)
    return compute_score(y_pred, y_val)

## Graph Visualization

In [None]:
#G= fill_graph(X,y)
export_to_json(X,y)

In [None]:
Z=[]
for nd, v in tqdm(zip(X, y)):
    if int(v) == 1:
        Z.append(np.array([nd[0], nd[1]]))
Z= np.array(Z)
np.savetxt("data.edgelist", Z, delimiter=" ",  fmt="%d" )

## Only NLP, supervised learning

### Preparing data (bag of words + TFIDF transform)

In [None]:
count_vect = CountVectorizer()
tfidf_transformer = TfidfTransformer()

X_train_counts = count_vect.fit_transform(Node_info)
nodes_tfidf = tfidf_transformer.fit_transform(X_train_counts)


In [None]:
print(nodes_tfidf.shape)

### Testing cossine similarity

In [None]:
av1=0
av2=0
i,j=0,0
d=0
for nd, v in tqdm(zip(X, y)):
    if int(v) ==1:
        av1+=np.dot(np.transpose(nodes_tfidf[:,nd[0]]),nodes_tfidf[:,nd[1]])
        i+=1
    else:
        av2+=np.dot(np.transpose(nodes_tfidf[:,nd[0]]),nodes_tfidf[:,nd[1]])
        j+=1
    d+=1
    if(d==100): break;
print("Cos for linked edges : " + (str(av1/i)))
print("Cos for linked edges : " + (str(av2/j)))

### Naive Bayes Classificator

In [None]:
X_t = np.concatenate((Node_info[X[:,0][0]],Node_info[X[:,1][0]]), axis=0)
print(X_t)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df1, y, test_size=0.10, random_state=42)

clf = MultinomialNB().fit(X_train, y_train)
y_pred=clf.predict(X_test)
print(f1_score(y_test,y_pred))


# Node2vec embedding 

In [None]:
G= fill_graph(X, y)

node2vec = Node2Vec(G, dimensions=64, walk_length=9, num_walks=5, workers=20, p=1, q=2) 

In [None]:
model = node2vec.fit(window=3, min_count=1, batch_words=4)

In [None]:
model.save("node2vec_d64_wl9_nw5_w3.model")

In [None]:
from gensim.models import Word2Vec
model = Word2Vec.load("node2vec_d64_wl9_nw5_w3.model")

In [None]:
model.wv.save_word2vec_format("model_d64_wl9_nw5_w3.txt")

In [None]:
print(model.wv.most_similar('2'))  # Output node names are always strings

## NLP - word2vec and Gensin

### Data pre-processing

In [5]:
print(Node_info[2])  # The Node_info[i] refers to i.txt

   En poursuivant votre navigation sur ce site, vous acceptez que nous
   utilisions des cookies pour mesurer l'audience de nos sites et pour
   vous proposer des fonctionnalités sociales, du contenu et des
   publicités éventuellement personnalisés. (BUTTON) Personnaliser
   (BUTTON) OK, tout accepter
     * Accueil
     * Peinture à l'huile
     * Les expositions
          + Exposition "immersion" 2018
          + "Fenêtre sur Mer" 2016
     * Le peintre
     * Biographie
          + Biographie
          + Parcours
     * Les Toiles
          + Toutes les toiles
          + Grandes toiles
          + Petites toiles
          + Grands Diptyques
          + Petits Diptyques
          + Encre de chine
          + Toiles installées
          + Catalogue
          + Vidéos
     * Me contacter

     * Accueil
     * Peinture à l'huile
     * Les expositions
          + Exposition "immersion" 2018
          + "Fenêtre sur Mer" 2016
     * Le peintre
     * Biographie
          + Biographie


In [5]:
import re
import pandas as pd  # For data handling
from time import time  # To time our operations
from collections import defaultdict  # For word frequency

import spacy  # For preprocessing

import logging  # Setting up the loggings to monitor gensim
logging.basicConfig(format="%(levelname)s - %(asctime)s: %(message)s", datefmt= '%H:%M:%S', level=logging.INFO)

In [6]:
df = pd.DataFrame(Node_info) 

In [7]:
print(df[0][:])

0           The Trade Desk\n   Looking for the Unified ...
1           #alternate alternate\n\n   [wa.pj?s=559638&...
2           En poursuivant votre navigation sur ce site...
3           #alternate alternate\n\n   [wa.pj?s=559638&...
4           #alternate alternate\n\n   [wa.pj?s=559638&...
                               ...                        
33221       #Caroline Decré » Flux Caroline Decré » Flu...
33222       #RSS\n\n     * [bb-canalblog2.png]\n     * ...
33223        #alternate alternate\n\nSaint Joseph Aubiè...
33224       #publisher Deciplus » Flux Deciplus » Flux ...
33225       #Welcome to Recall - Total Recall (2012) Tr...
Name: 0, Length: 33226, dtype: object


In [8]:
nlp = spacy.load('fr', disable=['ner', 'parser'])
def cleaning(doc):
    # Lemmatizes and removes stopwords
    # doc needs to be a spacy Doc object
    txt = [token.lemma_ for token in doc if not token.is_stop]
    # Word2Vec uses context words to learn the vector representation of a target word,
    # if a sentence is only one or two words long,
    # the benefit for the training is very small
    if len(txt) > 2:
        return ' '.join(txt)

In [28]:
t = time()
brief_cleaning = (re.sub("[^A-Za-z']+", ' ', str(row)).lower() for row in df[0][:])
i = 0
for doc in nlp.pipe(brief_cleaning, batch_size=5000, n_threads=8):
    #with open('./../Data/node_information/clean_text/'+str(i)+'.txt', 'w') as f:
        #f.write(cleaning(doc))
        #print('./../Data/node_information/clean_text/'+str(i)+'.txt')
    i+=1
#txt = [cleaning(doc) for doc in nlp.pipe(brief_cleaning, batch_size=5000, n_threads=8)]
print('Time to clean up everything: {} mins'.format(round((time() - t) / 60, 2)))

Time to generate the brief_cleaning: 0.0 mins


KeyboardInterrupt: 

In [None]:
df_clean = pd.DataFrame({'clean': txt})

In [None]:
print(df_clean)

In [None]:
df_clean.to_csv("Node_info_clean.csv",index=False)

In [None]:
np.save("list_of_nodes", Node_info, allow_pickle=True)

##  Import node2vec and basic models

In [3]:
T={}
i=0
with open("./model_d64_wl9_nw5_w5.txt", "r") as f:
     for line in tqdm(f):
        if(i==0): pass
        else:
            line = line.split()
            T[float(line[0])]=np.array([float(a) for a in (line[1:-1])])
        i+=1
#X= np.array(X[1:-1])



33163it [00:00, 40634.50it/s]


In [4]:
T[10481]

array([ 0.7404842 , -1.4362556 ,  0.5302325 , -0.1030505 , -0.6509405 ,
        0.07901148, -1.4621917 ,  0.06504963, -1.9926604 , -0.2518935 ,
        0.4129057 , -0.48431596,  0.79965603,  0.5177307 , -0.49467507,
       -0.08762395, -0.43693647, -0.12519674, -1.2167609 ,  0.15535897,
       -1.185967  ,  0.49078524, -0.13660388,  0.2120918 ,  0.3679756 ,
        0.3191221 , -0.04115665,  0.14868537,  0.10401659, -0.383966  ,
        0.9506267 , -1.3403143 , -0.6030259 , -0.13534434, -1.4833179 ,
        1.2220614 ,  0.79736674,  0.06079897, -0.8068173 ,  0.46822396,
        1.9668282 ,  0.8494924 ,  1.5610387 ,  0.40520224,  1.3071319 ,
        0.7966213 , -0.81597453,  1.370335  , -0.9699791 , -0.37755573,
        0.31234258,  0.56113994, -0.31787762, -1.3367473 ,  0.50323224,
       -0.73666686,  0.5041424 ,  1.4499984 , -0.22668447, -1.17377   ,
       -0.42792922, -0.7352172 ,  1.4161828 ])

In [None]:
print(len(T))

In [5]:
def hadamard(x,y):
    return x*y

def WeightedL1(x,y):
    return np.abs(x-y)

def WeightedL2(x,y):
    return (x-y)**2

def avg(x,y):
    return (x+y)/2

In [6]:
Z=[]
yt=[]
for nd, v in tqdm(zip(X, y)):
    try:
        Z.append(hadamard(T[nd[0]],T[nd[1]]))
        yt.append(v)
    except:
        pass
Z=np.array(Z)
print(Z)

453797it [00:01, 233293.38it/s]


[[ 3.11567243e-01  7.51485626e-01  3.51972761e-01 ...  1.70197422e-01
   5.39839552e-01  1.50590874e+00]
 [ 7.28803303e-03  1.22077349e-04  1.51414119e-02 ...  4.07585895e-03
   1.71206881e-02  1.90837207e-02]
 [ 2.36823544e-01  2.29267084e+00  1.45157386e-02 ...  4.75318510e-01
   5.03370516e-01  5.94864516e-01]
 ...
 [-1.10656749e-03  7.36422758e-04  2.98993045e-02 ...  2.21257402e-02
   4.89522059e-02  3.54070333e-02]
 [-1.22020542e-03  5.00479629e-02  1.09216373e-01 ...  6.72271185e-01
   1.72504766e-01  6.20401182e-01]
 [ 2.12931629e-03 -5.58029833e-04  1.15155316e-02 ... -6.24292242e-03
   2.61171926e-02  2.63767048e-02]]


In [7]:
X_train, X_test, y_train, y_test = train_test_split(Z, yt, test_size=0.10, random_state=42)

In [None]:
from sklearn.svm import SVC

svc= SVC().fit(X_train, y_train)
print("end of training")
y_pred=clf.predict(X_test)
print(f1_score(y_test,y_pred))

In [None]:
from sklearn.naive_bayes import MultinomialNB

clf= MultinomialNB().fit(X_train, y_train)
print("end of training")
y_pred=clf.predict(X_test)
print(f1_score(y_test,y_pred))

In [None]:
from sklearn.ensemble import RandomForestClassifier

rfc= RandomForestClassifier().fit(X_train, y_train)
print("end of training")
y_pred=rfc.predict(X_test)
print(f1_score(y_test,y_pred))

In [None]:
from sklearn.ensemble import ExtraTreesClassifier

etc= ExtraTreesClassifier().fit(X_train, y_train)
print("end of training")
y_pred=etc.predict(X_test)
print(f1_score(y_test,y_pred))

In [None]:
from sklearn.ensemble import GradientBoostingClassifier 

gbc= GradientBoostingClassifier().fit(X_train, y_train)
print("end of training")
y_pred=gbc.predict(X_test)
print(f1_score(y_test,y_pred))

In [8]:
from sklearn.neighbors import KNeighborsClassifier

knn= KNeighborsClassifier().fit(X_train, y_train)
print("end of training")
y_pred=knn.predict(X_test)
print(f1_score(y_test,y_pred))


end of training
0.9367443490450325


## 2 layers classificator with softmax and cross-entropy loss

In [None]:
import keras as keras
import tensorflow as tf
sess = tf.Session()
import wandb

In [None]:
layers = [
    keras.layers.Dense(128, input_shape=(64,), activation="relu"),
    keras.layers.Dense(1, activation="softmax")
]

model = keras.Sequential(layers)
model.summary()
model.compile(optimizer='sgd',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

history = model.fit(X_train, Y_train, batch_size=100, epochs=50, validation_data=(X_test, Y_test))

In [None]:
def plot_history(history):
    fig, axs = plt.subplots(1,2,figsize=(15,7))
    axs[0].plot(history.epoch,history.history["loss"])
    axs[0].plot(history.epoch,history.history["val_loss"])
    axs[0].grid()
    
    axs[1].plot(history.epoch,history.history["accuracy"])
    axs[1].plot(history.epoch,history.history["val_accuracy"])
    axs[1].grid()
    plt.show()
        
plot_history(history)

## 1) Given approach - Graph baseline - Jaccard coefficient

In [None]:
# For training purpose, as the testing.txt set doesn't have the labels
# We will split our original (training) dataset with kfolds 
# For each split we'll initilize a Graph, train our model and get the mean score
# On the training set
n_splits = 2

jacc_score = computeTrainingScore(jaccard_prediction, X, y)
print(jacc_score)