## Main file project


In [0]:
# Installing libraries
!pip install tqdm -U
!pip install node2vec
!apt-get install pv

In [0]:
# Main imports
import networkx as nx
import numpy as np
from tqdm.notebook import tqdm
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import f1_score
import json
import os 
import pandas as pd 
from node2vec import Node2Vec
from node2vec.edges import HadamardEmbedder

from gensim.models import Word2Vec

import csv

from natsort import natsorted


import multiprocessing

In [0]:
# For drive
from google.colab import drive
# Connect and Mount drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
# Download raw input files only one time
!wget --load-cookies /tmp/cookies.txt "https://docs.google.com/uc?export=download&confirm=$(wget --quiet --save-cookies /tmp/cookies.txt --keep-session-cookies --no-check-certificate 'https://docs.google.com/uc?export=download&id=1T--pdLZ-M7jHadPa6e1iFR_6WxfczoLh' -O- | sed -rn 's/.*confirm=([0-9A-Za-z_]+).*/\1\n/p')&id=1T--pdLZ-M7jHadPa6e1iFR_6WxfczoLh" -O text.tar.gz && rm -rf /tmp/cookies.txt
# Unziping the content
!mkdir -p /content/Data/node_information
!tar xpzf text.tar.gz -C /content/Data/node_information
!rm text.tar.gz
!mkdir -p /content/Data/node_information/clean_text

--2019-12-30 17:05:54--  https://docs.google.com/uc?export=download&confirm=OO0X&id=1T--pdLZ-M7jHadPa6e1iFR_6WxfczoLh
Resolving docs.google.com (docs.google.com)... 173.194.216.101, 173.194.216.113, 173.194.216.139, ...
Connecting to docs.google.com (docs.google.com)|173.194.216.101|:443... connected.
HTTP request sent, awaiting response... 302 Moved Temporarily
Location: https://doc-04-5s-docs.googleusercontent.com/docs/securesc/ha0ro937gcuc7l7deffksulhg5h7mbp1/ff50uvuna0rpd5lar2dpnfkehaq6irt3/1577721600000/17442647905740021451/*/1T--pdLZ-M7jHadPa6e1iFR_6WxfczoLh?e=download [following]
--2019-12-30 17:05:54--  https://doc-04-5s-docs.googleusercontent.com/docs/securesc/ha0ro937gcuc7l7deffksulhg5h7mbp1/ff50uvuna0rpd5lar2dpnfkehaq6irt3/1577721600000/17442647905740021451/*/1T--pdLZ-M7jHadPa6e1iFR_6WxfczoLh?e=download
Resolving doc-04-5s-docs.googleusercontent.com (doc-04-5s-docs.googleusercontent.com)... 173.194.217.132, 2607:f8b0:400c:c13::84
Connecting to doc-04-5s-docs.googleuserconten

In [0]:
# Define some recurrent variables
drive_data_folder = '/content/drive/My Drive/ML-project/Data'
drive_models_folder = '/content/drive/My Drive/ML-project/models'
local_data_folder = '/content/Data'
raw_node_text_dir = '/content/Data/node_information/text'

In [0]:
# Load the training dataset (with the labels)
X = []
y = []
with open(os.path.join(drive_data_folder, 'training.txt'), "r") as f:
    for line in tqdm(f):
        line = line.split()
        X.append(np.array([int(line[0]), int(line[1])]))
        y.append(np.array(int(line[2])))
    X = np.array(X)
    y = np.array(y)
    

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




In [0]:
# Load the test dataset
X_final_test = []

with open(os.path.join(drive_data_folder, 'testing.txt'), "r") as f:
    for line in tqdm(f):
        line = line.split()
        X_final_test.append(np.array([int(line[0]), int(line[1])]))
    X_final_test = np.array(X_final_test)

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




In [0]:
# Load the raw html dataset
Node_info = [] 

i = 0
j = 0
for root, dirs, files in os.walk(os.path.join(raw_node_text_dir), topdown=False):
        for name in tqdm(natsorted(files)):
            path = os.path.join(raw_node_text_dir, name)
            try:
                with open(path, "r",  encoding='utf-8', errors='ignore') as f:
                    Node_info.append(f.read())
            except:
                print(path)
                i += 1
            j += 1
df = pd.DataFrame(Node_info) 
print(i/j)

HBox(children=(FloatProgress(value=0.0, max=33226.0), HTML(value='')))


0.0


## Define some important functions

In [0]:
# Fill graph from np.arrays
def fill_graph(X, y):
    G = nx.Graph()
    for nd, v in tqdm(zip(X, y)):
        if int(v) == 1:
            G.add_edge(nd[0], nd[1])
    return G

def export_to_json(X,y):
    data = {}
    data["edges"]=[]
    data["nodes"]=[]
    i=0
    for _ in tqdm(X):
        data["nodes"].append({
            'id': str(_[0]),
            'label' : str(_[0]),
            'group' : 1
        })
        data["nodes"].append({
            'id': str(_[1]),
            'label' : str(_[1]),
            'group' : 1
        })
        i+=1
    
    for nd, v in tqdm(zip(X, y)):
        if int(v) == 1:
            data['edges'].append({
                'from': str(nd[0]),
                'to' : str(nd[1])
            })
            
    with open('./Vis/data.json', 'w') as outfile:
        json.dump(data, outfile)

def hadamard(x,y):
    return x*y

# The distance used
def compute_hadamard(model_dict, X, y):
    Z=[]
    yt=[]
    for nd, v in tqdm(zip(X, y)):
        try:
            Z.append(hadamard(model_dict[nd[0]], model_dict[nd[1]]))
            yt.append(v)
        except:
            pass
    Z=np.array(Z)
    return Z, yt

# Define the training function
def compute_results(Z, yt, test_size = 0.1, random_state = 42):
  X_train, X_test, y_train, y_test = train_test_split(Z, yt, test_size = test_size, random_state = random_state)
  
  gbc= GradientBoostingClassifier(verbose=True).fit(X_train, y_train)
  print("GBC end of training")
  y_pred=gbc.predict(X_test)
  print(f'GBC f1_score: {f1_score(y_test,y_pred)}')

## Graph Visualization

In [0]:
export_to_json(X,y)

NameError: ignored

In [0]:
Z=[]
for nd, v in tqdm(zip(X, y)):
    if int(v) == 1:
        Z.append(np.array([nd[0], nd[1]]))
Z= np.array(Z)
np.savetxt("data.edgelist", Z, delimiter=" ",  fmt="%d" )

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




# Node2vec - embedding 

In [0]:
G = fill_graph(X, y)

node2vec = Node2Vec(G, dimensions=64, walk_length=9, num_walks=5, workers=20, p=1, q=1)

model_version = 'node2vec_d64_wl9_nw5_w5'

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Computing transition probabilities:   0%|          | 0/33162 [00:00<?, ?it/s]




Computing transition probabilities:  22%|██▏       | 7185/33162 [06:47<15:14, 28.40it/s]

KeyboardInterrupt: ignored

In [0]:
model = node2vec.fit(window=5, min_count=1, batch_words=4)

In [0]:
model.save(os.path.join(drive_models_folder,f'{model_version}.model'))

In [0]:
model = Word2Vec.load(os.path.join(drive_models_folder,f'{model_version}.model'))

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [0]:
model.wv.save_word2vec_format(os.path.join(drive_models_folder,f'{model_version}.txt'))

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [0]:
print(model.wv.most_similar('2'))  # Output node names are always strings

[('20494', 0.9899447560310364), ('23876', 0.9859787225723267), ('23304', 0.9858320951461792), ('23873', 0.9856404066085815), ('4', 0.9848094582557678), ('13127', 0.9843376874923706), ('7', 0.9838321208953857), ('13128', 0.9833310842514038), ('5281', 0.9832398891448975), ('29929', 0.9826775789260864)]


  if np.issubdtype(vec.dtype, np.int):


#  Node2Vec - Baseline
## Basic models


In [0]:
model_version = 'node2vec_d64_wl9_nw5_w5'

In [0]:
T={}
i=0
with open(os.path.join(drive_models_folder,f'{model_version}.txt'), 'r') as f:
    for line in tqdm(f):
      if(i==0): pass
      else:
          line = line.split()
          T[float(line[0])]=np.array([float(a) for a in (line[1:-1])])
      i+=1

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




In [0]:
Z, yt = compute_hadamard(T, X, y)

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




In [0]:
compute_results(Z, yt)

      Iter       Train Loss   Remaining Time 
         1           1.2152            8.05m
         2           1.1281            8.01m
         3           1.0532            7.91m
         4           0.9913            7.82m
         5           0.9378            7.73m
         6           0.8891            7.63m
         7           0.8487            7.55m
         8           0.8111            7.47m
         9           0.7802            7.39m
        10           0.7520            7.32m
        20           0.5839            6.51m
        30           0.5077            5.70m
        40           0.4602            4.89m
        50           0.4279            4.08m
        60           0.4058            3.26m
        70           0.3891            2.44m
        80           0.3771            1.63m
        90           0.3675           48.87s
       100           0.3604            0.00s
GBC end of training
GBC f1_score: 0.9416653404770745


In [0]:
# Export final predictions

gbc= GradientBoostingClassifier(verbose=True).fit(Z, yt)
print("GBC end of training")


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


      Iter       Train Loss   Remaining Time 
         1           1.2156            8.22m
         2           1.1295            8.14m
         3           1.0544            8.06m
         4           0.9922            7.99m
         5           0.9384            7.94m
         6           0.8895            7.86m
         7           0.8482            7.80m
         8           0.8127            7.71m
         9           0.7800            7.63m
        10           0.7529            7.55m
        20           0.5848            6.74m
        30           0.5080            5.90m
        40           0.4591            5.06m
        50           0.4280            4.22m
        60           0.4060            3.37m
        70           0.3895            2.53m
        80           0.3774            1.68m
        90           0.3686           50.51s
       100           0.3617            0.00s
GBC end of training


In [0]:
y_pred = []
for nd in tqdm(X_final_test):
    try:
        y_pred.append(gbc.predict(hadamard(T[nd[0]], T[nd[1]]).reshape(1,-1))[0])
    except:
        y_pred.append(0)

with open(os.path.join(drive_data_folder, 'output', 'node2vec_baseline_2.csv'), 'w') as myfile:
    wr = csv.writer(myfile)
    wr.writerow(['id', 'predicted'])
    for index, pred in enumerate(y_pred):
      wr.writerow([index, pred])

HBox(children=(FloatProgress(value=0.0, max=113450.0), HTML(value='')))




# XGBoost

In [0]:
#Using xgboost
import xgboost as xgb
import numpy as np
from sklearn.metrics import precision_score, recall_score, accuracy_score

##Getting the data
D_train = xgb.DMatrix(X_train, label=Y_train)

##Setting model parameters

parameters = {
   'eta': 0.3,      # regularization parameter, prevents overfitting
   'max_depth': 9,  
   'objective': 'binary-regression',  
   'num_class': 2} 

steps = 3 #20  # The number of training iterations

model = xgb.train(parameters, D_train, steps)

##Other option
# model = xgb.XGBlassifier()
# model.fit(X_train, y_Train)


#predictions = model.predict(D_test)
#bestPrediction = np.asarray([np.argmax(line) for line in predictions])

#print("Precision = {}".format(precision_score(y_test, bestPrediction, average='macro')))
#print("Recall = {}".format(recall_score(y_test, bestPrediction, average='macro')))
#print("Accuracy = {}".format(accuracy_score(y_test, bestPrediction)))

NameError: ignored

# NLP


## Data pre-processing

In [0]:
!pip install spacy==2.2.3
!python -m spacy download fr
!python -m spacy download en

Collecting spacy==2.2.3
[?25l  Downloading https://files.pythonhosted.org/packages/47/13/80ad28ef7a16e2a86d16d73e28588be5f1085afd3e85e4b9b912bd700e8a/spacy-2.2.3-cp36-cp36m-manylinux1_x86_64.whl (10.4MB)
[K     |████████████████████████████████| 10.4MB 2.8MB/s 
Collecting catalogue<1.1.0,>=0.0.7
  Downloading https://files.pythonhosted.org/packages/4b/4c/0e0fa8b1e193c1e09a6b72807ff4ca17c78f68f0c0f4459bc8043c66d649/catalogue-0.2.0-py2.py3-none-any.whl
Collecting thinc<7.4.0,>=7.3.0
[?25l  Downloading https://files.pythonhosted.org/packages/07/59/6bb553bc9a5f072d3cd479fc939fea0f6f682892f1f5cff98de5c9b615bb/thinc-7.3.1-cp36-cp36m-manylinux1_x86_64.whl (2.2MB)
[K     |████████████████████████████████| 2.2MB 25.7MB/s 
Collecting preshed<3.1.0,>=3.0.2
[?25l  Downloading https://files.pythonhosted.org/packages/db/6b/e07fad36913879757c90ba03d6fb7f406f7279e11dcefc105ee562de63ea/preshed-3.0.2-cp36-cp36m-manylinux1_x86_64.whl (119kB)
[K     |████████████████████████████████| 122kB 51.1MB/s 

In [0]:
import re
from time import time  # To time our operations
from collections import defaultdict  # For word frequency

import spacy  # For preprocessing

import logging  # Setting up the loggings to monitor gensim

# Spacy stopwords
from spacy.lang.fr.stop_words import STOP_WORDS as fr_stop
from spacy.lang.en.stop_words import STOP_WORDS as en_stop

logging.basicConfig(format="%(levelname)s - %(asctime)s: %(message)s", datefmt= '%H:%M:%S', level=logging.INFO)

In [0]:
nlp = spacy.load('fr', disable=['ner', 'parser'])
stops = set(fr_stop).union(set(en_stop)) # for spicy
def token_filter(token):
    return not (token.text in stops or token in stops or len(token.text) <= 3)

def chunks(lst, n):
    """Return successive a list of n-sized chunks from lst."""
    cks = []
    for i in range(0, len(lst), n):
        cks.append(lst[i:i + n])
    return cks

In [0]:
local_clean_text_path = os.path.join(local_data_folder, 'node_information/clean_text', 'Node_info_clean_3.csv')
drive_clean_text_path = os.path.join(drive_data_folder, 'node_information/clean_text', 'Node_info_clean_3.csv')

In [0]:
## Clear and save clean txt on data_folder/node_information/clean_text
t = time()
chunk_size = 300
chunked_docs = np.array(chunks(df.values[:,0], chunk_size))  # TODO: remove _clean
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^a-z ]+')

# Initialize the csv with the clean text
with open(local_clean_text_path, 'w') as f:
    writer = csv.writer(f)
    writer.writerow(['clean'])

# Fill the text files chunk by chunk (IMPORTANT: csv's line i <-> '(i-2).txt')

for index, docs in enumerate(chunked_docs):
  print(f'Chunk #{index+1} out of {len(chunked_docs)}')

  brief_cleaning = (BAD_SYMBOLS_RE.sub('', REPLACE_BY_SPACE_RE.sub(' ', str(row).lower())) for row in tqdm(docs))
  
  # Clear the text and save it right away
  for doc in nlp.pipe(brief_cleaning, batch_size=10000, n_threads=16):
      tokens = [token.lemma_ for token in doc if token_filter(token)]
      document = ' '.join(tokens)
      with open(local_clean_text_path, 'a') as f:
        writer = csv.writer(f)
        writer.writerow([document])
        print(f'Time passed...: {round((time() - t) / 60, 2)} mins')
#Copy the cleaned text to drive
shutil.copyfile(local_clean_text_path, drive_clean_text_path)
print(f'Time to preprocess all data: {round((time() - t) / 60, 2)} mins')

## Word2vec - Embedding

In [0]:
# Load cleaned data from drive

drive_clean_text_path = os.path.join(drive_data_folder, 'node_information/clean_text', 'Node_info_clean_3.csv')

df_clean = pd.read_csv(drive_clean_text_path)

# fill Na's with a empty string to avoid errors
df_clean['clean']=df_clean['clean'].fillna("")

# Overview of the df pre-cleaned
print(df_clean.head())

# Define the sentences array
sentences =  [row['clean'].split() for index, row in tqdm(df_clean.iterrows())]

                                               clean
0  trade desk looking unified solution control da...
1  alternate alternat wapjssphomestc idsite idcli...
2  poursuivre navigation site accepter utilision ...
3  alternate alternat wapjsspaccueilstc idsitescf...
4  alternate alternat wapjsspaccueilstc idsitescc...


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




In [0]:
#feature_names = tfidf_vectorizer.get_feature_names()
#print(feature_index)
#print(w2v_indices['device'] in feature_index)
#{word: w2v_model.wv.vocab[word].index for word in w2v_model.wv.vocab}
# feature_indices = {} 
# for x in range(0, len(sentences)):
#   for feature_index in tfidf_vectorizer_matrix[x, :].nonzero()[1]:
#     feature_indices[feature_names[feature_index]] = feature_index
#feature_indices = {feature_names[feature_index]: feature_index for feature_index in tfidf_vectorizer_matrix[x, :].nonzero()[1] for x in range(0,len(sentences))}
# doc = 0, word = 'device'
# feature_index = tfidf_vectorizer_matrix[doc,:].nonzero()[1]
# tfidf_scores = zip(feature_index, [tfidf_vectorizer_matrix[doc, x] for x in feature_index])

# for w, s in [(feature_names[i], s) for (i, s) in tfidf_scores]:
#   print(w, s)

In [0]:
# Sanity Check - Effectiveness of the lemmatization and removal of stopwords

word_freq = defaultdict(int)
for sent in tqdm(sentences):
    for i in sent:
        word_freq[i] += 1

print(len(word_freq))
print(sorted(word_freq, key=word_freq.get, reverse=True)[:10])

HBox(children=(FloatProgress(value=0.0, max=33207.0), HTML(value='')))


1914129
['button', 'site', 'alternat', 'service', 'niveau', 'france', 'donnes', 'voir', 'commun', 'alternate']


In [0]:
cores = multiprocessing.cpu_count() # Count the number of cores in a computer

w2v_model = Word2Vec(min_count=20,
                     window=2,
                     size=100,
                     sample=6e-5, 
                     alpha=0.03, 
                     min_alpha=0.0007, 
                     negative=5,
                     workers=cores-1)


# Building the Vocabulary Table

# Word2Vec requires us to build the vocabulary table 
# (simply digesting all the words and filtering out the unique words, and doing some basic counts on them):

t = time()

w2v_model.build_vocab(sentences, progress_per=10000)

print('Time to build vocab: {} mins'.format(round((time() - t) / 60, 2)))

INFO - 19:05:42: collecting all words and their counts
INFO - 19:05:42: PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
INFO - 19:05:49: PROGRESS: at sentence #10000, processed 32324855 words, keeping 831279 word types
INFO - 19:05:54: PROGRESS: at sentence #20000, processed 53456594 words, keeping 1310967 word types
INFO - 19:05:59: PROGRESS: at sentence #30000, processed 72375381 words, keeping 1790794 word types
INFO - 19:06:00: collected 1914129 word types from a corpus of 77195914 raw words and 33207 sentences
INFO - 19:06:00: Loading a fresh vocabulary
INFO - 19:06:06: effective_min_count=20 retains 161524 unique words (8% of original 1914129, drops 1752605)
INFO - 19:06:06: effective_min_count=20 leaves 71716999 word corpus (92% of original 77195914, drops 5478915)
INFO - 19:06:06: deleting the raw counts dictionary of 1914129 items
INFO - 19:06:06: sample=6e-05 downsamples 995 most-common words
INFO - 19:06:06: downsampling leaves estimated 57683458 word corpu

Time to build vocab: 1.02 mins


In [0]:
# Training the model

t = time()

w2v_model.train(sentences, total_examples=w2v_model.corpus_count, epochs=10, report_delay=1)

w2v_model_version = 'word2vec_c20_w2_s100_e10'

print('Time to train the model: {} mins'.format(round((time() - t) / 60, 2)))


In [0]:
# Save the model (can still be trained)
w2v_model.save(os.path.join(drive_models_folder, f'{w2v_model_version}.model'))

INFO - 19:17:23: saving Word2Vec object under /content/drive/My Drive/ML-project/models/word2vec_c20_w2_s100_e10.model, separately None
INFO - 19:17:23: storing np array 'vectors' to /content/drive/My Drive/ML-project/models/word2vec_c20_w2_s100_e10.model.wv.vectors.npy
INFO - 19:17:23: not storing attribute vectors_norm
INFO - 19:17:23: storing np array 'syn1neg' to /content/drive/My Drive/ML-project/models/word2vec_c20_w2_s100_e10.model.trainables.syn1neg.npy
INFO - 19:17:24: not storing attribute cum_table
  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL
INFO - 19:17:24: saved /content/drive/My Drive/ML-project/models/word2vec_c20_w2_s100_e10.model
INFO - 19:17:24: saving Word2VecKeyedVectors object under /content/drive/My Drive/ML-project/models/wordvectors_c20_w2_s100_e10.kv, separately None
INFO - 19:17:24: storing np array 'vectors' to /content/drive/My Drive/ML-project/models/wordvectors_c20_w2_s100_e10.kv.vectors.npy
INFO - 19:17:24: not storing attribute vec

In [0]:
w2v_model.wv.most_similar(positive=['important'])

INFO - 19:17:30: precomputing L2-norms of word weight vectors
  if np.issubdtype(vec.dtype, np.int):


[('crucial', 0.6092925667762756),
 ('relever', 0.5951700210571289),
 ('multiple', 0.5688023567199707),
 ('correct', 0.5681476593017578),
 ('primordial', 0.5627104043960571),
 ('maximum', 0.554114818572998),
 ('change', 0.5526857376098633),
 ('aspect', 0.5448793768882751),
 ('importance', 0.5439976453781128),
 ('entir', 0.5438985824584961)]

## Word2Vec - Baseline


Now that we have a Word2Vec trained model, we are able to vectorize our web pages.
We will average the vector representation for each word in a document


In [0]:
# Necessary imports
from gensim.models import KeyedVectors

from sklearn.feature_extraction.text import TfidfVectorizer

In [0]:
w2v_model_version = 'word2vec_c20_w2_s100_e10'

In [0]:
# Load the model
# The word vector loaded from disk
w2v_model =  KeyedVectors.load(os.path.join(drive_models_folder, f'{w2v_model_version}.model'))
#w2v_model =  KeyedVectors.load(os.path.join(drive_models_folder, 'word2vec_c20_w5_s300_e7.model.wv.vectors.npy'))

# here we load vectors for each word in our model
w2v_vectors = w2v_model.wv.vectors 

# here we load indices - with whom we can find an index of the particular word in our model 
w2v_indices = {word: w2v_model.wv.vocab[word].index for word in w2v_model.wv.vocab} 

INFO - 19:17:41: loading Word2VecKeyedVectors object from /content/drive/My Drive/ML-project/models/word2vec_c20_w2_s100_e10.model
  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL
INFO - 19:17:43: loading wv recursively from /content/drive/My Drive/ML-project/models/word2vec_c20_w2_s100_e10.model.wv.* with mmap=None
INFO - 19:17:43: loading vectors from /content/drive/My Drive/ML-project/models/word2vec_c20_w2_s100_e10.model.wv.vectors.npy with mmap=None
INFO - 19:17:43: setting ignored attribute vectors_norm to None
INFO - 19:17:43: loading vocabulary recursively from /content/drive/My Drive/ML-project/models/word2vec_c20_w2_s100_e10.model.vocabulary.* with mmap=None
INFO - 19:17:43: loading trainables recursively from /content/drive/My Drive/ML-project/models/word2vec_c20_w2_s100_e10.model.trainables.* with mmap=None
INFO - 19:17:43: loading syn1neg from /content/drive/My Drive/ML-project/models/word2vec_c20_w2_s100_e10.model.trainables.syn1neg.npy with mmap=None
IN

In [0]:
# Define the vectorizer fonctions (receive a document as input and return the corresponding vector vector)
def vectorize(line):
    # Vectorize the lines (here, the HTML pages...) as an average
    # we assume that a line is the mean contribution of each word
    words = []
    for word in line: # line - iterable, for example list of tokens 
        try:
            w2v_idx = w2v_indices[word]
            words.append(w2v_vectors[w2v_idx])  # Put all words of a line in a list
        except KeyError: # if you does not have a vector for this word in your w2v model, continue 
            continue
    if words:  # If there's a list of words for this line 
        words = np.asarray(words)
        return np.mean(words, axis=0)  # Mean element-wise ()
    if not words:
        logging.warning("cannot compute similarity with no input %s", words)
        # FIXME: remove these examples in pre-processing
        return np.zeros(w2v_model.vector_size,)
  


In [0]:
# Create the vectorized document
Node_info_vectorized = {}
for index, row in tqdm(df_clean.iterrows()):
  Node_info_vectorized[index] = vectorize(row['clean'].split())
Node_info_vectorized = pd.DataFrame(Node_info_vectorized).T

In [0]:
# Save to CSV
Node_info_vectorized.to_csv(os.path.join(drive_data_folder, 'node_information/vectorized', 'Node_info_vectorized_c20_w2_s100_e10.csv'), index = None, header=True)

In [0]:
# Load Vectorized node info
Node_info_vectorized = pd.read_csv(os.path.join(drive_data_folder, 'node_information/vectorized', 'Node_info_vectorized_c20_w2_s100_e10.csv'))
print(Node_info_vectorized.shape)
print(Node_info_vectorized.head)

(33207, 100)
<bound method NDFrame.head of               0         1         2  ...        97        98        99
0     -0.233384  0.060287 -0.385286  ... -0.894999 -0.658560  0.463766
1      0.405912 -0.419194  0.071104  ... -0.389931 -0.128761  0.378947
2      0.222122 -0.321445  0.027319  ... -0.244004 -0.189464  0.269860
3      0.551546 -0.299921  0.018399  ... -0.389727 -0.362058  0.252638
4      0.452876 -0.352998 -0.040099  ... -0.322568 -0.107959  0.297919
...         ...       ...       ...  ...       ...       ...       ...
33202  0.393666  0.012739  0.037500  ... -0.150504 -0.155597  0.172013
33203  0.577671  0.142126  0.144092  ... -0.138839 -0.299346  0.247666
33204  0.487699 -0.162908 -0.313750  ... -0.586065  0.022578  0.024109
33205 -0.058005  0.243857 -0.142366  ... -0.358236 -0.279509  0.189254
33206 -0.427266  0.396401 -0.491637  ... -0.578566 -0.230975  0.535062

[33207 rows x 100 columns]>


In [0]:
T_w2v = {}
for index, value in Node_info_vectorized.iterrows():
  T_w2v[float(index)] = np.array([float(x) for x in value])
print(T_w2v[50])

In [0]:
# Compute the distance ('edge' array)
Z_w2v, yt_w2v = compute_hadamard(T_w2v, X, y)

In [0]:
# Train with diverse models and compute the results
compute_results(Z_w2v, yt_w2v)

      Iter       Train Loss   Remaining Time 
         1           1.3189           14.32m
         2           1.3153           14.21m
         3           1.3120           14.04m
         4           1.3092           13.89m
         5           1.3064           13.74m
         6           1.3043           13.63m
         7           1.3021           13.48m
         8           1.3003           13.33m
         9           1.2984           13.17m
        10           1.2966           12.99m
        20           1.2823           10.97m
        30           1.2728            9.25m
        40           1.2658            7.77m
        50           1.2597            6.39m
        60           1.2548            5.07m
        70           1.2499            3.78m
        80           1.2458            2.51m
        90           1.2421            1.25m
       100           1.2386            0.00s
GBC end of training
GBC f1_score: 0.7674091105043828


## Doc2Vec

In [0]:
import gensim
from sklearn import utils
from gensim.models import Doc2Vec
from sklearn.linear_model import LogisticRegression
from gensim.models.doc2vec import TaggedDocument

### Distributed Bag of Words (DBOW)

In [0]:
cores = multiprocessing.cpu_count()

#### Building a Vocabulary

In [0]:
documents = [TaggedDocument(words = doc.split(), tags = [i]) for i, doc in enumerate(df_clean['clean'])]

In [0]:
model_dbow = Doc2Vec(dm=0, vector_size=150, negative=5, hs=0, min_count=2, sample = 0, workers=cores)
model_dbow.build_vocab([x for x in tqdm(documents)])

HBox(children=(FloatProgress(value=0.0, max=33207.0), HTML(value='')))




### Doc2Vec - Embedding

In [0]:
%%time
for epoch in range(10):
    model_dbow.train(utils.shuffle([x for x in tqdm(documents)]), total_examples=len(documents), epochs=1)
    model_dbow.alpha -= 0.002
    model_dbow.min_alpha = model_dbow.alpha

HBox(children=(FloatProgress(value=0.0, max=33207.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=33207.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=33207.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=33207.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=33207.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=33207.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=33207.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=33207.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=33207.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=33207.0), HTML(value='')))


CPU times: user 45min 44s, sys: 10.6 s, total: 45min 55s
Wall time: 12min 15s


In [0]:
# Save the model (can still be trained)
model_dbow.save(os.path.join(drive_models_folder, 'doc2vec_dbow_s150_n5_hs0_e10.model'))

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


### Doc2Vec - Baseline



In [0]:
# Load the model
# The word vector loaded from disk
model_dbow =  Doc2Vec.load(os.path.join(drive_models_folder, 'doc2vec_dbow_s150_n5_hs0_e10.model'))

In [0]:
# Create the vectorized document
Node_info_vectorized_d2v = {}
for index, row in tqdm(df_clean.iterrows()):
  Node_info_vectorized_d2v[index] = model_dbow.docvecs[index]
Node_info_vectorized_d2v = pd.DataFrame(Node_info_vectorized_d2v).T

# Export to CSV
Node_info_vectorized_d2v.to_csv(os.path.join(drive_data_folder, 'node_information/vectorized', 'Node_info_vectorized_doc2vec_dbow_s150_n5_hs0_e10.csv'), index = None, header=True)


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




In [0]:
# Load from CSV
Node_info_vectorized_d2v = pd.read_csv(os.path.join(drive_data_folder, 'node_information/vectorized', 'Node_info_vectorized_c20_w2_s100_e10.csv'))

In [0]:
# Reformat array

T_d2v = {}
for index, value in Node_info_vectorized_d2v.iterrows():
  T_d2v[float(index)] = np.array([float(x) for x in value])

In [0]:
# Compute the distance ('edge' array)
Z_d2v, yt_d2v = compute_hadamard(T_d2v, X, y)

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




In [0]:
compute_results(Z_d2v, yt_d2v)

      Iter       Train Loss   Remaining Time 
         1           1.3202           23.66m
         2           1.3173           23.53m
         3           1.3144           23.22m
         4           1.3123           22.99m
         5           1.3100           22.75m
         6           1.3078           22.55m
         7           1.3060           22.30m
         8           1.3042           22.06m
         9           1.3025           21.80m
        10           1.3010           21.56m
        20           1.2884           19.15m
        30           1.2787           16.81m
        40           1.2715           14.54m
        50           1.2648           12.14m
        60           1.2589            9.68m
        70           1.2539            7.24m
        80           1.2498            4.82m
        90           1.2457            2.40m
       100           1.2420            0.00s
GBC end of training
GBC f1_score: 0.7699102516916282


# Doc2Vec and Node2Vec together

In [0]:
print(type(T_d2v))
# Concatenate the columns for each document
T_reinforced = {}
for key in T.keys():
  sizeT = T[key].size
  break
for key in T_d2v.keys():
  if key in T:
    T_reinforced[key] = np.concatenate((T[key], T_d2v[key]), axis=0) 
  else:
    T_reinforced[key] = np.concatenate((np.zeros(sizeT), T_d2v[key]), axis=0) 
#T_reinforced 

<class 'dict'>


In [0]:
# Compute the distance ('edge' array)
Z_reinforced, yt_reinforced = compute_hadamard(T_reinforced, X, y)
print(Z_reinforced.shape)
print(len(yt_reinforced))

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


(453546, 213)
453546


In [0]:
# Train with diverse models and compute the results
compute_results(Z_reinforced, yt_reinforced)

KeyboardInterrupt: ignored

Final Predictions


In [0]:
# Export final predictions

gbc= GradientBoostingClassifier(verbose=True).fit(Z_reinforced, yt_reinforced)


      Iter       Train Loss   Remaining Time 
         1           1.2161           34.14m
         2           1.1298           33.85m
         3           1.0546           33.42m
         4           0.9923           33.01m
         5           0.9385           32.66m
         6           0.8895           32.29m
         7           0.8481           31.94m
         8           0.8125           31.54m
         9           0.7798           31.18m
        10           0.7526           30.82m
        20           0.5843           27.36m
        30           0.5079           24.16m
        40           0.4606           20.94m
        50           0.4289           17.48m
        60           0.4067           13.99m
        70           0.3904           10.48m
        80           0.3782            6.97m
        90           0.3690            3.48m
       100           0.3618            0.00s


In [0]:
y_pred = []
for nd in tqdm(X_final_test):
    try:
        y_pred.append(gbc.predict(hadamard(T_reinforced[nd[0]], T_reinforced[nd[1]]).reshape(1,-1))[0])
    except:
        y_pred.append(0)

with open(os.path.join(drive_data_folder, 'output', 'node2vec_doc2vec_final_prediction.csv'), 'w') as myfile:
    wr = csv.writer(myfile)
    wr.writerow(['id', 'predicted'])
    for index, pred in enumerate(y_pred):
      wr.writerow([index, pred])

HBox(children=(FloatProgress(value=0.0, max=113450.0), HTML(value='')))