In [33]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import preprocessing
import nltk
import csv
import networkx as nx
import operator
import matplotlib.pyplot as plt
import pandas as pd
import re

##### loading the data

In [34]:
nltk.download('punkt') # for tokenization
nltk.download('stopwords')
stpwds = set(nltk.corpus.stopwords.words("english"))
stemmer = nltk.stem.PorterStemmer()

with open("testing_set.txt", "r") as f:
    reader = csv.reader(f)
    testing_set  = list(reader)

testing_set = [element[0].split(" ") for element in testing_set]

with open("training_set.txt", "r") as f:
    reader = csv.reader(f)
    training_set  = list(reader)

training_set = [element[0].split(" ") for element in training_set]

with open("node_information.csv", "r") as f:
    reader = csv.reader(f)
    node_info  = list(reader)

IDs = [element[0] for element in node_info]

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\kingr\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\kingr\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


##### Graph features computation on authors and articles

##### one might want to add the option of a directed link only if the temporal difference allows us todo so

In [35]:
def graph_articles(citation_set, node_info, directed_or_not = 'n'):
    if directed_or_not == 'y':
        G = nx.DiGraph()
    else:
        G = nx.Graph()
    for node in node_info:
        G.add_node(node[0], year = node[1])
    for i in citation_set:
        if i[2] == '1':
            if G.node[i[0]]['year'] < G.node[i[1]]['year']:
                G.add_edge(i[0], i[1])
            else:
                G.add_edge(i[1], i[0])
    return G

In [36]:
def graph_authors(citation_set, node_info, IDs, directed_or_not = 'n'):
    if directed_or_not == 'y':
        G = nx.DiGraph()
    else:
        G = nx.Graph()
    
    counter = 0
    for citation in citation_set:
        source = citation[0]
        target = citation[1]
        
        index_source = IDs.index(source)
        index_target = IDs.index(target)
        
        source_authors = [element[3].split(",") for element in node_info if element[0]==source][0]
        target_authors = [element[3].split(",") for element in node_info if element[0]==target][0]
        
        if citation[2] == '1':
            for auth1 in source_authors:
                for auth2 in target_authors:
                    G.add_edge(auth1, auth2)
        else:
            for auth1 in source_authors:
                for auth2 in target_authors:
                    G.add_node(auth1)
                    G.add_node(auth2)
               
        counter += 1
    
        if counter % 5000 == True:
            print(counter, "training examples processsed")
    
    return G
    

In [37]:
def compute_page_rank_feature_for_articles(citation_set, G=None):
    if G != G:
        G = graph_articles(citation_set)
        bool = True
    else:
        bool = False
    
    pg_rk = nx.pagerank(G)
    
    pg_rk_features = []
    for citation in citation_set:
        pg_rk_features.append(pg_rk[citation[0]] + pg_rk[citation[1]])
    
    if bool:
        return np.array(pg_rk_features), G
    else:
        return np.array(pg_rk_features)

In [49]:
def compute_page_club_feature_for_articles(citation_set, node_info, G=None):
    if G != G:
        G = graph_articles(citation_set, node_info)
        bool = True
    else:
        bool = False
    
    pg_rk = nx.pagerank(G)
    sorted_pgr = sorted(pg_rk.items(), key=operator.itemgetter(1), reverse = True)
    #sp_keys = sorted_pgr.keys()
    sp_keys = [int(a) if a!='ID' else -1 for (a,b) in sorted_pgr]
    #print(sp_keys)
    
    #nbedgesin = [0]
    
    pageclub = []
    
    edges = G.to_undirected().edges
    
    in_degs_tuple = list(G.in_degree())
    in_degs = [b for (a,b) in in_degs_tuple]
    k_in = sum(in_degs)/ float(len(in_degs))
    
    n = len(sp_keys)
    
    s=0
    i=1
    kincum=0
    koutcum=0
    for (k,v) in sorted_pgr:
        edges_i = G.to_undirected().edges(k)
        for (a,b) in edges_i:
            #print(k)
            #print(a,b)
            #print(sp_keys[:i])
            if int(a) == k:
                if int(b) in sp_keys[:i]:
                    s+=1
            else:
                if int(a) in sp_keys[:i]:
                    s+=1
        
        kincum += G.in_degree(k)
        koutcum += G.out_degree(k)
        #print(s)
        if kincum>0 and koutcum >0:
            pageclub.append((s*k_in*n)/(kincum*koutcum))
        else:
            pageclub.append(1)
        i+=1
        
    return pageclub
    
    
    
    
    
    

In [None]:
G = graph_articles(training_set, node_info, directed_or_not = 'y')
features = compute_page_club_feature_for_articles(training_set, node_info, G=G)
print(features)

In [None]:
features_edges = []
for citation in training_set:
        features_edges.append(features[citation[0]] + features[citation[0]])

In [None]:
print(features_edges)

### La feature a enregistré est feature_edges au dessus, le reste est des tests et reste du fichier de yannis

In [47]:
pagerank = nx.pagerank(G)
sorted_pgr = sorted(pagerank.items(), key=operator.itemgetter(1), reverse = True)
fin_pgr = [a for (a,b) in sorted_pgr]

In [48]:
print(fin_pgr)

['303256', '304263', '304187', '304131', '201176', '304262', '303207', '204253', '304018', '303095', '207116', '302075', '204089', '303144', '301136', '211178', '304271', '302150', '304180', '304019', '304256', '9905111', '303011', '304232', '304257', '303191', '304241', '301011', '212107', '303198', '304119', '210157', '304178', '304231', '111258', '304076', '303194', '304255', '304147', '210224', '304258', '303151', '303138', '303032', '303185', '304211', '109212', '304268', '303015', '210070', '303115', '304197', '303089', '303060', '111273', '303024', '110055', '301162', '207130', '208020', '207208', '301213', '304046', '304184', '211053', '201253', '304249', '301227', '304216', '303268', '303197', '304158', '303153', '303237', '211181', '303119', '303248', '211263', '209241', '304208', '302132', '303135', '304138', '303124', '301217', '110026', '304198', '304195', '304217', '211245', '302167', '301229', '304045', '304169', '301066', '8222', '9506171', '211041', '301090', '304143',

In [7]:
plt.plot(fin_pgr[:200],features)
plt.show()

NameError: name 'fin_pgr' is not defined

In [9]:
sorted_pgr = sorted(pagerank.items(), key=operator.itemgetter(1), reverse = True)
sp_keys = [float(b) for (a,b) in sorted_pgr]
print(len(sp_keys) != len(set(sp_keys)))

True


In [60]:
print(sp_keys)

[9711200, 9802150, 9407087, 9802109, 9908142, 9905111, 9610043, 9906064, 9408099, 9510017, 9503124, 9711162, 9510209, 9611050, 9510135, 9410167, 9601029, 9401139, 9204099, 9409089, 9412184, 9602022, 9803131, 9603142, 9411149, 9210010, 106048, 9711165, 9602052, 9405029, 9306002, 9710046, 9412228, 9603167, 9301042, 9611230, 9608024, 9703166, 9912072, 9910053, 9612115, 9703030, 9501068, 204253, 9508143, 109162, 9408074, 9602043, 9510169, 9602070, 9402002, 9512062, 9204064, 9512077, 9506171, 9802067, 9401153, 9607201, 9204083, 3190, 9803002, 101126, 9810188, 9511030, 9509066, 9512059, 202021, 9802042, 9704080, 9505105, 9202057, 9402044, 9504090, 9904207, 9507158, 9507121, 9304154, 9411048, 9301068, 9812219, 7170, 9805114, 110055, 12062, 9402032, 9206084, 9201056, 9903205, 9809039, 2034, 9802183, 9802051, 3160, 9306153, 9205068, 9212149, 9804058, 9601038, 9304011, 9211021, 9303046, 9909134, 9805170, 9603161, 9612121, 9511222, 9611042, 9712251, 204089, 2245, 9206070, 9710009, 9207016, 980608

##### beware, they are quite long to compute each

In [3]:
def compute_page_rank_feature_for_authors(citation_set, node_info, G=None):
    if G != G:
        G = graph_authors(citation_set, node_info)
        bool = True
    else:
        bool = False
    
    pg_rk = nx.pagerank(G)
    pg_rk_features = []
    for citation in citation_set:
        source = citation[0]
        target = citation[1]
        
        index_source = IDs.index(source)
        index_target = IDs.index(target)
        
        source_authors = [element[3].split(",") for element in node_info if element[0]==source][0]
        target_authors = [element[3].split(",") for element in node_info if element[0]==target][0]
        
        feature = 0
        for auth in source_authors + target_authors:
            feature += pg_rk[auth]
        
        pg_rk_features.append(feature)
    
    if bool:
        return np.array(pg_rk_features), G
    else:
        return np.array(pg_rk_features)

In [26]:
def compute_rich_club_feature_for_articles(citation_set, G=None):
    if G != G:
        G = graph_articles(citation_set)
        bool = True
    else:
        bool = False
    
    rc_cl = nx.richclub.rich_club_coefficient(G)
    
    rc_cl_features = []
    for citation in citation_set:
        rc_cl_features.append(rc_cl[citation[0]] + rc_cl[citation[1]])
    
    if bool:
        return np.array(rc_cl_features), G
    else:
        return np.array(rc_cl_features)

In [25]:
def compute_rich_club_feature_for_authors(citation_set, node_info, G=None):
    if G != G:
        G = graph_authors(citation_set, node_info)
        bool = True
    else:
        bool = False
    
    rc_cl = nx.richclub.rich_club_coefficient(G)
    rc_cl_features = []
    for citation in citation_set:
        source = citation[0]
        target = citation[1]
        
        index_source = IDs.index(source)
        index_target = IDs.index(target)
        
        source_authors = [element[3].split(",") for element in node_info if element[0]==source][0]
        target_authors = [element[3].split(",") for element in node_info if element[0]==target][0]
        
        feature = 0
        for auth in source_authors + target_authors:
            feature += rc_cl[auth]
        
        rc_cl_features.append(feature)
    
    if bool:
        return np.array(rc_cl_features), G
    else:
        return np.array(rc_cl_features)

In [16]:
def compute_shorthest_path_feature_for_articles(citation_set,G=None):
    if G != G:
        G = graph_articles(citation_set)
        bool = True
    else:
        bool = False
    
    sht_pth_features = []
    for citation in citation_set:
        if citation[2] == '1':
            G.remove_edge(citation[0], citation[1])
        sht_pth_features.append(
                nx.shortest_path_length(G,citation[0], citation[1]) if nx.has_path(G, citation[0], citation[1]) else 30)
        if citation[2] == '1':
            G.add_edge(citation[0], citation[1])
    
    if bool:
        return np.array(sht_pth_features), G
    else:
        return np.array(sht_pth_features)