# All necessary classes and methods for the implementation

In [1]:
import networkx as nx
import matplotlib.pyplot as plt

In [2]:
class CitationDatabase:

    def __init__(self, papers=None, authors=None, citations=None, alpha=0.9, keep_updated=False):
        
        # all papers in the citation network
        self.all_papers = [] if papers is None else papers
        
        # all authors in the citation network
        self.all_authors = []
        # all authors from the "negative" group
        self.negative_authors = []
        # all authors from the "positive" group
        self.positive_authors = []
        if not authors is None:
            for author in authors:
                self.add_author(author)

        # all citations in the citation network
        self.citations = [] if citations is None else citations

        # parameter alpha for the pagerank algorithm
        self.alpha = alpha
        # keep the ranking lists updated after each change
        self.keep_updated = keep_updated
        
        # indices for all papers and authors
        self.paper_pvalues = {}
        self.author_h_indices = {}
        self.author_pvalues = {}
        self.author_pindices = {}
   
    # update all rankings based on citations in the database
    def ranking_update(self):
        # construct citation-graph
        G = nx.DiGraph()
        self.G = G
        G.add_nodes_from(self.paper_ids)
        G.add_edges_from(self.citations)

        # calculate papers' pagerank-values
        self.paper_pvalues = nx.pagerank(G, alpha=self.alpha)

        # calculate authors' pagerank-values:
        # pagerank-values of each paper are distributed in a weighted manner
        # to each co-author of that specific paper
        for a in self.all_authors:
            author_pvalue = 0
            for p in a.paper_list:
                # N - number of authors of paper p
                N = p.num_of_authors
                # R - position of author a in author-list of paper p
                R = p.author_list.index(a) + 1
                # w - weight of the pagerank-value assigned to author a from paper p
                w = (N - R + 1) / (0.5 * N * (N + 1))
                # agregating weighted pagerank-values received from each of author's papers
                author_pvalue += w * p.pagerank_value
            self.author_pvalues[a.author_id] = author_pvalue

        # calculate authors' pagerank-indices:
        # pagerank-index - percentile ranking of author based on his pagerank-value
        for author in self.all_authors:
            author_pindex = 0
            # cf - cumulative frequency - count of all scores less than the score of interest
            cf = 0
            for pvalue in self.author_pvalues.values():
                if pvalue < author.pagerank_value:
                    cf += 1
            # f - frequency for the score of interest
            f = 0
            for pvalue in self.author_pvalues.values():
                if pvalue == author.pagerank_value:
                    cf += 1
            # n - number of scores in the distribution
            n = len(self.all_authors)
            # percentile rank
            author_pindex = 100 * (cf + 0.5 * f) / n
            
            self.author_pindices[author.author_id] = author_pindex

        # calculate authors' h-indices:
        # max value of h such that the author has at least h papers 
        # that have each been cited at least h times
        for author in self.all_authors:
            # for h in [num_of_papers, num_of_papers - 1, ..., 0]
            for h in range(author.num_of_papers, -1, -1): 
                # k - number of author's papers cited more than h times
                k = 0
                for paper in author.paper_list:
                    if paper.citation_count > h:
                        k += 1    
                if k >= h:
                    break
            self.author_h_indices[author.author_id] = h


    # add new paper to the database
    def add_paper(self, paper):
        paper.database = self
        self.all_papers.append(paper)
        # keep rankings updated
        if self.keep_updated:
            ranking_update()

    # add new author to the database
    def add_author(self, author):
        author.database = self
        self.all_authors.append(author)
        if author.member:
            self.negative_authors.append(author)
        else:
            self.positive_authors.append(author)
        # keep rankings updated
        if self.keep_updated:
            ranking_update()

    # add new citation to the database
    def add_citation(self, citation):
        self.citations.append(citation)
        # update citation count for the cited paper
        self.all_papers[citation[1]].citation_count += 1
        # keep rankings updated
        if self.keep_updated:
            ranking_update()


    # all author ids from the database
    @property
    def author_ids(self):
        return [author.author_id for author in self.all_authors]

    # all paper ids from the database
    @property
    def paper_ids(self):
        return [paper.paper_id for paper in self.all_papers]

    # recommend ID for new author
    def new_author_id(self):
        return len(self.all_authors)

In [3]:
class Paper():

    def __init__(self, database, paper_id, member=False, author_list=None, impact_factor=None, 
                 num_of_refs=None, num_of_inter_refs=None, title=None):
        
        self.database = database

        # paper's unique identifier
        self.paper_id = paper_id
        # member of "negative group" of papers or not
        self.member = member

        # how many authors are assigned to the paper
        self.num_of_authors = len(author_list)
        # list of authors of the paper
        self.author_list = author_list

        # the impact factor of the paper
        # used in simulating the author behaviour, in synthesized systems only 
        # the pagerank-index doesn't use impact factors
        self.impact_factor = impact_factor
        
        # total number of references a paper makes
        self.num_of_refs = num_of_refs
        
        # number of references paper makes within the particular field studied
        # (always less then total number of references)
        self.num_of_inter_refs = num_of_inter_refs

        # set this paper as new paper for all authors from the paper's author list
        for author in author_list:
            author.add_paper(self)

        # title of the paper
        self.title = title

        # number of citations paper receives
        self.citation_count = 0

        # "register" paper in specified database
        self.database.add_paper(self)
        

    def __repr__(self):
        return "Paper {} | {} | {}".format(self.paper_id, self.title, 
                 ', '.join([author.name if not author.name is None else str(author.author_id)
                            for author in self.author_list]))
        
    # pagerank-index of paper based on data in the database
    @property
    def pagerank_value(self):
        return self.database.paper_pvalues[self.paper_id]
            

In [4]:
class Author():

    def __init__(self, database, author_id, member=False, paper_list=None, name=None):
        self.database = database
        
        # author's unique identifier
        self.author_id = author_id
        
        # member of "negative group" of authors or not
        self.member = member
        
        # list of papers authored or co-authored by the author
        self.paper_list = [] if paper_list is None else paper_list
        
        # author's name
        self.name = name
                    
        # "register" author in specified database
        self.database.add_author(self)
        
    def __repr__(self):
        return "Author {} | {} | {}".format(self.author_id, self.name, [paper.paper_id for paper in self.paper_list])

    
    # add new paper to the author's paper list
    def add_paper(self, paper):
        if not paper in self.paper_list:
            self.paper_list.append(paper)
                       
    # number of papers authored or co-authored by this author
    @property
    def num_of_papers(self):
        return len(self.paper_list)
    
    # number of citations author has for all of his papers
    @property
    def citation_count(self):
        count = 0
        for paper in self.paper_list:
            count += paper.citation_count
        return count
        
    # h-index of author based on data in the database
    @property
    def h_index(self):
        return self.database.author_h_indices[self.author_id]

    # pagerank-value of author based on data in the database
    @property
    def pagerank_value(self):
        return self.database.author_pvalues[self.author_id]

    # pagerank-index of author based on data in the database
    @property
    def pagerank_index(self):
        return self.database.author_pindices[self.author_id]



In [5]:
def plot_indices(db, title=None, figheight=6, figwidth=20, 
                 marker_pos='o', marker_neg='x', color_pos='green', color_neg='red', 
                 legend=True, vertical=False):
    
    negative_ids = [author.author_id for author in db.negative_authors]
    positive_ids = [author.author_id for author in db.positive_authors]

    h_indices_negative = [author.h_index for author in db.negative_authors]
    h_indices_positive = [author.h_index for author in db.positive_authors]

    pvalues_negative = [author.pagerank_value for author in db.negative_authors]
    pvalues_positive = [author.pagerank_value for author in db.positive_authors]

    pindices_negative = [author.pagerank_index for author in db.negative_authors]
    pindices_positive = [author.pagerank_index for author in db.positive_authors]
    
    
    if vertical:
        fig, (ax1, ax2, ax3) = plt.subplots(3, 1)
    else:
        fig, (ax1, ax2, ax3) = plt.subplots(1, 3)
        
    fig.set_figheight(figheight)
    fig.set_figwidth(figwidth)
    if not title is None:
        fig.suptitle(title)
    
    ax1.set_title("Spread of the h-index for authors")
    ax1.scatter(positive_ids, h_indices_positive, color='green', marker=marker_pos, label='positive group')
    ax1.scatter(negative_ids, h_indices_negative, color='red', marker=marker_neg, label='negative group')
    ax1.set_xlabel("Author ID")
    ax1.set_ylabel("Author h-index")
    
    ax2.set_title("Spread of the pagerank-value for authors")
    ax2.scatter(positive_ids, pvalues_positive, color='green', marker=marker_pos, label='positive group')
    ax2.scatter(negative_ids, pvalues_negative, color='red', marker=marker_neg, label='negative group')
    ax2.set_xlabel("Author ID")
    ax2.set_ylabel("Author pagerank-value")
    
    ax3.set_title("Spread of the pagerank-index for authors")
    ax3.scatter(positive_ids, pindices_positive, color='green', marker=marker_pos, label='positive group')
    ax3.scatter(negative_ids, pindices_negative, color='red', marker=marker_neg, label='negative group')
    ax3.set_xlabel("Author ID")
    ax3.set_ylabel("Author pagerank-index")
    
    if legend:
        ax1.legend()
        ax2.legend()
        ax3.legend()
    
    plt.show()

In [6]:
def draw_graph(G, img_name):
    plt.figure(figsize=(30, 25))
    pos = nx.random_layout(G)
    nx.draw_networkx_nodes(G, pos, node_color='cadetblue', node_size=100)
    nx.draw_networkx_labels(G, pos, font_color='black', font_size=10, font_family='arial')
    nx.draw_networkx_edges(G, pos, edgelist=G.edges, width=1, edge_color='purple')
    plt.axis('off')
    plt.savefig(img_name)
    plt.show()

## Some tests:

In [7]:
if __name__ == '__main__':
    db = CitationDatabase()
    Jeff = Author(db, db.new_author_id(), True, name="Jeff")
    Bob = Author(db, db.new_author_id(), True, name="Bob")
    Helen = Author(db, db.new_author_id(), False)

    for author in db.all_authors:
        print(author)

Author 0 | Jeff | []
Author 1 | Bob | []
Author 2 | None | []


In [8]:
if __name__ == '__main__':
    paper1 = Paper(db, 0, "nes", [Jeff, Bob, Helen], 20, 3, 8)
    paper2 = Paper(db, 1, "nessss", [Jeff, Bob], 10, 3, 8)
    paper3 = Paper(db, 2, "niss", [Helen], 9, 3, 8)

    for paper in db.all_papers:
        print(paper)

Paper 0 | None | Jeff, Bob, 2
Paper 1 | None | Jeff, Bob
Paper 2 | None | 2


In [9]:
if __name__ == '__main__':
    for author in db.all_authors:
        print(author)

Author 0 | Jeff | [0, 1]
Author 1 | Bob | [0, 1]
Author 2 | None | [0, 2]


In [10]:
if __name__ == '__main__':
    db.ranking_update()
    print(db.paper_pvalues)
    db.add_citation((1, 2))
    db.ranking_update()
    print(db.paper_pvalues)

{0: 0.3333333333333333, 1: 0.3333333333333333, 2: 0.3333333333333333}
{0: 0.2564101201433333, 1: 0.2564101201433333, 2: 0.48717975971333327}
