In [1]:
import numpy as np
import scipy.sparse as ssp
from collections import defaultdict
import pickle

# Define paper data
papers = [
    ("A review of generative adversarial networks and its application in cybersecurity", "Artificial Intelligence Review", 2020),
    ("Biometric fingerprint generation using generative adversarial networks", "Artificial Intelligence for Cyber Security", 2021),
    ("Object detection for robot coordination in robotics soccer", "Nigerian Journal of Technological Development", 2022),
    ("Conflict resolution via emerging technologies?", "Journal of Physics: Conference Series", 2019),
    ("A predictive model for automatic generation control in smart grids using artificial neural networks", "Emerging Technologies for Developing Countries", 2019),
    ("Estimating the time-lapse between medical insurance reimbursement with non-parametric regression models", "Advances in Information and Communication", 2020)
]

# Create mapping of journals to papers
journal_to_papers = defaultdict(list)
for idx, (title, journal, year) in enumerate(papers):
    journal_to_papers[journal].append(idx)

# List of journals
journals = list(journal_to_papers.keys())

In [2]:
print(journal_to_papers)
print(journals)

defaultdict(<class 'list'>, {'Artificial Intelligence Review': [0], 'Artificial Intelligence for Cyber Security': [1], 'Nigerian Journal of Technological Development': [2], 'Journal of Physics: Conference Series': [3], 'Emerging Technologies for Developing Countries': [4], 'Advances in Information and Communication': [5]})
['Artificial Intelligence Review', 'Artificial Intelligence for Cyber Security', 'Nigerian Journal of Technological Development', 'Journal of Physics: Conference Series', 'Emerging Technologies for Developing Countries', 'Advances in Information and Communication']


In [8]:
# Create row and column indices for the sparse matrix
rows = []
cols = []

for journal_idx, journal in enumerate(journals):
    for paper_idx in journal_to_papers[journal]:
        rows.append(paper_idx)
        cols.append(journal_idx)

# Create sparse matrix G
G = ssp.coo_matrix((np.ones(len(rows), dtype=np.int8), (rows, cols)), shape=(len(papers), len(journals)), dtype=np.int8)

# Extract publication years
paper_dates = np.array([year for _, _, year in papers], dtype=np.int32)

In [18]:
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix

# Define the dataframes
papers_df = pd.DataFrame({
    'paper_id': [0, 1, 2, 3, 4],
    'open_alex_id': [
        'https://openalex.org/W100013003',
        'https://openalex.org/W1000167386',
        'https://openalex.org/W1000334729',
        'https://openalex.org/W1000340018',
        'https://openalex.org/W1000355943',
    ]
})

topics_df = pd.DataFrame({
    'topic_id': [1, 2, 3],
    'topic': ['Regression Models', 'Non-Parametric Regression Models', 'Multi Agents']
})

# Example paper-topic associations (this should be based on your actual data)
# Here, we're assuming each paper is associated with a specific topic.
# You would replace this with actual relationships from your data.
# Example paper-topic associations
associations_df = pd.DataFrame({
    'paper_id': [0, 0, 1, 2, 3, 4],
    'topic_id': [1, 2, 1, 0, 1, 0],  # Example topic associations
})

# Create row and column indices
row_indices = associations_df['paper_id'].values
col_indices = associations_df['topic_id'].values

# Define the data (all ones in this case, indicating presence of association)
data = np.ones(len(row_indices), dtype=np.int8)

# Create the sparse matrix
num_papers = len(papers_df)
num_topics = len(topics_df)
sparse_matrix = csr_matrix((data, (row_indices, col_indices)), shape=(num_papers, num_topics), dtype=np.int8)

# Print the sparse matrix
print(sparse_matrix.toarray())


[[0 1 1]
 [0 1 0]
 [1 0 0]
 [0 1 0]
 [1 0 0]]


In [19]:
print(sparse_matrix)


<Compressed Sparse Row sparse matrix of dtype 'int8'
	with 6 stored elements and shape (5, 3)>
  Coords	Values
  (0, 1)	1
  (0, 2)	1
  (1, 1)	1
  (2, 0)	1
  (3, 1)	1
  (4, 0)	1


In [7]:
# Save the hypergraph matrix and paper dates
with open('hypergraph.pkl', 'wb') as f:
    pickle.dump((G.row.tolist(), G.col.tolist()), f)

with open('paper_dates.pkl', 'wb') as f:
    pickle.dump(paper_dates, f)


In [1]:
import pandas as pd

# Example data based on the provided structure
papers_df = pd.DataFrame({
    'paper_id': [0, 1, 2, 3, 4, 5],
    'title': [
        'A review of generative adversarial networks and its application in cybersecurity',
        'Biometric fingerprint generation using generative adversarial networks',
        'Object detection for robot coordination in robotics soccer',
        'Conflict resolution via emerging technologies?',
        'A predictive model for automatic generation control in smart grids using artificial neural networks',
        'Estimating the time-lapse between medical insurance reimbursement with non-parametric regression models'
    ]
})

# Define terms for Predictive AI
predictive_ai_df = pd.DataFrame({
    'paper_id': [4, 5, 3],
    'term': ['Regression Models', 'Non-Parametric Regression Models', 'Multi Agents']
})

# Define terms for Computer Vision
computer_vision_df = pd.DataFrame({
    'paper_id': [0, 1, 2],
    'term': ['gans', 'Object Detection', 'Object Detection']
})


In [2]:
import numpy as np
import scipy.sparse as ssp
import pickle

def create_hierarchical_incidence_matrix(papers_df, topics_df, term_column):
    # Create mapping from term to index
    term_to_index = {term: idx for idx, term in enumerate(terms_df[term_column].unique())}
    
    # Initialize matrix
    num_papers = len(papers_df)
    num_terms = len(term_to_index)
    rows = []
    cols = []
    
    # Fill matrix
    for _, row in terms_df.iterrows():
        paper_id = row['paper_id']
        term = row[term_column]
        if term in term_to_index:
            rows.append(paper_id)
            cols.append(term_to_index[term])
    
    data = np.ones(len(rows), dtype=np.int8)  # Binary matrix
    incidence_matrix = ssp.coo_matrix((data, (rows, cols)), shape=(num_papers, num_terms))
    
    return incidence_matrix, term_to_index

# Create incidence matrices for new categories
predictive_ai_matrix, predictive_ai_to_index = create_hierarchical_incidence_matrix(papers_df, predictive_ai_df, 'term')
computer_vision_matrix, computer_vision_to_index = create_hierarchical_incidence_matrix(papers_df, computer_vision_df, 'term')



In [12]:
print(predictive_ai_matrix)
print(predictive_ai_to_index)
print(computer_vision_matrix)
print(computer_vision_to_index)

  (4, 0)	1
  (5, 1)	1
  (3, 2)	1
{'Regression Models': 0, 'Non-Parametric Regression Models': 1, 'Multi Agents': 2}
  (0, 0)	1
  (1, 1)	1
  (2, 1)	1
{'gans': 0, 'Object Detection': 1}


In [13]:
# Save matrices and mappings
def save_to_pickle(matrix, filename):
    with open(filename, 'wb') as f:
        pickle.dump(matrix, f)

def save_mapping(mapping, filename):
    with open(filename, 'wb') as f:
        pickle.dump(mapping, f)

save_to_pickle(predictive_ai_matrix, 'predictive_ai.pkl')
save_mapping(predictive_ai_to_index, 'predictive_ai_mapping.pkl')

save_to_pickle(computer_vision_matrix, 'computer_vision.pkl')
save_mapping(computer_vision_to_index, 'computer_vision_mapping.pkl')


In [3]:
# Create id2chemical.pkl (for demonstration, we're using a general term set)
id2predictive_ai = predictive_ai_df.groupby('paper_id')['term'].apply(list).to_dict()
id2computer_vision = computer_vision_df.groupby('paper_id')['term'].apply(list).to_dict()

In [15]:
print(id2predictive_ai)

{3: ['Multi Agents'], 4: ['Regression Models'], 5: ['Non-Parametric Regression Models']}


In [4]:
# Save to pickle files
with open('id2predictive_ai.pkl', 'wb') as f:
    pickle.dump(id2predictive_ai, f)

with open('id2computer_vision.pkl', 'wb') as f:
    pickle.dump(id2computer_vision, f)

In [17]:
import numpy as np
import scipy.sparse as ssp
import pickle

# Number of papers
num_papers = len(papers_df)

# Create a random citation matrix for demonstration
# In a real scenario, you would replace this with actual citation data
citation_matrix = np.random.randint(0, 5, size=(num_papers, num_papers))
citation_matrix = ssp.csr_matrix(citation_matrix)

In [18]:
print(citation_matrix)

  (0, 0)	3
  (0, 1)	4
  (0, 2)	1
  (0, 3)	1
  (0, 4)	2
  (0, 5)	1
  (1, 0)	3
  (1, 1)	1
  (1, 2)	2
  (1, 3)	1
  (1, 5)	3
  (2, 0)	3
  (2, 1)	2
  (2, 2)	2
  (2, 4)	4
  (2, 5)	3
  (3, 0)	4
  (3, 1)	1
  (3, 2)	1
  (3, 3)	2
  (3, 4)	2
  (3, 5)	4
  (4, 0)	2
  (4, 1)	3
  (4, 2)	2
  (4, 3)	3
  (4, 4)	1
  (4, 5)	4
  (5, 0)	2
  (5, 1)	4
  (5, 2)	2
  (5, 3)	1
  (5, 4)	2
  (5, 5)	2


In [19]:
# Save to pickle file
with open('citations.pkl', 'wb') as f:
    pickle.dump(citation_matrix, f)

In [23]:
import time
import pandas as pd
import scipy.sparse as ssp
import numpy as np
from multiprocessing import Pool, cpu_count
from scipy.stats import dirichlet
from collections import Counter
from datetime import datetime
import random
from collections import defaultdict
from scipy.optimize import minimize
from itertools import zip_longest
from numba import guvectorize, vectorize
from numba import int64, float64, int32
from numba import cuda
import math
import pickle as pickle
from scipy.stats import pearsonr
from sklearn.metrics import precision_recall_curve
from functools import partial
import numpy as np
from collections import Counter
from multiprocessing import Pool, cpu_count
from functools import partial
import pylab as plt
import seaborn as sns
import numpy as np
import math
from scipy.sparse import csr_matrix
from scipy.sparse import csc_matrix


class Stopwatch:
    start_time = None

    def go(self, msg=''):
        if msg:
            print(msg, flush=True)
        self.start_time = time.time()

    def stop(self, msg=''):
        if msg:
            print("{}: {} seconds".format(msg, time.time() - self.start_time), flush=True)
        else:
            print("Elapsed time: {} seconds".format(time.time() - self.start_time), flush=True)

    def check(self):
        return time.time() - self.start_time


tic = Stopwatch()


def load_date(filename):
    tic = Stopwatch()
    print("Loading paper dates %s from disk..." % filename),
    tic.go()
    pkl_file = open(filename, 'rb')
    A = pickle.load(pkl_file, encoding='latin1')
    pkl_file.close()
    tic.stop()
    return A


def load_hypergraph(filename):
    tic = Stopwatch()
    print("Loading file %s from disk..." % filename),
    tic.go()
    pkl_file = open(filename, 'rb')
    (row, col) = pickle.load(pkl_file, encoding='latin1')
    pkl_file.close()
    A = ssp.coo_matrix((np.ones(len(row), dtype=np.int8), (row, col)), shape=(19916562, max(col) + 1), dtype=np.int8)
    tic.stop()
    return A


tic = Stopwatch()
G = load_hypergraph('medline/journals.pkl').tocsr()
paper_dates = load_date('medline/paper_dates.pkl')

# G = G[paper_dates > 0, :]
# paper_dates = paper_dates[paper_dates > 0]
# G = G[paper_dates < 2010, :]
# paper_dates = paper_dates[paper_dates < 2010]
# paper_dates[paper_dates < 1950] = 1950

Loading file medline/journals.pkl from disk...
Elapsed time: 105.71194052696228 seconds
Loading paper dates medline/paper_dates.pkl from disk...
Elapsed time: 3.307645797729492 seconds


In [30]:
print(paper_dates)

[1975 1975 1975 ...   -1   -1 2009]


In [27]:
print(G)

<Compressed Sparse Row sparse matrix of dtype 'int8'
	with 96663162 stored elements and shape (19916562, 30126)>
  Coords	Values
  (0, 5168)	1
  (0, 7337)	2
  (0, 8907)	1
  (0, 8989)	2
  (0, 15846)	1
  (0, 20721)	1
  (0, 22609)	1
  (0, 23131)	2
  (0, 24442)	1
  (0, 24723)	1
  (1, 2833)	1
  (1, 5486)	2
  (1, 7337)	3
  (1, 10365)	1
  (1, 18842)	6
  (1, 24660)	6
  (2, 7015)	1
  (2, 7337)	1
  (2, 8989)	2
  (2, 9528)	1
  (2, 15471)	2
  (2, 18842)	3
  (2, 20155)	1
  (2, 24075)	1
  (2, 27910)	1
  :	:
  (16953034, 26828)	3
  (16953122, 885)	1
  (16953122, 2833)	2
  (16953122, 4961)	1
  (16953122, 5458)	1
  (16953122, 6428)	1
  (16953122, 7337)	1
  (16953122, 8097)	1
  (16953122, 15471)	1
  (16953122, 23872)	1
  (16953122, 24468)	1
  (16953122, 27290)	1
  (16953122, 28953)	1
  (16953158, 2084)	1
  (16953158, 10087)	2
  (16953158, 10914)	1
  (16953158, 13735)	2
  (16953158, 16797)	10
  (16953158, 17101)	1
  (16953158, 18443)	1
  (16953158, 20712)	2
  (16953158, 25113)	1
  (16953158, 26851)	2
  (

In [16]:
C = load_hypergraph('medline/chemical.pkl').tocsr()
D = load_hypergraph('medline/disease.pkl').tocsr()
M = load_hypergraph('medline/method.pkl').tocsr()

Loading file medline/chemical.pkl from disk...
Elapsed time: 7.957335472106934 seconds
Loading file medline/disease.pkl from disk...
Elapsed time: 4.053662300109863 seconds
Loading file medline/method.pkl from disk...
Elapsed time: 5.240198135375977 seconds


In [32]:
print(C)

<Compressed Sparse Row sparse matrix of dtype 'int8'
	with 32157364 stored elements and shape (19916562, 9159)>
  Coords	Values
  (0, 150)	1
  (0, 159)	1
  (0, 861)	1
  (0, 1954)	1
  (1, 3069)	1
  (1, 3075)	1
  (2, 788)	1
  (2, 869)	1
  (2, 2859)	1
  (2, 5007)	1
  (3, 964)	1
  (3, 1027)	1
  (3, 1412)	1
  (3, 2087)	1
  (3, 4451)	1
  (4, 2221)	1
  (4, 2870)	1
  (4, 3043)	1
  (5, 1147)	1
  (5, 2234)	1
  (5, 2561)	1
  (5, 2695)	1
  (5, 3339)	1
  (5, 3343)	1
  (6, 918)	1
  :	:
  (19904833, 1755)	1
  (19904833, 2692)	1
  (19904833, 3818)	1
  (19904833, 5558)	1
  (19904834, 5426)	1
  (19904834, 5860)	1
  (19904835, 5134)	1
  (19904835, 5558)	1
  (19908621, 368)	1
  (19908624, 368)	1
  (19908626, 382)	1
  (19908626, 6194)	1
  (19908627, 382)	1
  (19908627, 2719)	1
  (19908627, 6194)	1
  (19910909, 4694)	1
  (19910910, 214)	1
  (19910910, 446)	1
  (19910910, 4131)	1
  (19910911, 2486)	1
  (19911454, 41)	1
  (19911454, 5863)	1
  (19911459, 2486)	1
  (19911459, 6488)	1
  (19916200, 376)	1


In [13]:
tic.go('Loading citation data...')
citations=pickle.load(open('medline/citations.pkl','rb'),encoding='latin1').tocsr()
tic.stop()
# Load hypergraphs
PM=[] # paper by mesh terms
for i in ['chemical','disease','method']:
    PM.append(load_hypergraph('medline/'+i+'.pkl'))
PM=ssp.hstack(PM).tocsr()
PJ=load_hypergraph('medline/journals.pkl').tocsr() # paper by journal

paper_dates=load_date('medline/paper_dates.pkl') # publication date

id2chemical=pickle.load(open('medline/id2chemical.pkl','rb'),encoding='latin1')
id2disease=pickle.load(open('medline/id2disease.pkl','rb'),encoding='latin1')
id2method=pickle.load(open('medline/id2method.pkl','rb'),encoding='latin1')
id2name=np.array(id2chemical+id2disease+id2method)

Loading citation data...


  citations=pickle.load(open('medline/citations.pkl','rb'),encoding='latin1').tocsr()


Elapsed time: 9.711724281311035 seconds
Loading file medline/chemical.pkl from disk...
Elapsed time: 7.945269584655762 seconds
Loading file medline/disease.pkl from disk...
Elapsed time: 4.725998163223267 seconds
Loading file medline/method.pkl from disk...
Elapsed time: 6.2029829025268555 seconds
Loading file medline/journals.pkl from disk...
Elapsed time: 106.63767647743225 seconds
Loading paper dates medline/paper_dates.pkl from disk...
Elapsed time: 2.090975284576416 seconds


In [31]:
print(citations)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 56185074 stored elements and shape (19916562, 2007)>
  Coords	Values
  (1, 1976)	2.0
  (1, 1977)	3.0
  (1, 1978)	3.0
  (1, 1979)	3.0
  (1, 1980)	6.0
  (1, 1981)	3.0
  (1, 1982)	3.0
  (1, 1983)	4.0
  (1, 1984)	5.0
  (1, 1985)	1.0
  (1, 1986)	2.0
  (1, 1987)	3.0
  (1, 1989)	3.0
  (1, 1990)	2.0
  (1, 1993)	1.0
  (1, 1994)	1.0
  (1, 1995)	1.0
  (1, 1998)	1.0
  (2, 1976)	1.0
  (2, 1977)	2.0
  (2, 1978)	1.0
  (2, 1979)	2.0
  (2, 1981)	3.0
  (2, 1982)	1.0
  (3, 1976)	1.0
  :	:
  (16952961, 2006)	1.0
  (16952964, 2006)	1.0
  (16952965, 2005)	2.0
  (16952965, 2006)	2.0
  (16953008, 2006)	1.0
  (16953019, 2006)	1.0
  (16953123, 2000)	1.0
  (16953123, 2001)	4.0
  (16953123, 2002)	2.0
  (16953123, 2003)	1.0
  (16953123, 2004)	1.0
  (16953123, 2005)	4.0
  (16953159, 2006)	1.0
  (16953244, 2005)	3.0
  (16953244, 2006)	1.0
  (16953247, 2002)	3.0
  (16953247, 2003)	7.0
  (16953247, 2004)	7.0
  (16953247, 2005)	10.0
  (16953247, 2006)	2.0
  

In [10]:
G=[]
for thing in ['chemical','disease','method']:
    G.append(load_hypergraph('medline/'+thing+'.pkl'))
G=ssp.hstack(G).tocsr()

paper_dates=load_date('medline/paper_dates.pkl') # Load publicatioin dates

G=G[paper_dates>0,:]

Loading file medline/chemical.pkl from disk...
Elapsed time: 8.875566244125366 seconds
Loading file medline/disease.pkl from disk...
Elapsed time: 4.8578102588653564 seconds
Loading file medline/method.pkl from disk...
Elapsed time: 6.609812259674072 seconds
Loading paper dates medline/paper_dates.pkl from disk...
Elapsed time: 2.4513227939605713 seconds


In [21]:
G = load_hypergraph('journal.pkl').tocsr()

Loading file journal.pkl from disk...
Elapsed time: 0.0008120536804199219 seconds


In [15]:
G=[]
for thing in ['predictive_ai','computer_vision']:
    G.append(load_hypergraph(''+thing+'.pkl'))
G=ssp.hstack(G).tocsr()

Loading file predictive_ai.pkl from disk...
Elapsed time: 0.0006499290466308594 seconds
Loading file computer_vision.pkl from disk...
Elapsed time: 0.00060272216796875 seconds


In [4]:
tic.go('Loading citation data...')
citations=pickle.load(open('citations.pkl','rb'),encoding='latin1').tocsr()
tic.stop()
# Load hypergraphs
PM=[] # paper by mesh terms
for i in ['predictive_ai','computer_vision']:
    PM.append(load_hypergraph(''+i+'.pkl'))
PM=ssp.hstack(PM).tocsr()
PJ=load_hypergraph('journal.pkl').tocsr() # paper by journal

paper_dates=load_date('paper_dates.pkl') # publication date

id2predictive_ai=pickle.load(open('id2predictive_ai.pkl','rb'),encoding='latin1')
id2computer_vision=pickle.load(open('id2computer_vision.pkl','rb'),encoding='latin1')
id2name=np.array(id2predictive_ai+id2computer_vision)

Loading citation data...
Elapsed time: 0.0004878044128417969 seconds
Loading file predictive_ai.pkl from disk...
Elapsed time: 0.00038433074951171875 seconds
Loading file computer_vision.pkl from disk...
Elapsed time: 0.000576019287109375 seconds
Loading file journal.pkl from disk...
Elapsed time: 0.00046062469482421875 seconds
Loading paper dates paper_dates.pkl from disk...
Elapsed time: 0.0002658367156982422 seconds


  citations=pickle.load(open('citations.pkl','rb'),encoding='latin1').tocsr()


In [7]:
print(citations)

<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 34 stored elements and shape (6, 6)>
  Coords	Values
  (0, 0)	3
  (0, 1)	4
  (0, 2)	1
  (0, 3)	1
  (0, 4)	2
  (0, 5)	1
  (1, 0)	3
  (1, 1)	1
  (1, 2)	2
  (1, 3)	1
  (1, 5)	3
  (2, 0)	3
  (2, 1)	2
  (2, 2)	2
  (2, 4)	4
  (2, 5)	3
  (3, 0)	4
  (3, 1)	1
  (3, 2)	1
  (3, 3)	2
  (3, 4)	2
  (3, 5)	4
  (4, 0)	2
  (4, 1)	3
  (4, 2)	2
  (4, 3)	3
  (4, 4)	1
  (4, 5)	4
  (5, 0)	2
  (5, 1)	4
  (5, 2)	2
  (5, 3)	1
  (5, 4)	2
  (5, 5)	2
