In [None]:
import argparse
import gzip, os, csv
import numpy as np
import random
import time
import networkx as nx
import pandas as pd


In [None]:
import findspark
findspark.init()

from pyspark import SparkConf, SparkContext

In [None]:
if False: 
    sc.stop()

config = SparkConf()
config.setMaster("local")
config.set("spark.executor.memory", "70g")
config.set('spark.driver.memory', '90g')
config.set("spark.memory.offHeap.enabled",True)
config.set("spark.memory.offHeap.size","50g") 
config.set("spark.shuffle.blockTransferService", "nio")
sc = SparkContext(conf=config)
print (sc)

In [None]:
print( "spark version=" ,sc.version)


In [None]:
# Get the links (neighbors) of a given source node in the network
def getLinks(net, source):
    # If the source node is not found in the dictionary, return an empty dictionary,
    # indicating that the node has no links.
    if source not in net:
        return {}
    return net[source]


In [None]:
# Perform a random walk with uniform probability on a network
def randomWalkUniform(triples, startNode, max_depth=5):
    next_node = startNode
    path = 'n' + str(startNode) + '->'  # Initialize the path with the starting node
    for i in range(max_depth):
        neighs = getLinks(triples, next_node)  # Get neighbors of the current node
        if len(neighs) == 0:
            break  # If no neighbors, break the loop

        weights = []
        queue = []

        # Create a queue of edges and their corresponding neighbor nodes
        for neigh in neighs:
            for edge in neighs[neigh]:
                queue.append((edge, neigh))
                weights.append(1.0)  # Assign equal weights to all edges

        # Randomly choose an edge and its corresponding neighbor node
        edge, next_node = random.choice(queue)

        # Append the chosen edge and neighbor node to the path
        path = path + 'e' + str(edge) + '->'
        path = path + 'n' + str(next_node) + '->'

    return path  # Return the path generated during the random walk

In [None]:
def preprocess(folders, filename):
    # Initialize dictionaries to store mappings and triples
    entity2id = {}  # Entity to ID mapping
    relation2id = {}  # Relation to ID mapping
    triples = {}  # Triples in the form of (head, tail, relation)

    ent_counter = 0  # Counter for entities
    rel_counter = 0  # Counter for relations

    # Loop through the provided directories
    for dirname in folders:
        for fname in os.listdir(dirname):
            # Check if the filename is contained in the current file
            if not filename in fname:
                continue
            print(fname)
            
            # Open and read the GZipped file
            gzfile = gzip.open(os.path.join(dirname, fname), mode='rt', errors='ignore')

            # Iterate over lines in the file
            for line in csv.reader(gzfile, delimiter=' ', quotechar='"'):
                h = line[0]  # Head entity
                r = line[1]  # Relation
                t = line[2]  # Tail entity

                # Check if the tail entity starts with '<' (usually denoting an entity)
                if not t.startswith('<'):
                    continue

                # Skip triples with a specific relation
                if 'ddi-interactor-in' in r:
                    continue

                # Handle the head entity
                if h in entity2id:
                    hid = entity2id[h]  # Use the existing ID if the entity is already in the mapping
                else:
                    entity2id[h] = ent_counter  # Assign a new ID to the entity
                    ent_counter += 1
                    hid = entity2id[h]

                # Handle the tail entity
                if t in entity2id:
                    tid = entity2id[t]  # Use the existing ID if the entity is already in the mapping
                else:
                    entity2id[t] = ent_counter  # Assign a new ID to the entity
                    ent_counter += 1
                    tid = entity2id[t]

                # Handle the relation
                if r in relation2id:
                    rid = relation2id[r]  # Use the existing ID if the relation is already in the mapping
                else:
                    relation2id[r] = rel_counter  # Assign a new ID to the relation
                    rel_counter += 1
                    rid = relation2id[r]

                # Add the triple (head entity ID, tail entity ID, relation ID) to the triples dictionary
                addTriple(triples, hid, tid, rid)

            print('Relation:', rel_counter, ' Entity:', ent_counter)

    # Return the dictionaries containing mappings and the triples
    return entity2id, relation2id, triples

In [None]:
import os
import sys
os.path.dirname(sys.executable)

In [None]:
# Define the directory where the dataset is located
folders = ['./data']

# Define the filename pattern to identify the dataset files
fileext = 'drugbank.nq.gz'

# Call the 'preprocess' function to process the dataset and retrieve mappings and triples
entity2id, relation2id, triples = preprocess(folders, fileext)

# The 'entity2id' dictionary maps entities to unique IDs.
# The 'relation2id' dictionary maps relations to unique IDs.
# The 'triples' dictionary stores the processed triples (head entity ID, tail entity ID, relation ID).

In [None]:
# get the file path of the first file matching the file extension in the folder
filepath = None
for folder in folders:
    for filename in os.listdir(folder):
        if filename.endswith(fileext):
            filepath = os.path.join(folder, filename)
            break
    if filepath:
        break

# open and decompress the file using gzip
with gzip.open(filepath, 'rt') as f:
    # read the first 10 lines of the file using rdflib
    g = Graph()
    for i, line in enumerate(f):
        if i == 60:
            break
        g.parse(data=line, format='nquads')
        print(line.strip())

In [None]:
# count the total number of triples
num_triples = 0

# Iterate through the 'triples' dictionary, where 'source' is a key
for source in triples:
    # For each 'source', iterate through the 'target' keys in its nested dictionary
    for target in triples[source]:
        # Increment 'num_triples' by the number of triples in the list associated with the current 'source' and 'target'
        num_triples += len(triples[source][target])

# Print the total number of triples
print('Number of triples:', num_triples)

In [None]:
# Define a function named 'randomNWalkUniform' that takes the following parameters:
# 'triples' - a dictionary containing triples
# 'n' - the number of nodes or entities in the graph
# 'walks' - the number of random walks to generate
# 'path_depth' - the depth or length of each random walk
def randomNWalkUniform(triples, n, walks, path_depth):
    # Create an empty list to store the generated random walks
    path = []
    
    # Perform 'walks' number of random walks
    for k in range(walks):
        # Generate a single random walk using the 'randomWalkUniform' function
        walk = randomWalkUniform(triples, n, path_depth)
        
        # Append the generated walk to the 'path' list
        path.append(walk)
    
    # Remove duplicate random walks by converting the list to a set and back to a list
    path = list(set(path))
    
    # 'path' now represents the unique random walks generated by the function
    
    # Return the list of random walks as the output of the function
    return path

In [None]:
# Define the number of random walks to generate
walks = 5
# Define the depth or length of each random walk
path_depth = 10
# Generate random walks using the 'randomNWalkUniform' function
# Parameters:
# - 'triples' is the dictionary of triples
# - '100' is the number of nodes or entities in the graph
# - 'walks' is the number of random walks to generate
# - 'path_depth' is the depth or length of each random walk
paths = randomNWalkUniform(triples, 100, walks, path_depth)

# Print each random walk path on a new line
print('\n'.join(paths))


In [None]:
# Create a list of entities by extracting the values from the 'entity2id' dictionary
entities = list(entity2id.values())

# Broadcast the 'triples' dictionary using Spark's broadcast variable for efficient distributed processing
# This allows 'triples' to be shared among different Spark tasks without being duplicated in memory
b_triples = sc.broadcast(triples)

In [None]:

# Define the folder path where the random walk results will be saved
folder = './walks/'

# Check if the folder does not exist, and if so, create it
if not os.path.isdir(folder):
    os.mkdir(folder)

# Specify the number of random walks and the maximum depth of each walk
walks = 250
maxDepth = 5

# Iterate through different path depths from 1 to (maxDepth - 1)
for path_depth in range(1, maxDepth):
    # Define the filename for the random walk results based on the current path depth
    filename = folder + 'randwalkstest_n%d_depth%d_pagerank_uniform.txt' % (walks, path_depth)
    
    # Print the filename for the current path depth
    print(filename)
    
    # Record the start time to measure the time taken for random walk generation
    start_time = time.time()
    
    # Create an RDD (Resilient Distributed Dataset) from the 'entities' data and apply 'randomNWalkUniform' function to each entity
    rdd = sc.parallelize(entities).flatMap(lambda n: randomNWalkUniform(b_triples.value, n, walks, path_depth))
    
    # Save the RDD as text files with the specified filename
    rdd.saveAsTextFile(filename)
    
    # Calculate the elapsed time for random walk generation
    elapsed_time = time.time() - start_time
    
    # Print the elapsed time in the format HH:MM:SS
    print('Time elapsed to generate features:', time.strftime("%H:%M:%S", time.gmtime(elapsed_time)))

In [None]:
# Define a function named 'saveData' that takes the following parameters:
# - 'entity2id' - a dictionary containing mappings of entities to their IDs
# - 'relation2id' - a dictionary containing mappings of relations to their IDs
# - 'triples' - a nested dictionary containing triples
# - 'dirname' - the directory where the data will be saved
def saveData(entity2id, relation2id, triples, dirname):
    # Check if the specified directory does not exist, and if so, create it
    if not os.path.isdir(dirname):
        os.mkdir(dirname)  
    
    # Open a file called "entity2id.txt" in write mode ('w') within the specified directory
    entity2id_file = open(os.path.join(dirname, 'entity2id.txt'), 'w')
    
    # Open a file called "relation2id.txt" in write mode ('w') within the specified directory
    relation2id_file = open(os.path.join(dirname, 'relation2id.txt'), 'w')
    
    # Open a file called "train2id.txt" in write mode ('w') within the specified directory
    train_file = open(os.path.join(dirname, 'train2id.txt'), 'w')

    # Write the total number of triples to the "train2id.txt" file followed by a newline character
    train_file.write(str(num_triples) + '\n') 
    
    # Iterate through the triples in the 'triples' dictionary
    for source in triples:
        for target in triples[source]:
            hid = source
            tid = target
            # Iterate through the relations associated with the current source and target
            for rid in triples[source][target]:
                # Write the head entity ID (hid), tail entity ID (tid), and relation ID (rid) to the "train2id.txt" file
                train_file.write("%d %d %d\n" % (hid, tid, rid))

    # Write the total number of entities (length of 'entity2id') to the "entity2id.txt" file followed by a newline character
    entity2id_file.write(str(len(entity2id)) + '\n')  
    
    # Iterate through entities in sorted order based on their IDs
    for e in sorted(entity2id, key=entity2id.__getitem__):
        # Write each entity and its corresponding ID to the "entity2id.txt" file
        entity2id_file.write(e + '\t' + str(entity2id[e]) + '\n')  

    # Write the total number of relations (length of 'relation2id') to the "relation2id.txt" file followed by a newline character
    relation2id_file.write(str(len(relation2id)) + '\n')    
    
    # Iterate through relations in sorted order based on their IDs
    for r in sorted(relation2id, key=relation2id.__getitem__):
        # Write each relation and its corresponding ID to the "relation2id.txt" file
        relation2id_file.write(r + '\t' + str(relation2id[r]) + '\n') 
        
    # Close the "train2id.txt" file
    train_file.close()
    
    # Close the "entity2id.txt" file
    entity2id_file.close()
    
    # Close the "relation2id.txt" file
    relation2id_file.close()


In [None]:
# Specify the directory where the data will be saved
dirname = 'db_graph'

# Call the 'saveData' function to save entity, relation, and triple data to the specified directory
# Parameters:
# - 'entity2id' - a dictionary containing mappings of entities to their IDs
# - 'relation2id' - a dictionary containing mappings of relations to their IDs
# - 'triples' - a nested dictionary containing triples
# - 'dirname' - the directory where the data will be saved
saveData(entity2id, relation2id, triples, dirname)

In [None]:
import gensim

# Define a custom class named 'MySentences' for iterating over text data in a specific directory and filename
class MySentences(object):
    def __init__(self, dirname, filename):
        # Initialize the class with the directory name and filename
        self.dirname = dirname
        self.filename = filename

    def __iter__(self):
        # Print a message indicating that processing is starting for the specified filename
        print('Processing', self.filename)
        
        # Iterate over files in the specified directory
        for subfname in os.listdir(self.dirname):
            # Check if the current subfile does not contain the specified filename and skip it
            if not self.filename in subfname:
                continue
            
            # Construct the full path to the subfile
            fpath = os.path.join(self.dirname, subfname)
            
            # Iterate over files in the subdirectory
            for fname in os.listdir(fpath):
                # Check if the current file name does not contain 'part' and skip it
                if not 'part' in fname:
                    continue
                
                # Check if the current file name contains '.crc' and skip it
                if '.crc' in fname:
                    continue
                
                try:
                    # Open and read each line in the current file
                    for line in open(os.path.join(fpath, fname), mode='r'):
                        # Remove the trailing newline character and split the line into words using "->" as the separator
                        line = line.rstrip('\n')
                        words = line.split("->")
                        
                        # Yield the words as an iterable
                        yield words
                except Exception:
                    # Handle any exceptions that may occur while reading the file and print an error message
                    print("Failed reading file:")
                    print(fname)

In [None]:
# Define a function named 'extractFeatureVector' that takes the following parameters:
# - 'model' - a Gensim word embedding model
# - 'drugs' - a list of drug IDs
# - 'id2entity' - a dictionary mapping IDs to entity names
# - 'output' - the name of the output file where feature vectors will be saved

def extractFeatureVector(model, drugs, id2entity, output):
    # Initialize the 'header' string with the column names starting with "Entity"
    header = "Entity"
    
    # Define a namespace abbreviation, e.g., "n" for entities
    ns = "n"
    
    # Construct the first entity column name by combining the namespace and the first drug ID
    first = ns + str(drugs[0])
    
    # Iterate through the dimensions of the feature vectors in the model
    for i in range(len(model.wv[first])):
        # Append column names for each feature dimension, e.g., "feature0", "feature1", ...
        header = header + "\tfeature" + str(i)
        
    # Open the output file for writing
    fw = open(output, 'w')
    
    # Write the header row to the output file
    fw.write(header + "\n")

    # Iterate through drug IDs sorted in ascending order
    for id_ in sorted(drugs):
        # Create a namespace-specific ID, e.g., "n123" for the current drug ID
        nid = ns + str(id_)
        
        # Check if the namespace-specific ID is not in the word embedding model
        if nid not in model.wv:
            # Print a message indicating the missing ID and continue to the next ID
            print(nid)
            continue
        
        # Retrieve the feature vector for the current namespace-specific ID from the model
        vec = model.wv[nid]
        
        # Convert the feature vector to a string, with values separated by tabs
        vec = "\t".join(map(str, vec))
        
        # Write the entity name, followed by its feature vector, to the output file
        fw.write(id2entity[id_] + '\t' + str(vec) + '\n')
    
    # Close the output file
    fw.close()


In [None]:
# Define a function named 'trainModel' that takes the following parameters:
# - 'drugs' - a list of drug IDs
# - 'id2entity' - a dictionary mapping IDs to entity names
# - 'datafilename' - the directory containing the data for training the model
# - 'model_output' - the directory where the trained models will be saved
# - 'vector_output' - the directory where feature vectors will be saved
# - 'pattern' - a string specifying the pattern (e.g., 'uniform')
# - 'maxDepth' - the maximum depth for random walks
def trainModel(drugs, id2entity, datafilename, model_output, vector_output, pattern, maxDepth):
    # Check if the 'model_output' directory does not exist, and if not, create it
    if not os.path.isdir(model_output):
        os.mkdir(model_output)
        
    # Check if the 'vector_output' directory does not exist, and if not, create it
    if not os.path.isdir(vector_output):
        os.mkdir(vector_output)
    
    # Create an 'output' directory specific to the 'pattern' within 'model_output'
    output = model_output + pattern + '/'
    if not os.path.isdir(output):
        os.mkdir(output)
    
    # Define a custom iterator 'sentences' for processing text data
    sentences = MySentences(datafilename, filename=pattern) # a memory-friendly iterator
    
    # Initialize a Gensim Word2Vec model with specific parameters
    model = gensim.models.Word2Vec(size=200, workers=5, window=5, sg=1, negative=15, iter=5)
    
    # Build the vocabulary of the model using the custom iterator 'sentences'
    model.build_vocab(sentences)
    
    # Store the total count of sentences in the corpus
    corpus_count = model.corpus_count
    
    # Delete the model to save memory
    del model
    
    # Initialize another Gensim Word2Vec model with different parameters
    model1 = gensim.models.Word2Vec(size=200, workers=8, window=5, sg=1, negative=15, iter=5)
    
    # Build the vocabulary of the model using the custom iterator 'sentences'
    model1.build_vocab(sentences)
    
    # Train the model on the sentences
    model1.train(sentences, total_examples=corpus_count, epochs=5)
    
    # Define a model name based on parameters
    modelname = 'Entity2Vec_sg_200_5_5_15_2_500' + '_d' + str(maxDepth)
    
    # Save the trained model to the 'output' directory
    model1.save(output + modelname)
    
    # Extract and save feature vectors using the 'extractFeatureVector' function
    extractFeatureVector(model1, drugs, id2entity, vector_output + modelname + '_' + pattern + '.txt')
    
    # Delete the model to save memory
    del model1
    
    # Initialize another Gensim Word2Vec model with different parameters (CBOW)
    model2 = gensim.models.Word2Vec(size=200, workers=8, window=5, sg=0, iter=5, cbow_mean=1, alpha=0.05)
    
    # Build the vocabulary of the model using the custom iterator 'sentences'
    model2.build_vocab(sentences)
    
    # Train the model on the sentences
    model2.train(sentences, total_examples=corpus_count, epochs=5)
    
    # Define a model name based on parameters
    modelname = 'Entity2Vec_cbow_200_5_5_2_500' + '_d' + str(maxDepth)
    
    # Save the trained CBOW model to the 'output' directory
    model2.save(output + modelname)
    
    # Extract and save feature vectors using the 'extractFeatureVector' function
    extractFeatureVector(model2, drugs, id2entity, vector_output + modelname + '_' + pattern + '.txt')
    
    # Delete the model to save memory
    del model2


In [None]:
ddi_df = pd.read_csv('data/input/ddi_v5.txt',sep='\t')

In [None]:
db_ns ='http://bio2rdf.org/drugbank:'
#The db_ns variable is a string that stores the base URI of the DrugBank ontology. 
#The ddi_df.Drug1 and ddi_df.Drug2 columns contain the names of the two drugs involved in each DDI.
ddi_df.Drug1 = '<'+db_ns+ddi_df.Drug1+'>'
ddi_df.Drug2 = '<'+db_ns+ddi_df.Drug2+'>

In [None]:
ddi_df.head()


In [None]:
# Create an empty set called 'db_entities' to store DrugBank entities
db_entities = set()

# Create a set called 'drugs' containing unique Drug1 and Drug2 values from the 'ddi_df' DataFrame
drugs = set(ddi_df.Drug1.unique()).union(ddi_df.Drug2.unique())

# Iterate through each 'dbid' (DrugBank ID) in the 'drugs' set
for dbid in drugs:
    # Check if the 'dbid' is present in the 'entity2id' dictionary
    if dbid in entity2id:
        # If the 'dbid' is found in 'entity2id', add its corresponding ID to the 'db_entities' set
        db_entities.add(entity2id[dbid])


In [None]:
db_entities =list(db_entities)
print (len(db_entities))

In [None]:
# Create a dictionary called 'id2entity' by reversing the key-value pairs of the 'entity2id' dictionary
id2entity = {value: key for key, value in entity2id.items()}


In [None]:
# Define the path to the directory containing the data for training the model
datafilename = './walks/'

# Define the directory where the trained models will be saved
model_output = './models/'

# Define the pattern (e.g., 'uniform') for naming the models and vectors
pattern = 'uniform'

# Define the directory where feature vectors will be saved
vector_output = './vectors/'

# Call the 'trainModel' function to train the models and extract feature vectors
# Parameters:
# - 'db_entities' - set of DrugBank entity IDs
# - 'id2entity' - dictionary mapping entity IDs to entity names
# - 'datafilename' - directory containing the data for training
# - 'model_output' - directory to save trained models
# - 'vector_output' - directory to save feature vectors
# - 'pattern' - pattern for model and vector names
# - 'maxDepth' - maximum depth for random walks (assuming 'maxDepth' is defined elsewhere)
trainModel(db_entities, id2entity, datafilename, model_output, vector_output, pattern, maxDepth)
