In [40]:
import xml.etree.ElementTree as ET
import mrjob

In [42]:
def parse_dynetml(xml_file):
    tree = ET.parse(xml_file)
    root = tree.getroot()

    # Extract nodes
    node_count = 0
    nodes = {}
    nodes_inverse = {}
    for node_class in root.findall('.//nodeclass'):
        for idx, node in enumerate(node_class.findall('.//node')):
            nodes[node.attrib['id']] = idx    
            nodes_inverse[idx] = node.attrib['id']
            node_count += 1

    # Extract networks
    networks = {}
    for network in root.findall('.//network'):
        links = [[] for _ in range(node_count)]
        network_id = network.attrib['id']
        linksXml = network.findall('.//link')
        for link in linksXml :
            # Check if the link exists
            if float(link.attrib['value']) > 0:
                source = nodes[link.attrib['source']]
                target = nodes[link.attrib['target']]
                links[source].append(target)
        
        node_count = len(links)
        for i in range(len(links)):
            links_temp = links[i]
            print(links[i])
            links[i] = [1.0 / node_count, len(links[i]), links_temp]

        networks[network_id] = links

    return nodes, networks, nodes_inverse

# Example usage
nodes, networks, inverse = parse_dynetml('padgett.xml')

print("Nodes:")
for node_id, node_name in enumerate(nodes):
    print(f"Node ID: {node_id} - Node Name: {node_name}")

print("\nNetworks:")
with open('PADGB.txt', 'w') as f:
    for network_id, network_links in networks.items():
        if network_id == 'PADGM':
            continue
        print(f"Network ID: {network_id}")
        for idx, el in enumerate(network_links):
            f.write(f"{idx}: {el}\n")

[8]
[5, 6, 8]
[4, 8]
[6, 10, 14]
[2, 10, 14]
[1]
[1, 3, 7, 15]
[6]
[0, 1, 2, 12, 13, 15]
[13]
[3, 4, 14]
[]
[8, 14, 15]
[8, 9]
[3, 4, 10, 12]
[6, 8, 12]
[]
[]
[4, 5, 8, 10]
[6, 7, 10]
[2, 7, 10]
[2, 8]
[3, 7]
[3, 4, 6, 10]
[2, 5, 9, 13, 15]
[8]
[2, 3, 4, 7]
[]
[]
[8]
[]
[8]
Nodes:
Node ID: 0 - Node Name: ACCIAIUOL
Node ID: 1 - Node Name: ALBIZZI
Node ID: 2 - Node Name: BARBADORI
Node ID: 3 - Node Name: BISCHERI
Node ID: 4 - Node Name: CASTELLAN
Node ID: 5 - Node Name: GINORI
Node ID: 6 - Node Name: GUADAGNI
Node ID: 7 - Node Name: LAMBERTES
Node ID: 8 - Node Name: MEDICI
Node ID: 9 - Node Name: PAZZI
Node ID: 10 - Node Name: PERUZZI
Node ID: 11 - Node Name: PUCCI
Node ID: 12 - Node Name: RIDOLFI
Node ID: 13 - Node Name: SALVIATI
Node ID: 14 - Node Name: STROZZI
Node ID: 15 - Node Name: TORNABUON

Networks:
Network ID: PADGB


In [39]:
%%file main.py
from mrjob.job import MRJob
import xml.etree.ElementTree as ET
import ast
from mrjob.step import MRStep


D = 0
s = 0.85
node_count = 0

class MRPageRank(MRJob):

    def steps(self):
        return [
            MRStep(mapper=self.ip_mapper,
                     reducer=self.ip_reducer),
            MRStep(mapper=self.mapper,
                   reducer=self.reducer)
        ]

    def ip_mapper(self, _, line):
        idx, line = line.split(':', 1)
        key = int(idx)
        global node_count, D
        D = 0
        if key + 1 > node_count:
            node_count = key + 1

        parsed_line = ast.literal_eval(line)
        p_self = parsed_line[0]
        neighbor_count = parsed_line[1]
        neighbors = parsed_line[2]

        if neighbor_count == 0: yield 1, [p_self]

        yield key, (p_self, neighbor_count, neighbors) 
    
    def ip_reducer(self, key, values):
        global D
        for value in values:
            if len(value) == 1:
                D += float(value[0])
                continue
            
            yield key, value


    def mapper(self, key, vals):
        p_self, neighbor_count, neighbors = vals
        for neigh in neighbors:
            yield neigh, p_self/neighbor_count

        yield key, (0.0, p_self, neighbor_count, neighbors)


    def reducer(self, key, values):
        global D, s, node_count
        probSum = 0
        for val in values:
            if not isinstance(val, float):
                p_old = val[1]
                neighbor_count = val[2]
                neighbors = val[3]
            else:
                probSum += val
        p_new = s * probSum + (s * D  + 1.0 - s) / node_count
        yield key, [p_new, neighbor_count, neighbors, p_old]


if __name__ == '__main__':
    MRPageRank.run()

Writing main.py


In [38]:
from main import MRPageRank

fileName = 'PADGB.txt'
tolerance = 0.00001
change = 5
mr_job = MRPageRank(args=[fileName])

while change > tolerance:
    print(f'CHANGE: {change}')
    with mr_job.make_runner() as runner:
        change = 0
        runner.run()
        with open(fileName, 'w') as f:
            for el in mr_job.parse_output(runner.cat_output()):
                old_rank = el[1][3]
                new_rank = el[1][0]
                change += abs(new_rank - old_rank)
                f.write(f'{el[0]}: {el[1][:3]}\n')

No configs specified for inline runner


CHANGE: 5


No configs specified for inline runner


CHANGE: 0.5569270833333332


No configs specified for inline runner


CHANGE: 0.2477714029947915


No configs specified for inline runner


CHANGE: 0.14416794281005854


No configs specified for inline runner


CHANGE: 0.09510088077280263


No configs specified for inline runner


CHANGE: 0.06655461158382334


No configs specified for inline runner


CHANGE: 0.046644505992680926


No configs specified for inline runner


CHANGE: 0.0345415575008707


No configs specified for inline runner


CHANGE: 0.024302100440839455


No configs specified for inline runner


CHANGE: 0.01796412252386087


No configs specified for inline runner


CHANGE: 0.012635101917654074


No configs specified for inline runner


CHANGE: 0.009320178333978607


No configs specified for inline runner


CHANGE: 0.0065423157993076755


No configs specified for inline runner


CHANGE: 0.004830022672327461


No configs specified for inline runner


CHANGE: 0.003381403367670288


No configs specified for inline runner


CHANGE: 0.002502305602196546


No configs specified for inline runner


CHANGE: 0.001746364635276977


No configs specified for inline runner


CHANGE: 0.001296452884412859


No configs specified for inline runner


CHANGE: 0.0009016742748082923


No configs specified for inline runner


CHANGE: 0.0006718436796632685


No configs specified for inline runner


CHANGE: 0.0004655158615147273


No configs specified for inline runner


CHANGE: 0.00034826434586695404


No configs specified for inline runner


CHANGE: 0.00024034434224538825


No configs specified for inline runner


CHANGE: 0.00018059213631759635


No configs specified for inline runner


CHANGE: 0.00012414759732488206


No configs specified for inline runner


CHANGE: 9.368099734877872e-05


No configs specified for inline runner


CHANGE: 6.427606263694624e-05


No configs specified for inline runner


CHANGE: 4.8615905559948325e-05


No configs specified for inline runner


CHANGE: 3.328811225097178e-05


No configs specified for inline runner


CHANGE: 2.5240029982054324e-05


No configs specified for inline runner


CHANGE: 1.7245111405307656e-05


No configs specified for inline runner


CHANGE: 1.31098011012451e-05
