In [1]:
import json

import wikipedia as wp
import pandas as pd
from tqdm.notebook import tqdm

from marvel import config

In [2]:
def get_all_links_single_page(page_name):
    return wp.page(page_name).links


def get_all_links_all_pages(char_pages):
    links = []
    for page in char_pages:
        links += get_all_links_single_page(page)
    return links


def get_links_to_other_characters(char_dict, page_dict):
    name = char_dict['name']
    all_links = get_all_links_all_pages(char_dict['pages'])
    links_to_others = {}
    for link in all_links:
        if link in page_dict:
            linked_chars = page_dict.get(link, [])
            for char in linked_chars:
                curr_links = links_to_others.get(char, 0)
                links_to_others[char] = curr_links + 1
    
    links_to_others_json = [{"source": name, "target": target, "count": count} for target, count in links_to_others.items()]
    return links_to_others_json

In [3]:
mcu_char_page_names = json.loads((config.cleandata / 'all_character_pages_processed.json').open().read())

# Converting to mapping from page names -> characters
pages_to_names = {}
for dct in mcu_char_page_names:
    charname = dct['name']
    for page_name in dct['pages']:
        linked_chars = pages_to_names.get(page_name, [])
        linked_chars.append(charname)
        pages_to_names[page_name] = linked_chars


all_links = []
for char_dict in tqdm(mcu_char_page_names):
    all_links += get_links_to_other_characters(char_dict, pages_to_names)

HBox(children=(IntProgress(value=0, max=22), HTML(value='')))




In [6]:
all_links

[{'source': 'Iron Man', 'target': 'Ant-Man', 'count': 11},
 {'source': 'Iron Man', 'target': 'Black Panther', 'count': 5},
 {'source': 'Iron Man', 'target': 'Black Widow', 'count': 16},
 {'source': 'Iron Man', 'target': 'Hulk', 'count': 19},
 {'source': 'Iron Man', 'target': 'Captain America', 'count': 32},
 {'source': 'Iron Man', 'target': 'Captain Marvel', 'count': 7},
 {'source': 'Iron Man', 'target': 'Doctor Strange', 'count': 6},
 {'source': 'Iron Man', 'target': 'Drax the Destoyer', 'count': 10},
 {'source': 'Iron Man', 'target': 'Falcon', 'count': 2},
 {'source': 'Iron Man', 'target': 'Gamora', 'count': 10},
 {'source': 'Iron Man', 'target': 'Groot', 'count': 10},
 {'source': 'Iron Man', 'target': 'Star-Lord', 'count': 10},
 {'source': 'Iron Man', 'target': 'Rocket Raccoon', 'count': 10},
 {'source': 'Iron Man', 'target': 'Hawkeye', 'count': 2},
 {'source': 'Iron Man', 'target': 'Iron Man', 'count': 20},
 {'source': 'Iron Man', 'target': 'Loki', 'count': 1},
 {'source': 'Iron Ma

In [7]:
mcu_char_names = [d['name'] for d in mcu_char_page_names]

In [8]:
all_links_numeric = []
for link_dict in all_links:
    all_links_numeric.append(
        {
            'source': mcu_char_names.index(link_dict['source']),
            'target': mcu_char_names.index(link_dict['target']),
            'count': link_dict['count']
        }
    )

In [9]:
all_links_numeric

[{'source': 0, 'target': 6, 'count': 11},
 {'source': 0, 'target': 18, 'count': 5},
 {'source': 0, 'target': 3, 'count': 16},
 {'source': 0, 'target': 4, 'count': 19},
 {'source': 0, 'target': 1, 'count': 32},
 {'source': 0, 'target': 19, 'count': 7},
 {'source': 0, 'target': 13, 'count': 6},
 {'source': 0, 'target': 15, 'count': 10},
 {'source': 0, 'target': 9, 'count': 2},
 {'source': 0, 'target': 14, 'count': 10},
 {'source': 0, 'target': 12, 'count': 10},
 {'source': 0, 'target': 10, 'count': 10},
 {'source': 0, 'target': 11, 'count': 10},
 {'source': 0, 'target': 5, 'count': 2},
 {'source': 0, 'target': 0, 'count': 20},
 {'source': 0, 'target': 21, 'count': 1},
 {'source': 0, 'target': 16, 'count': 11},
 {'source': 0, 'target': 17, 'count': 19},
 {'source': 0, 'target': 8, 'count': 1},
 {'source': 0, 'target': 20, 'count': 1},
 {'source': 0, 'target': 2, 'count': 26},
 {'source': 0, 'target': 7, 'count': 2},
 {'source': 1, 'target': 6, 'count': 15},
 {'source': 1, 'target': 18, 'c

In [32]:
json.dump(all_links_numeric, (config.cleandata / 'all_character_links.json').open('w'))

In [16]:
# Processing to undirected with summed counts
undirected_links = dict()
for link in tqdm(all_links_numeric):
    source, target, count = link['source'], link['target'], link['count']
    if source < target:
        lower, upper = source, target
    else:
        lower, upper = target, source
    lower_dict = undirected_links.get(lower, {})
    lower_dict[upper] = lower_dict.get(upper, 0) + count
    undirected_links[lower] = lower_dict
    
all_character_links_undir = []
for lower, lower_dict in undirected_links.items():
    for upper, count in lower_dict.items():
        all_character_links_undir.append({
            'source': lower,
            'target': upper,
            'count': count
        })

HBox(children=(IntProgress(value=0, max=478), HTML(value='')))




In [17]:
all_character_links_undir

[{'source': 0, 'target': 6, 'count': 24},
 {'source': 0, 'target': 18, 'count': 14},
 {'source': 0, 'target': 3, 'count': 38},
 {'source': 0, 'target': 4, 'count': 38},
 {'source': 0, 'target': 1, 'count': 66},
 {'source': 0, 'target': 19, 'count': 15},
 {'source': 0, 'target': 13, 'count': 11},
 {'source': 0, 'target': 15, 'count': 22},
 {'source': 0, 'target': 9, 'count': 4},
 {'source': 0, 'target': 14, 'count': 21},
 {'source': 0, 'target': 12, 'count': 21},
 {'source': 0, 'target': 10, 'count': 22},
 {'source': 0, 'target': 11, 'count': 20},
 {'source': 0, 'target': 5, 'count': 4},
 {'source': 0, 'target': 0, 'count': 20},
 {'source': 0, 'target': 21, 'count': 2},
 {'source': 0, 'target': 16, 'count': 22},
 {'source': 0, 'target': 17, 'count': 37},
 {'source': 0, 'target': 8, 'count': 2},
 {'source': 0, 'target': 20, 'count': 3},
 {'source': 0, 'target': 2, 'count': 52},
 {'source': 0, 'target': 7, 'count': 4},
 {'source': 1, 'target': 6, 'count': 32},
 {'source': 1, 'target': 18,

In [18]:
json.dump(all_character_links_undir, (config.cleandata / 'all_character_links_undirected.json').open('w'))