In [1]:
import json
import multiprocessing as mp

import wikipedia as wp
import pandas as pd
from tqdm.notebook import tqdm
from mwviews.api import PageviewsClient

from marvel import config

## Processing nodes

Getting the following variables for each superhero

* REST API
    * Total page views
    * Total page edits
    * Total links to page
* Raw page result
    * \# of references on page
    * \# of links on the page
    * \# of words on the page

In [2]:
mcu_char_page_names = json.loads((config.cleandata / 'all_character_pages_processed.json').open().read())

In [9]:
def get_page_statistics(page_name):
    page = wp.page(page_name)
    wc = len(page.content)
    n_links = len(page.links)
    try:
        n_refs = len(page.references)
    except KeyError:
        n_refs = 0
    return {'word_count': wc, 'num_links': n_links, 'num_refs': n_refs}


def pool_page_stats(char_page_stats):
    df = pd.DataFrame(char_page_stats)
    avg = df.mean().to_dict()
    return avg


user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36'
start_date = '20150701'
end_date = '20191114'
p = PageviewsClient(user_agent=user_agent)


def get_avg_monthly_page_views(page_list):
    pviews = p.article_views('en.wikipedia', page_list, granularity='monthly', start=start_date, end=end_date)
    avgs = []
    for elem in pviews.items():
        (date, count_dict) = elem
        page_views = [c for c in count_dict.values() if c is not None]
        if len(page_views) > 0:
            avg_page_views_month = sum(page_views) / len(page_views)
            avgs.append(avg_page_views_month)
    avg_page_views_all = sum(avgs) / len(avgs)
    return avg_page_views_all


def worker(char_dict):
    name = char_dict['name']
    pages = char_dict['pages']
    
    print(f'Processing {name}')
    char_page_stats = []
    for page in pages:
        page_stats = get_page_statistics(page)
        char_page_stats.append(page_stats)
    pooled_stats = pool_page_stats(char_page_stats)
    num_pages = len(pages)
    avg_monthly_views = get_avg_monthly_page_views(pages)
    pooled_stats.update({
        'num_pages': num_pages,
        'avg_monthly_views': avg_monthly_views
    })
    res = {'id': name, 'data': pooled_stats}
    
    print(f'Done processing {name}')
    return res


In [10]:
char_stats = []
total = sum([len(inner_list['pages']) for inner_list in mcu_char_page_names])
with mp.Pool(mp.cpu_count()) as pool:
    char_stats = pool.map(worker, mcu_char_page_names)

Processing Thor
Processing Hulk
Processing Iron Man
Processing Ant-Man
Done processing Ant-Man
Processing Vision
Done processing Hulk
Processing Hawkeye
Done processing Hawkeye
Processing Scarlet Witch
Done processing Vision
Processing Star-Lord
Done processing Iron Man
Processing Captain America
Done processing Thor
Processing Black Widow
Done processing Scarlet Witch
Processing Falcon
Done processing Falcon
Processing Groot
Done processing Star-Lord
Processing Rocket Raccoon
Done processing Black Widow
Processing Gamora
Done processing Groot
Processing Doctor Strange
Done processing Captain America
Processing Nick Fury
Done processing Rocket Raccoon
Processing Black Panther
Done processing Gamora
Processing Drax the Destoyer
Done processing Doctor Strange
Processing Thanos
Done processing Thanos
Processing Loki
Done processing Nick Fury
Processing Spider Man
Done processing Loki
Done processing Black Panther
Processing Captain Marvel
Done processing Drax the Destoyer
Done processing 

In [11]:
char_stats

[{'id': 'Iron Man',
  'data': {'word_count': 43730.8,
   'num_links': 829.4,
   'num_refs': 242.8,
   'num_pages': 5,
   'avg_monthly_views': 137564.4721153846}},
 {'id': 'Captain America',
  'data': {'word_count': 35401.25,
   'num_links': 700.375,
   'num_refs': 208.125,
   'num_pages': 8,
   'avg_monthly_views': 171090.2987637362}},
 {'id': 'Thor',
  'data': {'word_count': 39843.5,
   'num_links': 785.3333333333334,
   'num_refs': 237.66666666666666,
   'num_pages': 6,
   'avg_monthly_views': 186426.34615384619}},
 {'id': 'Black Widow',
  'data': {'word_count': 17549.8,
   'num_links': 545.0,
   'num_refs': 118.6,
   'num_pages': 5,
   'avg_monthly_views': 40276.84070512822}},
 {'id': 'Hulk',
  'data': {'word_count': 38575.0,
   'num_links': 732.5,
   'num_refs': 159.75,
   'num_pages': 4,
   'avg_monthly_views': 77845.54967948717}},
 {'id': 'Hawkeye',
  'data': {'word_count': 39048.0,
   'num_links': 969.0,
   'num_refs': 83.0,
   'num_pages': 1,
   'avg_monthly_views': 662.4038461

In [15]:
mcu_char_names = [d['name'] for d in mcu_char_page_names]
char_stats_numeric = []
for char_dict in char_stats:
    new_dict = {'id': mcu_char_names.index(char_dict['id']),
                'name': char_dict['id']
               }
    new_dict.update(char_dict['data'])
    char_stats_numeric.append(new_dict)

In [16]:
char_stats_numeric

[{'id': 0,
  'name': 'Iron Man',
  'word_count': 43730.8,
  'num_links': 829.4,
  'num_refs': 242.8,
  'num_pages': 5,
  'avg_monthly_views': 137564.4721153846},
 {'id': 1,
  'name': 'Captain America',
  'word_count': 35401.25,
  'num_links': 700.375,
  'num_refs': 208.125,
  'num_pages': 8,
  'avg_monthly_views': 171090.2987637362},
 {'id': 2,
  'name': 'Thor',
  'word_count': 39843.5,
  'num_links': 785.3333333333334,
  'num_refs': 237.66666666666666,
  'num_pages': 6,
  'avg_monthly_views': 186426.34615384619},
 {'id': 3,
  'name': 'Black Widow',
  'word_count': 17549.8,
  'num_links': 545.0,
  'num_refs': 118.6,
  'num_pages': 5,
  'avg_monthly_views': 40276.84070512822},
 {'id': 4,
  'name': 'Hulk',
  'word_count': 38575.0,
  'num_links': 732.5,
  'num_refs': 159.75,
  'num_pages': 4,
  'avg_monthly_views': 77845.54967948717},
 {'id': 5,
  'name': 'Hawkeye',
  'word_count': 39048.0,
  'num_links': 969.0,
  'num_refs': 83.0,
  'num_pages': 1,
  'avg_monthly_views': 662.403846153846

In [17]:
json.dump(char_stats_numeric, (config.cleandata / 'all_character_nodes.json').open('w'))