In [None]:
## See https://blog.matteoferla.com/2019/07/wikipedia-datamining.html
## copypaste from imperial jupyter
import requests, re, csv, pickle
import wikitextparser as wtp
import requests, re, csv, pickle, json
import pandas as pd
import numpy as np
from collections import defaultdict
from datetime import datetime
from IPython.display import display, HTML

from plotly.offline import iplot, init_notebook_mode
import plotly.graph_objs as go
init_notebook_mode()


class WikicatParser():
    """
    Gets all the pages recursively within a category and parser the content (via a suplied function) and gets pageviews.
    >>>pages = WikicatParser(cat_name, custom_page_parser=my_function, extra_fields=[], forbidden_categories_keywords=[...]).get_pages_recursively()
    >>>print(pages.data)
    >>>pandas.DataFrame.from_records(list(pages.data.values()))
    custom_page_parser is for content mining. a function that given wiki text returns a dictionary of whatever it mined.
    Any extra fields need to be be added to extra_fields or to_csv will fail.
    
    .get_pages_recursively gets everything downwards. Do note that .forbidden_categories_keywords may need to be set.
    This calls both .get_pages and .get_subcategories, both of which actually call .get_members which calls get, which is the web fetcher.
    .get_pageviews gets the page views.
    """
    api = "https://en.wikipedia.org/w/api.php"
    
    def __init__(self, category, 
                 no_views=False, 
                 no_content=False, 
                 custom_page_parser=None,
                 wanted_templates = None,
                 extra_fields=None,
                 forbidden_categories_keywords=None):
        self.session = requests.Session()
        self.no_views = no_views
        self.no_content = no_content
        self.data = {}
        if 'Category:' not in category:
            self.category = 'Category:'+category
        else:
            self.category = category
        self.category_map = {}
        self.category_cleaned = category.replace(' ','_').replace('Category:','')
        if custom_page_parser:
            self.page_parser = custom_page_parser
        elif wanted_templates:
            if isinstance(wanted_templates, str):
                self.wanted_templates = [wanted_templates.lower()]
            else:
                self.wanted_templates = [t.lower() for t in wanted_templates]
            self.page_parser = self.parse_templates
        else:
            self.no_content = True
            self.page_parser = lambda text: {}
        if extra_fields:
            self.extra_fields = extra_fields
        else:
            self.extra_fields = []
        if forbidden_categories_keywords:
            if isinstance(forbidden_categories_keywords, str):
                self.forbidden_categories_keywords = [self.forbidden_categories_keywords.lower()]
            else:
                self.forbidden_categories_keywords = [k.lower() for k in forbidden_categories_keywords]
        else:
            self.forbidden_categories_keywords = []
        
    def get(self, params):
        """
        Fetch data.
        """
        data = self.session.get(url=self.api, params=params).json()
        if 'continue' in data:
            params['cmcontinue'] = data['continue']['cmcontinue']
            t = list(data['query'].keys())[0]
            new_data = self.get(params)
            new_data['query'][t] = [*data['query'][t], *new_data['query'][t]]
            data = new_data
        return data
    
    def _add_datum(self, data, cat):
        for d in data:
            name = d["title"]
            if name not in self.data:
                self.data[name] = d
                self.data[name]['category'] = cat
                if not self.no_views:
                    self.data[name]['views'] = self.get_pageviews(name)
                if not self.no_content:
                    wiki = self.get_content(name)
                    for key, value in self.page_parser(wiki).items():
                        self.data[name][key] = value
            else:
                self.data[name]["category"] += '|' + cat
        
    def get_subcategories(self, cat):
        subcats = []
        for subcat in self.get_members(cat, 'subcat'):
            for k in self.forbidden_categories_keywords:
                if k in subcat['title'].lower():
                    print(f'BAN: {subcat["title"]} removed because it contained {k}')
                    break
            else:
                subcats.append(subcat)
        self.category_map[cat] = [s['title'] for s in subcats]
        return subcats
                          
    def get_page_by_name(self,name, cat='Manual'):
        #gets the page by the name specified! This is a fix!
        self._add_datum([{'title': name}],cat)
    
    def get_pages(self,cat):
        #gets all the pages within the category
        return self.get_members(cat, 'page')
    
    def get_members(self, cat, cmtype='subcat|page'):
        params = {
            'action': "query",
            'list': "categorymembers",
            'cmtitle': cat,
            'cmtype': cmtype,
            'cmdir': "desc",
            'format': "json"
            }
        r = self.get(params)
        if 'query' not in r:
            print(f'{cat} replied with {str(r)}.')
            return []
        data = r['query']['categorymembers']
        self._add_datum(data, cat)
        return data
    
    def get_pages_recursively(self, cat=None):
        if cat is None:
            cat = self.category
        subcats = [s['title'] for s in self.get_subcategories(cat)]
        data = self.get_pages(cat)
        for c in subcats:
            ndata = self.get_pages_recursively(c)
            print(f'{c} has {len(data)} pages directly and {len(ndata)} in subcategories')
            data.extend(ndata)
        return data
    
    def get_pageviews(self, page):
        url = f"https://wikimedia.org/api/rest_v1/metrics/pageviews/per-article/en.wikipedia/all-access/all-agents/{page.replace(' ','_').replace('/','%2F')}/monthly/2018060100/2019060100"
        r = self.session.get(url).json()
        if 'items' in r:
            return sum([i['views'] for i in r['items']])/365
        else:
            print('error',page, r)
            return 'NA'
        
    def get_content(self,page):
        #via api
        params = {
            'action': "query",
            'prop': 'revisions',
            'rvprop': 'content',
            'rvsection': 0,
            'titles': page,
            'format': "json"
            }
        data = self.session.get(url=self.api, params=params).json()
        pageid = list(data['query']['pages'].keys())[0]
        wikimarkup = data['query']['pages'][pageid]['revisions'][0]['*']
        #no idea why this was a thing.
        #wikimarkup = wikimarkup.encode('utf-8','ignore').decode('unicode_escape','ignore') #not quite right
        return wikimarkup
    
    def to_csv(self):
        """Don't save as csv for storage. Save as pickle. This is just for causal inspection in Excel."""
        with open(f'{self.category_cleaned}.csv','w',newline='') as w:
            dw = csv.DictWriter(w,['title','category', 'ns','views','pageid']+self.extra_fields,
                                extrasaction='ignore')
            dw.writeheader()
            dw.writerows(self.data.values())
        return self
    
    ####### code to convert template to dictionary
    def parse_templates(self, text):
        dex = {}
        for t in wtp.parse(text).templates:
            if t.normal_name().lower() in self.wanted_templates:# not using t.name has training space.
                dex ={**dex, **self._template_to_dict(t)}
        return {'info_'+d: dex[d] for d in dex}
                          
    def _arg_to_val(self, arg):
        val = arg.value
        try:
            for t in arg.templates:
                if t.arguments:
                    tval = t.arguments[0].value
                    tnorm = t.normal_name().lower()
                    if tnorm in ('nowrap', 'val'):
                        if any(['ul' in a.name for a in t.arguments]): #unit!
                            tval += [a.value for a in t.arguments if 'u' in a.name][0] #u= and ul=
                        val = val.replace(t.string, tval)
                    elif tnorm == 'coords':
                        lat, lon = self._parse_coords(pages._template_to_dict(t))
                        return (lat, lon)
                    elif tnorm == 'death date and age':
                        date = self._template_to_dict(t)
                        val = str(datetime(year=int(date['1']), month=int(date['2']), day=int(date['3'])))+' - '+\
                               str(datetime(year=int(date['4']), month=int(date['5']), day=int(date['6'])))
                    elif tnorm in ('death date','birth date'):     
                        date = self._template_to_dict(t)
                        val = str(datetime(year=int(date['1']), month=int(date['2']), day=int(date['3'])))
                    elif tnorm in ('birth year and age', 'death year and age'):
                        date = self._template_to_dict(t)
                        if '2' not in date:
                            val = str(datetime(year=int(date['1']), month=1, day=1))
                        elif '3' not in date:
                            val = str(datetime(year=int(date['1']), month=int(date['2']), day=1))
        except:
               pass
        val = re.sub('<.*?\/>','',val) #remove self closing tags
        val = val.replace('&nbsp;',' ')
        val = re.sub('<.*?>.*?<\/.*?>','',val) # remove tags
        val = re.sub('<!--.*?-->','',val) # remove comments
        val = val.replace('–','-') # en dash to hyphen minus
        val = val.replace('–','-') # em dash to hyphen minus
        val = val.replace('{{snd}}','-')
        val = re.sub('±\s+\d+\.?\d*','', val) #clear error for safety
        val = val.replace('{{circa}}','')
        val = re.sub('\[\[.*?\|(.*?)\]\]','\1', val)
        val = re.sub('\[\[(.*?)\]\]','\1', val)
        val = re.sub('\{\{\s[cC]ite/*?\}\}','', val)
        val = val.rstrip().lstrip()
        return val

    def _arg_to_key(self, arg):
        return arg.name.rstrip().lstrip()

    def _template_to_dict(self, template):
        return {self._arg_to_key(arg): self._arg_to_val(arg) for arg in template.arguments}
                          
    def _parse_coords(self, coords):
        # coord parser for location
        #https://en.wikipedia.org/wiki/Template:Coord
        # fix badly parsed key argument parsed as positional
        for k in list(coords.keys()):
            if coords[k].find(':') != -1:
                del coords[k]
        # kill non-positional just because...
        for k in list(coords.keys()):
            if not k.isnumeric():
                del coords[k]
        if '3' not in coords: 
            # case {{coord|latitude|longitude|coordinate parameters|template parameters}}
            lat = float(coords['1'])
            long = float(coords['2'])
        elif '5' not in coords: 
            # case {{coord|dd|N/S|dd|E/W|coordinate parameters|template parameters}}
            if coords['2'] == 'N':
                lat = float(coords['1'])
            elif coords['2'] == 'S':
                lat = -float(coords['1'])
            else:
                raise ValueError(str(coords))
            if coords['4'] == 'E':
                long = float(coords['3'])
            elif coords['4'] == 'W':
                long = - float(coords['3'])
            else:
                raise ValueError(str(coords))
        elif '7' not in coords: 
            # case {{coord|dd|mm|N/S|dd|mm|E/W|coordinate parameters|template parameters}}
            if coords['3'] == 'N':
                lat = float(coords['1'])+float(coords['2'])/60
            elif coords['3'] == 'S':
                lat = -float(coords['1'])+float(coords['2'])/60
            else:
                raise ValueError(str(coords))
            if coords['6'] == 'E':
                long = float(coords['4'])+float(coords['5'])/60
            elif coords['6'] == 'W':
                long = - float(coords['4'])+float(coords['5'])/60
            else:
                raise ValueError(str(coords))
        else:
            # case {{coord|dd|mm|ss|N/S|dd|mm|ss|E/W|coordinate parameters|template parameters}}
            if coords['4'] == 'N':
                lat = float(coords['1'])+float(coords['2'])/60 + float(coords['3'])/3600
            elif coords['4'] == 'S':
                lat = -float(coords['1'])+float(coords['2'])/60 + float(coords['3'])/3600
            else:
                raise ValueError(str(coords))
            if coords['8'] == 'E':
                long = float(coords['5'])+float(coords['6'])/60 + float(coords['7'])/3600
            elif coords['8'] == 'W':
                long = - float(coords['5'])+float(coords['6'])/60 + float(coords['7'])/3600
            else:
                raise ValueError(str(coords))
        if long > 180:
            long = - (360 - long)
        elif long < -180:
            long = (360 + long)
        return (lat, long)

In [None]:
#Category:Dinosaurs by geologic time unit
dinotype = 'Mesozoic dinosaurs'
pages = WikicatParser('Category:'+dinotype,
                forbidden_categories_keywords=[],
                wanted_templates=['automatic taxobox'])

data = pages.get_pages_recursively()

pickle.dump(data, open(dinotype.replace(' ','_')+'.dict.p','wb'))
df = pd.DataFrame.from_records(list(pages.data.values()))
#df = df.loc[~df.info_reign.isna()]
df.to_pickle(dinotype.replace(' ','_')+'.pd.p')
display(df)

In [None]:
# get list of dinosaurs from jp
#https://en.wikipedia.org/wiki/List_of_cloned_animals_in_the_Jurassic_Park_series
import wikitextparser as wtp
import pandas as pd
import re
from IPython.display import display, HTML

from plotly.offline import iplot, init_notebook_mode
import plotly.graph_objs as go
init_notebook_mode()

import requests

jp_page = requests.get('https://en.wikipedia.org/w/index.php?title=List_of_cloned_animals_in_the_Jurassic_Park_series&action=raw').text

def polish_cell(cell):
    for pattern in ('\<.*?\>','colspan\=\"\d+\"', '\*', '{{N/A}}'):
        cell = re.sub(pattern,'', cell)
    if re.search('\[\[(.*)\]\]', cell):
        cell = re.search('\[\[(.*)\]\]', cell).group(1).split('|')[0]
    return cell.strip()

def polish(data):
    return [[polish_cell(cell) if cell is not None else '' for cell in row] for row in data]
            

tb = wtp.parse(jp_page).tables[0].data()
jp = pd.DataFrame(polish(tb[2:]), columns=tb[0])

jp = jp.assign(cloned=jp.apply(lambda row: any(row.str.contains('Appearance')),1))[['Species','cloned']]

jp_cloned = jp.loc[jp.cloned].Species.values
jp_skeleton = jp.loc[~jp.cloned].Species.values

In [None]:
df = df.assign(jp_cloned=df.title.apply(lambda v: v in jp_cloned),
               jp_skeleton=df.title.apply(lambda v: v in jp_skeleton))
df = df.assign(jurassic=df.category.str.contains('Jurassic'),
              cretaceous=df.category.str.contains('Cretaceous'),
              triassic=df.category.str.contains('Triassic'),
              paleocene=df.category.str.contains('Paleocene'),
              permian=df.category.str.contains('Permian'))\
        .loc[(df.views != 'NA')]\
        .sort_values('views', ascending=False)

display(df[['title','jp_cloned','permian', 'triassic', 'jurassic', 'cretaceous', 'paleocene']])

In [None]:
import pandas as pd
df = pd.read_csv('dino.csv')

In [None]:
##what about Capra aegagrus
[x for x in jp_cloned if x not in df.loc[df.jp_cloned].title.values]

In [None]:
## two entries don't have era categories
df.at[df.loc[df.title == 'Brontosaurus'].index[0], 'jurassic'] = True
df.at[df.loc[df.title == 'Apatosaurus'].index[0], 'jurassic'] = True
top = 20
d = df.loc[~(df.title.str.contains('birds') | df.title.str.contains('Feathered '))].sort_values('views', ascending=False).head(top).sort_values('views')


In [None]:
df.loc[df.title == 'Protoceratops']

In [52]:
from IPython.display import display, HTML
x = []
y = []
text = []
m = len(d) - 1
shapes = [go.layout.Shape(type="rect",
            x0=19-0.5,
            y0=1 + 0.3 + i,
            x1=20 + 0.5,
            y1=1 - 0.3 + i,
            fillcolor=color,
            layer="below",
            line_width=0 if i < 4 else 4,
            opacity=0.5) for i, color in enumerate(['coral', 'GoldenRod','teal', 'silver', 'white'])]
legend = ['Triassic', 'Jurassic','Cretaceous', 'Error', 'Starred in Jurassic Park(TM)']
for i, row in d.iterrows():
    for j in range(0, round(row.views/1e2)):
        x.append(j)
        y.append(row.title)
    if row.triassic:
        color = 'coral'
    elif row.jurassic:
        color = 'GoldenRod' # no mustard...
    elif row.cretaceous:
        color = 'teal'
    else:
        color = 'silver'
    shapes.append(go.layout.Shape(type="rect",
        x0=-0.5,
        y0=m - i + 0.4,
        x1=j + 0.5,
        y1=m - i - 0.4,
        fillcolor=color,
        layer="below",
        line_width=3 if row.jp_cloned else 0,
        opacity=0.5))
iplot(dict(data=[go.Scatter(x=x, y=y, mode='markers'),
                 go.Scatter(y=['Diplodocus'], x=[20], mode='markers'),
                 go.Scatter(y=[d.iloc[1 + i].title for i in range(len(legend)+1)],
                            x=[21] * (len(legend)+1), text=legend+['100 daily views'], mode='text', textposition='middle right')],
           layout=go.Layout(
                            title={'text': 'Top 20 Wikipedia Pageview for Mesozoic Dinosaurs', 'x': 0.5, 'xanchor': 'center'},
                            paper_bgcolor='rgba(255,255,255,1)',
                            plot_bgcolor='rgba(0,0,0,0)',
                            shapes=shapes,
                            showlegend=False,
                            xaxis=dict(side='top', showgrid=False,zeroline= False, showline=False, visible=True, showticklabels=True),
                            yaxis=dict(showgrid=False,zeroline= False, showline=False, visible=True, showticklabels=True)
                        )))
display(HTML("""
<div height=12>
<svg xmlns="http://www.w3.org/2000/svg" style="display: none;">  
<symbol id="dino" >
<path d="m133.707 177.437s4.872 14.615 0 34.102l14.615 9.743s0 29.23-19.487 48.717h-24.359c0-6.627 5.373-12 12-12h7.487s14.615-12.359 9.743-26.974c0 0-24.359-9.743-34.102-24.359s-14.615-43.846-14.615-43.846" fill="#a9ba5a"/><path d="m128.835 274h-24.359c-2.211 0-4-1.791-4-4 0-8.822 7.176-16 16-16h5.906c2.719-2.713 9.91-10.91 7.98-20.047-6.086-2.67-25.285-11.863-34.086-25.066-10.102-15.156-15.027-44.18-15.23-45.406-.363-2.18 1.109-4.24 3.289-4.604 2.184-.359 4.234 1.109 4.602 3.287.047.283 4.848 28.562 13.996 42.285 8.926 13.393 32.027 22.77 32.262 22.863 1.09.438 1.934 1.334 2.305 2.447 5.664 16.99-10.27 30.719-10.953 31.295-.723.609-1.637.945-2.582.945h-7.488c-2.957 0-5.543 1.611-6.926 4h17.59c14.199-15.131 16.668-36.035 17.094-42.635l-12.746-8.498c-1.406-.938-2.07-2.658-1.66-4.299 4.535-18.146.129-31.73.086-31.865-.699-2.096.434-4.361 2.527-5.061 2.109-.709 4.359.436 5.059 2.529.207.613 4.738 14.586.727 33.572l12.316 8.211c1.113.742 1.781 1.99 1.781 3.328 0 1.262-.234 31.123-20.66 51.545-.752.752-1.768 1.174-2.83 1.174z" fill="#4c241d"/><path d="m84.989 55.643s9.744 0 19.487-4.872l9.815 3.272c2.754.918 5.764-.329 7.063-2.926 1.441-2.882.273-6.386-2.609-7.827l-14.269-7.135s-2.165-4.33-7.939-5.774c-5.798-1.45-11.845 1.348-15.16 6.321l-6.131 9.198s-29.231 29.231-9.743 87.692c0 0-14.615 24.359 24.359 48.717 0 0 14.615 29.231 68.205 9.744 0 0 58.461 14.615 87.692 4.872 0 0-53.589-14.615-92.563-43.846-22.008-16.506-39.356-19.031-50.289-18.101-10.049.854-19.414-5.09-22.923-14.544-5.454-14.691-8.672-37.437 5.005-64.791z" fill="#a9ba5a"/><path d="m219.585 204.278c-26.098.002-54.969-6.541-61.297-8.055-50.199 17.803-68.023-5.566-71.402-11.061-16.688-10.635-26.031-22.68-27.785-35.818-1-7.5.895-13.289 2.098-16.047-18.219-56.592 7.605-86.398 10.918-89.914l5.934-8.898c4.387-6.588 12.199-9.799 19.461-7.982 5.449 1.363 8.449 4.699 9.758 6.58l13.266 6.631c2.348 1.174 4.102 3.193 4.934 5.686.828 2.492.641 5.16-.535 7.508-2.176 4.352-7.289 6.465-11.906 4.932l-8.285-2.762c-6.953 3.16-13.648 4.133-17.184 4.434-11.957 25.348-8.738 46.291-3.824 59.529 2.867 7.734 10.621 12.654 18.832 11.951 11.414-.965 29.871 1.52 53.027 18.889 37.887 28.414 90.688 43.041 91.219 43.186 1.699.465 2.895 1.986 2.945 3.748.047 1.762-1.062 3.35-2.734 3.906-7.776 2.59-17.409 3.557-27.44 3.557zm-61.52-16.225c.324 0 .652.039.969.119.484.119 40.875 10.104 70.281 7.758-18.957-6.525-51.855-19.652-78.52-39.652-21.145-15.855-37.535-18.168-47.551-17.314-11.824.969-22.895-6.049-27.012-17.141-5.602-15.082-9.246-39.119 5.18-67.969.676-1.352 2.059-2.207 3.57-2.211.09 0 8.984-.09 17.703-4.449.953-.471 2.043-.551 3.055-.217l9.816 3.271c.855.291 1.812-.105 2.219-.918.297-.594.191-1.133.102-1.402-.09-.268-.328-.764-.918-1.059l-14.273-7.135c-.773-.387-1.402-1.016-1.789-1.789-.031-.055-1.527-2.732-5.328-3.682-3.996-.99-8.344.877-10.863 4.656l-6.133 9.199c-.145.219-.312.424-.5.609-.059.059-6.891 7.102-11.363 20.818-4.137 12.689-7.047 33.881 2.586 62.779.371 1.109.238 2.322-.363 3.324-.484.838-11.848 21.455 23.051 43.266.625.391 1.129.945 1.457 1.604.539 1.031 14.039 25.674 63.258 7.773.44-.158.901-.238 1.366-.238z" fill="#4c241d"/><path d="m158.065 177.437s4.872 14.615 0 34.102l14.615 9.743s0 29.23-19.487 48.717h-24.359c0-6.627 5.373-12 12-12h7.487s14.615-12.359 9.744-26.974c0 0-24.359-9.743-34.102-24.359s-14.615-43.846-14.615-43.846" fill="#a9ba5a"/><path d="m153.194 274h-24.359c-2.211 0-4-1.791-4-4 0-8.822 7.176-16 16-16h5.906c2.719-2.713 9.91-10.91 7.98-20.047-6.086-2.67-25.285-11.863-34.086-25.066-10.102-15.154-15.031-44.178-15.234-45.406-.363-2.18 1.109-4.24 3.285-4.604 2.18-.359 4.238 1.107 4.605 3.287.047.283 4.852 28.562 14 42.285 8.926 13.393 32.027 22.77 32.262 22.863 1.09.438 1.934 1.334 2.305 2.447 5.664 16.99-10.27 30.719-10.953 31.295-.723.609-1.637.945-2.582.945h-7.488c-2.957 0-5.543 1.611-6.926 4h17.59c14.199-15.131 16.668-36.035 17.094-42.635l-12.746-8.498c-1.406-.938-2.07-2.658-1.66-4.299 4.535-18.146.129-31.73.086-31.865-.699-2.096.434-4.361 2.527-5.061 2.113-.709 4.359.436 5.059 2.529.207.613 4.738 14.586.727 33.572l12.316 8.211c1.113.742 1.781 1.99 1.781 3.328 0 1.262-.234 31.123-20.66 51.545-.751.752-1.766 1.174-2.829 1.174z" fill="#4c241d"/><path d="m94.733 177.437c0 19.487-14.615 29.231-14.615 29.231l-14.615-4.872-4.872 14.615s-14.473-16.667.142-26.411l16 4s-1.527-36.05 8.217-40.922" fill="#a9ba5a"/><g fill="#4c241d"><path d="m60.632 220.41c-1.145 0-2.25-.492-3.02-1.377-.352-.404-8.578-10.01-7.219-20.107.668-4.967 3.414-9.09 8.16-12.254.938-.625 2.09-.826 3.188-.553l10.902 2.725c-.105-12.174 1.039-34.586 10.559-39.344 1.973-.988 4.383-.186 5.367 1.789.988 1.977.188 4.379-1.789 5.367-4.445 2.221-6.695 21.014-6.012 37.174.055 1.262-.492 2.473-1.469 3.27-.98.795-2.277 1.082-3.496.781l-14.168-3.541c-1.93 1.623-3.016 3.477-3.312 5.631-.312 2.299.293 4.781 1.227 7.031l2.16-6.473c.332-1.006 1.055-1.838 2.004-2.312.953-.473 2.047-.553 3.055-.217l12.594 4.199c3.379-2.861 11.371-11.08 11.371-24.762 0-2.209 1.789-4 4-4s4 1.791 4 4c0 21.381-15.73 32.113-16.398 32.559-1.023.688-2.309.857-3.484.467l-10.816-3.607-3.609 10.82c-.453 1.361-1.602 2.377-3.012 2.656-.26.053-.526.078-.783.078z"/><circle cx="94.772" cy="40" r="2"/></g>
</symbol>
</svg></div>
"""))
display(HTML("""<div id="this_element"></div>
<script>
// wait 0.5 s before doing anything.
setTimeout(() => {
    //name the svg canvas
    $('#this_element').parent().parent().parent().find('svg').attr('id','myGraph');
    
    //this function changes the path-tag element el into the icon f11b added as text.
    function changer(idx, el) {
        const size = 15;
        let qjel = $(el);
        // the transform is middle-center'ed, but we want it to be top-left'ed
        let t = qjel.attr('transform');
        t = t.replace(/([\d\.]+),([\d\.]+)/,(m, x,y) => (parseFloat(x)-size/2)+','+(parseFloat(y)-size/2))
        
        _Plotly.d3.select(qjel.parent()[0])
                        .append("use")
                              .attr("transform", t+' scale(0.04)')
                              .attr("xlink:href","#dino")
        //goodbye marker
        qjel.detach();
    }
    // run the function for the traces and the legends.
    $('#myGraph .point').each(changer);
    $('#myGraph .legendpoints path').each(changer);
}, 500);
</script>
"""))

In [None]:
## make top 100 table.
h = df.head(100)

print(h.assign(dinosaur=h.title.apply(lambda v: f'[{v}](https://en.wikipedia.org/wiki/{v.replace(" ","_")})'))
               [['dinosaur','views','jp_cloned','triassic','jurassic','cretaceous']]
               .to_csv(sep='|'))

In [None]:
# inspect each dinosaur icon.
import os, re
from lxml import etree as ET

for file in sorted(os.listdir('1939411-dinosaurs/svg'),
                   key=lambda v: re.search('(\w+).svg', v).group(1) if '.svg' in v else '',
                   reverse=True):
    if '.svg' in file:
        print(file)
        fp = os.path.join('1939411-dinosaurs/svg',file)
        tree = ET.parse(fp)
        root = tree.getroot()
        h = int(root.attrib['height'])
        w = int(root.attrib['width'])
        root.attrib['width']="24"
        root.attrib['height']=str(round(24*h/w))
        print(root.attrib['id'])
        for child in root:
            if 'circle' in child.tag:
                root.remove(child)
                break
        name = root.attrib['id']
        svg = ''.join([ET.tostring(child, encoding='utf-8').decode() for child in root])
        display(HTML(f"""
        <div height=12>
        <svg style="display: none;">
        <symbol id="{name}">
        {svg}
        </symbol>
        <svg>
        <svg><use xlink:href="#{name}"></svg>
        </div>"""))

In [59]:
## era distribution

import numpy as np
from scipy.signal import savgol_filter
series = []
bins = 1000
win = 15
X = np.linspace(0,4e3,bins)
dfc = df.loc[~df.title.str.contains('Category:')]

for era in ('triassic' , 'jurassic', 'cretaceous'):
    t = dfc.loc[(df[era] == True)].views
    y, x = np.histogram(t, bins=X, density=True)
    series.append(go.Scatter(x=x, y=y, name=f'{era.title()} ({len(t)})'))
    #yhat = savgol_filter(y, win, 3)
    #series.append(go.Scatter(x=x, y=yhat, name=b))
fig = go.Figure(data=series,
                layout=dict(title=f'Distribution of page views of Dinosaur articles, split into era',
                            xaxis={'title': 'Daily page views'},
                            xaxis_type="log",
                            yaxis={'title': 'Density'}
                                        ))
fig.show()

In [61]:
import numpy as np
from scipy.signal import savgol_filter
series = []
bins = 200
win = 15
X = np.linspace(0,4e3,bins)
dfc = df.loc[~df.title.str.contains('Category:')]
starring = {'Featured': True, 'Not Featured': False}
for b in starring:
    t = dfc.loc[(df.jp_cloned == starring[b])].views
    y, x = np.histogram(t, bins=X, density=True)
    series.append(go.Scatter(x=x, y=y, name=b))
    #yhat = savgol_filter(y, win, 3)
    #series.append(go.Scatter(x=x, y=yhat, name=b))
fig = go.Figure(data=series,
                layout=dict(title=f'Distribution of page views of Dinosaur articles, split into appearance on JP',
                            xaxis={'title': 'Daily page views'},
                            xaxis_type="log",
                            yaxis={'title': 'Density'}
                                        ))
fig.show()

In [62]:
## The unloved ones.
df.loc[~df.title.str.contains('Category:')].tail(20)

Unnamed: 0.1,Unnamed: 0,category,info_authority,info_display_parents,info_fossil_range,info_grandparent_authority,info_greatgrandparent_authority,info_image,info_image2,info_image2_alt,...,pageid,title,views,jp_cloned,jp_skeleton,jurassic,cretaceous,triassic,paleocene,permian
1487,1365,Category:Early Cretaceous birds of Asia|Catego...,"Wang ''et al.'', ",,,,,Yumenornis.png,,,...,40770210,Yumenornis,2.076712,False,False,False,True,False,False,False
1489,1532,Category:Mesozoic birds,,,,,,,,,...,42141987,Avipeda,2.013699,False,False,False,False,False,False,False
1491,8,Category:Mesozoic dinosaurs|Category:Mesozoic ...,,,,,,,,,...,15371825,Proximodorsal process,1.99726,False,False,False,False,False,False,False
1492,609,Category:Late Cretaceous dinosaurs of North Am...,,,,,,,,,...,49313191,Triprismatoolithus,1.978082,False,False,False,True,False,False,False
1493,1533,Category:Mesozoic birds,,,,,,,,,...,42142010,Aviadactyla,1.969863,False,False,False,False,False,False,False
1495,1003,Category:Late Cretaceous birds of Asia|Categor...,,,,,,,,,...,30332249,Laevisoolithus,1.928767,False,False,False,True,False,False,False
1496,1515,Category:Cretaceous dinosaurs of Asia,,,,,,,,,...,48494721,Paradictyoolithus,1.906849,False,False,False,True,False,False,False
1497,1014,Category:Late Cretaceous birds of Asia|Categor...,,,,,,,,,...,42473648,Cretaaviculus,1.879452,False,False,False,True,False,False,False
1498,1136,Category:Early Cretaceous dinosaurs of North A...,"Lull, 1911",2.0,", {{fossilrange|112}}",,,,,,...,34612318,Dryosaurus grandis,1.876712,False,False,False,True,False,False,False
1499,1430,Category:Early Cretaceous birds of Asia|Catego...,,,,,,,,,...,59392852,Chiappeavis,1.876712,False,False,False,True,False,False,False
