# Wikipedia Traffic

This notebook aims at modeling user traffic on [Wikipedia](https://wikipedia.org) using a recurrent graph convolutional neural network.

Goal: anomaly detection. Can be used to detect events in the real world. Other applications:
* intrusion detection on telecomunnication networks,
* anomaly detection on energy networks,
* accident detection on transporation networks.

Events: Super Bowl, Academy Awards, Grammy, Miss Universe, Golden Globe. Mostly December-February.
Missed: Charlie Hebdo, Ebola

Network is very large: 5M nodes, 300M edges. Downsampling ideas:
* Choose a category, e.g. science.
* Take most active ones.
* Concatenate in modules / communities / super-nodes.

Raw data
* [Wikimedia SQL dumps](https://dumps.wikimedia.org/enwiki/), to construct the hyperlink graph.
    * Network size: 5M nodes, 300M edges.
* [Pagecounts](https://dumps.wikimedia.org/other/pagecounts-all-sites/) as activations on the graph.
    * Data from 2014-09-23 0h to 2015-06-05 22h.
    * 6142 hours in total.

In [None]:
%matplotlib inline

import os
import datetime

import IPython.display as ipd
from tqdm import tqdm_notebook
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import graph_tool.all as gt

In [None]:
%load_ext dotenv
%dotenv .env

#WIKI_RAW = os.environ.get('WIKI_RAW')  # Downloaded from dumps.wikimedia.org.
#WIKI_CLEAN = os.environ.get('WIKI_CLEAN')  # Processed by Kirell Benzi.

DATA_DIR = os.path.join('..', 'data', 'wikipedia')

In [None]:
sns.set_context("notebook", font_scale=1.5)
plt.rcParams['figure.figsize'] = (17, 5)
plt.rcParams['agg.path.chunksize'] = 10000  # OverflowError when plotting large series.

## 1 Hyperlink graph

In [None]:
graph = gt.load_graph(os.path.join(DATA_DIR, 'enwiki-20150403-graph.gt'))

In [None]:
graph.is_directed()
#graph.set_directed(False)

In [None]:
def print_graph(graph):
    print('{} vertices, {} edges'.format(
        graph.num_vertices(), graph.num_edges()))

print_graph(graph)
graph.list_properties()

In [None]:
idx = 42
page_title = graph.vertex_properties['page_title'][idx]
page_id = graph.vertex_properties['page_id'][idx]
print('{}: {}'.format(page_id, page_title))

In [None]:
def plot_degree_distribution(graph):
    hist = gt.vertex_hist(graph, 'total')
    plt.loglog(hist[1][:-1], hist[0])
    plt.xlabel('#edges')
    plt.ylabel('#nodes')
    #plt.savefig('degree_distribution.pdf')
plot_degree_distribution(graph)

## 2 Pages

A lot of pages in `pagecounts` are redirections to actual pages. We need to merge the hits.

In [None]:
filepath = os.path.join(DATA_DIR, 'enwiki-20150403-page-redirect.csv.gz')
redirect = pd.read_csv(filepath, compression='gzip', sep='|', encoding='utf-8', quoting=3, index_col=1)

redirect.head()

In [None]:
#assert len(redirect) == len(redirect['page_id'].unique())
print('{:.2e} unique pages, {:.2e} pages including redirections'.format(
        len(redirect['fix_page_id'].unique()),
        len(redirect)))

In [None]:
redirect.loc[page_id]

In [None]:
def id2title(page_id):
    page_title = redirect.at[page_id, 'fix_page_title']
    #page_title = graph.vp['page_title'][id]
    print('{}: https://en.wikipedia.org/?curid={}'.format(page_title, page_id))
    return page_title
id2title(12)

In [None]:
def find_in_title(string):

    def find(page_title, string):
        try:
            return string.lower() in page_title.lower()
        except:
            return False

    #b = redirect['fix_page_title'].apply(find, string=string)
    b = redirect['page_title'].apply(find, string=string)
    #return redirect[b]
    return redirect[b & (redirect['is_redirect'] == 0)]

find_in_title('ebola')

## 3 Page views / counts

Graph has 4M nodes but lot of pages are not seen much. `signal_500.h5` lists only 118k pages.

In [None]:
# Kirell's signal which includes views when greater than 500.
filepath = os.path.join(DATA_DIR, 'signal_500.h5')
signal = pd.read_hdf(filepath, 'data')
signal['count_views'].plot(kind='hist', logy=True)
print(len(signal), len(signal['page_id'].unique()), len(signal['layer'].unique()), signal['count_views'].max())
signal.head()

In [None]:
def get_pagecounts(date):
    filename = 'pagecounts-{:4d}{:02d}{:02d}-{:02d}0000.csv.gz'.format(date.year, date.month, date.day, date.hour)
    filepath = os.path.join('..', 'data', 'wikipedia', 'pagecounts_clean', filename)
    return pd.read_csv(filepath, compression='gzip', index_col=0, squeeze=True)

In [None]:
START = datetime.datetime(2014, 9, 23, 2)
END = datetime.datetime(2014, 9, 24, 3)
END = datetime.datetime(2015, 6, 5, 20)
dates = pd.date_range(START, END, freq='H')

activations_tot = pd.Series(
    data=0,
    index=graph.vp['page_id'].get_array(),
    dtype=np.int64
)

for date in tqdm_notebook(dates):
    pagecounts = get_pagecounts(date)
    activations_tot += pagecounts.reindex(activations_tot.index).fillna(0).astype(np.int32)

print(activations_tot.shape)

In [None]:
# The largest is the main page.
plt.semilogy(np.sort(activations_tot.values)[::-1])

main_page = activations_tot.argmax()
print('{} ({}): {:.2e} views in total'.format(id2title(main_page), main_page, activations_tot[main_page]))

print('{:.2e} views in total'.format(activations_tot.sum()))

In [None]:
# Power law.
activations_tot.drop(main_page).plot(kind='hist', logy=True, bins=100);
plt.figure()
activations_tot.drop(main_page)[activations_tot < 1e7].plot(kind='hist', logy=True, bins=100);

In [None]:
MIN_AVG_VIEWS = 100

keep = activations_tot.index[activations_tot >= MIN_AVG_VIEWS * len(dates)]
print('{} pages have more than {} views in total ({:.0f} per hour on average)'.format(
    len(keep), MIN_AVG_VIEWS * len(dates), MIN_AVG_VIEWS))

In [None]:
activations = pd.DataFrame(
    data=0,
    index=keep,
    columns=dates,
    dtype=np.int32
)

for date in tqdm_notebook(dates):
    pagecounts = get_pagecounts(date)
    activations[date] = pagecounts.reindex(activations.index).fillna(0).astype(np.int32)

activations.sort_index(inplace=True)

filepath = os.path.join('..', 'data', 'wikipedia', 'activations_{}.h5'.format(MIN_AVG_VIEWS))
activations.to_hdf(filepath, 'activations')

ipd.display(activations.head())
ipd.display(activations.info())

* Predictable fluctuations with unpredictable spikes. Those are outliers.
* Anomalies should be outliers persisting for many hours.

In [None]:
DROP = [
   15580374,  # Main page draws ~10% traffic.
   42727860,  # Undefined has the largest peaks of traffic while being inactive after 2014-10.
#   8063851,   # Feynman point has a very large traffic peak which is probably an error.
#   2697304,   # Gold_as_an_investment has many traffic peaks.
]

def load_activations(filepath, drop=DROP):
    activations = pd.read_hdf(filepath, 'activations')

    if drop:
        activations.drop(drop, inplace=True)
    
    print('activations: {} page ids x {} hours = {}'.format(*activations.shape, activations.size))
    return activations

activations = load_activations(filepath)

In [None]:
print('Max of {0} views at page id {2} and time {1}'.format(
    activations.unstack().max(), *activations.unstack().argmax())) 
plt.plot(activations.values.reshape(-1));

In [None]:
plt.hist(activations.values.reshape(-1), bins=100, log=True);

In [None]:
def plot_activation(page_id):
    page_title = id2title(page_id)
    ax = activations.loc[page_id].plot(label='{} ({})'.format(page_title, page_id), logy=True)
    ax.set_ylabel('#views per hour');
    ax.legend()
    #plt.savefig('{}_{}.png'.format(page_id, page_title.lower()), dpi=300)
    #plt.savefig('{}_{}.pdf'.format(page_id, page_title.lower()))

# Events.
plot_activation(2251390)   # Charlie Hebdo
plot_activation(44969225)  # Charlie Hebdo shooting
plt.figure()
plot_activation(27718)     # Super Bowl
plt.figure()
#plot_activation(40817806)  # Ebola
plot_activation(44635)     # Grammy
plot_activation(150340)    # Miss Universe
#plot_activation(324)       # Academy Awards

# Neighbors of Charlie Hebdo.
#plot_activation(44969610)  # Charb
#plot_activation(206682)    # Caricature
#plot_activation(15012)     # Islamism
#plot_activation(7826589)   # Jihadism
#plot_activation(50100)     # Journalist

# Remarkable things.
#plot_activation(25)
#plot_activation(15580374)  # Main Page --> largest traffic (~10%)
#plot_activation(42727860)  # Undefined --> hits only before mid-oct 2014
#plot_activation(670)       # Alphabet --> strange drop
#plot_activation(8063851)   # Shall distinguish outliers (counting errors?) from real events
#plot_activation(2697304)   # Lots of peaks --> correlated with fluctuations on market?

## 4 Matching graph & activations

Further analysis
* Ratio of in / out neighbors.
* Proportion of bidirectional hyperlinks.

In [None]:
activations = load_activations(os.path.join(DAT_DIR, 'activations_100.h5'))
graph = gt.load_graph(os.path.join(DATA_DIR, 'enwiki-20150403-graph.gt'))

In [None]:
def compute_diameter(graph):
    d = gt.pseudo_diameter(graph)[0]
    print('Pseudo-diameter: {}'.format(int(d)))

In [None]:
print_graph(graph)
compute_diameter(graph)

mask = np.in1d(graph.vp['page_id'].get_array(), activations.index)
graph = gt.GraphView(graph, vfilt=mask)
print_graph(graph)

l = gt.label_largest_component(graph)
graph = gt.GraphView(graph, vfilt=l)
print_graph(graph)
compute_diameter(graph)

In [None]:
graph = gt.Graph(graph, prune=True)

def sort_vertices(graph, vp):
    sort = np.argsort(vp.get_array())
    sort = np.argsort(sort)
    sort = graph.new_vertex_property('int64_t', sort)
    return gt.Graph(graph, vorder=sort)

graph = sort_vertices(graph, graph.vp['page_id'])
# directed=False

In [None]:
activations = activations.loc[graph.vp['page_id'].get_array()]

In [None]:
np.testing.assert_equal(graph.vp['page_id'].get_array(), activations.index)

graph.save(os.path.join('..', 'data', 'wikipedia', 'graph.gt'))
graph.save(os.path.join('..', 'data', 'wikipedia', 'graph.graphml'))
activations.to_hdf(os.path.join('..', 'data', 'wikipedia', 'activations.h5'), 'activations')

In [None]:
#gt.sfdp_layout()
#gt.graph_draw(graph)

## 5 Analysis

In [None]:
graph = gt.load_graph(os.path.join(DATA_DIR, 'graph.gt'))
activations = load_activations(os.path.join(DATA_DIR, 'activations.h5'), drop=None)

In [None]:
plot_degree_distribution(graph)

In [None]:
def plot_adjacency(graph, ax=None):
    A = gt.adjacency(graph)
    if not ax:
        fig, ax = plt.subplots(figsize=(10, 10))
    ax.spy(A[:10000,:10000], markersize=0.2)
    ax.set_title('{} nodes, {} edges ({:.2%})'.format(
        A.shape[0], A.nnz, A.nnz / np.multiply(*A.shape)))

plot_adjacency(graph)

In [None]:
def order_adjacency_plot(graph, ax=None, **kwargs):
    state = gt.minimize_blockmodel_dl(graph, **kwargs)
    graph = sort_vertices(graph, state.get_blocks())
    plot_adjacency(graph, ax)

fig, axes = plt.subplots(1, 3)
for ax, n_blocks in zip(axes, [10, 20, 30]):
    order_adjacency_plot(graph, ax=ax, B_max=n_blocks)

In [None]:
plt.hist(activations.values.reshape(-1), bins=100, log=True);
plt.figure()
plt.hist(activations.sum(axis=1).values.reshape(-1), bins=100, log=True);