#### __Imports__

In [2]:
from collections import Counter

import pandas as pd  # for easy and effective catalogue manipulation
import numpy as np  # for mathematic stuff

from ipywidgets import widgets  # for easy implement UI directly in notebook

import plotly.graph_objs as go
import plotly.io as pio
from plotly.offline import init_notebook_mode, iplot  # for beautifull plot

pio.renderers.default = "colab"

import cufflinks as cf  # to directly bind pandas and plotly

import requests  # for dealing with API
import json  # to deal with json inputs/outputs
import pprint  # for more friendly console formatting
import operator  # often faster than lambda expression

import sklearn.metrics.pairwise as skdist
import statistics

cf.go_offline()  # set plotly to offline mode

#### __Download and rename datasets (catalogue and user data)__

In [2]:
! rm x5gon_catelogue.tsv*
! rm x5gon_user_data.psv*
! mkdir datasets
! wget https://gitlab.univ-nantes.fr/x5gon/x5gon-hackathon-datasets/raw/master/datasets/x5gon_catelogue.tsv
! wget https://gitlab.univ-nantes.fr/x5gon/x5gon-hackathon-datasets/raw/master/datasets/x5gon_user_data.psv
! mv x5gon_catelogue.tsv datasets/catalogue.tsv
! mv x5gon_user_data.psv datasets/user_data.psv

rm: cannot remove 'x5gon_catelogue.tsv*': No such file or directory
rm: cannot remove 'x5gon_user_data.psv*': No such file or directory
mkdir: cannot create directory ‘datasets’: File exists
--2020-02-25 11:07:37--  https://gitlab.univ-nantes.fr/x5gon/x5gon-hackathon-datasets/raw/master/datasets/x5gon_catelogue.tsv
Loaded CA certificate '/etc/ssl/certs/ca-certificates.crt'
Resolving gitlab.univ-nantes.fr (gitlab.univ-nantes.fr)... 193.52.101.66
Connecting to gitlab.univ-nantes.fr (gitlab.univ-nantes.fr)|193.52.101.66|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 63850145 (61M) [text/plain]
Saving to: ‘x5gon_catelogue.tsv’


2020-02-25 11:07:42 (19.0 MB/s) - ‘x5gon_catelogue.tsv’ saved [63850145/63850145]

--2020-02-25 11:07:42--  https://gitlab.univ-nantes.fr/x5gon/x5gon-hackathon-datasets/raw/master/datasets/x5gon_user_data.psv
Loaded CA certificate '/etc/ssl/certs/ca-certificates.crt'
Resolving gitlab.univ-nantes.fr (gitlab.univ-nantes.fr)... 193.52.101.66

#### __Preview catalogue__

In [3]:
list_parser = lambda x: x[1:-1].split(",")
catalogue = pd.read_csv(
    "datasets/catalogue.tsv",
    sep="\t",
    converters={"keywords": list_parser, "concepts": list_parser},
)
# This is added in case initial dataset hasn't the right columns names:
catalogue.columns = ["id", "title", "language", "type", "keywords", "concepts"]
catalogue.set_index("id", inplace=True)
catalogue.head(20)

Unnamed: 0_level_0,title,language,type,keywords,concepts
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
59260,C7 - Computing with Space,en,pdf,"[space, omicini, c7, omicini disi, disi un...","['http://en.wikipedia.org/wiki/Bologna', 'htt..."
3904,"Electromagnetic Fields, Forces, and Motion",en,pdf,"[forces motion, fields forces, electromagnet...",['http://en.wikipedia.org/wiki/Massachusetts_I...
4796,Uncertain Allies,en,pdf,"[north korea, korea, china, north, pyongya...","['http://en.wikipedia.org/wiki/North_Korea', ..."
5757,Statistics for Brain and Cognitive Science,en,pdf,"[pr, pr pr, probability, probability theory...",['http://en.wikipedia.org/wiki/Probability_the...
6930,Classification of Web Documents Using a Graph-...,en,mp4,"[graph, subgraph, document, contrast, clas...","['http://en.wikipedia.org/wiki/Hello', 'http:..."
8160,Advanced Fluid Dynamics of the Environment,en,pdf,"[fluid, mei, layer, temperature, water sur...","['http://en.wikipedia.org/wiki/Homework', 'ht..."
11812,Lecture 5 - Work-Energy Theorem and Law of Con...,en,mov,"[force, velocity, minus, energy, function,...",['http://en.wikipedia.org/wiki/Conservation_of...
23191,Medical Decision Support,en,pdf,"[clinical, database, db, risk, report, la...","['http://en.wikipedia.org/wiki/P-value', 'htt..."
40540,Distinguishing Causes from Effects using Nonli...,fr,mp4,"[causal, nonlinear, model, ica, disturbanc...",['http://en.wikipedia.org/wiki/Nonlinear_syste...
43621,Computer Graphics,en,pdf,"[ray, dir, intersection, y1, y2, traverse...","['http://en.wikipedia.org/wiki/K-d_tree', 'ht..."


#### __Preview users__

In [4]:
list_parser = lambda x: x[1:-1].split("|")
users = pd.read_csv(
    "datasets/user_data.psv",
    sep="|",
    converters={"keywords": list_parser, "concepts": list_parser},
)
# This is added in case initial dataset hasn't the right columns names:
users.columns = ["id", "url", "timestamp"]
users.set_index("timestamp", inplace=True)
users.head(20)

Unnamed: 0_level_0,id,url
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1
2020-02-05 16:54:55.407645,0,85079
2020-02-05 21:29:13.219211,0,85079
2020-02-05 16:54:55.410975,1,141207
2020-02-06 14:45:36.547601,1,141207
2020-02-07 00:49:01.311032,1,141207
2020-02-08 17:54:43.255974,1,141207
2020-02-05 16:54:55.414236,2,85880
2020-02-05 17:29:13.414236,2,83438
2020-02-05 17:29:43.414236,2,83438
2020-02-05 16:54:55.417606,3,2099


#### __Search query example__

In [80]:
# The X5GON API is available at:
PLATFORM_URL = "https://platform.x5gon.org/api/v1/"
function = "search/"
parameter = "text="
keyword = "perceptron"
response = requests.get(PLATFORM_URL + function + "?" + parameter + keyword)
r_json = response.json()

# store query results in a dict of dicts {material_d: {material_data}}
# also remove duplicates by comparing material descriptions
materials = {}
for result in r_json["rec_materials"]:
    materials[result["material_id"]] = result
    materials[result["material_id"]].pop("material_id", None)

df_materials = pd.DataFrame(materials)
df_materials = df_materials.T
df_materials

Unnamed: 0,weight,title,description,creation_date,retrieved_date,type,mimetype,url,website,language,license,provider,content_ids
66731,31.1591,The Multi-layer Perceptron,This presentation describes the multilayer per...,2008-01-21T10:26:07.000Z,2018-08-02T11:32:58.041Z,text,application/pdf,http://hydro.ijs.si/v002/8c/rqh3wjwyfcx4j4mayz...,http://videolectures.net/epsrcws08_harison_tmp/,en,"{'short_name': 'by-nc-nd', 'typed_name': ['by'...","{'id': 1, 'name': 'videolectures.net', 'domain...",[173471]
65029,30.1091,The Projectron: a Bounded Kernel-Based Perceptron,We present a discriminative online algorithm w...,2008-07-08T11:30:00.000Z,2018-08-02T11:44:18.703Z,text,application/pdf,http://hydro.ijs.si/v002/4e/jzcm5wc7mniw2xc4ct...,http://videolectures.net/icml08_orabona_pbk/,en,"{'short_name': 'by-nc-nd', 'typed_name': ['by'...","{'id': 1, 'name': 'videolectures.net', 'domain...",[171743]
137065,30.0063,Fast learning of Document Ranking Functions wi...,,2008-02-11T13:45:00.000Z,2019-10-29T10:41:13.248Z,text,application/pdf,http://hydro.ijs.si/v002/40/iahak5ao5zin3gmtpx...,http://videolectures.net/wsdm08_elsas_fldr/,en,"{'short_name': 'by-nc-nd', 'typed_name': ['by'...","{'id': 1, 'name': 'videolectures.net', 'domain...",[278370]
11997,29.0144,The Projectron: a Bounded Kernel-Based Perceptron,We present a discriminative online algorithm w...,2008-07-08T11:30:00.000Z,2018-08-02T11:44:18.703Z,video,video/mp4,http://hydro.ijs.si/v002/d0/2ci2ggjinmnmra4g3j...,http://videolectures.net/icml08_orabona_pbk/,en,"{'short_name': 'by-nc-nd', 'typed_name': ['by'...","{'id': 1, 'name': 'videolectures.net', 'domain...","[28053, 28054, 28055, 28056, 28057, 28058, 280..."
8542,28.1756,The Multi-layer Perceptron,This presentation describes the multilayer per...,2008-01-21T10:26:07.000Z,2018-08-02T11:32:58.041Z,video,video/mp4,http://hydro.ijs.si/v002/0d/bxku65bxlnzaudo3ux...,http://videolectures.net/epsrcws08_harison_tmp/,en,"{'short_name': 'by-nc-nd', 'typed_name': ['by'...","{'id': 1, 'name': 'videolectures.net', 'domain...","[19374, 19375, 267102, 19376, 19377, 267103, 1..."
8343,27.6245,Fast learning of Document Ranking Functions wi...,,2008-02-11T13:45:00.000Z,2018-08-02T12:52:08.217Z,video,video/mp4,http://hydro.ijs.si/v002/4e/jyygewqam7awndgsph...,http://videolectures.net/wsdm08_elsas_fldr/,en,"{'short_name': 'by-nc-nd', 'typed_name': ['by'...","{'id': 1, 'name': 'videolectures.net', 'domain...","[18791, 18792, 18793, 18794, 18795, 18796, 187..."
79325,22.8812,Surrogate Functions for Maximizing Precision a...,The problem of maximizing precision at the top...,2015-07-08T13:44:45.000Z,2018-08-02T11:45:56.622Z,text,application/pdf,http://hydro.ijs.si/v012/4b/jnajx6667vfgohvwaz...,http://videolectures.net/icml2015_kar_surrogat...,en,"{'short_name': 'by-nc-nd', 'typed_name': ['by'...","{'id': 1, 'name': 'videolectures.net', 'domain...",[183869]
72072,22.7092,On-line learning algorithms: theory and practice,,2007-10-24T09:00:00.000Z,2020-01-13T13:40:23.592Z,text,application/pdf,http://hydro.ijs.si/v001/2b/fmyf4tqf6hu3rpokyu...,http://videolectures.net/aop07_cesa_bianchi_onl/,en,"{'short_name': 'by-nc-nd', 'typed_name': ['by'...","{'id': 1, 'name': 'videolectures.net', 'domain...","[308921, 308917, 308922, 308915, 308918, 30891..."
69429,22.6157,Online Similarity Prediction of Networked Data...,We consider online similarity prediction probl...,2013-06-13T15:00:00.000Z,2020-01-14T22:16:48.739Z,text,application/pdf,http://hydro.ijs.si/v00d/38/hckovwpuc6i5subzwu...,http://videolectures.net/colt2013_herbster_gra...,en,"{'short_name': 'by-nc-nd', 'typed_name': ['by'...","{'id': 1, 'name': 'videolectures.net', 'domain...","[316761, 316757, 316762, 316755, 316758, 31675..."
71650,22.6097,On-line Statistical Learning,,2007-01-31T00:00:00.000Z,2020-01-16T02:37:22.089Z,text,application/pdf,http://hydro.ijs.si/v001/13/cnjeui6xgpsje3gkcs...,http://videolectures.net/stw07_bianchi_lsl/,en,"{'short_name': 'by-nc-nd', 'typed_name': ['by'...","{'id': 1, 'name': 'videolectures.net', 'domain...","[327282, 327278, 327283, 327276, 327279, 32728..."


#### __Given a keyword search display users that accessed this content__

In [101]:
for key in df_materials.index:
    res = users[users.url == key].id
    if not res.empty:
        print(f"The material {key} - {materials[key]['title']} was accessed by:")
        print(users[users.url == key].id.values)
        print("")

The material 137065 - Fast learning of Document Ranking Functions with the Committee Perceptron was accessed by:
[ 23716  29126  29860  33138  40477  45497  58124  58443  59595  70982
  71626  71844  75332  76401  76617  79497  81867 109506 110838 113939
 124072 131623 155473 186374 199011 219287 222035 243853 255310 258679
 263422 266286 269037 275451 277236 282345 315127 323054 323296 329026
 331370 332471 343418 353591 365243 366218 368052 375670 380129 390219
 400401 410392 412957 416719 430858 435654 441594 449498 454717 454824
 462220 476546 477264 484710 499738 500666 508331 510213 519244 524031
 528652 544310 566773 573007 576325 576539 616087 628254 631428 645638
 645706 653542 656293 657222 670497 699266 700595 700828 705623 712729
 717632 718071 718345 720442 732453 740249 740911 744274 751993 760567
 765600 773311 774773 784267 792001 797865 812402 818701 828887 833030
 835923 846908 857156 865327 893555 918598 919284 927011 928014 930645
 953161 955825 959484 960344 965313

#### __Save the list of the content accessed, ordered by from access count__

In [None]:
count = []
for key in catalogue.index:
    res = users[users.url == key].index
    if not res.empty:
        count.append((len(res), catalogue[catalogue.index == key].title))

accessed = []
for c in count:
    accessed.append([c[0], c[1].keys()[0], c[1].values[0]])

test = pd.Series(accessed)
test = test.sort_values()
viewed = pd.DataFrame([[a[0], a[1], a[2]] for a in test])
viewed.to_csv("accessed.tsv", sep="\t", encoding="utf-8", index=False)