#### __Imports__

In [4]:
from collections import Counter

import pandas as pd  # for easy and effective catalogue manipulation
import numpy as np  # for mathematic stuff

from ipywidgets import widgets  # for easy implement UI directly in notebook

import plotly.graph_objs as go
import plotly.io as pio
from plotly.offline import init_notebook_mode, iplot  # for beautifull plot

pio.renderers.default = "colab"

import cufflinks as cf  # to directly bind pandas and plotly

import requests  # for dealing with API
import json  # to deal with json inputs/outputs
import pprint  # for more friendly console formatting
import operator  # often faster than lambda expression

import sklearn.metrics.pairwise as skdist
import statistics

cf.go_offline()  # set plotly to offline mode

#### __Download and rename datasets (catalogue and user data)__

In [2]:
! rm x5gon_catelogue.tsv*
! rm x5gon_user_data.psv*
! mkdir datasets
! wget https://gitlab.univ-nantes.fr/x5gon/x5gon-hackathon-datasets/raw/master/datasets/x5gon_catelogue.tsv
! wget https://gitlab.univ-nantes.fr/x5gon/x5gon-hackathon-datasets/raw/master/datasets/x5gon_user_data.psv
! mv x5gon_catelogue.tsv datasets/catalogue.tsv
! mv x5gon_user_data.psv datasets/user_data.psv

rm: cannot remove 'x5gon_catelogue.tsv*': No such file or directory
rm: cannot remove 'x5gon_user_data.psv*': No such file or directory
mkdir: cannot create directory ‘datasets’: File exists
--2020-02-25 10:48:47--  https://gitlab.univ-nantes.fr/x5gon/x5gon-hackathon-datasets/raw/master/datasets/x5gon_catelogue.tsv
Loaded CA certificate '/etc/ssl/certs/ca-certificates.crt'
Resolving gitlab.univ-nantes.fr (gitlab.univ-nantes.fr)... 193.52.101.66
Connecting to gitlab.univ-nantes.fr (gitlab.univ-nantes.fr)|193.52.101.66|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 63850145 (61M) [text/plain]
Saving to: ‘x5gon_catelogue.tsv’


2020-02-25 10:48:54 (10.8 MB/s) - ‘x5gon_catelogue.tsv’ saved [63850145/63850145]

--2020-02-25 10:48:54--  https://gitlab.univ-nantes.fr/x5gon/x5gon-hackathon-datasets/raw/master/datasets/x5gon_user_data.psv
Loaded CA certificate '/etc/ssl/certs/ca-certificates.crt'
Resolving gitlab.univ-nantes.fr (gitlab.univ-nantes.fr)... 193.52.101.66

#### __Preview catalogue__

In [15]:
list_parser = lambda x: x[1:-1].split(",")
catalogue = pd.read_csv(
    "datasets/catalogue.tsv",
    sep="\t",
    converters={"keywords": list_parser, "concepts": list_parser},
)
# This is added in case initial dataset hasn't the right columns names:
catalogue.columns = ["id", "title", "language", "type", "keywords", "concepts"]
catalogue.set_index("id", inplace=True)
catalogue.head(20)

Unnamed: 0_level_0,title,language,type,keywords,concepts
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
59260,C7 - Computing with Space,en,pdf,"[space, omicini, c7, omicini disi, disi un...","['http://en.wikipedia.org/wiki/Bologna', 'htt..."
3904,"Electromagnetic Fields, Forces, and Motion",en,pdf,"[forces motion, fields forces, electromagnet...",['http://en.wikipedia.org/wiki/Massachusetts_I...
4796,Uncertain Allies,en,pdf,"[north korea, korea, china, north, pyongya...","['http://en.wikipedia.org/wiki/North_Korea', ..."
5757,Statistics for Brain and Cognitive Science,en,pdf,"[pr, pr pr, probability, probability theory...",['http://en.wikipedia.org/wiki/Probability_the...
6930,Classification of Web Documents Using a Graph-...,en,mp4,"[graph, subgraph, document, contrast, clas...","['http://en.wikipedia.org/wiki/Hello', 'http:..."
8160,Advanced Fluid Dynamics of the Environment,en,pdf,"[fluid, mei, layer, temperature, water sur...","['http://en.wikipedia.org/wiki/Homework', 'ht..."
11812,Lecture 5 - Work-Energy Theorem and Law of Con...,en,mov,"[force, velocity, minus, energy, function,...",['http://en.wikipedia.org/wiki/Conservation_of...
23191,Medical Decision Support,en,pdf,"[clinical, database, db, risk, report, la...","['http://en.wikipedia.org/wiki/P-value', 'htt..."
40540,Distinguishing Causes from Effects using Nonli...,fr,mp4,"[causal, nonlinear, model, ica, disturbanc...",['http://en.wikipedia.org/wiki/Nonlinear_syste...
43621,Computer Graphics,en,pdf,"[ray, dir, intersection, y1, y2, traverse...","['http://en.wikipedia.org/wiki/K-d_tree', 'ht..."


#### __Preview users__

In [9]:
list_parser = lambda x: x[1:-1].split("|")
users = pd.read_csv(
    "datasets/user_data.psv",
    sep="|",
    converters={"keywords": list_parser, "concepts": list_parser},
)
# This is added in case initial dataset hasn't the right columns names:
users.columns = ["id", "url", "timestamp"]
users.set_index("timestamp", inplace=True)
users.head(20)

Unnamed: 0_level_0,id,url
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1
2020-02-05 16:54:55.407645,0,85079
2020-02-05 21:29:13.219211,0,85079
2020-02-05 16:54:55.410975,1,141207
2020-02-06 14:45:36.547601,1,141207
2020-02-07 00:49:01.311032,1,141207
2020-02-08 17:54:43.255974,1,141207
2020-02-05 16:54:55.414236,2,85880
2020-02-05 17:29:13.414236,2,83438
2020-02-05 17:29:43.414236,2,83438
2020-02-05 16:54:55.417606,3,2099
