In [1]:
import pandas as pd
import matplotlib as plt
import numpy as np

%matplotlib inline

In [2]:
nov18f = "data/france/tsv/clickstream-frwiki-2018-11.tsv"
dec18f = "data/france/tsv/clickstream-frwiki-2018-12.tsv"
jan19f = "data/france/tsv/clickstream-frwiki-2019-01.tsv"
feb19f = "data/france/tsv/clickstream-frwiki-2019-02.tsv"
mar19f = "data/france/tsv/clickstream-frwiki-2019-03.tsv"

NOV18, DEC18, JAN19, FEB19, MAR19 = 0, 1, 2, 3, 4

months = ["Nov '18", "Dec '18", "Jan '19", "Feb '19", "Mar '19"]
paths = [nov18f, dec18f, jan19f, feb19f, mar19f]

def explore_tsv(file):
    df = pd.read_csv(file, sep='\t', header=0)
    df.columns = ['from', 'article_title', 'type', 'count']
    return df

In [3]:
def top_n_articles_list(month_id, n):
    
    df = explore_tsv(paths[month_id])
    df_search = df[df['from']=='other-search']
    dftop = df_search.sort_values(['count'], ascending=False)[:n]
    top_articles = dftop.T.loc['article_title'].T.tolist()
    return top_articles

In [4]:
monthly_top_titles = []
for month_id in range(1, 4):
    monthly_top_titles.append(top_n_articles_list(month_id, n=5))

In [5]:
tracked_titles = set()

i = 0
for top_titles_in_month in monthly_top_titles:
    for j, title in enumerate(top_titles_in_month):
        tracked_titles.add(title)
        i += 1
        print(i,":", j,"-", title)

1 : 0 - Mouvement_des_Gilets_jaunes
2 : 1 - Freddie_Mercury
3 : 2 - La_Vérité_sur_l'affaire_Harry_Quebert
4 : 3 - Emmanuel_Macron
5 : 4 - Mai_68
6 : 0 - Emiliano_Sala
7 : 1 - Bilal_Hassani
8 : 2 - Championnat_du_monde_masculin_de_handball_2019
9 : 3 - Christophe_Dettinger
10 : 4 - Che_Guevara
11 : 0 - Karl_Lagerfeld
12 : 1 - Emiliano_Sala
13 : 2 - Freddie_Mercury
14 : 3 - Umbrella_Academy_(série_télévisée)
15 : 4 - Saint-Valentin


In [6]:
print(len(tracked_titles))

13


In [7]:
track_dictionary = {}
for title in tracked_titles:
    track_dictionary[title] = []

In [8]:
for month_id in range(5):
    
    df = explore_tsv(paths[month_id])
    df_search = df[df['from']=='other-search']
    dftop = df_search.sort_values(['count'], ascending=False)

    for title in track_dictionary.keys():
        
        x = dftop.loc[dftop['article_title'] == title]['count']
        try: 
            x = int(x)
            print("month_id: {} | count: {:7d} | title: {}".format(month_id, x, title))
        except:
            x = 'MISSING'
            print("month_id: {} | count: MISSING | title: {}".format(month_id, title))
        
        track_dictionary[title].append(x)

month_id: 0 | count:    7145 | title: Championnat_du_monde_masculin_de_handball_2019
month_id: 0 | count:   39419 | title: Che_Guevara
month_id: 0 | count:    6002 | title: Saint-Valentin
month_id: 0 | count: MISSING | title: Bilal_Hassani
month_id: 0 | count:  592669 | title: La_Vérité_sur_l'affaire_Harry_Quebert
month_id: 0 | count:  189854 | title: Mouvement_des_Gilets_jaunes
month_id: 0 | count: MISSING | title: Umbrella_Academy_(série_télévisée)
month_id: 0 | count:  182124 | title: Emmanuel_Macron
month_id: 0 | count:   18262 | title: Karl_Lagerfeld
month_id: 0 | count:  730379 | title: Freddie_Mercury
month_id: 0 | count:   20974 | title: Emiliano_Sala
month_id: 0 | count:  213560 | title: Mai_68
month_id: 0 | count:      30 | title: Christophe_Dettinger
month_id: 1 | count:   24127 | title: Championnat_du_monde_masculin_de_handball_2019
month_id: 1 | count:   37704 | title: Che_Guevara
month_id: 1 | count:    6912 | title: Saint-Valentin
month_id: 1 | count: MISSING | title: Bi

In [9]:
for k, v in track_dictionary.items():
    print()
    print("title:", k)
    print("counts:", v)


title: Championnat_du_monde_masculin_de_handball_2019
counts: [7145, 24127, 372017, 5171, 2483]

title: Che_Guevara
counts: [39419, 37704, 359355, 47699, 45294]

title: Saint-Valentin
counts: [6002, 6912, 29266, 286882, 8229]

title: Bilal_Hassani
counts: ['MISSING', 'MISSING', 387874, 86033, 24062]

title: La_Vérité_sur_l'affaire_Harry_Quebert
counts: [592669, 268130, 20224, 9001, 8063]

title: Mouvement_des_Gilets_jaunes
counts: [189854, 356218, 78767, 14835, 'MISSING']

title: Umbrella_Academy_(série_télévisée)
counts: ['MISSING', 'MISSING', 'MISSING', 310754, 188480]

title: Emmanuel_Macron
counts: [182124, 262944, 147858, 87050, 85288]

title: Karl_Lagerfeld
counts: [18262, 18542, 32643, 1031709, 86292]

title: Freddie_Mercury
counts: [730379, 269269, 332669, 337720, 320180]

title: Emiliano_Sala
counts: [20974, 12091, 767670, 406180, 19597]

title: Mai_68
counts: [213560, 262693, 43006, 23730, 30227]

title: Christophe_Dettinger
counts: [30, 20, 365050, 21418, 5346]


In [10]:
df = explore_tsv(paths[month_id])
df_search = df[df['from']=='other-search']
dftop = df_search.sort_values(['count'], ascending=False)
dftop.loc[dftop['article_title'] == 'Agnès_Varda']['count']

2108068    243420
Name: count, dtype: int64