# Average number of TPs per each website category (based on visited sites) with linear regression

    – input data: TP_occurence_per_visit.json - total presence of each TP on every visited site
    – output plot: scatter plot with harvest dates on x-axis and average number of unique TPs by category on y-axis depicted with markers and by linear regression line
    – purpose: Visualise the average number of TPs for each FP category over the harvesting period as well as the linear regression line. This enables to better analyse the TPs trends over the time, more closely compare average number of TPs between categories, and see trends.

In [1]:
# Import
import plotly
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly.express as px
import pandas as pd
import json
import plotly.graph_objects as go
import numpy as np
from itertools import cycle

In [2]:
# Jupyter setup
init_notebook_mode(connected=True)

In [3]:
# Function for loading json file
def load_json(path, name):
    with open(path + name) as f:
        json_data = json.load(f)
    return json_data

In [4]:
# Load data
f_path = '/home/ubuntu/data/processed/crawls/response_enriched/analysis_v.3/'
TPs_per_site_name = 'TP_occurence_per_visit.json'

TPs_per_site = load_json(f_path, TPs_per_site_name)

In [5]:
# Creating empty dataframe
df_total = pd.DataFrame(columns={'visit_id'})
df_total.head()

Unnamed: 0,visit_id


In [6]:
# Process the json dict to that the first column of DF are visited sites and the rest are the dates of harvest and
    # the number of TPs for given visited site
for i in range(len(TPs_per_site)):
    keys = []
    values = []
    for key in TPs_per_site[i]['TP_occurence_per_visit']:
        number_of_TPs = len(TPs_per_site[i]['TP_occurence_per_visit'][key])
        keys.append(key)
        values.append(number_of_TPs)
    data = {'visit_id':keys, TPs_per_site[i]['date']:values}
    df = pd.DataFrame(data)
    df_total = df_total.merge(df, how='outer', on='visit_id')
    df_total.set_index('visit_id', inplace=True)
    df_total.sort_index(inplace=True)
df_total.head()    

Unnamed: 0_level_0,2019-05-29,2019-04-05,2019-02-21,2020-05-12,2018-04-17,2018-06-12,2019-06-14,2019-04-12,2018-04-09,2018-11-07,...,2019-10-29,2018-09-06,2019-08-05,2018-09-11,2019-11-28,2019-09-20,2018-02-09,2019-10-03,2019-03-27,2019-10-16
visit_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,21.0,13.0,23.0,18.0,84.0,17.0,21.0,22.0,101.0,15.0,...,31.0,17.0,22.0,15.0,25.0,26.0,53.0,25.0,22.0,27.0
10000,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,,1.0,,1.0,,,1.0,,1.0,
10002,21.0,21.0,19.0,,15.0,15.0,21.0,21.0,15.0,17.0,...,21.0,16.0,21.0,16.0,21.0,22.0,15.0,22.0,21.0,21.0
10003,9.0,9.0,9.0,12.0,6.0,7.0,8.0,9.0,6.0,9.0,...,11.0,6.0,8.0,7.0,11.0,9.0,14.0,11.0,9.0,11.0
10004,9.0,7.0,7.0,9.0,7.0,7.0,9.0,7.0,7.0,7.0,...,9.0,7.0,9.0,7.0,8.0,9.0,7.0,8.0,7.0,9.0


In [7]:
# Load visited site info
site_cat_path = '/home/ubuntu/data/processed/'
site_cat_name = 'visitedSitesCat.csv'

df_site_cat = pd.read_csv(site_cat_path + site_cat_name)
df_site_cat = df_site_cat.astype({'visit_id': 'int64'})
df_site_cat.head()

Unnamed: 0,visit_id,crawl_id,site_url,url_TLD,Country,Europe,PublicPrivate,SiteCategory,URLtype,TopLevelDomainLookUp
0,1,1,https://ekstrabladet.dk/nationen/,ekstrabladet.dk,Denmark,EU,Private,News,PrivateMedia EU,ekstrabladet.dk
1,2,2,http://nuzzel.com,nuzzel.com,United States,NotEurope,Private,News,NewsUser,nuzzel.com
2,3,3,https://www.her.ie/entertainment/dakota-johnso...,her.ie,Ireland,EU,Private,News,PrivateMedia,her.ie
3,4,4,https://www.government.is/ministries/locations...,government.is,Iceland,EEA,Public,Government,Ministry,government.is
4,5,2,https://www.bme.hu/kt,bme.hu,Hungary,EU,Public,Education,University,bme.hu


In [8]:
# Manipulate dataframe
df_total.reset_index(inplace=True)
df_total['visit_ids'] = df_total['visit_id']
df_total.set_index('visit_ids', inplace=True)
df_total = df_total.astype({'visit_id': 'int64'})
df_total

Unnamed: 0_level_0,visit_id,2019-05-29,2019-04-05,2019-02-21,2020-05-12,2018-04-17,2018-06-12,2019-06-14,2019-04-12,2018-04-09,...,2019-10-29,2018-09-06,2019-08-05,2018-09-11,2019-11-28,2019-09-20,2018-02-09,2019-10-03,2019-03-27,2019-10-16
visit_ids,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1,21.0,13.0,23.0,18.0,84.0,17.0,21.0,22.0,101.0,...,31.0,17.0,22.0,15.0,25.0,26.0,53.0,25.0,22.0,27.0
10000,10000,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,,1.0,,1.0,,,1.0,,1.0,
10002,10002,21.0,21.0,19.0,,15.0,15.0,21.0,21.0,15.0,...,21.0,16.0,21.0,16.0,21.0,22.0,15.0,22.0,21.0,21.0
10003,10003,9.0,9.0,9.0,12.0,6.0,7.0,8.0,9.0,6.0,...,11.0,6.0,8.0,7.0,11.0,9.0,14.0,11.0,9.0,11.0
10004,10004,9.0,7.0,7.0,9.0,7.0,7.0,9.0,7.0,7.0,...,9.0,7.0,9.0,7.0,8.0,9.0,7.0,8.0,7.0,9.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,9995,11.0,12.0,12.0,12.0,9.0,9.0,11.0,12.0,9.0,...,12.0,11.0,11.0,12.0,12.0,12.0,9.0,12.0,11.0,12.0
9996,9996,17.0,15.0,16.0,16.0,14.0,14.0,16.0,16.0,13.0,...,16.0,14.0,16.0,14.0,16.0,16.0,13.0,16.0,15.0,16.0
9997,9997,8.0,7.0,18.0,12.0,18.0,18.0,9.0,7.0,18.0,...,12.0,18.0,9.0,18.0,12.0,10.0,18.0,10.0,18.0,10.0
9998,9998,8.0,8.0,6.0,8.0,6.0,6.0,9.0,8.0,6.0,...,7.0,6.0,9.0,6.0,8.0,9.0,6.0,7.0,6.0,7.0


In [9]:
# Merge DFs
df_merged = df_total.merge(df_site_cat, how='left', left_on='visit_id', right_on='visit_id')
df_merged.set_index('visit_id', inplace=True)
df_merged.sort_index(inplace=True)
df_merged.head()

Unnamed: 0_level_0,2019-05-29,2019-04-05,2019-02-21,2020-05-12,2018-04-17,2018-06-12,2019-06-14,2019-04-12,2018-04-09,2018-11-07,...,2019-10-16,crawl_id,site_url,url_TLD,Country,Europe,PublicPrivate,SiteCategory,URLtype,TopLevelDomainLookUp
visit_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,21.0,13.0,23.0,18.0,84.0,17.0,21.0,22.0,101.0,15.0,...,27.0,1,https://ekstrabladet.dk/nationen/,ekstrabladet.dk,Denmark,EU,Private,News,PrivateMedia EU,ekstrabladet.dk
3,18.0,23.0,23.0,16.0,34.0,31.0,18.0,23.0,34.0,26.0,...,18.0,3,https://www.her.ie/entertainment/dakota-johnso...,her.ie,Ireland,EU,Private,News,PrivateMedia,her.ie
4,14.0,14.0,14.0,18.0,11.0,11.0,14.0,14.0,11.0,13.0,...,15.0,4,https://www.government.is/ministries/locations...,government.is,Iceland,EEA,Public,Government,Ministry,government.is
5,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,...,3.0,2,https://www.bme.hu/kt,bme.hu,Hungary,EU,Public,Education,University,bme.hu
9,46.0,28.0,53.0,71.0,23.0,25.0,36.0,27.0,29.0,28.0,...,34.0,2,https://www.idnes.cz/,idnes.cz,Czech Republic,EU,Private,News,PrivateMedia EU,idnes.cz


In [10]:
# Create new column for merged categories
df_merged['merged_cat'] = np.nan
df_merged.head()

Unnamed: 0_level_0,2019-05-29,2019-04-05,2019-02-21,2020-05-12,2018-04-17,2018-06-12,2019-06-14,2019-04-12,2018-04-09,2018-11-07,...,crawl_id,site_url,url_TLD,Country,Europe,PublicPrivate,SiteCategory,URLtype,TopLevelDomainLookUp,merged_cat
visit_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,21.0,13.0,23.0,18.0,84.0,17.0,21.0,22.0,101.0,15.0,...,1,https://ekstrabladet.dk/nationen/,ekstrabladet.dk,Denmark,EU,Private,News,PrivateMedia EU,ekstrabladet.dk,
3,18.0,23.0,23.0,16.0,34.0,31.0,18.0,23.0,34.0,26.0,...,3,https://www.her.ie/entertainment/dakota-johnso...,her.ie,Ireland,EU,Private,News,PrivateMedia,her.ie,
4,14.0,14.0,14.0,18.0,11.0,11.0,14.0,14.0,11.0,13.0,...,4,https://www.government.is/ministries/locations...,government.is,Iceland,EEA,Public,Government,Ministry,government.is,
5,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,...,2,https://www.bme.hu/kt,bme.hu,Hungary,EU,Public,Education,University,bme.hu,
9,46.0,28.0,53.0,71.0,23.0,25.0,36.0,27.0,29.0,28.0,...,2,https://www.idnes.cz/,idnes.cz,Czech Republic,EU,Private,News,PrivateMedia EU,idnes.cz,


In [11]:
# Unifying categories
df_merged.loc[df_merged['SiteCategory']=='Education', 'merged_cat'] = 'University'
df_merged.loc[df_merged['SiteCategory']=='Entertainment', 'merged_cat'] = 'Entertainment'
df_merged.loc[(df_merged['SiteCategory']=='Government') | (df_merged['URLtype']=='CitizenSelfService'), 'merged_cat'] = 'Government'
df_merged.loc[df_merged['SiteCategory']=='LegalService', 'merged_cat'] = 'LegalServices'
df_merged.loc[(df_merged['SiteCategory']=='News') & (df_merged['PublicPrivate']=='Public'), 'merged_cat'] = 'NewsPublic'
df_merged.loc[(df_merged['SiteCategory']=='News') & (df_merged['PublicPrivate']=='Private'), 'merged_cat'] = 'NewsPrivate'

df_merged.loc[(df_merged['SiteCategory']=='CitizenService') & (df_merged['URLtype']=='PostalService'), 'merged_cat'] = 'PostalServices'
df_merged.loc[(df_merged['SiteCategory']=='CitizenService') & (df_merged['URLtype']=='PublicTransport'), 'merged_cat'] = 'PublicTransport'
df_merged.loc[(df_merged['SiteCategory']=='CitizenService') 
                      & (df_merged['URLtype']=='Weather')
                         & (df_merged['PublicPrivate']=='Private'), 'merged_cat'] = 'WeatherPrivate'
df_merged.loc[(df_merged['SiteCategory']=='CitizenService') 
                      & (df_merged['URLtype']=='Weather')
                         & (df_merged['PublicPrivate']=='Public'), 'merged_cat'] = 'WeatherPublic'
df_merged.loc[(df_merged['SiteCategory']=='Consumption') & ((df_merged['URLtype']=='Shopping')
                         | (df_merged['URLtype']=='ShoppingUser') | (df_merged['URLtype']=='TravelUser')
                            | (df_merged['URLtype']=='Pharmacy')), 'merged_cat'] = 'ShoppingTravel'

In [12]:
# Delete not needed column
del df_merged['crawl_id']
df_merged.head()

Unnamed: 0_level_0,2019-05-29,2019-04-05,2019-02-21,2020-05-12,2018-04-17,2018-06-12,2019-06-14,2019-04-12,2018-04-09,2018-11-07,...,2019-10-16,site_url,url_TLD,Country,Europe,PublicPrivate,SiteCategory,URLtype,TopLevelDomainLookUp,merged_cat
visit_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,21.0,13.0,23.0,18.0,84.0,17.0,21.0,22.0,101.0,15.0,...,27.0,https://ekstrabladet.dk/nationen/,ekstrabladet.dk,Denmark,EU,Private,News,PrivateMedia EU,ekstrabladet.dk,NewsPrivate
3,18.0,23.0,23.0,16.0,34.0,31.0,18.0,23.0,34.0,26.0,...,18.0,https://www.her.ie/entertainment/dakota-johnso...,her.ie,Ireland,EU,Private,News,PrivateMedia,her.ie,NewsPrivate
4,14.0,14.0,14.0,18.0,11.0,11.0,14.0,14.0,11.0,13.0,...,15.0,https://www.government.is/ministries/locations...,government.is,Iceland,EEA,Public,Government,Ministry,government.is,Government
5,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,...,3.0,https://www.bme.hu/kt,bme.hu,Hungary,EU,Public,Education,University,bme.hu,University
9,46.0,28.0,53.0,71.0,23.0,25.0,36.0,27.0,29.0,28.0,...,34.0,https://www.idnes.cz/,idnes.cz,Czech Republic,EU,Private,News,PrivateMedia EU,idnes.cz,NewsPrivate


In [13]:
# Calculate the average of TPs per visited sites category
df_mean = df_merged.groupby('merged_cat').mean()
df_mean = df_mean.T
df_mean.sort_index(inplace=True)
df_mean.reset_index(inplace=True)
df_mean['order'] = [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59]
df_mean.head()

merged_cat,index,Entertainment,Government,LegalServices,NewsPrivate,NewsPublic,PostalServices,PublicTransport,ShoppingTravel,University,WeatherPrivate,WeatherPublic,order
0,2018-02-07,17.717241,3.893548,4.967944,31.568966,12.666967,7.46,8.405063,17.628639,7.573855,20.363636,10.709677,1
1,2018-02-09,17.982094,3.877551,4.940153,30.704271,12.187726,7.398671,8.151899,17.817467,7.558538,20.515152,9.290323,2
2,2018-02-14,17.788752,3.886022,4.938462,30.472364,12.41065,7.387417,8.151899,17.400315,7.488493,20.969697,9.935484,3
3,2018-03-21,18.527548,3.849785,5.068592,30.775093,12.261969,7.569536,9.24359,17.976544,7.702963,22.515152,11.4,4
4,2018-03-29,18.335188,3.860215,5.009259,30.954419,12.286745,7.44186,8.721519,18.153725,7.725055,21.666667,9.903226,5


In [14]:
# Process data for the format that Plotly express accept
df_mean_new = pd.DataFrame(columns=['date', 'category', 'mean'])

categories = ['Entertainment', 'Government', 'LegalServices', 'NewsPrivate', 'NewsPublic', 'PostalServices', 'PublicTransport', 'ShoppingTravel', 'University', 'WeatherPrivate', 'WeatherPublic']

for i in range(len(df_mean)):
    for j in range(len(categories)):
        df_mean_new = df_mean_new.append({'date': df_mean.iloc[i,0], 'category': categories[j], 'mean': df_mean.iloc[i,j+1]}, ignore_index=True)

df_mean_new['date2'] = df_mean_new['date']
df_mean_new = df_mean_new.astype({'date': 'datetime64'})

df_mean_new.head()

Unnamed: 0,date,category,mean,date2
0,2018-02-07,Entertainment,17.717241,2018-02-07
1,2018-02-07,Government,3.893548,2018-02-07
2,2018-02-07,LegalServices,4.967944,2018-02-07
3,2018-02-07,NewsPrivate,31.568966,2018-02-07
4,2018-02-07,NewsPublic,12.666967,2018-02-07


In [15]:
# Make GDPR date bold
df_mean.at[9, 'index'] = '<b>2018-05-25</b>'

# Define categories, symbols and colors
categories=['NewsPrivate', 'Entertainment', 'ShoppingTravel', 'NewsPublic', 'WeatherPrivate', 'University', 'PostalServices',
            'WeatherPublic', 'PublicTransport', 'LegalServices', 'Government']
symbols = ['circle', 'x', 'star', 'cross', 'triangle-up', 'triangle-left', 'square',
           'pentagon', 'diamond', 'triangle-down', 'triangle-right']
colors = ['#FF6692', '#FFA15A', '#B68100', '#66AA00', '#2CA02C', '#00CC96', '#00A08B', 
          '#17BECF', '#9467BD', '#AB63FA', '#E377C2', ]

# Define scatter plot
fig = go.Figure()

for i in range(len(categories)):
    px_fig = px.scatter(x=df_mean['order'], y=df_mean[categories[i]], trendline="ols")
    mean = px_fig.data[0]['y'] # mean
    trendline = px_fig.data[1]['y'] # ols

    fig.add_trace(go.Scatter(name=categories[i], x=df_mean['index'], y=mean, mode='markers', marker_symbol=symbols[i], marker_color=colors[i]))
    fig.add_trace(go.Scatter(x=df_mean['index'], y=trendline, marker_color='orange', showlegend=False, line_color=colors[i]))
    
# Set y-axis title
fig.update_yaxes(title_text="Average number of <b>TPs</b>")

# Set x-axis title
fig.update_xaxes(title_text="Harvest date")

# Change legend layout
fig.update_layout(legend=dict(
    orientation="h",
    yanchor="bottom",
    y=1.01,
    xanchor="center",
    x=0.5
))

# Add centered figure title, set the legent and font
fig.update_layout(
    title={
        'text': "Average number of TPs per category with respect to the harvest date<br>",
        'y':0.95,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top',
        'font':dict(size=20)},
    legend_title="Site category:",
    font=dict(
        family="Courier New, monospace",
    ),
)

# Set plot size - use when exporting
#fig.update_layout(
#    autosize=False,
#    width=1100,
#    height=600,
#    )

# Make x-axis categorical
fig.update_xaxes(type='category')

iplot(fig)

In [16]:
# Export
fig.write_image("/home/ubuntu/Plots/FIG_9.pdf")