# Visited sited of Total, EU-EEA, EU-EEA with TP communication

    – input data: visited_sites_total_combined.json - contains the number of total, EU/EEA, and EU/EEA with TP communication visited sites per harvest
    – output plot: bar chart with harvest date on x-axis and number of visited sites on y-axis
    – purpose: Visualise the number of visited sites per harvest based on three criteria: (i) total number of visited sited, (ii) all visited sites with EU/EEA origin, and (iii) only visited sites with EU/EEA origin with a TP response. The number of requested visited sites is consistent across the harvest - 12 778. However, the number of visited sites with at least one response fluctuates slightly over the time. This plot does not directly offer a reason on why is that happening but depicts the trend and opens a door for further analysis.

In [1]:
# Import
import pandas as pd
import json
import plotly.graph_objects as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
from itertools import cycle
import plotly.express as px

In [2]:
# Jupyter setup
init_notebook_mode(connected=True)

In [3]:
# Define path and file names
f_path = '/home/ubuntu/data/processed/crawls/response_enriched/analysis_v.3/'

TPs_name = 'visited_sites_total_combined.json'

In [4]:
# Load TPs data
with open(f_path + TPs_name) as f:
    json_TPs = json.load(f)
    
df_TPs = pd.json_normalize(json_TPs)
df_TPs.sort_values('date', inplace=True)
df_TPs.head()

Unnamed: 0,date,visited_sites_EU_FP_TP,visited_sites_EU,visited_sites
34,2018-02-07,9547,10056,12719
55,2018-02-09,9544,10058,12722
15,2018-02-14,9543,10055,12720
20,2018-03-21,9506,10040,12707
30,2018-03-29,9516,10057,12731


In [5]:
# Add new column with total requested sites per harvest
df_TPs['total_requested'] = 12778

In [6]:
# Load harvest dates
dates_path = '/home/ubuntu/data/processed/crawls/response_enriched/analysis_v.3/'
dates_name = 'harvest_DATES.csv'
df_dates = pd.read_csv(dates_path+dates_name)

# Adjust and filter 
df_dates.reset_index(inplace=True)
del df_dates['Unnamed: 0']
del df_dates['index']
df_dates.at[9, 'date'] = '<b>2018-05-25</b>'
df_dates.head()

Unnamed: 0,date
0,2018-02-07
1,2018-02-09
2,2018-02-14
3,2018-03-21
4,2018-03-29


In [8]:
# Choose a color palette
colors = ['#2CA02C', '#D62728', '#EECA3B']
palette = cycle(colors)


# Initialie figure
fig = go.Figure()

# Add traces
fig.add_trace(
    go.Bar(name='All visited sites',
           x=df_dates['date'],
           y=df_TPs['visited_sites'],
           marker_color=next(palette)
          )
)

fig.add_trace(
    go.Bar(name='All EU/EEA visited sites',
           x=df_dates['date'],
           y=df_TPs['visited_sites_EU'],
           width=0.5,
           marker_color=next(palette)
          )
)

fig.add_trace(
    go.Bar(name='EU/EEA visited sites with TP communication',
           x=df_dates['date'],
           y=df_TPs['visited_sites_EU_FP_TP'],
           width=0.35,
           marker_color=next(palette)
          )
)

fig.add_trace(
    go.Scatter(name='All requested sites', 
               x=df_dates['date'], 
               y=df_TPs['total_requested'],
               marker_color='black'))

# Make x-axis categorical
fig.update_xaxes(type='category')

# Use date values from DF for x-axis 
fig.update_xaxes(tickvals=df_dates['date'])

# Set the mode
fig.update_layout(barmode='overlay')

# Set y-axis title
fig.update_yaxes(title_text="Number of <b>visited sites</b>")

# Add centered figure title, set the legent and font
fig.update_layout(
    title={
        'text': "Number of <i>total</i>, <i>EU/EEA</i>, and <i>EU/EEA TP visited sites</i> per harvest<br>",
        'y':0.9,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top',
        'font':dict(size=20)},
    font=dict(
        family="Courier New, monospace",
    ),
)

# Add annotation
fig.add_annotation(
        x=9,
        y=12710,
        xref="x",
        yref="y",
        text="GDPR",
        showarrow=True,
        font=dict(
            family="Courier New, monospace",
            size=14,
            color="#ffffff"
            ),
        align="center",
        arrowhead=2,
        arrowsize=1,
        arrowwidth=2,
        arrowcolor="#636363",
        ax=20,
        ay=-30,
        bordercolor="#c7c7c7",
        borderwidth=2,
        borderpad=4,
        bgcolor="#ff7f0e",
        opacity=0.8
        )

# Change legend location
fig.update_layout(
    legend=dict(
        orientation="h",
        yanchor="bottom",
        y=1.01,
        xanchor="center",
        x=0.5
    ),
)

# Set y-axis range
fig.update_yaxes(range=[9000,13170])


# Set plot size - use when exporting
#fig.update_layout(
#    autosize=False,
#    width=1100,
#    height=600,
#    )

# Plot 
iplot(fig)

In [10]:
# Export
fig.write_image("/home/ubuntu/Plots/FIG_2.pdf")