# Status codes of responses to the first first-party initial request

    – input data: FP_status_codes_EU.json - HTTP response status codes of all firstparties with EU/EEA origin
    – output plot:
        1. stacked bar chart with number of visited sites per grouped HTTP response status code on y-axis and harvest date on x-axis
        2. stacked bar chart with number of visited sites per specific HTTP response status code on y-axis and harvest date on x-axis
    – purpose: Visualise the ‘health’ of visited sites over the harvesting period. The harvesting period is almost 2.5 years and the same sites are being visited over and over, therefore, there is a high chance that some visited sites will become inactive or will redirect to another site.
        1. Investigate the grouped HTTP response codes.
        2. Investigate specific most populous HTTP response codes, besides 200 - OK.

In [1]:
# Import
import pandas as pd
import json
import plotly.graph_objects as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
from itertools import cycle
import plotly.express as px

In [2]:
# Jupyter setup
init_notebook_mode(connected=True)

In [3]:
# Load data
stat_path = '/home/ubuntu/data/processed/crawls/response_enriched/analysis_v.3/'
stat_name = 'FP_status_codes_EU.json'

with open(stat_path + stat_name) as f:
    stat_dict = json.load(f)

print('Complete')

Complete


In [4]:
# Prepare data for DF creation
dict_data = []

for key in stat_dict.keys():
    crawl_data = {'date':key, 'data':stat_dict[key]}
    dict_data.append(crawl_data)

In [5]:
# Create a DF
df_errors = pd.json_normalize(dict_data)
df_errors.head()

Unnamed: 0,date,data.200,data.301,data.302,data.303,data.307,data.401,data.403,data.404,data.410,...,data.308,data.501,data.504,data.520,data.400,data.204,data.304,data.592,data.523,data.521
0,2019-05-29,5039,3628,894,39,43,10,11,361,9,...,,,,,,,,,,
1,2019-04-05,5311,3424,877,30,45,10,12,318,9,...,,,,,,,,,,
2,2019-02-21,5509,3253,890,32,35,10,11,281,11,...,,,,,,,,,,
3,2020-05-12,3967,4340,1080,42,33,12,29,443,11,...,12.0,1.0,2.0,3.0,,,,,,
4,2018-04-17,7708,1579,555,26,17,15,7,112,6,...,,,1.0,,1.0,,,,,


In [6]:
# Set date column as index
df_errors.set_index('date', inplace=True)
df_errors.head()

Unnamed: 0_level_0,data.200,data.301,data.302,data.303,data.307,data.401,data.403,data.404,data.410,data.500,...,data.308,data.501,data.504,data.520,data.400,data.204,data.304,data.592,data.523,data.521
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2019-05-29,5039,3628,894,39,43,10,11,361,9,9,...,,,,,,,,,,
2019-04-05,5311,3424,877,30,45,10,12,318,9,8,...,,,,,,,,,,
2019-02-21,5509,3253,890,32,35,10,11,281,11,10,...,,,,,,,,,,
2020-05-12,3967,4340,1080,42,33,12,29,443,11,8,...,12.0,1.0,2.0,3.0,,,,,,
2018-04-17,7708,1579,555,26,17,15,7,112,6,5,...,,,1.0,,1.0,,,,,


In [7]:
# Sort dataframe by index (status codes)
df_errors = df_errors.sort_index()
df_errors.head()

Unnamed: 0_level_0,data.200,data.301,data.302,data.303,data.307,data.401,data.403,data.404,data.410,data.500,...,data.308,data.501,data.504,data.520,data.400,data.204,data.304,data.592,data.523,data.521
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2018-02-07,8315,1122,512,27,17,5,3,35,7,4,...,,,2.0,,,,,,,
2018-02-09,8302,1128,525,28,16,5,3,38,7,4,...,,,,,,,,,,
2018-02-14,8276,1143,525,27,19,5,3,42,7,3,...,,,1.0,,,1.0,,,,
2018-03-21,7917,1408,543,27,17,15,2,94,5,6,...,,,3.0,,,,,,,
2018-03-29,7894,1457,528,24,16,15,8,91,5,3,...,,,8.0,,,,,2.0,,


In [8]:
# Transpose the DF
df_errors = df_errors.T
df_errors.head()

date,2018-02-07,2018-02-09,2018-02-14,2018-03-21,2018-03-29,2018-04-09,2018-04-17,2018-05-05,2018-05-19,2018-05-25,...,2019-12-22,2020-01-13,2020-02-07,2020-02-25,2020-03-10,2020-03-24,2020-04-07,2020-05-12,2020-06-02,2020-06-19
data.200,8315.0,8302.0,8276.0,7917.0,7894.0,7819.0,7708.0,7542.0,7418.0,7301.0,...,4373.0,4357.0,4220.0,4184.0,4138.0,4102.0,4055.0,3967.0,3924.0,3891.0
data.301,1122.0,1128.0,1143.0,1408.0,1457.0,1502.0,1579.0,1724.0,1794.0,1847.0,...,4082.0,4099.0,4143.0,4178.0,4200.0,4271.0,4273.0,4340.0,4350.0,4407.0
data.302,512.0,525.0,525.0,543.0,528.0,555.0,555.0,575.0,588.0,626.0,...,979.0,984.0,1024.0,1046.0,1069.0,1052.0,1081.0,1080.0,1086.0,1093.0
data.303,27.0,28.0,27.0,27.0,24.0,26.0,26.0,28.0,27.0,28.0,...,39.0,39.0,39.0,39.0,35.0,35.0,34.0,42.0,42.0,31.0
data.307,17.0,16.0,19.0,17.0,16.0,16.0,17.0,21.0,21.0,28.0,...,24.0,24.0,24.0,34.0,31.0,33.0,32.0,33.0,34.0,31.0


In [9]:
# Sort dataframe by index (status codes)
df_errors = df_errors.sort_index()
df_errors.head()

date,2018-02-07,2018-02-09,2018-02-14,2018-03-21,2018-03-29,2018-04-09,2018-04-17,2018-05-05,2018-05-19,2018-05-25,...,2019-12-22,2020-01-13,2020-02-07,2020-02-25,2020-03-10,2020-03-24,2020-04-07,2020-05-12,2020-06-02,2020-06-19
data.200,8315.0,8302.0,8276.0,7917.0,7894.0,7819.0,7708.0,7542.0,7418.0,7301.0,...,4373.0,4357.0,4220.0,4184.0,4138.0,4102.0,4055.0,3967.0,3924.0,3891.0
data.204,,,1.0,,,,,,,,...,,1.0,,,1.0,,,,,
data.301,1122.0,1128.0,1143.0,1408.0,1457.0,1502.0,1579.0,1724.0,1794.0,1847.0,...,4082.0,4099.0,4143.0,4178.0,4200.0,4271.0,4273.0,4340.0,4350.0,4407.0
data.302,512.0,525.0,525.0,543.0,528.0,555.0,555.0,575.0,588.0,626.0,...,979.0,984.0,1024.0,1046.0,1069.0,1052.0,1081.0,1080.0,1086.0,1093.0
data.303,27.0,28.0,27.0,27.0,24.0,26.0,26.0,28.0,27.0,28.0,...,39.0,39.0,39.0,39.0,35.0,35.0,34.0,42.0,42.0,31.0


In [10]:
# Transpose the DF
df_errors = df_errors.T
df_errors.head()

Unnamed: 0_level_0,data.200,data.204,data.301,data.302,data.303,data.304,data.307,data.308,data.400,data.401,...,data.500,data.501,data.502,data.503,data.504,data.520,data.521,data.522,data.523,data.592
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2018-02-07,8315.0,,1122.0,512.0,27.0,,17.0,,,5.0,...,4.0,,,7.0,2.0,,,,,
2018-02-09,8302.0,,1128.0,525.0,28.0,,16.0,,,5.0,...,4.0,,1.0,1.0,,,,,,
2018-02-14,8276.0,1.0,1143.0,525.0,27.0,,19.0,,,5.0,...,3.0,,2.0,1.0,1.0,,,,,
2018-03-21,7917.0,,1408.0,543.0,27.0,,17.0,,,15.0,...,6.0,,,3.0,3.0,,,,,
2018-03-29,7894.0,,1457.0,528.0,24.0,,16.0,,,15.0,...,3.0,,,5.0,8.0,,,1.0,,2.0


In [11]:
# Define a list of status codes for the plot categories
status_codes_group = [200, 300, 400, 500]
status_codes_name = ['2xx', '3xx', '4xx', '5xx']

In [12]:
# Fill all NAN values with 0
df_errors = df_errors.fillna(0)
df_errors.head()

Unnamed: 0_level_0,data.200,data.204,data.301,data.302,data.303,data.304,data.307,data.308,data.400,data.401,...,data.500,data.501,data.502,data.503,data.504,data.520,data.521,data.522,data.523,data.592
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2018-02-07,8315.0,0.0,1122.0,512.0,27.0,0.0,17.0,0.0,0.0,5.0,...,4.0,0.0,0.0,7.0,2.0,0.0,0.0,0.0,0.0,0.0
2018-02-09,8302.0,0.0,1128.0,525.0,28.0,0.0,16.0,0.0,0.0,5.0,...,4.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2018-02-14,8276.0,1.0,1143.0,525.0,27.0,0.0,19.0,0.0,0.0,5.0,...,3.0,0.0,2.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
2018-03-21,7917.0,0.0,1408.0,543.0,27.0,0.0,17.0,0.0,0.0,15.0,...,6.0,0.0,0.0,3.0,3.0,0.0,0.0,0.0,0.0,0.0
2018-03-29,7894.0,0.0,1457.0,528.0,24.0,0.0,16.0,0.0,0.0,15.0,...,3.0,0.0,0.0,5.0,8.0,0.0,0.0,1.0,0.0,2.0


In [13]:
# Obtain a presence of status codes per status code group
df_error_cat = pd.DataFrame()
df_error_cat[200] = df_errors['data.200'] + df_errors['data.204']
df_error_cat[300] = df_errors['data.301'] + df_errors['data.302'] + df_errors['data.303'] + df_errors['data.304'] + df_errors['data.307'] + df_errors['data.308']
df_error_cat[400] = df_errors['data.400'] + df_errors['data.401'] + df_errors['data.403'] + df_errors['data.404'] + df_errors['data.410']
df_error_cat[500] = df_errors['data.500'] + df_errors['data.501'] + df_errors['data.502'] + df_errors['data.503'] + df_errors['data.504'] + df_errors['data.520'] + df_errors['data.521'] + df_errors['data.522'] + df_errors['data.523'] + df_errors['data.592']
df_error_cat.head()

Unnamed: 0_level_0,200,300,400,500
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2018-02-07,8315.0,1678.0,50.0,13.0
2018-02-09,8302.0,1697.0,53.0,6.0
2018-02-14,8277.0,1714.0,57.0,7.0
2018-03-21,7917.0,1995.0,116.0,12.0
2018-03-29,7894.0,2025.0,119.0,19.0


In [14]:
# Define the number of all requested sites with EU/EEA origin
df_error_cat['all_requested'] = 10089
df_error_cat.head()

Unnamed: 0_level_0,200,300,400,500,all_requested
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2018-02-07,8315.0,1678.0,50.0,13.0,10089
2018-02-09,8302.0,1697.0,53.0,6.0,10089
2018-02-14,8277.0,1714.0,57.0,7.0,10089
2018-03-21,7917.0,1995.0,116.0,12.0,10089
2018-03-29,7894.0,2025.0,119.0,19.0,10089


In [15]:
# Find the number of sites that have not returned a response
df_error_cat['no_response'] = df_error_cat['all_requested'] - df_error_cat[200] - df_error_cat[300] - df_error_cat[400] - df_error_cat[500]
df_error_cat.head()

Unnamed: 0_level_0,200,300,400,500,all_requested,no_response
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2018-02-07,8315.0,1678.0,50.0,13.0,10089,33.0
2018-02-09,8302.0,1697.0,53.0,6.0,10089,31.0
2018-02-14,8277.0,1714.0,57.0,7.0,10089,34.0
2018-03-21,7917.0,1995.0,116.0,12.0,10089,49.0
2018-03-29,7894.0,2025.0,119.0,19.0,10089,32.0


In [16]:
# Load harvest dates
dates_path = '/home/ubuntu/data/processed/crawls/response_enriched/analysis_v.3/'
dates_name = 'harvest_DATES.csv'
df_dates = pd.read_csv(dates_path+dates_name)

# Adjust and filter
df_dates.reset_index(inplace=True)
del df_dates['Unnamed: 0']
del df_dates['index']
df_dates.at[9, 'date'] = '<b>2018-05-25</b>'
df_dates.head()

Unnamed: 0,date
0,2018-02-07
1,2018-02-09
2,2018-02-14
3,2018-03-21
4,2018-03-29


In [17]:
# Create a list
data = []

# Choose a color palette
colors = ['#2CA02C', '#FF7F0E', '#D62728', '#7F7F7F']
palette = cycle(colors)

# Create bars 
for i in range(len(status_codes_group)):
    data.append(go.Bar(name=status_codes_name[i], x=df_dates['date'], y=df_error_cat[status_codes_group[i]], 
                      marker_color=next(palette)))

data.append(go.Bar(name='Missing responses', x=df_dates['date'], y=df_error_cat['no_response'],
                       marker_color='black'))
# Initiate figure
fig = go.Figure(data)

# Make x-axis categorical
fig.update_xaxes(type='category')

# Use date values from DF for x-axis 
fig.update_xaxes(tickvals=df_dates['date'])

# Change the bar mode
fig.update_layout(barmode='stack')

# Set y-axis title
fig.update_yaxes(title_text="Number of <b>visited sites</b> (first-parties)")

# Add centered figure title, set the legent and font
fig.update_layout(
    title={
        'text': "Grouped HTTP response <i>status codes</i> for initializing first-party requests for<br>EU/EEA sites<br>",
        'y':0.93,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top',
        'font':dict(size=20)},
    legend_title="HTTP response status codes:",
    font=dict(
        family="Courier New, monospace",
    ),
)

# Add annotation
fig.add_annotation(
        x=9,
        y=10100,
        xref="x",
        yref="y",
        text="GDPR",
        showarrow=True,
        font=dict(
            family="Courier New, monospace",
            size=14,
            color="#ffffff"
            ),
        align="center",
        arrowhead=2,
        arrowsize=1,
        arrowwidth=2,
        arrowcolor="#636363",
        ax=20,
        ay=-30,
        bordercolor="#c7c7c7",
        borderwidth=2,
        borderpad=4,
        bgcolor="#ff7f0e",
        opacity=0.8
        )

# Change legend location
fig.update_layout(legend=dict(
    orientation="h",
    yanchor="bottom",
    y=1.02,
    xanchor="center",
    x=0.5, 
    traceorder='normal'
))

# Set plot size - use when exporting
#fig.update_layout(
#    autosize=False,
#    width=1100,
#    height=600,
#    )


iplot(fig)

In [19]:
# Export
fig.write_image("/home/ubuntu/Plots/FIG_5.pdf")

In [18]:
# Defin lists
data = []
status_codes_specific = [301, 302, 404]
status_codes_specific_legend = ['301 - Moved Permanently', '302 - Found', '404 - Not found']

# Choose a color palette
colors = ['#4C78A8', '#F58518', '#D62728']
palette = cycle(colors)

# Create bars
for i in range(len(status_codes_specific)):
    data.append(go.Bar(name=status_codes_specific_legend[i], x=df_dates['date'], y=df_errors['data.'+str(status_codes_specific[i])],
                      marker_color=next(palette)))

data.append(go.Bar(name='Missing responses', x=df_dates['date'], y=df_error_cat['no_response'],
                       marker_color='black'))
fig = go.Figure(data)

# Make x-axis categorical
fig.update_xaxes(type='category')

# Use date values from DF for x-axis 
fig.update_xaxes(tickvals=df_dates['date'])

# Change the bar mode
fig.update_layout(barmode='stack')

# Set y-axis title
fig.update_yaxes(title_text="Number of <b>visited sites</b> (first-parties)")

# Add centered figure title, set the legent and font
fig.update_layout(
    title={
        'text': "Specific HTTP response <i>status codes</i> for initializing first-party requests<br>for EU/EEA sites",
        'y':0.93,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top',
        'font':dict(size=20)},
    legend_title="HTTP response status codes:",
    font=dict(
        family="Courier New, monospace",
    ),
)

# Add annotation
fig.add_annotation(
        x=9,
        y=2700,
        xref="x",
        yref="y",
        text="GDPR",
        showarrow=True,
        font=dict(
            family="Courier New, monospace",
            size=14,
            color="#ffffff"
            ),
        align="center",
        arrowhead=2,
        arrowsize=1,
        arrowwidth=2,
        arrowcolor="#636363",
        ax=20,
        ay=-30,
        bordercolor="#c7c7c7",
        borderwidth=2,
        borderpad=4,
        bgcolor="#ff7f0e",
        opacity=0.8
        )

# Change legend location
fig.update_layout(legend=dict(
    orientation="h",
    yanchor="bottom",
    y=1.02,
    xanchor="center",
    x=0.5,
    traceorder='normal'
))

# Set plot size - use when exporting
#fig.update_layout(
#    autosize=False,
#    width=1100,
#    height=600,
#    )


iplot(fig)

In [21]:
# Export
fig.write_image("/home/ubuntu/Plots/FIG_6.pdf")