# Development of TPs over harvest
Status of TPs over harvesting period:

   - **Active** - TPs which are present at the current harvest and are not incative or present in all previous harvests
   - **Same** - TPs which are present in current and all previous harvests
   - **Inactive** - TPs which were present in any of the previous harvests but are not present in the current one    

    – input data: all unique responses per harvest (.csv)
    – output plot: combined bar and line chart with number of unique TPs on y-axis and harvest date on x-axis
    – purpose: Visualise the development of the number of TPs over harvests including the TPs active in all previous harvests, and TPs not anymore active. This offers a view on how much the total number of TPs fluctuates and if it is mostly the same TPs present across harvests or they die out and new ones appear.

In [1]:
# Import
import pandas as pd
import os
import plotly.graph_objects as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
from itertools import cycle
import plotly.express as px

from plotly.subplots import make_subplots

In [2]:
# Jupyter setup
init_notebook_mode(connected=True)

In [3]:
# Load harvest dates
dates_path = '/home/ubuntu/data/processed/crawls/response_enriched/analysis_v.3/'
dates_name = 'harvest_DATES.csv'
df_dates = pd.read_csv(dates_path+dates_name)

# Adjust and filter 
df_dates.reset_index(inplace=True)
del df_dates['Unnamed: 0']
del df_dates['index']
df_dates.head()

Unnamed: 0,date
0,2018-02-07
1,2018-02-09
2,2018-02-14
3,2018-03-21
4,2018-03-29


In [4]:
# Define path for data
f_path = '/home/ubuntu/data/processed/TPs/responses_EU/'

# Create a starting DF
df_TPs = pd.DataFrame(index=['active', 'sum_inactive', 'sum_same_active', 'current_TPs'])


# Load and process the first harvest
df_first_harvest = pd.read_csv(f_path + 'EU-RES-' + df_dates['date'][0] + '.csv', header=None)
first_harvest_new_TPs = len(df_first_harvest)
first_and_second_TPs_combined = df_first_harvest[0]
df_TPs[df_dates['date'][0]] = [0,0, first_harvest_new_TPs, first_harvest_new_TPs]

# Create variables
first_inactive_list = pd.Series(dtype='str')
sum_inactive = 0

# Loop through all harvests
for i in range(len(df_dates)):
    if i==58:
        break
        
    # Load data to DF
    date_first = df_dates['date'][i]
    date_second = df_dates['date'][i+1]
    f_name_first = 'EU-RES-' + str(date_first) + '.csv'
    f_name_second = 'EU-RES-' + str(date_second) + '.csv'
    df_first = pd.read_csv(f_path + f_name_first, header=None)
    df_second = pd.read_csv(f_path + f_name_second, header=None)
    
    # Compare the previous and new TPs and find those appearing in both harvests
    first_TPs = first_and_second_TPs_combined
    second_TPs = df_second[0]
    first_and_second_TPs = set(first_TPs) & set(second_TPs)
    sum_same_active = len(first_and_second_TPs)
    
    # Create series from DFs and find their lenght
    s_first = df_first[0]
    s_second = df_second[0]
    first_lenght = len(s_first)
    second_lenght = len(s_second)
    
    # Calculate the number of new, inactive and same TPs
    same_TPs = len(s_second[s_second.isin(s_first)])
    new_TPs = second_lenght - same_TPs
    incative_TPs = first_lenght - same_TPs
    sum_same_active = len(first_and_second_TPs)
    active = same_TPs - sum_same_active + new_TPs
    current_TPs = sum_same_active + active
    
    # Find out which TPs from first harvest series are not in second -> find inactive TPs from second harvest
    second_inactive_list = s_first[~s_first.isin(s_second)]

    # Find out which TPs are newly inactive
    new_inactive = second_inactive_list[~second_inactive_list.isin(first_inactive_list)]
    
    # Find out which TPs are still inactive
    still_inactive = first_inactive_list[~first_inactive_list.isin(s_second)]
    
    # Merge new and still inactive
    all_inactive = new_inactive.append(still_inactive, ignore_index=True)
    sum_inactive = len(all_inactive)
    
    # Create a new column in DF for harvest
    df_TPs[df_dates['date'][i+1]] = [active, sum_inactive, sum_same_active, current_TPs]
    
    # Prepare new inactive list for next iteration
    first_inactive_list = all_inactive

    # Prepare new combined TPs for next iteration
    first_and_second_TPs_combined = first_and_second_TPs
    
print('DONE')

# Transponse DF
df_TPs = df_TPs.T
df_TPs.head()    

DONE


Unnamed: 0,active,sum_inactive,sum_same_active,current_TPs
2018-02-07,0,0,3047,3047
2018-02-09,143,124,2923,3066
2018-02-14,236,230,2840,3076
2018-03-21,395,457,2641,3036
2018-03-29,441,541,2586,3027


In [5]:
# Number of TPs appearing in every harvest
TPs_everywhere = len(first_and_second_TPs_combined)
TPs_everywhere

1265

In [6]:
# Create new column with the number of TPs appearing in all harvests
df_TPs['TPs_everywhere'] = TPs_everywhere

In [7]:
# Make GDPR harvest date bold
df_dates.at[9, 'date'] = '<b>2018-05-25</b>'

In [9]:
# Create a lists
TPs_cat_2 = ['sum_same_active', 'active', 'sum_inactive']#, 'current_TPs'
TPs_cat_names = ['TPs appearing in current and all previous harvests', 
                 'Varying active TPs', 'Overall inactive TPs']
data = []

fig = make_subplots(specs=[[{"secondary_y": True}]])

# Choose a color palette
colors = ['#FF6692', '#B6E880', '#7F7F7F']
palette = cycle(colors)

# Create bars 
for i in range(len(TPs_cat_2)):
    data.append(go.Bar(name=TPs_cat_names[i], x=df_dates['date'], y=df_TPs[TPs_cat_2[i]],
                       marker_color=next(palette)))

data.append(go.Scatter(name='TPs appearing in all harvests', x=df_dates['date'], y=df_TPs['TPs_everywhere'],
                       marker_color='black'))

data.append(go.Scatter(name='Number of TPs in current harvest', x=df_dates['date'], y=df_TPs['current_TPs'],
                       marker_color='blue'))
    
# Initiate figure
fig = go.Figure(data)

# Make x-axis categorical
fig.update_xaxes(type='category')

# Use date values from DF for x-axis 
fig.update_xaxes(tickvals=df_dates['date'])

# Change the bar mode
fig.update_layout(barmode='stack')

# Add and center figure title, add legend, change font
fig.update_layout(
    title={
        'text': "Development of TPs over harvests<br>",
        'y':0.95,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top',
        'font':dict(size=20)},
    font=dict(
        family="Courier New, monospace",
    )
)

# Set y-axes titles
fig.update_yaxes(title_text="Number of unique <b>TPs</b>")

# Add annotation
fig.add_annotation(
        x=9,
        y=4050,
        xref="x",
        yref="y",
        text="GDPR",
        showarrow=True,
        font=dict(
            family="Courier New, monospace",
            size=14,
            color="#ffffff"
            ),
        align="center",
        arrowhead=2,
        arrowsize=1,
        arrowwidth=2,
        arrowcolor="#636363",
        ax=20,
        ay=-30,
        bordercolor="#c7c7c7",
        borderwidth=2,
        borderpad=4,
        bgcolor="#ff7f0e",
        opacity=0.8
        )

# Change legend location
fig.update_layout(legend=dict(
    orientation="h",
    yanchor="bottom",
    y=1.02,
    xanchor="center",
    x=0.5, 
    traceorder='normal'
))

# Set plot size - use when exporting
#fig.update_layout(
#    autosize=False,
#    width=1100,
#    height=600,
#    )


iplot(fig)

In [14]:
# Export
fig.write_image("/home/ubuntu/Plots/FIG_3.pdf")