In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.stats.proportion as smp
from ipywidgets import interact

In [19]:
# Dictionary to recode values (as on survey dataset)
  
# Create a dictionary of question names
questions_dict_path = 'https://raw.githubusercontent.com/lorena-gp/food-standards-agency/master/survey_guide_variables.csv'
questions_dict = pd.read_csv(questions_dict_path)
questions_dict = pd.Series(questions_dict.Label.values, index=questions_dict.Variable).to_dict()

# Create a nested dictionary of answer names
answers_dict_path = 'https://raw.githubusercontent.com/lorena-gp/food-standards-agency/master/survey_guide_values.csv'
answers_dict = pd.read_csv(answers_dict_path)
answers_dict['Label'] = (answers_dict['Label']
                         .replace({'Wave 1':2010, 'Wave 2':2012, 'Wave 3':2014, 'Wave 4':2016, 'Wave 5':2018})
                         .replace({'Married/Civil Partnership/Living with Partner':'Married/Partnership'})
                         .replace({'Single/Widowed/Divorced/Separated/Other':'Single/Other'}))
answers_dict = answers_dict.fillna(method='ffill')
answers_dict = answers_dict.groupby('Variable')[['Vlue', 'Label']].apply(lambda g: dict(g.values)).to_dict()
answers_dict['wimd_2014_quintile'] = {1: 1, 2: 2, # 1 is most deprived
                                      3: 3, 4: 4, 5: 5, # 5 is least deprived
                                      -8: "Don't know", -1:'Not applicable'}

In [20]:
# Load Food and You survey dataset
survey_path = 'https://raw.githubusercontent.com/lorena-gp/food-standards-agency/master/survey.csv'
survey_full_dataset = pd.read_csv(survey_path)
survey_full_dataset = pd.DataFrame(survey_full_dataset)

# Encode 'Not applicable', 'Refused' and 'Don't know' as NaN
survey_full_dataset = survey_full_dataset.replace([-9, -8, -1, 98], np.nan)
cols_5_NaN = ['q4_1_4', 'q4_1_5a', 'Q4_1_5_comb', 'q4_1_6', 'q4_1_7', 'q4_1_8a', 'q4_1_8b', 'sanspray', 'q4_1_11',
              'q4_1_12', 'q4_1_13', 'q4_1_14', 'q4_1_15', 'q4_1_16', 'q4_1_17', 'q4_1_18', 'q4_1_19']
survey_full_dataset[cols_5_NaN] = survey_full_dataset[cols_5_NaN].replace([5], np.nan)


In [21]:
# Define demographic variables of interest
demographic_variables = ['age_dv', 'marstat2', 'religion_dv', 'RespSex', 'wimd_2014_quintile', 'workstat2', 'Q6_1']

# To plot the survey answers (wave 4 and 5 only) for subpopulation groups for specified relevant questions
waves = [4,5]
survey_subpopulation = survey_full_dataset.loc[survey_full_dataset['surveyyear'].isin(waves)]
survey_subpopulation[demographic_variables] = survey_subpopulation[demographic_variables].replace(answers_dict)

# Combine demographics as desired, for example:
survey_subpopulation['combined_demographics'] = (survey_subpopulation['workstat2'] + '_' +
                                                 survey_subpopulation['RespSex'] + '_' +
                                                 survey_subpopulation['marstat2'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if sys.path[0] == '':


In [22]:
# Define questions of interest
questions_of_interest = ['sanspray', 'q4_1_6', 'q4_1_7', 'q4_1_8a', 'q4_1_8b',   'Q4_37', 'Q4_38',
                         'bpoison', 'Q4_26b', 'Q4_19b', 'Q4_1_5_comb', 'eatoutev', 'fdsecst'] # etc

questions_of_interest_names = questions_of_interest.copy()
questions_of_interest_names[:] = [questions_dict.get(e,'') for e in questions_of_interest_names]

questions_dict_inv = {v: k for k, v in questions_dict.items()}

@interact
def plot(
    # provide an argument which has a default value
    # consisting of a list of values
    question=questions_of_interest_names
):
    temp = survey_subpopulation.copy()
    
    question = questions_dict_inv.get(question)
    
    temp = temp[temp['workstat2'] != 'Other'] 

    # Group according to the features of interest and calculate the % of answer types for the main question
    temp = temp.groupby(['combined_demographics', question])[question].agg(['count'])
    temp['total'] = temp.groupby('combined_demographics')['count'].transform('sum')
    lower, upper = smp.proportion_confint (temp['count'], temp['total'], alpha=0.05, method='normal')
    temp['CI_prop_upper'] = upper
    temp['CI_prop_lower'] = lower
    temp[question + ' (%)'] = temp['count'] / temp['total'] * 100
    temp['CI_perc_upper'] = temp['CI_prop_upper'] * 100
    temp['CI_perc_lower'] = temp['CI_prop_lower'] * 100
    temp['abs_err'] = temp[question + ' (%)'] - temp['CI_perc_lower']
    temp.reset_index(inplace=True) # To 'undo' the grouping
    temp = temp.round(1)

    # Recode the answers of the question of interest
    temp = temp.replace(answers_dict)

    abs_err = temp.pivot(index=question, columns='combined_demographics', values='abs_err')
        
    fig = temp.pivot(index=question, columns='combined_demographics', values=question +' (%)')\
          .plot(kind='barh', xerr=abs_err, width=0.75, figsize=(6,10))
    for item in ([fig.title, fig.xaxis.label, fig.yaxis.label] + fig.get_xticklabels() + fig.get_yticklabels()):
        item.set_fontsize(16)
    plt.title(questions_dict.get(question), x =0,  fontsize = 16)
    plt.xlabel(question + ' (%)')
    plt.ylabel('')
    fig.legend(loc='center left', bbox_to_anchor=(1.2, 0.8), fontsize = 14)
    plt.xlim(0, max(temp[question +' (%)'] + 10))
    fig.spines['top'].set_visible(False)
    fig.spines['right'].set_visible(False)
    fig.spines['left'].set_visible(False)

    # To plot the % number
    for p in fig.patches:
        width, height = p.get_width(), p.get_height()
        x, y = p.get_xy() 
        fig.annotate('{:}%'.format(width), (x + width + 8, y + 0.03), fontsize = 14)


# To show this as an independent (local) dashboard with Voila (https://github.com/voila-dashboards/voila):
# Install in terminal by: conda install -c conda-forge voila
# Start Voilà locally (cd in directory with this notebook) by running: voila Food-and-You-survey_risks.ipynb


# To give access to voila dashboard remotely using github and binder:
# Set up a public github repo with all data files needed (as csv),
# jupyter notebook with widgets(in which all csv files are read using the raw URLs from github)
# and a requirements.txt (listing  all the modules needed to run the notebook).
# Use binder (https://mybinder.org) and specify the path of the jupypter notebook (using voila/render),
# indicate it is a URL path.
# The new URL generated can be shared and is available any time.


# To give access to voila dashboard remotely using ngrok:
# (see https://voila.readthedocs.io/en/stable/deploy.html#sharing-voila-applications-with-ngrok)
# Install ngrok: https://ngrok.com/download, unzip file and, if using macOS, move executable file to /usr/local/bin
# Start Voilà locally (cd in directory with this notebook) by running: voila Food-and-You-survey_risks.ipynb
# In a new terminal window, start ngrok by running: ngrok http 8866 (check local host number actually used by
# the dashboard of interest, as, if running voila multiple times, this number will change)
# Copy the link from the ngrok terminal window (link looks like https://8bb6fded.ngrok.io) and use or send link.
# Website will take some time to load, as the jupyter noteook is running in the background.
# When using the ngrok link, the requests will be forwared to your local instance of Voilà.

# Census data is too heavy and cannot be ploted as a voula dashboard

interactive(children=(Dropdown(description='question', options=('(D) Use any antibacterial surface sanitising …