In [4]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

import statsmodels.stats.proportion as smp
from matplotlib.backends.backend_pdf import PdfPages


from ipywidgets import interact


In [5]:
# Dictionary to recode values (as on survey dataset)

survey_guide_path = 'https://github.com/lorena-gp/food-standards-agency/blob/master/survey_guide.xlsx'
#'/Users/garcial/My_documents/Personal documents/S2DS_course/FSA_project/Data/food survey/withGuide_Food+and+You+Waves+1-5+Data+User+Guide.xlsx'

# Create a dictionary of question names
questions_dict = pd.read_excel(survey_guide_path, 'Variables')
questions_dict = pd.Series(questions_dict.Label.values, index=questions_dict.Variable).to_dict() #OR: dict(zip(questions_dict.Variable, questions_dict.Label))
#print(questions_dict)

# Create a nested dictionary of answer names
answers_dict = pd.read_excel(survey_guide_path, 'Values')
answers_dict['Label'] = (answers_dict['Label']
                         .replace({'Wave 1':2010, 'Wave 2':2012, 'Wave 3':2014, 'Wave 4':2016, 'Wave 5':2018})
                         .replace({'Married/Civil Partnership/Living with Partner':'Married/Partnership'})
                         .replace({'Single/Widowed/Divorced/Separated/Other':'Single/Other'}))
answers_dict = answers_dict.fillna(method='ffill')
answers_dict = answers_dict.groupby('Variable')[['Vlue', 'Label']].apply(lambda g: dict(g.values)).to_dict()
answers_dict['wimd_2014_quintile'] = {1: 1, 2: 2, # 1 is most deprived
                                      3: 3, 4: 4, 5: 5, # 5 is least deprived
                                      -8: "Don't know", -1:'Not applicable'}
#print(answers_dict)

In [9]:
# Load Food and You survey dataset

survey_path = 'https://github.com/lorena-gp/food-standards-agency/blob/master/survey.csv'
#'/Users/garcial/My_documents/Personal documents/S2DS_course/FSA_project/Data/food survey/Food+and+You+Waves+1-5+Data (3).csv'
survey_full_dataset = pd.read_csv(survey_path)
survey_full_dataset = pd.DataFrame(survey_full_dataset)

# Encode 'Not applicable', 'Refused' and 'Don't know' as NaN
survey_full_dataset = survey_full_dataset.replace([-9, -8, -1, 98], np.nan)
cols_5_NaN = ['q4_1_4', 'q4_1_5a', 'Q4_1_5_comb', 'q4_1_6', 'q4_1_7', 'q4_1_8a', 'q4_1_8b', 'sanspray', 'q4_1_11',
              'q4_1_12', 'q4_1_13', 'q4_1_14', 'q4_1_15', 'q4_1_16', 'q4_1_17', 'q4_1_18', 'q4_1_19']
survey_full_dataset[cols_5_NaN] = survey_full_dataset[cols_5_NaN].replace([5], np.nan)

#survey_full_dataset.shape
survey_full_dataset.head()

Unnamed: 0,SerialNo,RespSex,age_dv,bhhsize2,below6,below16,marstat2,hhdinc,workstat2,region_dv,...,Label,FdAuthAct_MC1,FdAuthAct_MC2,FdAuthAct_MC3,FdAuthAct_MC4,FdAuthAct_MC5,FdAuthAct_MC6,FdAuthAct_MC7,FdAuthAct_MC8,FdAuthAct_MC9
0,100002,1,,,,2.0,2.0,4.0,,12,...,,,,,,,,,,
1,100006,1,,,,1.0,1.0,2.0,,7,...,,,,,,,,,,
2,100018,2,,,,2.0,1.0,4.0,,12,...,,,,,,,,,,
3,100019,1,,,,2.0,2.0,,,2,...,,,,,,,,,,
4,100027,2,,,,1.0,1.0,3.0,,10,...,,,,,,,,,,


In [5]:

demographic_variables = ['age_dv', 'marstat2', 'religion_dv', 'RespSex', 'wimd_2014_quintile', 'workstat2', 'Q6_1']

# To plot the survey answers (wave 4 and 5 only) for subpopulation groups for specified relevant questions
waves = [4,5]
survey_subpopulation = survey_full_dataset.loc[survey_full_dataset['surveyyear'].isin(waves)]
survey_subpopulation[demographic_variables] = survey_subpopulation[demographic_variables].replace(answers_dict)

# Combine demographics as desired, for example:
survey_subpopulation['combined_demographics'] = (survey_subpopulation['workstat2'] + '_' +
                                                 survey_subpopulation['RespSex'] + '_' +
                                                 survey_subpopulation['marstat2'])

# Define questions of interest
questions_of_interest = ['sanspray', 'q4_1_6', 'q4_1_7', 'q4_1_8a', 'q4_1_8b',   'Q4_37', 'Q4_38',
                         'bpoison', 'Q4_26b', 'Q4_19b', 'Q4_1_5_comb', 'eatoutev', 'fdsecst'] # etc

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # This is added back by InteractiveShellApp.init_path()


In [10]:

questions_of_interest_names = questions_of_interest.copy()
questions_of_interest_names[:] = [questions_dict.get(e,'') for e in questions_of_interest_names]

questions_dict_inv = {v: k for k, v in questions_dict.items()}

@interact
def plot(
    # provide an argument which has a default value
    # consisting of a list of values
    question=questions_of_interest_names
):
    temp = survey_subpopulation.copy()
    
    question = questions_dict_inv.get(question)

    indexNames = temp[temp['workstat2'] == 'Other'].index
    temp.drop(indexNames , inplace=True)

    # Group according to the features of interest and calculate the % of answer types for the main question
    temp = temp.groupby(['combined_demographics', question])[question].agg(['count'])
    temp['total'] = temp.groupby('combined_demographics')['count'].transform('sum')
    lower, upper = smp.proportion_confint (temp['count'], temp['total'], alpha=0.05, method='normal')
    temp['CI_prop_upper'] = upper
    temp['CI_prop_lower'] = lower
    temp[question + ' (%)'] = temp['count'] / temp['total'] * 100
    temp['CI_perc_upper'] = temp['CI_prop_upper'] * 100
    temp['CI_perc_lower'] = temp['CI_prop_lower'] * 100
    temp['abs_err'] = temp[question + ' (%)'] - temp['CI_perc_lower']
    temp.reset_index(inplace=True) # To 'undo' the grouping
    temp = temp.round(1)
    temp

    # Recode the answers of the question of interest
    temp = temp.replace(answers_dict)

    abs_err = temp.pivot(index=question, columns='combined_demographics', values='abs_err')
        
    fig = temp.pivot(index=question, columns='combined_demographics', values=question +' (%)')\
          .plot(kind='barh', xerr=abs_err, width=0.75, figsize=(6,10))
    for item in ([fig.title, fig.xaxis.label, fig.yaxis.label] + fig.get_xticklabels() + fig.get_yticklabels()):
        item.set_fontsize(16)
    plt.xlabel(question + ' (%)')
    plt.ylabel('')
    fig.legend(loc='center left', bbox_to_anchor=(1.2, 0.8))
    plt.xlim(0, max(temp[question +' (%)'] + 10))
    fig.spines['top'].set_visible(False)
    fig.spines['right'].set_visible(False)
    fig.spines['left'].set_visible(False)


    # To plot the % number
    for p in fig.patches:
        width, height = p.get_width(), p.get_height()
        x, y = p.get_xy() 
        fig.annotate('{:}%'.format(width), (x + width + 8, y + 0.03), fontsize = 14)

    plt.figtext(.1,.9, questions_dict.get(question), va='center')


# To show this as an independent dashboard with Voila:

# Install in terminal by: conda install -c conda-forge voila
# Then cd to directory where this file is store and run in terminal:
# voila Food-and-You-survey_risks.ipynb

# Census data is too heavy and cannot be ploted as a voula dashboard

interactive(children=(Dropdown(description='question', options=('(D) Use any antibacterial surface sanitising …

In [None]:
#

for question in questions_of_interest:
    
    temp = survey_subpopulation.copy()
    
    indexNames = temp[temp['workstat2'] == 'Other'].index
    temp.drop(indexNames , inplace=True)
    
    # Group according to the features of interest and calculate the % of answer types for the main question
    temp = temp.groupby(['combined_demographics', question])[question].agg(['count'])
    temp['total'] = temp.groupby('combined_demographics')['count'].transform('sum')
    lower, upper = smp.proportion_confint (temp['count'], temp['total'], alpha=0.05, method='normal')
    temp['CI_prop_upper'] = upper
    temp['CI_prop_lower'] = lower
    temp[question + ' (%)'] = temp['count'] / temp['total'] * 100
    temp['CI_perc_upper'] = temp['CI_prop_upper'] * 100
    temp['CI_perc_lower'] = temp['CI_prop_lower'] * 100
    temp['abs_err'] = temp[question + ' (%)'] - temp['CI_perc_lower']
    temp.reset_index(inplace=True) # To 'undo' the grouping
    temp = temp.round(1)
    temp

    # Recode the answers of the question of interest
    temp = temp.replace(answers_dict)
    
    abs_err = temp.pivot(index=question, columns='combined_demographics', values='abs_err')
   
    fig = temp.pivot(index=question, columns='combined_demographics', values=question +' (%)')\
    .plot(kind='barh', xerr=abs_err, width=0.75, figsize=(6,10))
    for item in ([fig.title, fig.xaxis.label, fig.yaxis.label] + fig.get_xticklabels() + fig.get_yticklabels()):
        item.set_fontsize(16)
    plt.xlabel(question + ' (%)')
    plt.ylabel('')
    fig.legend(loc='center left', bbox_to_anchor=(1.2, 0.8))
    plt.xlim(0, max(temp[question +' (%)'] + 10))
    fig.spines['top'].set_visible(False)
    fig.spines['right'].set_visible(False)
    fig.spines['left'].set_visible(False)
    
    # To plot the % number
    for p in fig.patches:
        width, height = p.get_width(), p.get_height()
        x, y = p.get_xy() 
        fig.annotate('{:}%'.format(width), (x + width + 8, y + 0.03), fontsize = 14)
    
    plt.figtext(.5,.9, questions_dict.get(question), ha='center')
    plt.show()
    
    fig = fig.get_figure()
    fig.savefig('subpopulation_profiles_' + question + '.png', dpi=200, bbox_inches='tight')
    plt.close()