### USEFUL FUNCTIONS

Notebook with useful functions used throughout the `main_explainer.ipynb` and other notebooks.

In [None]:
# Importing and translating the dataframe:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import googletrans
from googletrans import Translator
# sns.set_style('whitegrid') # Do not put this please, it messes with all my plots 
sns.set(style='white')


file_path = '../data/2017_ebsib_bcn_enquesta_benestar_subjectiu_infancia_barcelona.csv'
df = pd.read_csv(file_path, sep=';')

In [None]:
# Translated data-frame:
file_path_t = '/work/data/translated.csv'
df_t = pd.read_csv(file_path_t, sep=',')
df_t = df_t.drop(df_t.columns[0], axis=1)

In [None]:
# Include Average income and Population by neighborhood
# Import Territorial Demographics:
file_path = '../data/2017_distribucio_territorial_renda_familiar.csv'
df_inc = pd.read_csv(file_path, sep=',')

# Calculate sum of Population / Mean RFD Index/ Weighthed Mean RFD Index (taking in to account population) --> By neighborhood
df_inc['SumW']= df_inc['Població'] * df_inc['Índex RFD Barcelona = 100']
df_dist = df_inc.groupby(['Codi_Districte']).agg({'Població':'sum','SumW':'sum', 'Índex RFD Barcelona = 100':'mean'})
df_dist['RFD-sumw']= df_dist['SumW'] / df_dist['Població']
df_dist.pop('SumW')

# Aggregate measures in the original data-set
df_t = df_t.join(df_dist, on='Codi_Districte_Educatiu').rename(columns={'Índex RFD Barcelona = 100':'RFD-mean'})

# Calculate Std. deviation and Std. error of RFD Index and aggregate to original
df_dist2 = df_inc.groupby(['Codi_Districte']).agg({'Índex RFD Barcelona = 100':'std'})
df_t = df_t.join(df_dist2, on='Codi_Districte_Educatiu').rename(columns={'Índex RFD Barcelona = 100':'RFD-std'})
df_dist3 = df_inc.groupby(['Codi_Districte']).agg({'Índex RFD Barcelona = 100':'sem'})
df_t = df_t.join(df_dist3, on='Codi_Districte_Educatiu').rename(columns={'Índex RFD Barcelona = 100':'RFD-sem'})

In [None]:
def dic_codi_dist(df_):
    codi_nom = df_.loc[:, ['Codi_Districte_Educatiu', 'Nom_Districte_Educatiu']].drop_duplicates(keep='first',inplace=False).sort_values('Codi_Districte_Educatiu')
    codi_nom = codi_nom.set_index('Codi_Districte_Educatiu')
    d_ = codi_nom.to_dict()
    return (d_['Nom_Districte_Educatiu'])

In [None]:
# Colors
import matplotlib.colors

# Defining incremental color scale
colors_rgba_hex = ['#EDF6CA',"#d9ed92","#b5e48c","#99d98c","#76c893","#52b69a","#34a0a4","#168aad","#1a759f","#1e6091","#184e77", "#0E2C44"]
colors_rgba = [matplotlib.colors.to_rgba(col) for col in colors_rgba_hex[::-1]]

colors_hex = ['#00378F', '#FF595E', '#FFCA3A', '#8AC926', '#1982C4', '#6A4C93']
colors = [matplotlib.colors.to_rgba(col) for col in colors_hex[::-1]]


colors2_hex= ["#FFCA3A","#DAAB50","#A3798B","#7D5C9B","#4D376C"] # YELLOW - PURPLES
# colors2_hex= ["#0B59AD","#66ABF4","#EBEBD3","#F4D35E","#EE964B"]  # BLUE-ORANGE
colors2 = [matplotlib.colors.to_rgba(col) for col in colors2_hex]

In [None]:
# WIP: generate dict of districts and code
# df.groupby(['Codi_Districte_Educatiu','Nom_Districte_Educatiu']).toList()

In [None]:
# # Existing Columns

# {
#  'ID',
#  'Codi_Districte_Educatiu',
#  'Nom_Districte_Educatiu',
#  'Curs',
#  # Part 1?
#  'Sexe',
#  'Llar_BCN',
#  'ERFDbllp',
#  'Edat',
#  '4.1': 'amb_qui_vius',
#  '4.2': 'familia_altra_casa',
#  '4.3': 'frequencia_altra_familia',
#  '5.1.1': 'mare',
#  '5.1.2': 'pare',
#  '5.1.3': 'parella_mare',
#  '5.1.4': 'segona_mare',
#  '5.1.5': 'parella_pare',
#  '5.1.6': 'segon_pare',
#  '5.1.7': 'germans',
#  '5.1.8': 'avis',
#  '5.1.9': 'altres_infants',
#  '5.1.10': 'altres_adults',
#  '5.1.11': 'animals',
#  '5.2': 'germans_mateixa_llar',
#  '5.3': 'avis_mateixa_llar', # not sure
#  '6': 's_familia', # not sure
#  '7.1': 'f_preocupacio',
#  '7.2': 'f_suport',
#  '7.3': 'f_bejunts',
#  '7.4': 'f_seguretat',
#  '7.5': 'f_escolta',
#  '7.6': 'f_llibertat',
#  'on_vas_niexer', # not sure
#  'on_va_neixer_mare_progenitor1',# not sure
#  'on_va_neixer_pare_progenitor2',# not sure
#  'dificultat_fisica',# not sure
#  'dificultat_aprenentatge',# not sure
#  'dificultat_per_malaltia',# not sure
#  'ps_malcap',
#  'ps_panxa',
#  'ps_esquena',
#  'ps_insomni',
#  's_llar',
#  'll_estudi',
#  'll_joc',
#  'cotxe',
#  'habitacio',
#  'ordinadors',
#  'banys',
#  'rentaplats',
#  'vacances',
#  'preocupacio_diners',
#  'amb_qui_vius_feina',
#  's_bmaterials',
#  'bm_internet',
#  'bm_mobil',
#  'bm_sabates',
#  'bm_mescola',
#  'bm_maficions',
#  's_amics',
#  'a_suficients',
#  'a_tractebe',
#  'a_bejunts',
#  'a_suport',
#  'a_freq_foraescola',
#  's_vidaestudiant',
#  's_aprenentatges',
#  's_igualsclasse',
#  'temps_trajecte_escolar',
#  'seguretat_trajecte_escolar',
#  'e_preocupacio',
#  'e_suportmestres',
#  'e_suportiguals',
#  'e_discussions_inv',
#  'e_escolta',
#  'e_eleccio',
#  'e_seguretat',
#  'e_freq_baralles',
#  'e_vfisica_inv',
#  'e_vverbal_inv',
#  'e_vbuit_inv',
#  'e_afisic_inv',
#  'e_averbal_inv',
#  'e_abuit_inv',
#  's_barri',
#  'b_seguretat',
#  'b_joc',
#  'b_suport',
#  'b_amabilitat',
#  'b_llibertat',
#  'b_escolta',
#  'b_freq_baralles',
#  's_seguretat',
#  's_llibertat',
#  's_cos',
#  'contrastslssm_exps',
#  's_escoltaadulta',
#  's_salut',
#  'contrastslssm_olsm',
#  'SLSSm',
#  'SLSSmc',
#  'slssm_vabe',
#  'slssm_hauria',
#  'slssm_bona',
#  'slssm_excel',
#  'slssm_agrada',
#  'slssm_gaudi',
#  'slssm_feliç',
#  'f_felicitat',
#  'f_tristesa',
#  'f_calma',
#  'f_estres',
#  'f_energia',
#  'f_avorriment',
#  'contrastslssm_exp2',
#  's_ustemps',
#  's_tempslliure',
#  't_ajuda',
#  't_cuida',
#  't_treball',
#  't_classes',
#  't_deures',
#  't_tv',
#  't_esport',
#  't_familia',
#  't_airelliure',
#  't_xarxes',
#  't_videojocs',
#  't_resrepos',
#  'd_drets',
#  'd_convencio'
# }

In [None]:
def translate_svy(df):
    # use translate method to translate a string - by default, the destination language is english
    translator = Translator()

    # make a deep copy of the data frame
    df_en = df.copy()

    # translate columns' name using rename function
    df_en.rename(columns=lambda x: translator.translate(x, src='ca').text, inplace=True)

    # translated column names
    df_en.columns

In [None]:
# Basic statistics for a numerical LIST
def basic_stats(LIST, ratio=False, r=0):    
    # Stats:
    if ratio:
        print("  N: ", len(LIST), f"({round(100*len(LIST)/r,2)}%)")
    else:
        print("  N: ", len(LIST))

    print('  Variance', round(np.var(LIST),2))
    print('  Average:', round(np.mean(LIST),2))
    print('  Median:', np.median(LIST))
    print('  Max:', max(LIST))
    print('  Min:', min(LIST))

In [None]:
def plot_yes_no(plot_col):
    cols = ['Codi_Districte_Educatiu', 'Nom_Districte_Educatiu', plot_col]
    plot_df = df_t.copy()
    plot_df = plot_df[cols]

    district_names = plot_df['Nom_Districte_Educatiu'].unique()
    district_count = [plot_df[plot_df['Nom_Districte_Educatiu'] == name].count()[plot_col] for name in district_names] 

    options = plot_df[plot_col].unique()

    plot_grouped = plot_df.groupby(['Nom_Districte_Educatiu', plot_col]).count()
    plot_grouped.columns = ['count']

    # Looking at yes
    yes_count = np.array([plot_grouped.loc[(x, 'Yes'), :].values[0] for x in district_names])
    yes_count = yes_count/district_count

    # Looking at no
    no_count = np.array([plot_grouped.loc[(x, 'No'), :].values[0] for x in district_names])
    no_count = no_count/district_count

    fig, ax = plt.subplots(1, 2, figsize=(15, 5))

    ax[0].bar(district_names, yes_count)
    ax[0].set_title('Yes')

    ax[1].bar(district_names, no_count)
    ax[1].set_title('No')

    ax[0].tick_params(axis='x', labelrotation=90)
    ax[1].tick_params(axis='x', labelrotation=90)
    plt.show()

In [None]:
# plot_yes_no('ll_estudi')

In [None]:
def stacked_vars(df_t, name_var, colors, rename_var='', leg='top'):
    count = df_t[name_var].value_counts(normalize=True)*100
    piv_ = count.to_frame().T
    levels = piv_.shape[1]

    if rename_var!='':
        piv_ = piv_.rename(index={name_var: rename_var})

    piv_.plot.barh(stacked=True, figsize=(10,3), color = colors[:levels])

    if leg=='top':
        plt.legend(bbox_to_anchor=(0.98,0.9), loc="lower right", ncol=levels, frameon=False, fontsize='medium')
    elif leg=='r':
        plt.legend(bbox_to_anchor=(0.98,0.94), loc="upper left", frameon=False, fontsize='medium')

    plt.tick_params(labelsize=14, pad=6)
    
    ax = plt.gca()
    ax.spines["right"].set_visible(False)
    ax.spines["left"].set_visible(False)
    ax.spines["top"].set_visible(False)
    # ax.spines["bottom"].set_visible(False)

    plt.tight_layout()

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=5cb1a2c0-2f56-40d6-8008-fedeaf8b6a17' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>