### USEFUL FUNCTIONS

Notebook with useful functions used throughout the `main_explainer.ipynb` and other notebooks.

In [1]:
# Importing and translating the dataframe:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import googletrans
from googletrans import Translator
# sns.set_style('whitegrid') # Do not put this please, it messes with all my plots 
sns.set(style='white')


file_path = './data/2017_ebsib_bcn_enquesta_benestar_subjectiu_infancia_barcelona.csv'
df = pd.read_csv(file_path, sep=';')

In [2]:
def translate_svy(df):
    # use translate method to translate a string - by default, the destination language is english
    translator = Translator()

    # make a deep copy of the data frame
    df_en = df.copy()

    # translate columns' name using rename function
    df_en.rename(columns=lambda x: translator.translate(x, src='ca').text, inplace=True)

    # translated column names
    df_en.columns

In [3]:
# Translated data-frame:
file_path_t = '/work/data/translated.csv'
df_t = pd.read_csv(file_path_t, sep=',')
df_t = df_t.drop(df_t.columns[0], axis=1)

In [4]:
# Include Average income and Population by neighborhood
# Import Territorial Demographics:
file_path = './data/2017_distribucio_territorial_renda_familiar.csv'
df_inc = pd.read_csv(file_path, sep=',')

# Calculate sum of Population / Mean RFD Index/ Weighthed Mean RFD Index (taking in to account population) --> By neighborhood
df_inc['SumW']= df_inc['Població'] * df_inc['Índex RFD Barcelona = 100']
df_dist = df_inc.groupby(['Codi_Districte']).agg({'Població':'sum','SumW':'sum', 'Índex RFD Barcelona = 100':'mean'})
df_dist['RFD-sumw']= df_dist['SumW'] / df_dist['Població']
df_dist.pop('SumW')

# Aggregate measures in the original data-set
df_t = df_t.join(df_dist, on='Codi_Districte_Educatiu').rename(columns={'Índex RFD Barcelona = 100':'RFD-mean'})

# Calculate Std. deviation and Std. error of RFD Index and aggregate to original
df_dist2 = df_inc.groupby(['Codi_Districte']).agg({'Índex RFD Barcelona = 100':'std'})
df_t = df_t.join(df_dist2, on='Codi_Districte_Educatiu').rename(columns={'Índex RFD Barcelona = 100':'RFD-std'})
df_dist3 = df_inc.groupby(['Codi_Districte']).agg({'Índex RFD Barcelona = 100':'sem'})
df_t = df_t.join(df_dist3, on='Codi_Districte_Educatiu').rename(columns={'Índex RFD Barcelona = 100':'RFD-sem'})

In [5]:
def dic_codi_dist(df_):
    codi_nom = df_.loc[:, ['Codi_Districte_Educatiu', 'Nom_Districte_Educatiu']].drop_duplicates(keep='first',inplace=False).sort_values('Codi_Districte_Educatiu')
    codi_nom = codi_nom.set_index('Codi_Districte_Educatiu')
    d_ = codi_nom.to_dict()
    return (d_['Nom_Districte_Educatiu'])

In [6]:
# COLOR IMPORTS
import matplotlib.colors

# Defining incremental color scale
colors_rgba_hex = ['#EDF6CA',"#d9ed92","#b5e48c","#99d98c","#76c893","#52b69a","#34a0a4","#168aad","#1a759f","#1e6091","#184e77", "#0E2C44"]
colors_rgba = [matplotlib.colors.to_rgba(col) for col in colors_rgba_hex[::-1]]

colors_hex = ['#00378F', '#FF595E', '#FFCA3A', '#8AC926', '#1982C4', '#6A4C93']
colors = [matplotlib.colors.to_rgba(col) for col in colors_hex[::-1]]


colors2_hex= ["#FFCA3A","#DAAB50","#A3798B","#7D5C9B","#4D376C"] # YELLOW - PURPLES
# colors2_hex= ["#0B59AD","#66ABF4","#EBEBD3","#F4D35E","#EE964B"]  # BLUE-ORANGE
colors2 = [matplotlib.colors.to_rgba(col) for col in colors2_hex]

In [18]:
# DISTRICTS: COLOR MAP

def bat_plot2(mtrx,lev_, att_, sec_mtrx, sec_lev, cmap, fig_tup=(5,10)):    
    fig, axes = plt.subplots(ncols=2, nrows=1, figsize=fig_tup, sharey=True, gridspec_kw={'width_ratios': [3.28, 1]})
    im1 = axes[0].imshow(mtrx, cmap=cmap)
    im2 = axes[1].imshow(sec_mtrx, cmap=cmap)

    # We want to show all ticks...
    axes[0].set_xticks(np.arange(len(lev_)))
    axes[0].set_yticks(np.arange(len(att_)))

    axes[1].set_xticks(np.arange(len(sec_lev)))
    # axes[1].set_yticks([])

    # ... and label them with the respective list entries
    axes[0].set_xticklabels(lev_, rotation = 45, ha= "left")
    axes[0].set_yticklabels(att_)

    axes[1].set_xticklabels(sec_lev, rotation = 45, ha= "left")

    # Add grid
    axes[0].set_xticks(np.arange(len(lev_))-.5, minor=True)
    axes[0].set_yticks(np.arange(len(lev_))-.5, minor=True)
    axes[1].set_xticks(np.arange(len(sec_lev))-.5, minor=True)

    # Let the horizontal axes labeling appear on top.
    axes[0].tick_params(top=True, bottom=False, left=True,
                labeltop=True, labelbottom=False, labelsize=14, pad=6)

    axes[1].tick_params(top=True, bottom=False, left=False,
                labeltop=True, labelbottom=False, labelsize=14, pad=6)

    
    # Loop over data dimensions and create text annotations.
    for i in range(len(att_)):
        for j in range(len(lev_)):
            text = axes[0].text(j, i, round(mtrx[i][j],1),
                        ha="center", va="center", color="w", fontsize=16, fontweight='bold')

    # Loop over data dimensions and create text annotations.
    for i2 in range(len(att_)):
        for j2 in range(len(sec_lev)):
            text = axes[1].text(j2, i2, round(sec_mtrx[i2][j2],1),
                        ha="center", va="center", color="w", fontsize=16, fontweight='bold')

    for ax in axes:
        # Despine
        for edge, spine in ax.spines.items():
            spine.set_visible(False)
        
        ax.grid(which="minor", color="w", linestyle='-', linewidth=1.5)
        ax.tick_params(which="minor", bottom=False, left=False)
    
    # plt.title("Limitted access to materials", fontsize=22, y=1.3, x=-1)

    # plt.rcParams['figure.dpi'] = 400
    # plt.savefig('dc_fam-ff.png', bbox_inches='tight', transparent=True)
    fig.tight_layout()
    plt.show()

In [7]:
# FEELINGS DISTRIBUTIONS - PLOT BAR SATCKED
# Preparing thr Function for Plotting the results for feelings stacked plot: 
def f_sbar_plot(dic): 
    # Plotting Feeling frequencies:
    plt.figure(figsize=(18, 10))
    # plt.rcParams['figure.dpi'] = 600
    axes = []

    # Aggregating each feeling level:
    cum_size = np.zeros(len(feelings))
    i=0
    for level ,row_data in zip(dic.keys(),dic.values()):
        axes.append(plt.barh(feelings_names, row_data, left=cum_size,
                                label=level, color=colors_rgba[i]))
        cum_size += row_data
        i +=1

    ax = plt.gca()
    ax.spines["right"].set_visible(False)
    ax.spines["left"].set_visible(False)
    ax.spines["top"].set_visible(False)
    ax.set_xticks(np.arange(0,101,20))
    plt.legend(bbox_to_anchor=(0.5,0.95), loc="lower center", ncol=len(levels), frameon=False, fontsize='medium',
    title = "                    Never                                                                                                                                                           All the time")
    # ax.legend(bbox_to_anchor=(0.5,0.95), loc="lower center", ncol=len(levels), frameon=False, fontsize='medium')

    # plt.ylabel("Feelings", fontsize=16)
    plt.xlabel("Ratio of answers per level", fontsize=16)
    plt.suptitle("How often did you have this feeling during the last two weeks?", fontsize=22)

    # plt.savefig('f_sbar.png', bbox_inches='tight')
    plt.show()

In [8]:
# FEELINGS DISTRIBUTIONS - PLOT POSTIVE - NEGATIVE
def f_dist_plot(df_):
    df_ = df_.rename(columns=f_labels)
    feelings_names = ["SAD", "STRESSED", "BORED","HAPPY","CALM", "ENERGETIC"]

    fig,axs = plt.subplots(ncols=2, nrows=1, figsize=(16,6), sharex= True, sharey= True)
    plt.subplots_adjust(wspace=0.6, hspace=0.01)

    df_.loc[:,feelings_names[3:]].plot(ax=axs[0], lw=2.5, fontsize=12, style={'CALM': '--c', 'ENERGETIC': ':c', 'HAPPY': 'b'})
    df_.loc[:,feelings_names[:3]].plot(ax=axs[1], lw=2.5, fontsize=12, style={'BORED': ':c', 'STRESSED': '--c', 'SAD': 'b'})

    for ax in axs:
        ax.spines["right"].set_visible(False)
        ax.spines["left"].set_visible(False)
        ax.spines["top"].set_visible(False)

        ax.grid(axis='y', color='#eee9e9', linestyle='dotted', linewidth=1.5)
        ax.legend(bbox_to_anchor=(1,1), loc="upper left", frameon=False, fontsize='medium')
        ax.set_xlabel("Frequency levels", fontsize=14)
        ax.set_ylabel("Ratio of answers (%)", fontsize=14)
        # ax.legend(frameon=False, fontsize='medium')

    plt.suptitle("Feelings Frequency Distribution", fontsize=18)

    # plt.savefig('f_dist.png', bbox_inches='tight')
    plt.tight_layout()
    plt.show()

In [9]:
# CORRELATIONS ACCESS TO MATERIALS - DATA PREP
def am_dataprep(df_, att_name, levels, no=True):
    cols = [att_name] + levels + ['RFD-sumw']
    df_am = df_[cols].copy().sort_values('RFD-sumw')
    df_am = df_am.dropna()
    df_am.pop('RFD-sumw')

    att_ = df_am[att_name].unique()
    if att_name == 'ERFDbllp':
        values = ['Neighborhoods outside the city Barcelona', 'NC']
        for value in values:
            j, = np.where(att_ == value) 
            att_ = np.delete(att_,j )

    mtrx = []
    for att in att_:
        row = []
        for lev in levels:
            df_temp = df_am.loc[df_am[att_name]==att][lev].value_counts(normalize=True)*100
            if no:
                val = df_temp['No']
            else:
                val = df_temp['Yes']
            row.append(val)
        mtrx.append(row)
    return(mtrx, att_)

In [10]:
# CORRELATIONS ACCESS TO MATERIALS - PLOT CONSOLIDATED

def sat_plot2(mtrx,lev_, att_, sec_mtrx, sec_lev,cmap, fig_tup=(5,10)):    
    fig, axes = plt.subplots(ncols=2, nrows=1, figsize=fig_tup, sharey=True, gridspec_kw={'width_ratios': [3.28, 1]})
    im1 = axes[0].imshow(mtrx, cmap=cmap)
    im2 = axes[1].imshow(sec_mtrx, cmap=cmap)

    # We want to show all ticks...
    axes[0].set_xticks(np.arange(len(lev_)))
    axes[0].set_yticks(np.arange(len(att_)))

    axes[1].set_xticks(np.arange(len(sec_lev)))
    # axes[1].set_yticks([])

    # ... and label them with the respective list entries
    axes[0].set_xticklabels(lev_, rotation = 45, ha= "left")
    axes[0].set_yticklabels(att_)

    axes[1].set_xticklabels(sec_lev, rotation = 45, ha= "left")

    # Add grid
    axes[0].set_xticks(np.arange(len(lev_))-.5, minor=True)
    axes[0].set_yticks(np.arange(len(lev_))-.5, minor=True)
    axes[1].set_xticks(np.arange(len(sec_lev))-.5, minor=True)

    # Let the horizontal axes labeling appear on top.
    axes[0].tick_params(top=True, bottom=False, left=True,
                labeltop=True, labelbottom=False, labelsize=14, pad=6)

    axes[1].tick_params(top=True, bottom=False, left=False,
                labeltop=True, labelbottom=False, labelsize=14, pad=6)

    for ax in axes:
        # Despine
        for edge, spine in ax.spines.items():
            spine.set_visible(False)
        
        ax.grid(which="minor", color="w", linestyle='-', linewidth=1.5)
        ax.tick_params(which="minor", bottom=False, left=False)

        # Loop over data dimensions and create text annotations.
        for i in range(len(att_)):
            for j in range(len(lev_)):
                text = ax.text(j, i, round(mtrx[i][j],1),
                            ha="center", va="center", color="w", fontsize=16, fontweight='bold')
    
    # plt.title("Limitted access to materials", fontsize=22, y=1.3, x=-1)

    # plt.rcParams['figure.dpi'] = 400
    # plt.savefig('dc_fam-r.png', bbox_inches='tight')
    fig.tight_layout()
    plt.show()

In [11]:
# CORRELATIONS ACCESS TO MATERIALS - PLOT
def sat_plot(mtrx,lev_, att_,fig_tup=(5,10), col = 'whitesmoke'):    
    fig, ax = plt.subplots(figsize=fig_tup)
    im = ax.imshow(mtrx, cmap='Purples')

    # We want to show all ticks...
    ax.set_xticks(np.arange(len(lev_)))
    ax.set_yticks(np.arange(len(att_)))

    # ... and label them with the respective list entries
    ax.set_xticklabels(lev_, rotation = 45, ha= "left")
    ax.set_yticklabels(att_)

    # Add grid
    ax.set_xticks(np.arange(len(lev_))-.5, minor=True)
    ax.set_yticks(np.arange(len(lev_))-.5, minor=True)
    ax.grid(which="minor", color="w", linestyle='-', linewidth=1.5)
    ax.tick_params(which="minor", bottom=False, left=False)

    # Let the horizontal axes labeling appear on top.
    ax.tick_params(top=True, bottom=False, left=True,
                labeltop=True, labelbottom=False, labelsize=14, pad=6)

    # Despine
    for edge, spine in ax.spines.items():
        spine.set_visible(False)
        

    # Loop over data dimensions and create text annotations.
    for i in range(len(att_)):
        for j in range(len(lev_)):
            text = ax.text(j, i, round(mtrx[i][j],1),
                    ha="center", va="center", color=col, fontsize=16, fontweight='bold')
    
    # plt.title("Limitted access to materials", fontsize=22, y=1.2, fontweight='bold')
    fig.tight_layout()
    plt.show()

In [12]:
# LIFE SATSIFACTION: DATA PREP
def sat_dataprep(df_, att_name):
    cols = [att_name, 'SLSSmc']
    df_sat = df_[cols].copy()
    df_sat = df_sat.dropna()

    lab = list(df_sat['SLSSmc'].unique())
    lab_new= ['Very','Fairly', 'Slightly', 'Not at all', 'NC']

    df_sat['Satisf'] = pd.Categorical(df_sat['SLSSmc'].replace(lab,lab_new),
         categories = lab_new, ordered=True)
    df_sat.pop('SLSSmc')  

    att_ = np.sort(df_sat[att_name].unique())
    levels = df_sat['Satisf'].unique()

    mtrx = []
    for att in att_:
        df_att = df_sat.loc[df_sat[att_name]==att].value_counts(normalize=True)*100
        df_att= df_att.reset_index().rename(columns={0: 'Ratio'})
        mtrx.append(list(df_att['Ratio']))
    
    return(mtrx,lab_new, att_)

In [13]:
# LIFE SATSIFACTION: PLOT BAR STACKED

def s_bar_plot(mtrx, level, attr, attr_label='', fig_tup=(16, 10), title = False): 
    plt.figure(figsize=fig_tup)
    axes = []

    # Aggregating each feeling level:
    cum_size = np.zeros(len(attr))
    i=0
    for lev_, lev_data in zip(level, np.transpose(mtrx)):
        axes.append(plt.barh(attr, lev_data, left=cum_size,
                                label=lev_, color=colors2[i]))
        cum_size += lev_data
        i +=1

    ax = plt.gca()
    ax.spines["right"].set_visible(False)
    ax.spines["left"].set_visible(False)
    ax.spines["top"].set_visible(False)
    ax.set_xticks(np.arange(0,101,20))
    ax.legend(bbox_to_anchor=(0.5,0.95), loc="lower center", ncol=len(level), frameon=False, fontsize='medium')
    plt.tick_params(labelsize=14, pad=6)

    plt.ylabel(attr_label, fontsize=16)
    plt.xlabel("Ratio of answers per level", fontsize=16)
    if title:
        plt.title("To what extent are you satisfied with your life?", fontsize=22, y=1.03)

    plt.tight_layout()
    # plt.savefig('f_sbar.png', bbox_inches='tight')
    plt.show()

In [14]:
# LIFE SATSIFACTION: CLEAN ATTRIBUTES
def att_bar_plot(df_, att_name, att_, att_lab='', fig_tup = (12,8)):

    fig, ax = plt.subplots(figsize=fig_tup)

    # Number of kids surveyed by ATTRIBUTE
    att_count = [(df_[att_name] == x).sum() for x in att_]

    ax.barh(att_, att_count, color=colors2[-1])
    ax.tick_params(labelsize=14, pad=6)

    ax.spines["right"].set_visible(False)
    ax.spines["left"].set_visible(False)
    ax.spines["top"].set_visible(False)

    ax.set_xlabel('Counts', size=16, labelpad = 12)
    plt.suptitle(att_lab, fontsize=20, y=0.94, x=0.6)
    plt.tight_layout()
    plt.show()


def m_att_resize(m,lev,att):
    m_ = m.copy()
    att_ = att.copy()
    # print("att", len(att))
    for i in range(len(m)):
        # print(i, m[i], len(lev)>len(m[i]))
        if len(lev)>len(m[i]):
            m_.remove(m[i])
            j, = np.where(att_ == att[i]) 
            att_ = np.delete(att_, j)

    return(m_,att_)

In [15]:
# Basic statistics for a numerical LIST
def basic_stats(LIST, ratio=False, r=0):    
    # Stats:
    if ratio:
        print("  N: ", len(LIST), f"({round(100*len(LIST)/r,2)}%)")
    else:
        print("  N: ", len(LIST))

    print('  Variance', round(np.var(LIST),2))
    print('  Average:', round(np.mean(LIST),2))
    print('  Median:', np.median(LIST))
    print('  Max:', max(LIST))
    print('  Min:', min(LIST))

In [16]:
def plot_yes_no(plot_col):
    cols = ['Codi_Districte_Educatiu', 'Nom_Districte_Educatiu', plot_col]
    plot_df = df_t.copy()
    plot_df = plot_df[cols]

    district_names = plot_df['Nom_Districte_Educatiu'].unique()
    district_count = [plot_df[plot_df['Nom_Districte_Educatiu'] == name].count()[plot_col] for name in district_names] 

    options = plot_df[plot_col].unique()

    plot_grouped = plot_df.groupby(['Nom_Districte_Educatiu', plot_col]).count()
    plot_grouped.columns = ['count']

    # Looking at yes
    yes_count = np.array([plot_grouped.loc[(x, 'Yes'), :].values[0] for x in district_names])
    yes_count = yes_count/district_count

    # Looking at no
    no_count = np.array([plot_grouped.loc[(x, 'No'), :].values[0] for x in district_names])
    no_count = no_count/district_count

    fig, ax = plt.subplots(1, 2, figsize=(15, 5))

    ax[0].bar(district_names, yes_count)
    ax[0].set_title('Yes')

    ax[1].bar(district_names, no_count)
    ax[1].set_title('No')

    ax[0].tick_params(axis='x', labelrotation=90)
    ax[1].tick_params(axis='x', labelrotation=90)
    plt.show()

In [17]:
def stacked_vars(df_t, name_var, colors, rename_var='', leg='top'):
    count = df_t[name_var].value_counts(normalize=True)*100
    piv_ = count.to_frame().T
    levels = piv_.shape[1]

    if rename_var!='':
        piv_ = piv_.rename(index={name_var: rename_var})

    piv_.plot.barh(stacked=True, figsize=(10,3), color = colors[:levels])

    if leg=='top':
        plt.legend(bbox_to_anchor=(0.98,0.9), loc="lower right", ncol=levels, frameon=False, fontsize='medium')
    elif leg=='r':
        plt.legend(bbox_to_anchor=(0.98,0.94), loc="upper left", frameon=False, fontsize='medium')

    plt.tick_params(labelsize=14, pad=6)
    
    ax = plt.gca()
    ax.spines["right"].set_visible(False)
    ax.spines["left"].set_visible(False)
    ax.spines["top"].set_visible(False)
    # ax.spines["bottom"].set_visible(False)

    plt.tight_layout()

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=5cb1a2c0-2f56-40d6-8008-fedeaf8b6a17' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>