<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Revisão-Pandas" data-toc-modified-id="Revisão-Pandas-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Revisão Pandas</a></span><ul class="toc-item"><li><span><a href="#Carregando-Dados" data-toc-modified-id="Carregando-Dados-1.1"><span class="toc-item-num">1.1&nbsp;&nbsp;</span>Carregando Dados</a></span><ul class="toc-item"><li><span><a href="#Limpando-os-nomes-de-colunas" data-toc-modified-id="Limpando-os-nomes-de-colunas-1.1.1"><span class="toc-item-num">1.1.1&nbsp;&nbsp;</span>Limpando os nomes de colunas</a></span></li></ul></li><li><span><a href="#Limpeza-de-Informações" data-toc-modified-id="Limpeza-de-Informações-1.2"><span class="toc-item-num">1.2&nbsp;&nbsp;</span>Limpeza de Informações</a></span><ul class="toc-item"><li><span><a href="#(Re)Construção-da-coluna-region" data-toc-modified-id="(Re)Construção-da-coluna-region-1.2.1"><span class="toc-item-num">1.2.1&nbsp;&nbsp;</span>(Re)Construção da coluna <code>region</code></a></span></li><li><span><a href="#Limpeza-de-Colunas-NA" data-toc-modified-id="Limpeza-de-Colunas-NA-1.2.2"><span class="toc-item-num">1.2.2&nbsp;&nbsp;</span>Limpeza de Colunas NA</a></span></li></ul></li><li><span><a href="#Análise-de-Agregados" data-toc-modified-id="Análise-de-Agregados-1.3"><span class="toc-item-num">1.3&nbsp;&nbsp;</span>Análise de Agregados</a></span><ul class="toc-item"><li><span><a href="#Agregado-Regional" data-toc-modified-id="Agregado-Regional-1.3.1"><span class="toc-item-num">1.3.1&nbsp;&nbsp;</span>Agregado Regional</a></span></li><li><span><a href="#Relação-País-Região" data-toc-modified-id="Relação-País-Região-1.3.2"><span class="toc-item-num">1.3.2&nbsp;&nbsp;</span>Relação País-Região</a></span></li><li><span><a href="#Evolução-Anual-de-Felicidade-por-País" data-toc-modified-id="Evolução-Anual-de-Felicidade-por-País-1.3.3"><span class="toc-item-num">1.3.3&nbsp;&nbsp;</span>Evolução Anual de Felicidade por País</a></span></li></ul></li></ul></li></ul></div>

# Revisão Pandas

In [None]:
import pandas as pd
import numpy as np
import os

## Carregando Dados

World Happiness Report

Fonte: https://www.kaggle.com/unsdsn/world-happiness

### Limpando os nomes de colunas

In [None]:
path = 'data/dados_felicidade/'
lista_df = [pd.read_csv(path + csv) for csv in os.listdir(path)]

In [None]:
for df in lista_df:
    print(df.columns)

In [None]:
import re
pattern = r'[ |\.()]+'
column_names = [re.sub(pattern, '_', column.lower()) for df in lista_df for column in df.columns]
print(set(column_names))
print(len(set(column_names)))

In [None]:
column_name_dict = dict()
column_name_dict['health_life_expectancy_'] = 'health_life_expectancy'
column_name_dict['country_or_region'] = 'country'
column_name_dict['upper_confidence_interval'] = 'upper_confidence_interval'
column_name_dict['perceptions_of_corruption'] = 'corruption'
column_name_dict['whisker_high'] = 'whisker_high'
column_name_dict['region'] = 'region'
column_name_dict['generosity'] = 'generosity'
column_name_dict['country'] = 'country'
column_name_dict['gdp_per_capita'] = 'gdp_per_capita'
column_name_dict['freedom_to_make_life_choices'] = 'freedom'
column_name_dict['standard_error'] = 'standard_error'
column_name_dict['happiness_score'] = 'happiness_score'
column_name_dict['whisker_low'] = 'whisker_low'
column_name_dict['happiness_rank'] = 'happiness_rank'
column_name_dict['lower_confidence_interval'] = 'lower_confidence_interval'
column_name_dict['overall_rank'] = 'happiness_rank'
column_name_dict['dystopia_residual'] = 'dystopia'
column_name_dict['trust_government_corruption_'] = 'corruption'
column_name_dict['freedom'] = 'freedom'
column_name_dict['score'] = 'happiness_score'
column_name_dict['social_support'] = 'social_support'
column_name_dict['healthy_life_expectancy'] = 'health_life_expectancy'
column_name_dict['economy_gdp_per_capita_'] = 'gdp_per_capita'
column_name_dict['family'] = 'family'

In [None]:
column_name_dict

In [None]:
lista_df = []
path = 'data/dados_felicidade/'
for csv in os.listdir(path):
    print(f'Carregando {csv}')
    ano = csv.replace('.csv', '')
    df = pd.read_csv(path + csv)
    df['year'] = ano
    df.columns = [re.sub(pattern, '_', column.lower()) for column in df.columns]
    df = df.rename(column_name_dict, axis = 1)
    print(df.columns)
    lista_df.append(df)

In [None]:
tb_happy = pd.concat(lista_df, axis = 0, ignore_index=True)
tb_happy

## Limpeza de Informações

### (Re)Construção da coluna `region`

In [None]:
tb_paises = tb_happy[['country', 'region']].dropna()
tb_paises = tb_paises.groupby('country').first()
tb_paises = tb_paises.rename({'region' : 'region_cons'}, axis = 1)
tb_paises

In [None]:
tb_happy = pd.merge(tb_happy, tb_paises, on = 'country')
tb_happy

### Limpeza de Colunas NA

In [None]:
tb_happy = tb_happy.drop('region', axis = 1)
thresh = int(len(tb_happy) * 0.3)
tb_happy = tb_happy.dropna(axis = 1, thresh = thresh)


In [None]:
tb_happy.describe()

## Análise de Agregados

### Agregado Regional

In [None]:
tb_happy.groupby('region_cons').agg(
    mu_happy = pd.NamedAgg('happiness_score', 'mean'),
    mu_le = pd.NamedAgg('health_life_expectancy', 'mean')
).sort_values('mu_happy')

In [None]:
tb_reg = tb_happy.groupby('region_cons').agg(
    mu_reg_happy = pd.NamedAgg('happiness_score', 'mean'),
    mu_reg_le = pd.NamedAgg('health_life_expectancy', 'mean')
)

### Relação País-Região

In [None]:
tb_happy = pd.merge(tb_happy, tb_reg, on = 'region_cons')
tb_happy['prop_reg_country'] = tb_happy['happiness_score']/tb_happy['mu_reg_happy']
tb_happy.groupby('country').agg(avg_prop_regcou = pd.NamedAgg('prop_reg_country', 'mean')).sort_values('avg_prop_regcou')

### Evolução Anual de Felicidade por País

In [None]:
tb_happy_per = tb_happy.groupby('country').agg(
    first_year=pd.NamedAgg('year', 'min'),
    last_year=pd.NamedAgg('year', 'max'),
    avg_happy=pd.NamedAgg('happiness_score', 'mean'))
tb_happy_per = pd.merge(
    tb_happy_per,
    tb_happy[['country', 'year', 'happiness_score', 'happiness_rank']].rename(
        {
            'happiness_score': 'first_score',
            'happiness_rank': 'first_rank'
        },
        axis=1),
    left_on=['country', 'first_year'],
    right_on=['country', 'year'])

tb_happy_per = pd.merge(
    tb_happy_per,
    tb_happy[['country', 'year', 'happiness_score', 'happiness_rank']].rename(
        {
            'happiness_score': 'last_score',
            'happiness_rank': 'last_rank'
        },
        axis=1),
    left_on=['country', 'last_year'],
    right_on=['country', 'year'])
tb_happy_per = tb_happy_per.drop(['year_x', 'year_y'], axis = 1)

In [None]:
tb_happy_per

In [None]:
tb_happy_per['score_var'] = tb_happy_per['last_score'] - tb_happy_per['first_score']
tb_happy_per['num_anos'] = tb_happy_per['last_year'].astype(int) - tb_happy_per['first_year'].astype(int).astype(int) + 1
tb_happy_per['score_var_anual'] = tb_happy_per['score_var']/tb_happy_per['num_anos']
tb_happy_per['per_score_var_anual'] = tb_happy_per['score_var_anual']/tb_happy_per['avg_happy']

In [None]:
tb_happy_per.sort_values('per_score_var_anual')

In [None]:
tb_happy_per['classif_var'] = pd.qcut(tb_happy_per['per_score_var_anual'], 4,
                                     labels = ['Muito Ruim', 'Ruim', 'Bom', 'Muito Bom'])
tb_happy_per['classif_var']

In [None]:
tb_happy_per[tb_happy_per['country'] == 'Brazil']