# Portugal's mortality stats

Data acquired from http://evm.min-saude.pt/

_Assignement: try to edit the markdown to explain the commands..._


In [None]:
#!pip3 install dateparser
import dateparser

import pandas as pd
import numpy as np

## Read the data

Files data/portugal_mortality/causeXXXX.csv contain data associated the death causes for each day

In [None]:
def read_data(prefix):
    def my_date_parser(days):
        days = [dateparser.parse(str(year) + '-' + d, languages=['pt']) for d in days]
        return days
    
    df = pd.DataFrame()
    for year in range(2014, 2023):
        filename = f'./data/portugal_mortality/{prefix}{year}.csv'
        temp_df = pd.read_csv(filename,
                              parse_dates=['Data (mm-dd)'], 
                              date_parser=my_date_parser
                             )
        # append to the final dataframe
        df = pd.concat([df, temp_df], ignore_index=True)

    # convert the date (string) to datetime
    df['date'] = pd.to_datetime(df['Data (mm-dd)'])
    df = df.drop('Data (mm-dd)', axis=1)
    return df
        
cause_df = read_data('cause')
age_df = read_data('grupo_etario')

In [None]:
cause_df.info()

In [None]:
cause_df.head()

In [None]:
age_df.info()

In [None]:
age_df.head()

In [None]:
age_df.tail()

In [None]:
cause_df[['Morte natural', 'Causa externa', 'Sujeito a investigação', 'date']].set_index('date').plot(figsize=(30, 5))

In [None]:
age_df.columns

In [None]:
# recomputing the > 75 anos column to include the 75-84 anos and the >= 85 anos
mask = np.isnan(age_df['> 75 anos'])
age_df[mask].tail()
age_df.loc[mask, '> 75 anos'] = age_df.loc[mask, '75-84 anos'] + age_df.loc[mask, '≥ 85 anos']

age_df[['< 1 ano', '1-4 anos', '5-14 anos', '15-24 anos', '25-34 anos', '35-44 anos', '45-54 anos',
        '55-64 anos', '65-74 anos', '> 75 anos', '75-84 anos', '≥ 85 anos', 'date']].set_index('date').plot(figsize=(30, 5))

Lets:
 * add the total number of deaths
 * join the data from the cause dataframe with the age dataframe

In [None]:
df = cause_df.merge(age_df, left_on='date', right_on='date')

df['total'] = df['Morte natural'] + df['Causa externa'] + df['Sujeito a investigação']
df.tail()

Plot ...

In [None]:
df[['> 75 anos', 'total', 'date']].set_index('date').plot(logy=True, figsize=(30, 10))

## joining the temperature (from lisbon)

Now, we are going to join the temperature from Lisbon to the mortality and chech for correlations

In [None]:
temp_df = pd.read_csv('data/portugal_meteo/meteo_lx.csv', 
                      parse_dates=['YEARMODA'], 
                      date_parser=lambda t:pd.to_datetime(str(t),format='%Y%m%d')
                     )
temp_df.tail()

In [None]:
temp_df.info()

In [None]:
def F2C(d):
    return (d - 32.) * (5. / 9.)

meteo_df = pd.DataFrame()
meteo_df['date'] = temp_df['YEARMODA'] 
meteo_df['max'] = F2C(temp_df['MAX']) 
meteo_df['min'] = F2C(temp_df['MIN']) 
meteo_df['rained'] = temp_df['PRCP'] > 0

In [None]:
meteo_df.describe()

are there strange values? maybe in the MAX temperature!?

In [None]:
meteo_df.sort_values(by='max', ascending=False)

which lines are problematic?

In [None]:
problematic_idx = meteo_df[meteo_df['max']>50].index
problematic_idx

In [None]:
meteo_df = meteo_df.drop(problematic_idx, axis=0)

In [None]:
df = df.merge(meteo_df, left_on='date', right_on='date')
df.tail()

In [None]:

df[['total', 'max', 'min']].plot()

maybe it is better if data is normalized. Furthermore its seam that high temperatures are worst, so lets plot them only

In [None]:
for col in ['total', 'max']:
    df[col + '_n'] = df[col] / df[col].max()

df[['total_n', 'max_n']].plot(figsize=(25,10))

In [None]:
df.plot(x='max', y='total', kind='scatter', figsize=(15,10))

## Group data
To group data by month we can start by adding a new columns with year and month

In [None]:
df['year'] = df.date.dt.year
df['month'] = df.date.dt.month
df['day'] = df.date.dt.day
df['day_week'] = df.date.dt.dayofweek

In [None]:
df.groupby(by='day_week').mean()

We can see that aparently there is almost no influence on the days of week (look at the scale!)

In [None]:
df[['total', 'day_week']].groupby(by='day_week').mean().plot(figsize=(20,5))

In [None]:
df[['< 1 ano', '1-4 anos', 
   '5-14 anos', '15-24 anos', '25-34 anos', '35-44 anos', '45-54 anos', '55-64 anos', 
   '65-74 anos', '> 75 anos', 'Desconhecido', 'day_week']].groupby(by='day_week').mean().plot(figsize=(20,5)) 

What happens if think the number of deaths by month?

In [None]:
df[['total', 'month']].groupby(by='month').mean()

maybe there is something...?

In [None]:
df[['total', 'month']].groupby(by='month').mean().plot(kind='bar')

Has the rain anything to do with it?

In [None]:
df.groupby(by='rained').mean()

In [None]:
df.groupby(by='rained').mean().total.plot(kind='bar')