In [36]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
from sklearn import preprocessing
import seaborn as sns

In [2]:
path_data = 'data/Economy_Data.csv'
path_target = 'data/HealthAndPoverty_Data.csv'

X = pd.read_csv(path_data, sep=';')
y = pd.read_csv(path_target, sep=';')

# Limpando dados

In [3]:
X = X[~X['Value'].isna()] # dropping all rows with NaN values
y = y[~y['Value'].isna()] # dropping all rows with NaN values

In [4]:
X['SeriesName']

5        Adjusted net national income (constant 2010 US$)
7        Adjusted net national income (constant 2010 US$)
10             Adjusted net national income (current US$)
11             Adjusted net national income (current US$)
12             Adjusted net national income (current US$)
                               ...                       
86490                Use of IMF credit (DOD, current US$)
86491                Use of IMF credit (DOD, current US$)
86492                Use of IMF credit (DOD, current US$)
86493                Use of IMF credit (DOD, current US$)
86494                Use of IMF credit (DOD, current US$)
Name: SeriesName, Length: 60348, dtype: object

## Primeira ideia:
* Feature selection (ver a coluna Series Name - pra depois agrupamento por ano)

## Nova ideia:
* Utilizar os dados em X pra prever cada `SeriesName` de forma separada.
* Aparentemente ha poucos em Y


## ATENCAO 
Adicionei abaixo  uma transformacao das features em onehot, mas ainda nao sei como combinar todas no agg (talvez eu mantenha em sum)

In [5]:
one_hot = pd.get_dummies(X['SeriesName'])
X = X.drop('SeriesName', axis=1)
X = X.join(one_hot)

In [6]:
# one_hot = pd.get_dummies(y['SeriesName'])
# y.drop('SeriesName', axis = 1, inplace=True)
# y = y.join(one_hot)

In [7]:
X.head()

Unnamed: 0,SeriesCode,CountryName,CountryCode,Year,Value,Adjusted net national income (annual % growth),Adjusted net national income (constant 2010 US$),Adjusted net national income (current US$),Adjusted net national income per capita (annual % growth),Adjusted net national income per capita (constant 2010 US$),...,"Total reserves (includes gold, current US$)",Total reserves in months of imports,Total reserves minus gold (current US$),Trade (% of GDP),Trade in services (% of GDP),"Transport services (% of service exports, BoP)","Transport services (% of service imports, BoP)","Travel services (% of service exports, BoP)","Travel services (% of service imports, BoP)","Use of IMF credit (DOD, current US$)"
5,NY.ADJ.NNTY.KD,Brazil,BRA,1970.0,391897400000.0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,NY.ADJ.NNTY.KD,India,IND,1970.0,191533500000.0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10,NY.ADJ.NNTY.CD,Brazil,BRA,1970.0,37860210000.0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
11,NY.ADJ.NNTY.CD,China,CHN,1970.0,85255610000.0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
12,NY.ADJ.NNTY.CD,India,IND,1970.0,57767200000.0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


>- Como ideia aqui sugiro contar o número de análises que foram aplicadas aos países. Contar a incidência de dummies para cada ano, e a partir daí ver a evolução das análises empregadas ao longo do tempo.
>- Outra ideia seria montar um dataframe único onde são agregados os dados de `X` e `y`, cruzando indices históricos de cada nação afim de definir possíveis relações entre as variáveis.

In [8]:
X_by_year = X[X['CountryName']=='Brazil'].groupby(['Year']).mean().reset_index()
y_by_year = y[y['CountryName']=='Brazil'].groupby(['Year']).mean().reset_index()

In [9]:
X_by_year.head()

Unnamed: 0,Year,Value,Adjusted net national income (annual % growth),Adjusted net national income (constant 2010 US$),Adjusted net national income (current US$),Adjusted net national income per capita (annual % growth),Adjusted net national income per capita (constant 2010 US$),Adjusted net national income per capita (current US$),"Adjusted net savings, excluding particulate emission damage (% of GNI)","Adjusted net savings, excluding particulate emission damage (current US$)",...,"Total reserves (includes gold, current US$)",Total reserves in months of imports,Total reserves minus gold (current US$),Trade (% of GDP),Trade in services (% of GDP),"Transport services (% of service exports, BoP)","Transport services (% of service imports, BoP)","Travel services (% of service exports, BoP)","Travel services (% of service imports, BoP)","Use of IMF credit (DOD, current US$)"
0,1970.0,43882010000.0,0.0,0.005319,0.005319,0.0,0.005319,0.005319,0.0,0.0,...,0.005319,0.0,0.005319,0.005319,0.0,0.0,0.0,0.0,0.0,0.005319
1,1971.0,48012890000.0,0.005181,0.005181,0.005181,0.005181,0.005181,0.005181,0.0,0.0,...,0.005181,0.0,0.005181,0.005181,0.0,0.0,0.0,0.0,0.0,0.005181
2,1972.0,53409860000.0,0.005102,0.005102,0.005102,0.005102,0.005102,0.005102,0.0,0.0,...,0.005102,0.0,0.005102,0.005102,0.0,0.0,0.0,0.0,0.0,0.005102
3,1973.0,61581970000.0,0.005128,0.005128,0.005128,0.005128,0.005128,0.005128,0.0,0.0,...,0.005128,0.0,0.005128,0.005128,0.0,0.0,0.0,0.0,0.0,0.005128
4,1974.0,67995760000.0,0.005102,0.005102,0.005102,0.005102,0.005102,0.005102,0.0,0.0,...,0.005102,0.0,0.005102,0.005102,0.0,0.0,0.0,0.0,0.0,0.005102


In [10]:
X_by_year.iloc[:,2:] = X_by_year.iloc[:,2:].applymap(lambda x: 1 if x > 0 else 0)

In [11]:
X_by_year.head()

Unnamed: 0,Year,Value,Adjusted net national income (annual % growth),Adjusted net national income (constant 2010 US$),Adjusted net national income (current US$),Adjusted net national income per capita (annual % growth),Adjusted net national income per capita (constant 2010 US$),Adjusted net national income per capita (current US$),"Adjusted net savings, excluding particulate emission damage (% of GNI)","Adjusted net savings, excluding particulate emission damage (current US$)",...,"Total reserves (includes gold, current US$)",Total reserves in months of imports,Total reserves minus gold (current US$),Trade (% of GDP),Trade in services (% of GDP),"Transport services (% of service exports, BoP)","Transport services (% of service imports, BoP)","Travel services (% of service exports, BoP)","Travel services (% of service imports, BoP)","Use of IMF credit (DOD, current US$)"
0,1970.0,43882010000.0,0,1,1,0,1,1,0,0,...,1,0,1,1,0,0,0,0,0,1
1,1971.0,48012890000.0,1,1,1,1,1,1,0,0,...,1,0,1,1,0,0,0,0,0,1
2,1972.0,53409860000.0,1,1,1,1,1,1,0,0,...,1,0,1,1,0,0,0,0,0,1
3,1973.0,61581970000.0,1,1,1,1,1,1,0,0,...,1,0,1,1,0,0,0,0,0,1
4,1974.0,67995760000.0,1,1,1,1,1,1,0,0,...,1,0,1,1,0,0,0,0,0,1


In [12]:

y_target = y[y['SeriesName'] == 'Adolescent fertility rate (births per 1,000 women ages 15-19)'].groupby(['CountryName','Year']).mean().reset_index()



In [13]:
data = y_target[y_target['CountryName'] == 'Brazil'].merge(X_by_year, on='Year')
X, y = data.iloc[:,4:], data.iloc[:,3]
X['Value'] = data.iloc[:,2]

In [14]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_validate, StratifiedKFold
from sklearn.metrics import plot_roc_curve, plot_precision_recall_curve
from sklearn.metrics import auc
from sklearn.model_selection import  RandomizedSearchCV, train_test_split
from sklearn.ensemble import RandomForestRegressor as RF

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

tprs = []
aucs = []
score = []
ap = []
prcs = []
feature_importances = []
mean_fpr = np.linspace(0, 1, 100)


#for i, (train, test) in enumerate(cv.split(X, y)):
n = 10
param_grid = {
    'n_estimators': [i for i in range(2,100)],
    'min_samples_split': [i for i in range(8,51)],
    'min_samples_leaf': [i for i in range(1,31)],
    'max_depth': [i for i in range(2,20)],
    'max_samples': [round(0.1*i,2) for i in range(1,10)]
}

random_grid_search = RandomizedSearchCV(estimator=RF(max_features='sqrt'),param_distributions=param_grid,n_iter=n,scoring='r2', n_jobs=-1, verbose=2, random_state=42)
random_grid_search.fit(X_train,
                        y_train,)

feature_importances.append(random_grid_search.best_estimator_.feature_importances_)

    


Fitting 5 folds for each of 10 candidates, totalling 50 fits


In [15]:
random_grid_search.best_estimator_.feature_importances_

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0.

## Resultados
* Ficou horrivel o regressor dessa forma (talvez tenhamos que fazer de maneiras mais simples), e.g.:
    - selecionar um `SeriesName` de X e um de y e criar uma hipotese em cima

In [5]:
X_br = X[X['CountryCode'] == 'BRA'].copy()

In [6]:
y_br = y[y['CountryCode'] == 'BRA'].copy()

In [7]:
y_br

Unnamed: 0,SeriesName,SeriesCode,CountryName,CountryCode,Year,Value
0,"Adolescent fertility rate (births per 1,000 wo...",SP.ADO.TFRT,Brazil,BRA,1970.0,77.118400
15,Age dependency ratio (% of working-age populat...,SP.POP.DPND,Brazil,BRA,1970.0,83.980449
20,"Age dependency ratio, old (% of working-age po...",SP.POP.DPND.OL,Brazil,BRA,1970.0,6.318839
25,"Age dependency ratio, young (% of working-age ...",SP.POP.DPND.YG,Brazil,BRA,1970.0,77.661610
55,"Birth rate, crude (per 1,000 people)",SP.DYN.CBRT.IN,Brazil,BRA,1970.0,35.116000
...,...,...,...,...,...,...
69210,"Prevalence of HIV, total (% of population ages...",SH.DYN.AIDS.ZS,Brazil,BRA,2019.0,0.500000
69305,Probability of dying among adolescents ages 10...,SH.DYN.1014,Brazil,BRA,2019.0,1.400000
69310,Probability of dying among adolescents ages 15...,SH.DYN.1519,Brazil,BRA,2019.0,5.800000
69315,Probability of dying among children ages 5-9 y...,SH.DYN.0509,Brazil,BRA,2019.0,1.000000


In [8]:
test = y_br['SeriesName'].value_counts()
test2 = X_br['SeriesName'].value_counts()

In [42]:
test[test == 20].index

Index(['Antiretroviral therapy coverage (% of people living with HIV)', 'Sex ratio at birth (male births per female births)'], dtype='object')

In [40]:
test2[test2 >= 50].index

Index(['Gross capital formation (current US$)',
       'Total reserves (includes gold, current US$)',
       'Gross national expenditure (% of GDP)',
       'Changes in inventories (constant LCU)', 'GDP (current LCU)',
       'Households and NPISHs Final consumption expenditure (current LCU)',
       'Total reserves minus gold (current US$)',
       'Gross capital formation (% of GDP)',
       'Net primary income (Net income from abroad) (current US$)',
       'Final consumption expenditure (constant 2010 US$)',
       ...
       'Final consumption expenditure (current US$)',
       'Final consumption expenditure (current LCU)',
       'Multilateral debt service (% of public and publicly guaranteed debt service)',
       'Net financial flows, IBRD (NFL, current US$)',
       'External balance on goods and services (% of GDP)',
       'Gross fixed capital formation (current US$)',
       'Gross fixed capital formation (constant LCU)',
       'Gross value added at basic prices (GVA) (cur

In [12]:
cobert_hiv_treatment = y_br['SeriesName'] == 'Antiretroviral therapy coverage (% of people living with HIV)'

In [25]:
gva = X_br[X_br['SeriesName'] ==  'Gross value added at basic prices (GVA) (current US$)']


In [27]:
gva.head()

Unnamed: 0,SeriesName,SeriesCode,CountryName,CountryCode,Year,Value
775,Gross value added at basic prices (GVA) (curre...,NY.GDP.FCST.CD,Brazil,BRA,1970.0,35567520000.0
2505,Gross value added at basic prices (GVA) (curre...,NY.GDP.FCST.CD,Brazil,BRA,1971.0,42144480000.0
4235,Gross value added at basic prices (GVA) (curre...,NY.GDP.FCST.CD,Brazil,BRA,1972.0,49863910000.0
5965,Gross value added at basic prices (GVA) (curre...,NY.GDP.FCST.CD,Brazil,BRA,1973.0,67933200000.0
7695,Gross value added at basic prices (GVA) (curre...,NY.GDP.FCST.CD,Brazil,BRA,1974.0,91373940000.0


In [13]:
infant_deaths = y_br[y_br['SeriesName'] == 'Number of infant deaths'].copy()

In [14]:
norm = preprocessing.MinMaxScaler()

In [33]:
infant_deaths_norm = infant_deaths['Value'].values
infant_deaths_norm = norm.fit_transform(infant_deaths_norm.reshape(-1,1))

gva_norm  = gva['Value'].values
gva_norm = norm.fit_transform(gva_norm.reshape(-1,1))

In [34]:
infant_deaths['norm'] = infant_deaths_norm
gva['norm'] = gva_norm



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [17]:
infant_deaths.head()

Unnamed: 0,SeriesName,SeriesCode,CountryName,CountryCode,Year,Value,norm
580,Number of infant deaths,SH.DTH.IMRT,Brazil,BRA,1970.0,333509.0,1.0
1970,Number of infant deaths,SH.DTH.IMRT,Brazil,BRA,1971.0,327356.0,0.979315
3360,Number of infant deaths,SH.DTH.IMRT,Brazil,BRA,1972.0,321921.0,0.961043
4750,Number of infant deaths,SH.DTH.IMRT,Brazil,BRA,1973.0,317374.0,0.945757
6140,Number of infant deaths,SH.DTH.IMRT,Brazil,BRA,1974.0,313484.0,0.93268


In [18]:
gni.head()

Unnamed: 0,SeriesName,SeriesCode,CountryName,CountryCode,Year,Value
710,Gross national expenditure (% of GDP),NE.DAB.TOTL.ZS,Brazil,BRA,1970.0,100.420317
2440,Gross national expenditure (% of GDP),NE.DAB.TOTL.ZS,Brazil,BRA,1971.0,101.724673
4170,Gross national expenditure (% of GDP),NE.DAB.TOTL.ZS,Brazil,BRA,1972.0,101.584894
5900,Gross national expenditure (% of GDP),NE.DAB.TOTL.ZS,Brazil,BRA,1973.0,101.229908
7630,Gross national expenditure (% of GDP),NE.DAB.TOTL.ZS,Brazil,BRA,1974.0,105.870612


In [19]:
infant_deaths.head()

Unnamed: 0,SeriesName,SeriesCode,CountryName,CountryCode,Year,Value,norm
580,Number of infant deaths,SH.DTH.IMRT,Brazil,BRA,1970.0,333509.0,1.0
1970,Number of infant deaths,SH.DTH.IMRT,Brazil,BRA,1971.0,327356.0,0.979315
3360,Number of infant deaths,SH.DTH.IMRT,Brazil,BRA,1972.0,321921.0,0.961043
4750,Number of infant deaths,SH.DTH.IMRT,Brazil,BRA,1973.0,317374.0,0.945757
6140,Number of infant deaths,SH.DTH.IMRT,Brazil,BRA,1974.0,313484.0,0.93268


In [35]:
fig = go.Figure()
fig.add_trace(go.Bar(x = infant_deaths['Year'], y = infant_deaths['norm'], name= 'infant_deaths', hovertemplate="Deaths: %{y}<extra></extra>"))
fig.add_trace(go.Bar(x = gva['Year'], y = gva['norm'], name= 'GVA'))

Unnamed: 0,SeriesName,SeriesCode,CountryName,CountryCode,Year,Value,norm
580,Number of infant deaths,SH.DTH.IMRT,Brazil,BRA,1970.0,333509.0,1.0
1970,Number of infant deaths,SH.DTH.IMRT,Brazil,BRA,1971.0,327356.0,0.979315
3360,Number of infant deaths,SH.DTH.IMRT,Brazil,BRA,1972.0,321921.0,0.961043
4750,Number of infant deaths,SH.DTH.IMRT,Brazil,BRA,1973.0,317374.0,0.945757
6140,Number of infant deaths,SH.DTH.IMRT,Brazil,BRA,1974.0,313484.0,0.93268
7530,Number of infant deaths,SH.DTH.IMRT,Brazil,BRA,1975.0,310366.0,0.922197
8920,Number of infant deaths,SH.DTH.IMRT,Brazil,BRA,1976.0,307526.0,0.91265
10310,Number of infant deaths,SH.DTH.IMRT,Brazil,BRA,1977.0,304396.0,0.902127
11700,Number of infant deaths,SH.DTH.IMRT,Brazil,BRA,1978.0,300558.0,0.889225
13090,Number of infant deaths,SH.DTH.IMRT,Brazil,BRA,1979.0,295510.0,0.872254
