In [15]:
import duckdb
import os, sys
module_path = os.path.abspath(os.path.join('../..'))
if module_path not in sys.path:
    sys.path.append(module_path)

import utils 

# Data exploration

Groupping nationalities that have less than 0.5% of the total population (from 2018 to 2020) in a feature called Others.

In [16]:
def data_exploration(df,x = 1):
    """
    Shows the nationalities that have more than `x`% of the total population
    the rest are shown in a new feature called Others.
    """
    cols = list(range(22, df.shape[1]))
    cols[0] =  df.columns.get_loc('Madrid_section') 

    nat = df.iloc[:,cols]
    nat.set_index('Madrid_section', inplace = True)
    nat.loc['Total (%)'] = round((nat.iloc[:,1:].sum()/(nat.iloc[:,1:].sum()).sum()) * 100,3)

    dfn = (nat.loc[:, nat.loc['Total (%)'] >= x])
    dfn['Others'] = (nat.loc[:, nat.loc['Total (%)'] < x]).sum(axis = 1)

    return dfn

con = duckdb.connect('../integration.duckdb')
df = con.execute(f'SELECT * FROM integratedTable').df()
con.close()

dfn = data_exploration(df, 1)
dfn

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  nat.loc['Total (%)'] = round((nat.iloc[:,1:].sum()/(nat.iloc[:,1:].sum()).sum()) * 100,3)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfn['Others'] = (nat.loc[:, nat.loc['Total (%)'] < x]).sum(axis = 1)


Unnamed: 0_level_0,China,Españoles,Rumanía,Others
Madrid_section,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
079601001,7.000,975.000,1.000,186.000
079601002,36.000,668.000,4.000,194.000
079601003,106.000,1265.000,9.000,365.000
079601004,0.000,1059.000,9.000,182.000
079601006,12.000,1390.000,5.000,289.000
...,...,...,...,...
079621030,3.000,1428.000,34.000,103.000
079621031,6.000,2231.000,18.000,138.000
079621032,4.000,1638.000,13.000,142.000
079621033,1.000,1797.000,11.000,188.000


In [17]:
total = dfn.iloc[:-1,:-1].sum(axis = 1)
total.describe()
dfn.describe()

Unnamed: 0,China,Españoles,Rumanía,Others
count,7330.0,7330.0,7330.0,7330.0
mean,15.981199,1149.41934,18.155574,156.690954
std,31.44113,429.087567,20.548638,106.401686
min,0.0,85.762,0.0,3.0
25%,3.0,840.25,5.0,81.0
50%,8.0,1051.0,12.0,131.0
75%,18.0,1353.0,25.0,207.0
max,412.0,4040.0,391.0,931.0


# Updating integratedTable
Feature engineering: we aggregate a foreign variable, which is the variable target. Therefore we eliminate the other nationalities, since sum(nationalities) = foreign

In [18]:
data = utils.DBtable_to_df('../integration.duckdb', 'integratedTable')
data = data.astype('int32')
data['Extrangeros'] = (data.iloc[:,24:].drop(columns=['Españoles'])).sum(axis=1)
data = data.drop(data.iloc[:,24:-1],axis = 1)
utils.df_to_DBtable('../integration.duckdb', data, 'integratedTable')

In [19]:
data

Unnamed: 0,Madrid_section,single_women_aged_16_to_64,single_men_aged_16_to_64,single_women_aged_65_or_over,single_men_aged_65_or_over,adult_women_with_one_or_more_minors,adult_men_with_one_or_more_minors,two_adults_from_16_to_64_and_without_minors,two_adults_one_at_least_65_and_without_minors,two_adults_and_one_minor,...,three_adults_and_0_or_more_minors,two_adults_over_35_and_two_adults_from_16_to_34,two_adults_over_35_and_two_adults_from_16_to_34_and_one_minor,two_adults_over_35_and_two_adults_from_16_to_34_and_two_or_more_minors,four_adults_and_0_or_more_minors,five_adults_and_0_or_more_minors,fifteen_or_more_inhabitants,only_minors,Year,Extrangeros
0,79601001,67,88,62,16,9,4,101,69,15,...,35,11,4,2,13,17,0,1,2019,194
1,79601002,99,89,36,15,5,3,84,29,11,...,41,11,1,0,13,12,0,0,2019,234
2,79601003,147,151,71,42,7,1,144,75,23,...,54,16,5,1,20,35,0,2,2019,480
3,79601004,99,95,55,20,9,1,88,71,23,...,43,14,1,0,15,17,1,0,2019,191
4,79601006,142,174,75,42,12,1,151,80,33,...,61,13,3,1,12,22,0,0,2019,306
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7324,79621029,32,29,6,5,20,12,45,19,53,...,29,49,7,2,8,12,0,0,2018,74
7325,79621030,37,46,29,11,19,6,68,52,44,...,39,41,3,0,14,16,0,0,2018,140
7326,79621031,88,91,15,6,39,9,156,31,153,...,28,21,6,2,10,16,0,0,2018,162
7327,79621032,65,71,17,4,27,13,82,23,68,...,21,30,5,0,7,6,1,1,2018,159
