In [1]:
import duckdb
import shutil
import os, sys
module_path = os.path.abspath(os.path.join('../..'))
if module_path not in sys.path:
    sys.path.append(module_path)

import utils 
import warnings
warnings.filterwarnings('ignore')

## Join two sources into one table

In [2]:

nationalites = utils.DBtable_to_df('../integration.duckdb', 'nationalitiesClean_Madrid')
households = utils.DBtable_to_df('../integration.duckdb','householdClean_Madrid')

result = households.merge(nationalites, how='inner', on=["Madrid_section","Year"])

utils.df_to_DBtable('../integration.duckdb',result, 'integratedTable')

### Test

In [3]:
# check if all tables in integration.duckdb are created
utils.get_tables('../integration.duckdb')

0    nationalitiesClean_Madrid
1        householdClean_Madrid
2              integratedTable
Name: table_name, dtype: object

## Removing redundacies from integrated table

### Removing columns with zeros in all rows

In [4]:
integratedDF = utils.DBtable_to_df('../integration.duckdb', 'integratedTable')
for (columnName, columnData) in integratedDF.items():
    if(integratedDF[columnName] == 0).all():
        integratedDF.drop(columnName, axis=1, inplace=True)
        print(columnName)
display(integratedDF)


Brunei
Maldivas
Santa_Sede
Swazilandia
Vanuatu


Unnamed: 0,Madrid_section,single_women_aged_16_to_64,single_men_aged_16_to_64,single_women_aged_65_or_over,single_men_aged_65_or_over,adult_women_with_one_or_more_minors,adult_men_with_one_or_more_minors,two_adults_from_16_to_64_and_without_minors,two_adults_one_at_least_65_and_without_minors,two_adults_and_one_minor,...,Túnez,Ucrania,Uganda,Uruguay,Uzbekistán,Venezuela,Vietnam,Yemen,Zambia,Zimbabwe
0,079601001,72.0,86.0,63.0,16.0,10.0,4.0,82.0,63.0,20.0,...,0.0,0.0,0.0,0.0,0.0,6.0,0.0,0.0,0.0,0.0
1,079601002,100.0,91.0,36.0,19.0,7.0,0.0,89.0,28.0,13.0,...,0.0,0.0,0.0,3.0,0.0,9.0,0.0,0.0,0.0,0.0
2,079601003,154.0,179.0,74.0,40.0,13.0,2.0,141.0,77.0,23.0,...,0.0,6.0,0.0,5.0,0.0,21.0,0.0,0.0,0.0,0.0
3,079601004,92.0,95.0,53.0,17.0,10.0,2.0,100.0,64.0,20.0,...,0.0,0.0,1.0,2.0,0.0,14.0,0.0,0.0,0.0,0.0
4,079601006,140.0,179.0,77.0,38.0,14.0,1.0,167.0,78.0,31.0,...,0.0,5.0,0.0,2.0,0.0,13.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7324,079621029,35.0,33.0,7.0,5.0,21.0,9.0,36.0,24.0,58.0,...,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0
7325,079621030,33.0,59.0,26.0,8.0,16.0,4.0,70.0,49.0,49.0,...,0.0,6.0,0.0,0.0,0.0,10.0,0.0,0.0,0.0,0.0
7326,079621031,99.0,94.0,14.0,7.0,41.0,8.0,155.0,35.0,164.0,...,0.0,4.0,0.0,0.0,0.0,36.0,0.0,1.0,0.0,0.0
7327,079621032,72.0,67.0,18.0,6.0,33.0,13.0,94.0,25.0,62.0,...,0.0,0.0,0.0,2.0,0.0,6.0,0.0,0.0,0.0,0.0


### Checking if there is section code with all zeros in remaining columns

In [5]:

display(integratedDF[integratedDF.columns[1:]].loc[(integratedDF[integratedDF.columns[1:]]== 0).all(axis=1)])
# none are found

Unnamed: 0,single_women_aged_16_to_64,single_men_aged_16_to_64,single_women_aged_65_or_over,single_men_aged_65_or_over,adult_women_with_one_or_more_minors,adult_men_with_one_or_more_minors,two_adults_from_16_to_64_and_without_minors,two_adults_one_at_least_65_and_without_minors,two_adults_and_one_minor,two_adults_and_two_minors,...,Túnez,Ucrania,Uganda,Uruguay,Uzbekistán,Venezuela,Vietnam,Yemen,Zambia,Zimbabwe


# Feature engineering

Groupping nationalities that have less than 0.5% of the total population (from 2018 to 2020) in a feature called Others.

In [6]:
cols = list(range(22, integratedDF.shape[1]))
cols[0] =  integratedDF.columns.get_loc('Madrid_section') # nationalities - year- madrid section - extranjeros - total nacionalidades

nat = integratedDF.iloc[:,cols]
nat.set_index('Madrid_section', inplace = True)
nat.astype('int64')

Unnamed: 0_level_0,Year,Afganistán,Albania,Alemania,Andorra,Angola,Antigua_y_Barbuda,Arabia_Saudita,Argelia,Argentina,...,Túnez,Ucrania,Uganda,Uruguay,Uzbekistán,Venezuela,Vietnam,Yemen,Zambia,Zimbabwe
Madrid_section,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
079601001,2018,0,0,2,0,0,0,0,0,2,...,0,0,0,0,0,6,0,0,0,0
079601002,2018,0,0,3,0,0,0,0,0,5,...,0,0,0,3,0,9,0,0,0,0
079601003,2018,0,0,5,0,0,0,0,2,15,...,0,6,0,5,0,21,0,0,0,0
079601004,2018,0,0,7,0,0,0,0,1,3,...,0,0,1,2,0,14,0,0,0,0
079601006,2018,0,0,18,0,1,0,0,0,8,...,0,5,0,2,0,13,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
079621029,2019,0,0,3,0,0,0,0,0,4,...,0,0,0,0,0,3,0,0,0,0
079621030,2019,0,0,2,0,0,0,0,0,2,...,0,6,0,0,0,10,0,0,0,0
079621031,2019,0,0,3,0,0,0,0,0,3,...,0,4,0,0,0,36,0,1,0,0
079621032,2019,0,0,5,0,0,0,0,0,2,...,0,0,0,2,0,6,0,0,0,0


In [7]:
nat.loc['Total (%)'] = round((nat.iloc[:,1:].sum()/(nat.iloc[:,1:].sum()).sum()) * 100,3)

natDef = (nat.loc[:, nat.loc['Total (%)'] >= 0.5])
natDef['Others'] = (nat.loc[:, nat.loc['Total (%)'] < 0.5]).sum(axis = 1)
natDef['Year'] = nat.Year

natDef.drop('Total (%)', inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  nat.loc['Total (%)'] = round((nat.iloc[:,1:].sum()/(nat.iloc[:,1:].sum()).sum()) * 100,3)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  natDef['Others'] = (nat.loc[:, nat.loc['Total (%)'] < 0.5]).sum(axis = 1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  natDef['Year'] = nat.Year
A value is trying to be set on a copy of a slice from a DataFrame

See t

In [8]:
integratedDF = households.merge(natDef, how='inner', on=["Madrid_section","Year"])

### Updating integrated table 

In [9]:
utils.df_to_DBtable('../integration.duckdb',integratedDF, 'integratedTable')

## Saving copy of integrated table to analysis folder, together with utils needed for analysis

In [10]:
shutil.copy("../integration.duckdb","../../2.Data Analysis Backbone/")
print('Copied')

Copied
