# Assessing BBS data: Provided in WB1 folder

- This notebook is created to assess the quality and type of data provided by Bangladesh Bureau of Statistics. 
- This specific notebook is used to assess data as provided in the form of "WB1" directory [data/bbs/census2011_svrs_first_sample/wb1].
- There were specified file output.

## Getting necessary packages

In [2]:
import os
import glob
import tzlocal
import numpy as np
import pandas as pd
from scipy import io
from dbfread import DBF
import matplotlib.pyplot as plt
import rpy2.robjects as robjects
from rpy2.robjects import packages
from rpy2.robjects import pandas2ri

## Functions

In [3]:
def import_stata(path):
    """
    path: directory where STATA files are stored
    pattern: file extension pattern
    returns list of pandas dataframes
    """
    stata_files = glob.glob(os.path.join(path, '*.dta'))
    output = dict()
    for file in stata_files:
        data = pd.read_stata(file)
        output[file] = data
    return output
def import_sav(file):
    """
    file: full path to a SPSS file to be imported
    """
    foreign = packages.importr('foreign')
    pandas2ri.activate()
    df = foreign.read_spss(file, reencode=False)
    return pd.DataFrame(dict(zip(df.names, map(list,list(df)))))

def get_names(dbf_data):
    variable_names = list()
    tafsil_names = list()
    for num, names in enumerate(dbf_data):
        variable_names = variable_names + list(dbf_data[names].columns)
        tafsil_names = tafsil_names + ([names] * len(list(dbf_data[names].columns)))
    return pd.DataFrame(np.column_stack([variable_names, tafsil_names]), columns = ['variables', 'dataset'])

## Importing data

### Importing STATA files from WB1 directory

In [4]:
WD = "/Users/edinhamzic/Symphony/wb_bangladesh/"
print(os.getcwd())
os.chdir()

/Users/edinhamzic/Symphony/wb_bangladesh/fe/bbs


FileNotFoundError: [Errno 2] No such file or directory: '5PCRT'

In [5]:
hous2011 = import_sav('5PCRT/hous2011.sav')
hous2011.head()

Unnamed: 0,D_R,hh_rec,division,zila,upzila,union,mauza,village,ea,rmo,...,floating,hh_type,no_house,type_hou,tenancy,dwaterso,toilet_f,electry,ethenpop,ethn_cod
0,,2,10.0,4.0,9.0,1.0,165.0,0.0,1.0,2.0,...,2.0,1.0,3.0,3.0,1.0,2.0,2.0,1.0,2.0,0.0
1,,2,10.0,4.0,9.0,1.0,165.0,0.0,1.0,2.0,...,2.0,1.0,2.0,3.0,1.0,2.0,2.0,1.0,2.0,0.0
2,,2,10.0,4.0,9.0,1.0,165.0,0.0,1.0,2.0,...,2.0,1.0,2.0,3.0,1.0,2.0,2.0,2.0,2.0,0.0
3,,2,10.0,4.0,9.0,1.0,165.0,0.0,1.0,2.0,...,2.0,1.0,2.0,3.0,1.0,2.0,3.0,1.0,2.0,0.0
4,,2,10.0,4.0,9.0,1.0,567.0,0.0,1.0,2.0,...,2.0,1.0,2.0,3.0,1.0,2.0,3.0,1.0,2.0,0.0


In [17]:
hous2011.columns

Index(['D_R', 'hh_rec', 'division', 'zila', 'upzila', 'union', 'mauza',
       'village', 'ea', 'rmo', 'hhno', 'serial_no', 'hhsize', 'floating',
       'hh_type', 'no_house', 'type_hou', 'tenancy', 'dwaterso', 'toilet_f',
       'electry', 'ethenpop', 'ethn_cod'],
      dtype='object')

In [14]:
hous2011['floating'].value_counts()

2.0    1608678
1.0        410
Name: floating, dtype: int64

In [6]:
for variable in hous2011.columns:
    print("#################################################")
    print(f"Summary for the variable: {variable}")
    print("#################################################")
    tmp = hous2011.copy(deep=True)
    print(tmp[variable].value_counts())
    print(len(tmp[variable].unique()))

#################################################
Summary for the variable: D_R
#################################################
     1609088
Name: D_R, dtype: int64
1
#################################################
Summary for the variable: hh_rec
#################################################
2    1609088
Name: hh_rec, dtype: int64
1
#################################################
Summary for the variable: division
#################################################
30.0    542628
20.0    281399
50.0    224376
55.0    190915
40.0    187025
10.0     93175
60.0     89570
Name: division, dtype: int64
7
#################################################
Summary for the variable: zila
#################################################
26.0    139359
15.0     76617
61.0     57780
19.0     52688
93.0     43513
10.0     43363
33.0     41337
85.0     36015
27.0     35798
88.0     35754
67.0     33793
41.0     32826
64.0     32795
81.0     31693
48.0     31372
32.0     30618
91.0     29812

6.027687e+11    4
6.027719e+11    3
6.027710e+11    3
6.028761e+11    3
6.027736e+11    3
6.028770e+11    3
6.027683e+11    2
6.027737e+11    2
6.025931e+11    2
6.027693e+11    2
6.027711e+11    2
6.027707e+11    2
6.028783e+11    2
6.027687e+11    2
6.027707e+11    2
6.031849e+11    2
6.027741e+11    2
6.028759e+11    2
3.002014e+11    2
6.008559e+11    2
6.028096e+11    2
6.028751e+11    2
6.028769e+11    2
6.027709e+11    2
6.027697e+11    2
6.029541e+11    2
6.027707e+11    2
6.027697e+11    2
6.027723e+11    2
6.027672e+11    2
               ..
6.008196e+11    1
6.013049e+11    1
6.011290e+11    1
6.015467e+11    1
6.021442e+11    1
6.016626e+11    1
6.012990e+11    1
6.030964e+11    1
6.030413e+11    1
1.506795e+11    1
6.007116e+11    1
6.003574e+11    1
6.008286e+11    1
1.500597e+11    1
6.003994e+11    1
6.031886e+11    1
6.000811e+11    1
6.006856e+11    1
6.010564e+11    1
6.000826e+11    1
6.011386e+11    1
6.005353e+11    1
6.014249e+11    1
6.027152e+11    1
6.030956e+

In [None]:
tmp[].value_counts()

In [16]:
popu2011.columns

Index(['D_R', 'pop_rec', 'division', 'zila', 'upzila', 'union', 'mauza',
       'village', 'ea', 'rmo', 'hhno', 'serial_no', 'mem_serial', 'age',
       'relation', 'sex', 'mar_stat', 'religion', 'disable', 'student',
       'h_class', 'edu_fild', 'literacy', 'activity', 'emply_fl'],
      dtype='object')

In [11]:
popu2011.head()

Unnamed: 0,D_R,pop_rec,division,zila,upzila,union,mauza,village,ea,rmo,...,sex,mar_stat,religion,disable,student,h_class,edu_fild,literacy,activity,emply_fl
0,,1,10.0,4.0,9.0,1.0,165.0,0.0,1.0,2.0,...,2.0,1.0,1.0,0.0,2.0,9.0,1.0,1.0,3.0,0.0
1,,1,10.0,4.0,9.0,1.0,165.0,0.0,1.0,2.0,...,1.0,2.0,1.0,0.0,1.0,10.0,1.0,1.0,4.0,0.0
2,,1,10.0,4.0,9.0,1.0,165.0,0.0,1.0,2.0,...,1.0,2.0,2.0,0.0,2.0,5.0,1.0,1.0,1.0,1.0
3,,1,10.0,4.0,9.0,1.0,165.0,0.0,1.0,2.0,...,2.0,2.0,2.0,0.0,2.0,0.0,4.0,2.0,3.0,0.0
4,,1,10.0,4.0,9.0,1.0,165.0,0.0,1.0,2.0,...,1.0,1.0,2.0,0.0,2.0,7.0,1.0,1.0,1.0,3.0


In [9]:
for variable in popu2011.columns:
    print("#################################################")
    print(f"Summary for the variable: {variable}")
    print("#################################################")
    tmp = popu2011.copy(deep=True)
    print(tmp[variable].value_counts())
    print(len(tmp[variable].unique()))

#################################################
Summary for the variable: D_R
#################################################
     7205720
Name: D_R, dtype: int64
1
#################################################
Summary for the variable: pop_rec
#################################################
1    7205720
Name: pop_rec, dtype: int64
1
#################################################
Summary for the variable: division
#################################################
30.0    2375225
20.0    1422571
50.0     924436
55.0     788027
40.0     784807
60.0     496480
10.0     414174
Name: division, dtype: int64
7
#################################################
Summary for the variable: zila
#################################################
26.0    604771
15.0    381686
19.0    269546
61.0    255902
93.0    180487
91.0    170924
10.0    170510
33.0    170413
88.0    155780
75.0    155491
27.0    149305
67.0    148185
48.0    145373
85.0    142894
12.0    142426
41.0    138874
64.0 

1.510473e+11    1694
6.000265e+11    1200
6.017988e+11     700
6.005197e+11     637
6.020937e+11     586
6.003112e+11     549
6.000351e+11     520
6.001125e+11     506
6.005729e+11     497
6.002422e+11     496
6.007508e+11     491
6.003363e+11     477
1.505044e+11     460
6.016985e+11     436
6.008699e+11     423
6.000253e+11     406
6.017143e+11     404
3.001610e+11     380
6.000306e+11     370
1.512155e+11     340
3.002573e+11     303
1.502264e+11     300
6.034957e+11     288
1.501258e+11     280
6.021137e+11     277
6.026669e+11     272
6.009047e+11     272
6.021257e+11     261
1.509229e+11     258
6.015403e+11     250
                ... 
6.026217e+11       1
6.016200e+11       1
6.025496e+11       1
1.506104e+11       1
6.009285e+11       1
6.027260e+11       1
6.015771e+11       1
6.002641e+11       1
6.034841e+11       1
1.508000e+11       1
6.027732e+11       1
6.031840e+11       1
1.506926e+11       1
6.006998e+11       1
6.009093e+11       1
6.018032e+11       1
6.001734e+11 

In [7]:
popu2011 = import_sav('5PCRT/popu2011.sav')
popu2011.head()

Unnamed: 0,D_R,pop_rec,division,zila,upzila,union,mauza,village,ea,rmo,...,sex,mar_stat,religion,disable,student,h_class,edu_fild,literacy,activity,emply_fl
0,,1,10.0,4.0,9.0,1.0,165.0,0.0,1.0,2.0,...,2.0,1.0,1.0,0.0,2.0,9.0,1.0,1.0,3.0,0.0
1,,1,10.0,4.0,9.0,1.0,165.0,0.0,1.0,2.0,...,1.0,2.0,1.0,0.0,1.0,10.0,1.0,1.0,4.0,0.0
2,,1,10.0,4.0,9.0,1.0,165.0,0.0,1.0,2.0,...,1.0,2.0,2.0,0.0,2.0,5.0,1.0,1.0,1.0,1.0
3,,1,10.0,4.0,9.0,1.0,165.0,0.0,1.0,2.0,...,2.0,2.0,2.0,0.0,2.0,0.0,4.0,2.0,3.0,0.0
4,,1,10.0,4.0,9.0,1.0,165.0,0.0,1.0,2.0,...,1.0,1.0,2.0,0.0,2.0,7.0,1.0,1.0,1.0,3.0


## Assessing the data quality, geo and variable availablility

In [17]:
stata_data.keys()
print(stata_data['wb1/HH2011.dta'].shape)
print(stata_data['wb1/POP2011.dta'].shape)

dict_keys([])

In [5]:
stata_data['wb1/HH2011.dta'].head()
len(stata_data['wb1/HH2011.dta']['ea'].unique())


100

In [6]:
stata_data['wb1/POP2011.dta'].head()

Unnamed: 0,pop_rec,division,zila,upzila,union,mauza,village,ea,rmo,hhno,...,sex,mar_stat,religion,disable,student,h_class,edu_fild,literacy,activity,emply_fl
0,1,10,4,9,1,165,0,1,2,30,...,1,1,2,0,2,7,1,1,1,3
1,1,10,4,9,1,165,0,1,2,50,...,1,2,1,0,2,5,1,1,1,1
2,1,10,4,9,1,165,0,1,2,70,...,2,2,1,0,2,0,4,2,1,1
3,1,10,4,9,1,567,0,1,2,16,...,2,2,1,0,2,15,1,1,1,1
4,1,10,4,9,1,567,0,1,2,56,...,1,1,1,0,2,0,0,0,0,0


In [8]:
stata_data['wb1/POP2011.dta'].describe()

Unnamed: 0,division,zila,upzila,union,mauza,village,ea,rmo,hhno,serial_no,...,sex,mar_stat,religion,disable,student,h_class,edu_fild,literacy,activity,emply_fl
count,1441144.0,1441144.0,1441144.0,1441144.0,1441144.0,1441144.0,1441144.0,1441144.0,1441144.0,1441144.0,...,1441144.0,1441144.0,1441144.0,1441144.0,1441144.0,1441144.0,1441144.0,1441144.0,1441144.0,1441144.0
mean,35.33226,48.49055,49.40345,44.11159,501.1679,1.546885,5.348701,1.837044,82.02562,569704600000.0,...,1.500021,1.563885,1.115607,0.04779536,1.772258,3.709634,1.729096,1.25275,2.225191,0.5618932
std,14.383,27.33173,27.00182,27.9852,285.1623,2.525678,9.190261,2.16766,448.7823,112443600000.0,...,0.5000002,0.5796373,0.4027394,0.4332478,0.4193754,4.207713,1.484373,0.7060981,1.504943,1.002492
min,10.0,1.0,2.0,1.0,1.0,0.0,1.0,1.0,1.0,150000000000.0,...,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
25%,20.0,26.0,27.0,19.0,255.0,1.0,1.0,1.0,27.0,600654800000.0,...,1.0,1.0,1.0,0.0,2.0,0.0,1.0,1.0,1.0,0.0
50%,30.0,48.0,49.0,42.0,500.0,1.0,2.0,1.0,56.0,601488800000.0,...,2.0,2.0,1.0,0.0,2.0,2.0,1.0,1.0,3.0,0.0
75%,50.0,73.0,72.0,69.0,748.0,1.0,5.0,1.0,87.0,602503800000.0,...,2.0,2.0,1.0,0.0,2.0,7.0,4.0,2.0,4.0,1.0
max,60.0,94.0,96.0,98.0,999.0,66.0,99.0,9.0,9999.0,603495800000.0,...,2.0,4.0,5.0,6.0,2.0,18.0,4.0,2.0,4.0,3.0


In [42]:
print(stata_data['wb1/POP2011.dta']['pop_rec'].astype('int').sum())
get_names(stata_data).to_csv("Census_Variables.csv")

1441144


- Number of variables
- Number of rows
- Create a hierarchical sample for Census 2010 data