# Processing SVRS data

```
!pip install --upgrade pip
!pip install requests_html
```

```
!wget -r -np -nH http://203.112.218.66/webtestapplication/userfiles/image/Census2011/
```

- SVRS 2002: .dbf extension
- SVRS 2004: .dbf extension
- SVRS 2006: .dbf extension
- SVRS 2007: .dbf extension
- SVRS 2009: .dbf extension
- SVRS 2012: .dbf extension
- SVRS 2013: .sav extension
- SVRS 2014: .sav extension
- SVRS 2015: .dbf extension
- SVRS 2017: .dta extension
- **Type of files:**
    - .dbf: used in dBase II and continued through to dBase Version IV
    - .sav: SPSS file
    - .dta: STATA file

In [3]:
import os 
import glob
import numpy as np
import pandas as pd
from simpledbf import Dbf5
import tzlocal
import rpy2.robjects as robjects
from rpy2.robjects import packages
from rpy2.robjects import pandas2ri

In [5]:
os.getcwd()
os.chdir('/Users/edinhamzic/Symphony/wb_bangladesh/')
SVRS_PATH = 'data/bbs/svrs/*'
os.getcwd()


'/Users/edinhamzic/Symphony/wb_bangladesh'

In [73]:
data_directories = glob.glob(os.path.join(SVRS_PATH), recursive=True)
data_directories

['data/bbs/svrs/SVRS_04',
 'data/bbs/svrs/SVRS_02',
 'data/bbs/svrs/raw_data',
 'data/bbs/svrs/SVRS_17',
 'data/bbs/svrs/SVRS_07',
 'data/bbs/svrs/SVRS_09',
 'data/bbs/svrs/SVRS_06',
 'data/bbs/svrs/SVRS_15',
 'data/bbs/svrs/SVRS_12',
 'data/bbs/svrs/SVRS_13',
 'data/bbs/svrs/SVRS_14']

In [25]:
def import_svrs(path):
    """
    path: directory where dbf/sav/dta files are stored
    pattern: file extension pattern
    returns list of pandas dataframes
    """
    svrs_files = list()
    svrs_files.append(glob.glob(os.path.join(path, '*.dbf')))
    svrs_files.append(glob.glob(os.path.join(path, '*.sav')))
    svrs_files.append(glob.glob(os.path.join(path, '*.dta')))
    input_files = [files for files in svrs_files if files][0]
    output = dict()
    print(all(['.dbf' in check for check in input_files]))
    if all(['.dbf' in check for check in input_files]):
        print("Input files are dbf")
        for file in input_files:
            print(file)
            df = Dbf5(file)
            df = df.to_dataframe()
            output[file] = df
    elif all(['.sav' in check for check in input_files]):
        print("Input files are sav")
        foreign = packages.importr('foreign')
        pandas2ri.activate()
        for file in input_files:
            print(file)
            df = foreign.read_spss(file, reencode=False)
            tmp_dict = dict()
            for name in list(df.names):
                tm = np.array(df.rx(str(name)))
                tmp_dict[name] = tm.flatten()
            output[file] = pd.DataFrame.from_dict(tmp_dict)            
    elif all(['.dta' in check for check in input_files]):
        print("Input files are dta")
        for file in input_files:
            print(file)
            df = pd.read_stata(file)
            output[file] = df
    return output

```
foreign = packages.importr('foreign')
pandas2ri.activate()
df = foreign.read_spss('data/bbs/svrs/SVRS_13/tafsil5_14-05-2015.sav', reencode=False)
pd.DataFrame(dict(zip(df.names, map(list,list(df)))))
```

In [26]:
SVRS_PATH = 'data/bbs/svrs/'
svrs_directories = [os.path.join(SVRS_PATH,directory) for directory in os.listdir(SVRS_PATH) if 'SVRS' in directory]
svrs_directories

['data/bbs/svrs/SVRS_04',
 'data/bbs/svrs/SVRS_02',
 'data/bbs/svrs/SVRS_17',
 'data/bbs/svrs/SVRS_07',
 'data/bbs/svrs/SVRS_09',
 'data/bbs/svrs/SVRS_06',
 'data/bbs/svrs/SVRS_15',
 'data/bbs/svrs/SVRS_12',
 'data/bbs/svrs/SVRS_13',
 'data/bbs/svrs/SVRS_14']

In [28]:
svrs_data = dict()
for directory in svrs_directories:
    print(directory)
    svrs_data[directory] = import_svrs(directory)

data/bbs/svrs/SVRS_04
True
Input files are dbf
data/bbs/svrs/SVRS_04/tafsil-2p.dbf
data/bbs/svrs/SVRS_04/tafsil-9.dbf
data/bbs/svrs/SVRS_04/tafsil-8.dbf
data/bbs/svrs/SVRS_04/tafsil-3.dbf
data/bbs/svrs/SVRS_04/tafsil-2h.dbf
data/bbs/svrs/SVRS_04/tafsil-6.dbf
data/bbs/svrs/SVRS_04/tafsil-7.dbf
data/bbs/svrs/SVRS_04/tafsil-5.dbf
data/bbs/svrs/SVRS_04/tafsil-4.dbf
data/bbs/svrs/SVRS_02
True
Input files are dbf
data/bbs/svrs/SVRS_02/tafsil-9.dbf
data/bbs/svrs/SVRS_02/tafsil-8.dbf
data/bbs/svrs/SVRS_02/ABC.dbf
data/bbs/svrs/SVRS_02/tafsl-2p.dbf
data/bbs/svrs/SVRS_02/tafsil-3.dbf
data/bbs/svrs/SVRS_02/tafsl-10.dbf
data/bbs/svrs/SVRS_02/tafsil-6.dbf
data/bbs/svrs/SVRS_02/tafsil-7.dbf
data/bbs/svrs/SVRS_02/tafsil-5.dbf
data/bbs/svrs/SVRS_02/tafsil-4.dbf
data/bbs/svrs/SVRS_02/tafsl-2h.dbf
data/bbs/svrs/SVRS_17
False
Input files are dta
data/bbs/svrs/SVRS_17/tafsil-8.dta
data/bbs/svrs/SVRS_17/tafsil-9.dta
data/bbs/svrs/SVRS_17/tafsl-2p.dta
data/bbs/svrs/SVRS_17/tafsil-3.dta
data/bbs/svrs/SVRS_17



data/bbs/svrs/SVRS_13/tafsil8_14-05-2015.sav
data/bbs/svrs/SVRS_13/tafsil9_14-05-2015.sav
data/bbs/svrs/SVRS_13/HH_POP_14-05-2015.sav
data/bbs/svrs/SVRS_13/tafsil6_14-05-2015.sav
data/bbs/svrs/SVRS_13/tafsil4_14-05-2015.sav
data/bbs/svrs/SVRS_13/tafsil10_14-05-2015.sav
data/bbs/svrs/SVRS_13/tafsil3_14-05-2015.sav
data/bbs/svrs/SVRS_13/tafsil5_14-05-2015.sav
data/bbs/svrs/SVRS_13/tafsil11_14-05-2015.sav
data/bbs/svrs/SVRS_13/pop_hh_14-05-2015.sav
data/bbs/svrs/SVRS_14
False
Input files are sav
data/bbs/svrs/SVRS_14/tafsil2P_12-06-2015.sav
data/bbs/svrs/SVRS_14/tafsil5_12-06-2015.sav
data/bbs/svrs/SVRS_14/tafsil11_12-06-2015.sav
data/bbs/svrs/SVRS_14/tafsil3_12-06-2015.sav
data/bbs/svrs/SVRS_14/tafsil4_12-06-2015.sav
data/bbs/svrs/SVRS_14/tafsil2H_12-06-2015.sav
data/bbs/svrs/SVRS_14/tafsil10_12-06-2015.sav
data/bbs/svrs/SVRS_14/tafsil6_12-06-2015.sav
data/bbs/svrs/SVRS_14/tafsil9_12-06-2015.sav
data/bbs/svrs/SVRS_14/tafsil8_12-06-2015.sav
data/bbs/svrs/SVRS_14/tafsil7_12-06-2015.sav


In [None]:
## List of variables per dataset per year
## List of shapes per dataset per year

In [31]:
svrs_data.keys()

dict_keys(['data/bbs/svrs/SVRS_04', 'data/bbs/svrs/SVRS_02', 'data/bbs/svrs/SVRS_17', 'data/bbs/svrs/SVRS_07', 'data/bbs/svrs/SVRS_09', 'data/bbs/svrs/SVRS_06', 'data/bbs/svrs/SVRS_15', 'data/bbs/svrs/SVRS_12', 'data/bbs/svrs/SVRS_13', 'data/bbs/svrs/SVRS_14'])

In [38]:
svrs_data.keys()
svrs_data[key].keys()
columns

dict_keys(['data/bbs/svrs/SVRS_04', 'data/bbs/svrs/SVRS_02', 'data/bbs/svrs/SVRS_17', 'data/bbs/svrs/SVRS_07', 'data/bbs/svrs/SVRS_09', 'data/bbs/svrs/SVRS_06', 'data/bbs/svrs/SVRS_15', 'data/bbs/svrs/SVRS_12', 'data/bbs/svrs/SVRS_13', 'data/bbs/svrs/SVRS_14'])

In [78]:
def list_svrs_variables(svrs_dict):
    dict_output = {'svrs_year': [], 'svrs_schedule': [], 'num_variables': [], 'num_rows': [], 'variables': []}
    for key_year, items_year in svrs_dict.items():
        for key_schedule, item_schedule in svrs_dict[key_year].items():
            dict_output['svrs_year'].append(key_year)
            dict_output['svrs_schedule'].append(key_schedule)
            dict_output['num_variables'].append(svrs_dict[key_year][key_schedule].shape[1])
            dict_output['num_rows'].append(svrs_dict[key_year][key_schedule].shape[0])
            dict_output['variables'].append(list(svrs_dict[key_year][key_schedule].columns))
    return pd.DataFrame.from_dict(dict_output,orient='columns')
svrs_info = list_svrs_variables(svrs_data)
svrs_info.head(100)


Unnamed: 0,svrs_year,svrs_schedule,num_variables,num_rows,variables
0,data/bbs/svrs/SVRS_04,data/bbs/svrs/SVRS_04/tafsil-2p.dbf,21,1023489,"[PSU_NO, DIV, ZILA, UPZA, UNION, MAUZA, RMO, H..."
1,data/bbs/svrs/SVRS_04,data/bbs/svrs/SVRS_04/tafsil-9.dbf,51,181190,"[PSU_NO, DUP, DIV, ZILA, UPZA, UNION, MAUZA, R..."
2,data/bbs/svrs/SVRS_04,data/bbs/svrs/SVRS_04/tafsil-8.dbf,16,34866,"[PSU_NO, ZILA, DUP, UPZA, UNION, MAUZA, RMO, H..."
3,data/bbs/svrs/SVRS_04,data/bbs/svrs/SVRS_04/tafsil-3.dbf,29,20667,"[PSU_NO, DIV, ZILA, UPZA, UNION, MAUZA, OLD_RM..."
4,data/bbs/svrs/SVRS_04,data/bbs/svrs/SVRS_04/tafsil-2h.dbf,30,218738,"[PSU_NO, DIV, ZILA, UPZA, UNION, MAUZA, RMO, H..."
5,data/bbs/svrs/SVRS_04,data/bbs/svrs/SVRS_04/tafsil-6.dbf,21,971,"[PSU_NO, ZILA, UPZA, UNION, MAUZA, RMO, HH_NO,..."
6,data/bbs/svrs/SVRS_04,data/bbs/svrs/SVRS_04/tafsil-7.dbf,16,34588,"[PSU_NO, ZILA, DUP, UPZA, UNION, MAUZA, RMO, H..."
7,data/bbs/svrs/SVRS_04,data/bbs/svrs/SVRS_04/tafsil-5.dbf,18,12889,"[PSU_NO, DIV, ZILA, UPZA, UNION, MAUZA, RMO, H..."
8,data/bbs/svrs/SVRS_04,data/bbs/svrs/SVRS_04/tafsil-4.dbf,26,5775,"[PSU_NO, DIV, ZILA, UPZA, UNION, MAUZA, OLD_RM..."
9,data/bbs/svrs/SVRS_02,data/bbs/svrs/SVRS_02/tafsil-9.dbf,53,175905,"[PSU_NO, REMARK, DIV, ZILA, UPZA, UNION, MAUZA..."


In [81]:
for key_year, items_year in svrs_data.items():
    print("#######################################################")
    print(key_year)
    for key_schedule, item_schedule in svrs_data[key_year].items():
        print("#######################################################")
        print(svrs_data[key_year][key_schedule].columns)


#######################################################
data/bbs/svrs/SVRS_04
#######################################################
Index(['PSU_NO', 'DIV', 'ZILA', 'UPZA', 'UNION', 'MAUZA', 'RMO', 'HH_NO',
       'Q_9', 'DUP', 'Q_11', 'Q_12', 'Q_13', 'Q_14', 'Q_15', 'Q_16', 'Q_17',
       'Q_18', 'Q_19', 'Q_20', 'Q_21'],
      dtype='object')
#######################################################
Index(['PSU_NO', 'DUP', 'DIV', 'ZILA', 'UPZA', 'UNION', 'MAUZA', 'RMO',
       'HH_NO', 'Q92_HAGE', 'Q92_HEDU', 'Q92_HECO', 'Q92_WAGE', 'Q92_WEDU',
       'Q92_WECO', 'Q93', 'Q94_1', 'Q94_2', 'Q94_3', 'Q94_4', 'Q94_5', 'Q94_6',
       'Q94_7', 'Q94_8', 'Q94_9', 'Q94_10', 'Q94_11', 'Q94_12', 'Q94_13',
       'Q94_14', 'Q94_15', 'Q94_88', 'Q94_99', 'Q95', 'Q96_1', 'Q96_2',
       'Q96_3', 'Q96_4', 'Q96_5', 'Q96_6', 'Q96_7', 'Q96_8', 'Q96_9', 'Q96_10',
       'Q96_11', 'Q96_12', 'Q96_13', 'Q96_14', 'Q96_15', 'Q96_88', 'Q96_99'],
      dtype='object')
###########################################

In [90]:
svrs_data['data/bbs/svrs/SVRS_02']['data/bbs/svrs/SVRS_02/tafsil-9.dbf'].head()

Unnamed: 0,PSU_NO,REMARK,DIV,ZILA,UPZA,UNION,MAUZA,RMO,SMA,HH_NO,...,Q96_8,Q96_9,Q96_10,Q96_11,Q96_12,Q96_13,Q96_14,Q96_15,Q96_88,Q96_99
0,1,*,10.0,6,2,47,367,1,,1,...,0,0,0,0,0,0,0,15,0,0
1,1,,10.0,6,2,47,367,1,,2,...,0,0,0,0,0,0,0,0,0,0
2,1,,10.0,6,2,47,367,1,,3,...,0,0,0,0,0,0,0,0,0,0
3,1,,10.0,6,2,47,367,1,,4,...,0,0,0,0,0,0,0,0,0,0
4,1,,10.0,6,2,47,367,1,,5,...,0,0,0,0,0,0,0,0,0,0


In [101]:
svrs_data['data/bbs/svrs/SVRS_02']['data/bbs/svrs/SVRS_02/tafsl-2p.dbf'].columns

Index(['PSU_NO', 'DIV', 'ZILA', 'UPZA', 'UNION', 'MAUZA', 'OLD_RMO', 'RMO',
       'HH_NO', 'POP1', 'Q_9', 'Q_11', 'Q_12', 'Q_13', 'Q_14', 'Q_15', 'Q_16',
       'Q_17', 'Q_18', 'Q_19', 'Q_20', 'Q_21', 'YEAR'],
      dtype='object')

In [103]:
svrs_data['data/bbs/svrs/SVRS_02']['data/bbs/svrs/SVRS_02/tafsl-2p.dbf']['Q_9'].value_counts()

01    206803
02    201471
03    184304
04    149071
05    101707
06     61312
07     34603
08     15921
09      9385
10      5652
11      3503
12      2239
13      1445
14       933
15       614
16       420
17       292
18       197
19       138
20        95
21        61
22        43
23        31
24        27
25        19
26         9
27         7
28         6
29         4
30         3
31         1
Name: Q_9, dtype: int64