# dp_svrs_metadata

```
!pip install dbf
!pip install rpy2
!pip install dbfread
!pip install simpledbf
```

In [1]:
import re
import os
import numpy as np
import pandas as pd
from dbfread import DBF
from simpledbf import Dbf5
from rpy2.robjects import pandas2ri
from rpy2.robjects.packages import importr

## TO DO LIST:
- List all files within the directory with ".dbf", ".dta" or ".sav" names
- Take file name, folder above

In [2]:
DATA_DIR = '../../data/bbs/svrs/'
OUT_DIR = '../../output/bbs/'
os.getcwd()

'/Users/edinhamzic/Symphony/wb_bangladesh/Bangladesh/fe/bbs'

In [3]:
def read_dbf(file):
    try:
        df = Dbf5(file)
        df = df.to_dataframe()
    except error:
        df = DBF(file)
        df = pd.DataFrame(iter(df))
    return df


def read_sav(file):
    foreign = importr('foreign')
    pandas2ri.activate()
    df = foreign.read_spss(file, reencode=False)
    tmp_dict = dict()
    for name in list(df.names):
        tm = np.array(df.rx(str(name)))
        tmp_dict[name] = tm.flatten()
    df = pd.DataFrame.from_dict(tmp_dict) 
    return df


def read_dta(file_path):
    df = pd.read_stata(file_path)
    return df

def svrs_metadata(data_path):
    output = {'variable': [], 'file_name': [], 'file_path': [], 'year': [], 'folder': []}
    for path, subdirs, files in os.walk(data_path):
        files_str = [file for file in files if 
                     bool(re.search(".dbf", file)) or 
                     bool(re.search(".dta", file)) or
                     bool(re.search(".sav", file))]
        for file in files_str:
            y = "20" + os.path.split(path)[1][-2:]
            if bool(re.search(r".dbf|.DBF", file)):
                df = read_dbf(os.path.join(path,file))
                print("#"*20 + f" Year: {y} DBF (dBASE) File: {file} " + "#" * 20)
            elif bool(re.search(r".dta|.DTA", file)):
                df = read_dta(os.path.join(path,file))
                print("#"*20 + f" Year: {y} DTA (STATA) File: {file} " + "#" * 20)
            elif bool(re.search(r".sav|.SAV", file)):
                df = read_sav(os.path.join(path,file))
                print("#"*20 + f" Year: {y} SAV (SPSS) File: {file} " + "#" * 20)
            output['variables'] = output['variables'] + list(df.columns)
            output['file_name'] = output['file_name'] + [file]*df.shape[1]
            output['file_path'] = output['file_path'] + [os.path.join(path,file)]*df.shape[1]
            output['year'] = output['year'] + ["20" + os.path.split(path)[1][-2:]]*df.shape[1]
            output['folder'] =  output['folder'] + ["20" + os.path.split(path)[1]]*df.shape[1]
            else:
                print(f"Check the structure of file {file}!")
    return pd.DataFrame.from_dict(output)


In [13]:
re.search(r"dbf|.DBF", "DBF")

In [4]:
svrs_variables = svrs_metadata('../../data/bbs/svrs')

#################### Year: 2004 DBF (dBASE) File: tafsil-2p.dbf ####################
#################### Year: 2004 DBF (dBASE) File: tafsil-9.dbf ####################
#################### Year: 2004 DBF (dBASE) File: tafsil-8.dbf ####################
#################### Year: 2004 DBF (dBASE) File: tafsil-3.dbf ####################
#################### Year: 2004 DBF (dBASE) File: tafsil-2h.dbf ####################
#################### Year: 2004 DBF (dBASE) File: tafsil-6.dbf ####################
#################### Year: 2004 DBF (dBASE) File: tafsil-7.dbf ####################
#################### Year: 2004 DBF (dBASE) File: tafsil-5.dbf ####################
#################### Year: 2004 DBF (dBASE) File: tafsil-4.dbf ####################
#################### Year: 2002 DBF (dBASE) File: tafsil-9.dbf ####################
#################### Year: 2002 DBF (dBASE) File: tafsil-8.dbf ####################
#################### Year: 2002 DBF (dBASE) File: ABC.dbf ################

#################### Year: 2014 SAV (SPSS) File: tafsil3_12-06-2015.sav ####################
#################### Year: 2014 SAV (SPSS) File: tafsil4_12-06-2015.sav ####################
#################### Year: 2014 SAV (SPSS) File: tafsil2H_12-06-2015.sav ####################
#################### Year: 2014 SAV (SPSS) File: tafsil10_12-06-2015.sav ####################
#################### Year: 2014 SAV (SPSS) File: tafsil6_12-06-2015.sav ####################
#################### Year: 2014 SAV (SPSS) File: tafsil9_12-06-2015.sav ####################
#################### Year: 2014 SAV (SPSS) File: tafsil8_12-06-2015.sav ####################
#################### Year: 2014 SAV (SPSS) File: tafsil7_12-06-2015.sav ####################


In [7]:
pd.DataFrame.from_dict(svrs_variables)

Unnamed: 0,file_name,file_path,year,folder,variables
0,tafsil-2p.dbf,../../data/bbs/svrs/SVRS_04/tafsil-2p.dbf,2004,20SVRS_04,PSU_NO
1,tafsil-2p.dbf,../../data/bbs/svrs/SVRS_04/tafsil-2p.dbf,2004,20SVRS_04,DIV
2,tafsil-2p.dbf,../../data/bbs/svrs/SVRS_04/tafsil-2p.dbf,2004,20SVRS_04,ZILA
3,tafsil-2p.dbf,../../data/bbs/svrs/SVRS_04/tafsil-2p.dbf,2004,20SVRS_04,UPZA
4,tafsil-2p.dbf,../../data/bbs/svrs/SVRS_04/tafsil-2p.dbf,2004,20SVRS_04,UNION
5,tafsil-2p.dbf,../../data/bbs/svrs/SVRS_04/tafsil-2p.dbf,2004,20SVRS_04,MAUZA
6,tafsil-2p.dbf,../../data/bbs/svrs/SVRS_04/tafsil-2p.dbf,2004,20SVRS_04,RMO
7,tafsil-2p.dbf,../../data/bbs/svrs/SVRS_04/tafsil-2p.dbf,2004,20SVRS_04,HH_NO
8,tafsil-2p.dbf,../../data/bbs/svrs/SVRS_04/tafsil-2p.dbf,2004,20SVRS_04,Q_9
9,tafsil-2p.dbf,../../data/bbs/svrs/SVRS_04/tafsil-2p.dbf,2004,20SVRS_04,DUP
