# dp_svrs_metadata

```
!pip install dbf
!pip install rpy2
!pip install dbfread
!pip install simpledbf
```

In [3]:
import sys
import os
import re
import time
import logging
import numpy as np
import pandas as pd
from dbfread import DBF
from simpledbf import Dbf5
from rpy2.robjects import pandas2ri
from rpy2.robjects.packages import importr

In [4]:
np.log10(1)

0.0

## TO DO LIST:
- List all files within the directory with ".dbf", ".dta" or ".sav" names
- Take file name, folder above

In [5]:
DATA_DIR = '../../data/bbs/svrs/'
OUT_DIR = '../../output/bbs/'
os.getcwd()

'/Users/edinhamzic/Symphony/wb_bangladesh/Bangladesh/utils/dp_svrs_metadata'

In [6]:
def sumarise_var(df,var):
    return df[var].unique()
    
def read_dbf(file):
    try:
        df = Dbf5(file)
        df = df.to_dataframe()
    except error:
        df = DBF(file)
        df = pd.DataFrame(iter(df))
    return df


def read_sav(file):
    foreign = importr('foreign')
    pandas2ri.activate()
    df = foreign.read_spss(file, reencode=False)
    tmp_dict = dict()
    for name in list(df.names):
        tm = np.array(df.rx(str(name)))
        tmp_dict[name] = tm.flatten()
    df = pd.DataFrame.from_dict(tmp_dict)
    return df


def read_dta(file_path):
    df = pd.read_stata(file_path)
    return df


def svrs_metadata(data_path, out_path):
    output = {'variable': [], 'file_name': [], 'file_path': [], 'year': [], 'folder': []}
    for path, subdirs, files in os.walk(data_path):
        files_str = [file for file in files if
                     bool(re.search(".dbf", file)) or
                     bool(re.search(".dta", file)) or
                     bool(re.search(".sav", file))]
        for file in files_str:
            y = "20" + os.path.split(path)[1][-2:]
            if bool(re.search(r".dbf|.DBF", file)):
                df = read_dbf(os.path.join(path,file))
                print("#"*20 + f" Year: {y} DBF (dBASE) File: {file} " + "#" * 20)
            elif bool(re.search(r".dta|.DTA", file)):
                df = read_dta(os.path.join(path,file))
                print("#"*20 + f" Year: {y} DTA (STATA) File: {file} " + "#" * 20)
            elif bool(re.search(r".sav|.SAV", file)):
                df = read_sav(os.path.join(path,file))
                print("#"*20 + f" Year: {y} SAV (SPSS) File: {file} " + "#" * 20)
            else:
                print(f"Check the structure of file {file}!")
            output['variable'] = output['variable'] + list(df.columns)
            output['file_name'] = output['file_name'] + [file]*df.shape[1]
            output['file_path'] = output['file_path'] + [os.path.join(path,file)]*df.shape[1]
            output['year'] = output['year'] + ["20" + os.path.split(path)[1][-2:]]*df.shape[1]
            output['folder'] = output['folder'] + ["20" + os.path.split(path)[1]]*df.shape[1]
    df = pd.DataFrame.from_dict(output)
    return df

```
svrs_variables = svrs_metadata(data_path = DATA_DIR, out_path=OUT_DIR)
print(svrs_variables.shape)
print(svrs_variables.drop_duplicates().shape)
svrs_variables.sort_values(by=['year','file_name', 'variable'])
```

In [7]:
variables_assessment = pd.read_csv("../../output/bbs/svrs/dp_svrs_metadata.csv")
variables_assessment.head()

Unnamed: 0,variable,variable_definition,file_name,year,file_path,folder
0,DIV,,ABC.dbf,2002,/Users/edinhamzic/Symphony/wb_bangladesh/Bangl...,SVRS_02
1,DUP,,ABC.dbf,2002,/Users/edinhamzic/Symphony/wb_bangladesh/Bangl...,SVRS_02
2,HH_NO,,ABC.dbf,2002,/Users/edinhamzic/Symphony/wb_bangladesh/Bangl...,SVRS_02
3,MAUZA,,ABC.dbf,2002,/Users/edinhamzic/Symphony/wb_bangladesh/Bangl...,SVRS_02
4,MOTH_LIN,,ABC.dbf,2002,/Users/edinhamzic/Symphony/wb_bangladesh/Bangl...,SVRS_02


In [8]:
def sum_variables(input_df):
    logging.basicConfig(level = logging.INFO, filename = time.strftime("var_logs-%Y-%m-%d.log"))
    df = input_df.copy()
    for var in df['variable'].unique():
        nchar = len(f' Checking variables: {var} ')
        logging.info('#' * 100)
        logging.info('#'* int((50 - round(nchar/2,0)))  + f' Checking variables: {var} ' + '#'* int((50 - round(nchar/2,0))))
        logging.info('#' * 100)
        tmp = df[df['variable'] == var]
        for file in tmp['file_path'].unique():
            if bool(re.search(r".dbf|.DBF", file)):
                tmp_df = read_dbf(os.path.join(file))
            elif bool(re.search(r".dta|.DTA", file)):
                tmp_df = read_dta(os.path.join(file))
            elif bool(re.search(r".sav|.SAV", file)):
                tmp_df = read_sav(os.path.join(file))
            else:
                print(f"Check the structure of file {file}!")
            logging.info('#' * 50)
            logging.info(f"The file: {file}!")
            logging.info(f"Summary:")
            logging.info(tmp_df[var].unique())
            tmp_df[var] = tmp_df[var].astype(str)
            logging.info(tmp_df[var].value_counts())

In [9]:
sum_variables(input_df=variables_assessment)