# Data Collection

## Configure Environment 

In [1]:
# If running this module for the first time, you may need to install the required third-party packages.
# Steps:
# 1. Uncomment the line below and run the cell to install the packages.
# 2. Check the log to confirm successful installation.
# 3. Comment the line back out and restart the module.

# !pip install -r ../requirements.txt

### Import libraries

In [2]:
import os
from pathlib import Path
import json
import zipfile
import pandas as pd
import time

### Utility Functions

In [3]:
def unzip_files(zip_path: str, extract_to: str):
    """
    Unzips all files from the specified zip archive to a given directory.

    :param zip_path: Path to the zip file.
    :param extract_to: Destination directory for extracted files.
    """
    # Validate if the provided path points to a valid zip file
    if not zipfile.is_zipfile(zip_path):
        raise ValueError(f"{zip_path} is not a valid zip file.")
    
    # Open the zip file and extract its contents to the target directory
    with zipfile.ZipFile(file=zip_path, mode="r") as my_zip_file:
        my_zip_file.extractall(path=extract_to)

    # Confirmation of successful extraction
    print(f"Extracted all files to {extract_to}")

In [4]:
def make_mydir(dirname: str) -> Path:
    """
    Creates a directory, including parent directories if they do not exist.

    :param dirname: The path of the directory to create.
    :return: Path object representing the created directory.
    """
    path = Path(dirname)
    # Create the directory, along with any necessary parent directories
    path.mkdir(parents=True, exist_ok=True)
    return path

## Process 

### Read data

#### Target Country: Eygpt

In [5]:
# Set the target country for data processing
country = "EG"

# Create a directory to store raw data files, including parent directories if they don't exist
raw_data_dir = make_mydir('../resources/raw_data')

In [6]:
# Prompt the user to provide the path to the zipped raw survey data
# In the current investigation, we use the following as folder as a source raw data: 
# EG_2014_DHS_07092024_715_216139.zip
raw_data_file = input("Enter the name of the source data zip folder name:")

# Construct the full path to the user-provided zip file
raw_data_path = f'../user_input/{raw_data_file}'

# Create a directory for extracting the raw data, specific to the target country
extract_to = make_mydir(f"{raw_data_dir}/extracted/all/{country}/")

# Unzip the provided raw data file to the specified directory
unzip_files(zip_path=raw_data_path, extract_to=extract_to)

Enter the name of the source data zip folder name: EG_2014_DHS_07092024_715_216139.zip


Extracted all files to ..\resources\raw_data\extracted\all\EG


### Extract variables metadata

In [7]:
def preprocess_input_data(directory_path: str, var_output_file_path: str,
                          var_level_output_file_path: str, var_type_output_file_path: str,
                          data_output_file_dir: str):
    """
    Processes raw survey data files from a specified directory and extracts metadata (variable names, levels, and types). 
    Converts data from STATA format to CSV.

    :param directory_path: Path to the directory containing the raw files.
    :param var_output_file_path: Path to the output file where variable names will be saved.
    :param var_level_output_file_path: Path to the output file for variable levels.
    :param var_type_output_file_path: Path to the output file for variable types.
    :param data_output_file_dir: Path to the directory where converted CSV files will be saved.
    """
    DELIMIT = ","  # Delimiter for output files
    file_count = 0  # Tracks number of files processed
    start_time = time.time()  # To calculate processing time

    try:
        print("Processing raw data files...")
        
        # Initialize output files with headers
        with open(var_output_file_path, 'w+') as var_out_file:
            var_out_file.write(f"File Name{DELIMIT}Variable Id{DELIMIT}Variable Name\n")

        with open(var_level_output_file_path, 'w+') as var_level_out_file:
            var_level_out_file.write(f"File Name{DELIMIT}Variable Id{DELIMIT}Variable Level{DELIMIT}Variable Level Description\n")

        with open(var_type_output_file_path, 'w+') as var_type_out_file:
            var_type_out_file.write(f"File Name{DELIMIT}Variable Id{DELIMIT}Data Type\n")
        
        # Iterate through the directory and process the files
        for path, subdirs, files in os.walk(directory_path):
            for name in files:
                file_count += 1  # Count the files processed
                is_open = False
                
                # Process .DO files to extract variable names and levels
                if name.endswith('.DO'):
                    file_path = os.path.join(path, name)
                    print(file_path)
                    
                    with open(file_path, 'r') as in_file:
                        for line in in_file:
                            # Handle 'label variable' lines to extract variable names
                            if line.startswith('label variable'):
                                with open(var_output_file_path, 'a') as var_out_file:
                                    words = [w.replace("\n", "") for w in line.split(" ") if w not in ['label', 'variable', '']]
                                    variable_id = words[0]
                                    variable_name = " ".join(words[1:])
                                    var_out_file.write(f"{name}{DELIMIT}{variable_id}{DELIMIT}{variable_name}\n")
                            
                            # Begin processing 'label define' lines for variable levels
                            elif line.startswith('label define'):
                                is_open = True
                                words = line.split(" ")
                                variable_id = words[2].replace("\n", "")
                            
                            # Capture variable levels
                            elif is_open and not line.startswith(';'):
                                with open(var_level_output_file_path, 'a') as var_level_out_file:
                                    words = [w.replace("\n", "") for w in line.split(" ") if w]
                                    variable_level = words[0]
                                    variable_level_description = " ".join(words[1:])
                                    var_level_out_file.write(f"{name}{DELIMIT}{variable_id}{DELIMIT}{variable_level}{DELIMIT}{variable_level_description}\n")
                            
                            # End capturing variable levels
                            elif is_open and line.startswith(';'):
                                is_open = False
                
                # Process .DCT files to extract variable types
                elif name.endswith('.DCT'):
                    file_path = os.path.join(path, name)
                    print(file_path)
                    
                    with open(file_path, 'r') as in_file:
                        for line in in_file:
                            # Skip unnecessary lines
                            if line.startswith("infix") or line.startswith("1 lines") or line.startswith("}"):
                                continue
                            else:
                                with open(var_type_output_file_path, 'a') as var_type_out_file:
                                    words = [w for w in line.split(" ") if w not in ['', '\n']]
                                    variable_id = words[1]
                                    variable_type = words[0]
                                    var_type_out_file.write(f"{name}{DELIMIT}{variable_id}{DELIMIT}{variable_type}\n")
                
                # Process .DTA files and convert them to CSV
                elif name.endswith('.DTA'):
                    file_path = os.path.join(path, name)
                    print(file_path)
                    
                    # Read and convert .DTA file to CSV
                    in_df = pd.read_stata(file_path, convert_categoricals=False)
                    out_file_path = os.path.join(data_output_file_dir, f"{name.replace('.DTA', '')}.csv")
                    in_df.to_csv(out_file_path, index=False)
        
        # Print summary of the process
        print(f"Total number of files processed: {file_count}")
        print(f"Variable names written to: {var_output_file_path}")
        print(f"Variable levels written to: {var_level_output_file_path}")
        print(f"Variable data types written to: {var_type_output_file_path}")
        print(f"CSV data saved to: {data_output_file_dir}")
        
        elapsed_time = time.time() - start_time  # Calculate and display elapsed time
        print(f"Elapsed time in seconds: {elapsed_time:.3f}")

    except Exception as e:
        print(f"An error occurred: {e}")
        raise e

#### Target Country: Eygpt

In [8]:
# Create directories for raw data extraction and output files for variables and data
directory_path = make_mydir(f"{raw_data_dir}/extracted/all/{country}/")

# Set paths for output CSV files to store variable names, levels, and data types
var_output_file_path = os.path.join(make_mydir(f"{raw_data_dir}/extracted/variables/{country}/"),
                                    "variables_name.csv")
var_leve_output_file_path = os.path.join(make_mydir(f"{raw_data_dir}/extracted/variables/{country}/"),
                                         "variables_level.csv")
var_type_output_file_path = os.path.join(make_mydir(f"{raw_data_dir}/extracted/variables/{country}/"),
                                         "variables_datatype.csv")

# Set directory for saving the converted data files (from STATA to CSV)
data_output_file_dir = make_mydir(f"{raw_data_dir}/extracted/data/{country}/")

# Preprocess the input data by parsing variable information and converting files
preprocess_input_data(directory_path=directory_path, 
                      var_output_file_path=var_output_file_path, 
                      var_level_output_file_path=var_leve_output_file_path,
                      var_type_output_file_path=var_type_output_file_path,
                      data_output_file_dir=data_output_file_dir
                     )

Processing raw data files...
..\resources\raw_data\extracted\all\EG\EGBR61DT\EGBR61FL.DCT
..\resources\raw_data\extracted\all\EG\EGBR61DT\EGBR61FL.DO
..\resources\raw_data\extracted\all\EG\EGBR61DT\EGBR61FL.DTA
..\resources\raw_data\extracted\all\EG\EGHR61DT\EGHR61FL.DCT
..\resources\raw_data\extracted\all\EG\EGHR61DT\EGHR61FL.DO
..\resources\raw_data\extracted\all\EG\EGHR61DT\EGHR61FL.DTA
..\resources\raw_data\extracted\all\EG\EGIR61DT\EGIR61FL.DCT
..\resources\raw_data\extracted\all\EG\EGIR61DT\EGIR61FL.DO
..\resources\raw_data\extracted\all\EG\EGIR61DT\EGIR61FL.DTA
..\resources\raw_data\extracted\all\EG\EGKR61DT\EGKR61FL.DCT
..\resources\raw_data\extracted\all\EG\EGKR61DT\EGKR61FL.DO
..\resources\raw_data\extracted\all\EG\EGKR61DT\EGKR61FL.DTA
..\resources\raw_data\extracted\all\EG\EGPR61DT\EGPR61FL.DCT
..\resources\raw_data\extracted\all\EG\EGPR61DT\EGPR61FL.DO
..\resources\raw_data\extracted\all\EG\EGPR61DT\EGPR61FL.DTA
Total number of files processed: 31
Variable names written to

### Extract target input data

In [9]:
# After inspecting the extracted variable information, we selected 230 items (questionnaire items)
# as independent variables from an initial pool of approximately 3,000 items.

# Prompt the user to provide the path to the file containing the selected variable IDs.
# In the current investigation, we use the following file which contains the selected variable IDs:
# EG_selected_variables_id.csv
file_name = input("Enter the name of the selected variables file name:")
selected_var_path = f'../user_input/{file_name}'

# Read the selected variable IDs from the CSV file.
selected_var_ids = [id[0] for id in pd.read_csv(selected_var_path).values]

# Output the count and a sample of the selected variables.
print(f"Number of selected variables: {len(selected_var_ids)}")
print(f"Sample of selected variables: {selected_var_ids[:5]}")

Enter the name of the selected variables file name: EG_selected_variables_id.csv


Number of selected variables: 235
Sample of selected variables: ['hhid', 'hv009', 'hv025', 'hv201', 'hv204']


In [10]:
# Define the survey type to filter relevant data files.
survey_type = 'HR'

# Retrieve the path of the data file matching the survey type.
for path, subdirs, files in os.walk(f'{raw_data_dir}/extracted/data/{country}/'):
    for file in files:
        if survey_type in file:  # Check if the file contains the survey type.
            data_file_path = os.path.join(path, file)  # Construct the full file path.
            break

# Load the data from the CSV file into a DataFrame.
in_df = pd.read_csv(data_file_path)

# Select only the columns corresponding to the chosen variable IDs.
in_df = in_df[selected_var_ids]

# Display the first few rows of the DataFrame and its size.
display(in_df.head())
print(f'Size of DataFrame: {in_df.shape}')

# Prepare the output directory and file path for the extracted data.
out_file_dir = make_mydir("../resources/data/input/")
out_file_path = os.path.join(out_file_dir, 'input_data.csv')

# Save the filtered DataFrame to a CSV file.
in_df.to_csv(out_file_path, index=False)
print(f"Extracted input data saved to file: {out_file_path}")

Unnamed: 0,hhid,hv009,hv025,hv201,hv204,hv205,hv206,hv207,hv208,hv209,...,sh29_22,sh29_23,sh29_24,sh29_25,sh29_26,sh29_27,sh29_28,sh29_29,sh29_30,sh29_31
0,10706 5,5,1,11.0,996.0,11.0,1.0,1.0,1.0,1.0,...,,,,,,,,,,
1,10706 15,3,1,11.0,996.0,11.0,1.0,0.0,1.0,1.0,...,,,,,,,,,,
2,10706 24,8,1,11.0,996.0,11.0,1.0,0.0,1.0,1.0,...,,,,,,,,,,
3,10706 34,4,1,11.0,996.0,11.0,1.0,0.0,1.0,1.0,...,,,,,,,,,,
4,10706 43,3,1,11.0,996.0,11.0,1.0,0.0,1.0,1.0,...,,,,,,,,,,


Size of DataFrame: (28175, 235)
Extracted input data saved to file: ..\resources\data\input\input_data.csv


# END