In [1]:
import pandas as pd
import gc # Garbage collector

from utils.feature_scope import get_feature_scope
from utils.transformation_functions import *

# PNS 2019 Cleaning

In [2]:
df = pd.read_parquet("..\..\data\staged\PNS_2019.parquet")

print(f"Count of df: {len(df)}")
df.head()

Count of df: 293726


Unnamed: 0,V0001,V0024,UPA_PNS,V0006_PNS,V0015,V0020,V0022,V0026,V0031,V0025A,...,VDE002,VDE014,VDF002,VDF003,VDF004,VDL001,VDM001,VDP001,VDR001,VDDATA
0,11,1110011,110000016,1,1,2019,6,1,1,1,...,1.0,6.0,2098,350,2,,,,,20220504
1,11,1110011,110000016,1,1,2019,6,1,1,0,...,,,2098,350,2,,,,,20220504
2,11,1110011,110000016,1,1,2019,6,1,1,0,...,1.0,4.0,2098,350,2,,,,,20220504
3,11,1110011,110000016,1,1,2019,6,1,1,9,...,,,2098,350,2,,,,,20220504
4,11,1110011,110000016,1,1,2019,6,1,1,9,...,,,2098,350,2,,,,,20220504


## Rows to filter

In [3]:
# Only Females
only_females = df['C006'] == "2"

# Apply filters
df_filtered = df[only_females]
df_filtered = df_filtered[df_filtered['C008'].str.strip() != "."]
df_filtered = df_filtered[df_filtered['C008'].str.strip() != "" ]

print(f"Count of df_filtered: {len(df_filtered)}")
df_filtered.head()

Count of df_filtered: 144940


Unnamed: 0,V0001,V0024,UPA_PNS,V0006_PNS,V0015,V0020,V0022,V0026,V0031,V0025A,...,VDE002,VDE014,VDF002,VDF003,VDF004,VDL001,VDM001,VDP001,VDR001,VDDATA
0,11,1110011,110000016,1,1,2019,6,1,1,1,...,1.0,6.0,2098,350,2,,,,,20220504
4,11,1110011,110000016,1,1,2019,6,1,1,9,...,,,2098,350,2,,,,,20220504
5,11,1110011,110000016,1,1,2019,6,1,1,9,...,,,2098,350,2,,,,,20220504
6,11,1110011,110000016,2,1,2019,4,1,1,0,...,1.0,6.0,1000,250,2,,,,,20220504
8,11,1110011,110000016,2,1,2019,4,1,1,0,...,,,1000,250,2,,,,,20220504


## Columns to filter

In [4]:
columns_to_select = []

for modulo in get_feature_scope():
    for column in modulo:
        for key in column.keys():
            if key in df_filtered.columns:
                columns_to_select.append(key)    

df_filtered = df_filtered[columns_to_select]

## Columns to clean

In [5]:
# Replace any column that has only spaces and "." by "<None>"
df_cleaned = df_filtered.apply(lambda x: x.map(lambda y: "<None>" if isinstance(y, str) and (y.strip() == '.' or y.strip() == '') else y))
df_cleaned

Unnamed: 0,V0020,C008,C009,R00101,R010,R011,R012,R025,R028,R031,...,Q121010,Q121011,Q121012,Q121013,Q121014,Q121015,Q12201,Q124,Q125,Q128
0,2019,055,1,4,2,<None>,<None>,12,1,<None>,...,<None>,<None>,<None>,<None>,<None>,<None>,<None>,2,<None>,2
4,2019,006,4,<None>,<None>,<None>,<None>,<None>,<None>,<None>,...,<None>,<None>,<None>,<None>,<None>,<None>,<None>,<None>,<None>,<None>
5,2019,004,2,<None>,<None>,<None>,<None>,<None>,<None>,<None>,...,<None>,<None>,<None>,<None>,<None>,<None>,<None>,<None>,<None>,<None>
6,2019,033,1,<None>,<None>,<None>,<None>,<None>,<None>,<None>,...,<None>,<None>,<None>,<None>,<None>,<None>,<None>,<None>,<None>,<None>
8,2019,015,4,<None>,<None>,<None>,<None>,<None>,<None>,<None>,...,<None>,<None>,<None>,<None>,<None>,<None>,<None>,<None>,<None>,<None>
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
293713,2019,032,4,4,2,<None>,<None>,11,<None>,1,...,<None>,<None>,<None>,<None>,<None>,<None>,<None>,2,<None>,2
293716,2019,004,4,<None>,<None>,<None>,<None>,<None>,<None>,<None>,...,<None>,<None>,<None>,<None>,<None>,<None>,<None>,<None>,<None>,<None>
293719,2019,042,4,<None>,<None>,<None>,<None>,<None>,<None>,<None>,...,<None>,<None>,<None>,<None>,<None>,<None>,<None>,<None>,<None>,<None>
293721,2019,049,2,<None>,<None>,<None>,<None>,<None>,<None>,<None>,...,<None>,<None>,<None>,<None>,<None>,<None>,<None>,<None>,<None>,<None>


## Columns to transform

In [6]:
df_transformed = df_cleaned.copy()

# Dynamically call the transformation function based on the column name
for column in df_transformed.columns:

    transform_function_name = f"transform_{column}"

    if transform_function_name in globals():
        transform_function = globals()[transform_function_name]
        df_transformed = transform_function(df_transformed)    

df_transformed

Unnamed: 0,V0020,C008,C009,R00101,R010,R011,R012,R025,R028,R031,...,Q121010,Q121011,Q121012,Q121013,Q121014,Q121015,Q12201,Q124,Q125,Q128
0,2019,55,1,4,0,<None>,0,12,1,<None>,...,<None>,<None>,<None>,<None>,<None>,<None>,<None>,0,99,2
4,2019,6,4,<None>,99,<None>,0,0,99,<None>,...,<None>,<None>,<None>,<None>,<None>,<None>,<None>,99,99,<None>
5,2019,4,2,<None>,99,<None>,0,0,99,<None>,...,<None>,<None>,<None>,<None>,<None>,<None>,<None>,99,99,<None>
6,2019,33,1,<None>,99,<None>,0,0,99,<None>,...,<None>,<None>,<None>,<None>,<None>,<None>,<None>,99,99,<None>
8,2019,15,4,<None>,99,<None>,0,0,99,<None>,...,<None>,<None>,<None>,<None>,<None>,<None>,<None>,99,99,<None>
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
293713,2019,32,4,4,0,<None>,0,11,99,1,...,<None>,<None>,<None>,<None>,<None>,<None>,<None>,0,99,2
293716,2019,4,4,<None>,99,<None>,0,0,99,<None>,...,<None>,<None>,<None>,<None>,<None>,<None>,<None>,99,99,<None>
293719,2019,42,4,<None>,99,<None>,0,0,99,<None>,...,<None>,<None>,<None>,<None>,<None>,<None>,<None>,99,99,<None>
293721,2019,49,2,<None>,99,<None>,0,0,99,<None>,...,<None>,<None>,<None>,<None>,<None>,<None>,<None>,99,99,<None>


## Write to stage path

In [12]:
df_transformed.to_parquet("..\..\data\staged\PNS_2019_transformed.parquet")

In [None]:
# Delete the dfs that are no longer needed
del df
del df_filtered
del df_cleaned
del df_transformed

# Run garbage collection to free up memory
gc.collect()