In [None]:
import pandas as pd
import gc # Garbage collector

from utils.feature_scope import get_feature_scope
from utils.transformation_functions import *

# PNS 2019 Cleaning

In [None]:
df = pd.read_parquet("..\..\data\staged\PNS_2019.parquet")

print(f"Count of df: {len(df)}")
df.head()

## Rows to filter

In [None]:
# Only Females
only_females = df['C006'] == "2"

# Apply filters
df_filtered = df[only_females]
df_filtered = df_filtered[df_filtered['C008'].str.strip() != "."]
df_filtered = df_filtered[df_filtered['C008'].str.strip() != "" ]

print(f"Count of df_filtered: {len(df_filtered)}")
df_filtered.head()

## Columns to filter

In [None]:
columns_to_select = []

for modulo in get_feature_scope():
    for column in modulo:
        for key in column.keys():
            if key in df_filtered.columns:
                columns_to_select.append(key)    

df_filtered = df_filtered[columns_to_select]

## Columns to clean

In [None]:
# Replace any column that has only spaces and "." by "<None>"
df_cleaned = df_filtered.apply(lambda x: x.map(lambda y: "<None>" if isinstance(y, str) and (y.strip() == '.' or y.strip() == '') else y))
df_cleaned

## Columns to transform

In [None]:
import json
DEFAULT_NUMBER = 0

# Read feature type inference from JSON file
with open("../../data/schema/feature_type_inference.json", "r", encoding="utf-8") as f:
    feature_types_infered = json.load(f)

df_transformed = df_cleaned.copy()

# Dynamically call the transformation function based on the column name
for column in df_transformed.columns:

    transform_function_name = f"transform_{column}"

    if transform_function_name in globals():
        transform_function = globals()[transform_function_name]
        df_transformed = transform_function(df_transformed)
    
    else:
        
        # Apply type transformation based on the feature type inference
        if column in feature_types_infered:
            column_type = feature_types_infered[column]["type"]
            
            if column_type == "numeric":
                df_transformed[column] = df_transformed[column].replace("<None>", DEFAULT_NUMBER)
                df_transformed = df_transformed.astype({column: int})
            elif column_type == "float":
                df_transformed[column] = df_transformed[column].replace("<None>", DEFAULT_NUMBER)
                df_transformed = df_transformed.astype({column: float})
            elif column_type == "category":
                df_transformed = df_transformed.astype({column: 'category'})
            elif column_type == "boolean":
                df_transformed[column] = df_transformed[column].apply(lambda x: 1 if x == "1" else 0)
                df_transformed = df_transformed.astype({column: int})

df_transformed

## Write to stage path

In [None]:
df_transformed.to_parquet("..\..\data\staged\PNS_2019_transformed.parquet")

In [None]:
# Delete the dfs that are no longer needed
del df
del df_filtered
del df_cleaned
del df_transformed

# Run garbage collection to free up memory
gc.collect()