# Data Preprocessing

In this step, we conduct data preprocessing tasks such as imputing missing data and feature engineering. Utilize the `Data_Preprocessing.ipynb` notebook to impute missing values and engineer features that might enhance the predictive model's performance.

## Importing the Libraries

In [19]:
import pandas as pd
import os
import utils as ui
from sklearn.impute import SimpleImputer
from sklearn.decomposition import FactorAnalysis
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.ensemble import ExtraTreesRegressor
import numpy as np

## Defining the Imputation Methods

### 1. PPCA Imputation

In [20]:
def PPCA_Imputation(df,filename,split):

    # Replace NaN values with the mean for each column
    df_filled = df.fillna(df.mean())

    # Perform PPCA
    num_latent_variables = 2  # You can adjust this based on your analysis
    ppca_model = FactorAnalysis(n_components=num_latent_variables)
    df_transformed = ppca_model.fit_transform(df_filled)

    # Transform the imputed data back to the original space
    df_imputed = pd.DataFrame(np.dot(df_transformed, ppca_model.components_) + ppca_model.mean_, columns=df.columns, index=df.index)

    df_result=df.combine_first(df_imputed)

    # Specifying the file path where you want to save the CSV file
    directory="./Datasets/With Imputation/PPCA Imputation"
    
    if not os.path.exists(directory):
        os.makedirs(directory)
    
    csv_file_path = directory+f'/{filename}_{split}_PPCA.csv'

    # Writing the DataFrame to a CSV file
    df_result.to_csv(csv_file_path)

    return df_result

In [None]:
def et_reg(df,filename,split):   
 
    imputer = IterativeImputer(estimator=ExtraTreesRegressor(n_jobs=1),max_iter=10)
    df_et=imputer.fit_transform(df) # Fitted the imputer to the data and transformed it
    df_et =pd.DataFrame(df_et, columns=df.columns) 
    df_et.index=df.index
    
    # Specifying the file path where you want to save the CSV file
    directory="./Datasets/With Imputation/ET_Regressor Imputation"
    
    if not os.path.exists(directory):
        os.makedirs(directory)

    csv_file_path = directory+f'/{filename}_{split}_ET.csv'
    
    # Writing the DataFrame to a CSV file
    df_et.to_csv(csv_file_path)

    return df_et

## Creating the Imputed Datasets

In [None]:
def preprocess_csv(filepath,filename,imputation="PPCA"):

    df = ui.load_csv(filepath)

    df_train, df_test = ui.train_test_split(df)

    if imputation=="PPCA":
        df_train=PPCA_Imputation(df_train,filename,split="train")
        df_test=PPCA_Imputation(df_test,filename,split="test")
    elif imputation=="ET_Regressor":
        df_train=et_reg(df_train,filename,split="train")
        df_test=et_reg(df_test,filename,split="test")

    return df_train, df_test

In [24]:
df_train,df_test=preprocess_csv(".\Datasets\Without Imputation\Final_Dataset_Ghaziabad.csv","Ghaziabad")