# Data Preprocessing

In this step, we conduct data preprocessing tasks such as imputing missing data and feature engineering. Utilize the `Data_Preprocessing.ipynb` notebook to impute missing values and engineer features that might enhance the predictive model's performance.

## Importing the Libraries

In [29]:
import pandas as pd
import os

from sklearn.impute import SimpleImputer
from sklearn.decomposition import FactorAnalysis
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.ensemble import ExtraTreesRegressor
import numpy as np

## Defining the Imputation Methods

### 1. PPCA Imputation

In [30]:
def PPCA_Imputation(df,filename):

    # Replace NaN values with the mean for each column
    df_filled = df.fillna(df.mean())

    # Perform PPCA
    num_latent_variables = 2  # You can adjust this based on your analysis
    ppca_model = FactorAnalysis(n_components=num_latent_variables)
    df_transformed = ppca_model.fit_transform(df_filled)

    # Transform the imputed data back to the original space
    df_imputed = pd.DataFrame(np.dot(df_transformed, ppca_model.components_) + ppca_model.mean_, columns=df.columns, index=df.index)

    df_result=df.combine_first(df_imputed)

    # Specifying the file path where you want to save the CSV file
    directory="./Datasets/With Imputation/PPCA Imputation"
    
    if not os.path.exists(directory):
        os.makedirs(directory)
    
    csv_file_path = directory+f'/{filename}_PPCA.csv'

    # Writing the DataFrame to a CSV file
    df_result.to_csv(csv_file_path)

    return df_result

### 2. Extratree Regressor Using Iterative Imputer

In [31]:
def et_reg(df,filename):   
 
    imputer = IterativeImputer(estimator=ExtraTreesRegressor(n_jobs=1),max_iter=10)
    df_et=imputer.fit_transform(df) # Fitted the imputer to the data and transformed it
    df_et =pd.DataFrame(df_et, columns=df.columns) 
    df_et.index=df.index
    
    # Specifying the file path where you want to save the CSV file
    directory="./Datasets/With Imputation/ET_Regressor Imputation"
    
    if not os.path.exists(directory):
        os.makedirs(directory)

    csv_file_path = directory+f'/{filename}_ET.csv'
    
    # Writing the DataFrame to a CSV file
    df_et.to_csv(csv_file_path)

    return df_et

### 3. Mean Imputation

In [32]:
def mean_impute(df,filename):   
 
    imputer = SimpleImputer(strategy='mean')
    df_mean = imputer.fit_transform(df)
    df_mean =pd.DataFrame(df_mean, columns=df.columns) 
    df_mean.index=df.index

    # Specifying the file path where you want to save the CSV file
    directory="./Datasets/With Imputation/Mean Imputation"
    
    if not os.path.exists(directory):
        os.makedirs(directory)

    csv_file_path = directory+f'/{filename}_Mean.csv'

    # Writing the DataFrame to a CSV file
    df_mean.to_csv(csv_file_path)

    return df_mean

### 4. Median Imputation

In [33]:
def median_impute(df,filename):    
    imputer = SimpleImputer(strategy='median')
    df_median = imputer.fit_transform(df)
    df_median =pd.DataFrame(df_median, columns=df.columns) 
    df_median.index=df.index

    # Specifying the file path where you want to save the CSV file
    directory="./Datasets/With Imputation/Median Imputation"
    
    if not os.path.exists(directory):
        os.makedirs(directory)

    csv_file_path = directory+f'/{filename}_Median.csv'

    # Writing the DataFrame to a CSV file
    df_median.to_csv(csv_file_path)

    return df_median

### 5. Backfill Imputation

In [34]:
def bfill_impute(df,filename):
    df_bfill=df.fillna(method='bfill')

    # Specifying the file path where you want to save the CSV file
    directory="./Datasets/With Imputation/Back Fill Imputation"
    
    if not os.path.exists(directory):
        os.makedirs(directory)

    csv_file_path = directory+f'/{filename}_Back_Fill.csv'

    # Writing the DataFrame to a CSV file
    df_bfill.to_csv(csv_file_path)

    return df_bfill

### 6. Forwardfill Imputation 

In [35]:
def ffill_impute(df,filename):
    df_ffill=df.fillna(method='ffill')

    # Specifying the file path where you want to save the CSV file
    directory="./Datasets/With Imputation/Forward Fill Imputation"
    
    if not os.path.exists(directory):
        os.makedirs(directory)

    csv_file_path = directory+f'/{filename}_Forward_Fill.csv'

    # Writing the DataFrame to a CSV file
    df_ffill.to_csv(csv_file_path)

    return df_ffill

## Creating the Imputed Datasets

In [36]:
def preprocess_csv(filepath,filename,imputation="PPCA"):

    df = pd.read_csv(filepath,na_values='None')

    # Set the Date Time column as the index
    df.set_index('Datetime', inplace=True)

    if imputation=="PPCA":
        df=PPCA_Imputation(df,filename)
    
    elif imputation=="ET_Regressor":
        df=et_reg(df,filename)
    
    elif imputation=="Mean":
        df=mean_impute(df,filename)
    
    elif imputation=="Median":
        df=median_impute(df,filename)
    
    elif imputation=="BFill":
        df=bfill_impute(df,filename)
    
    elif imputation=="FFill":
        df=ffill_impute(df,filename)

    return df

In [37]:
df=preprocess_csv(".\Datasets\Without Imputation\Final_Dataset_Ghaziabad.csv","Ghaziabad")
df.describe()



Unnamed: 0,PM2.5,PM10,NO,NO2,NOx,NH3,SO2,CO,Ozone,Benzene,Toluene,Temp,RH,WS,WD
count,33217.0,33217.0,33217.0,33217.0,33217.0,33217.0,33217.0,33217.0,33217.0,33217.0,33217.0,33217.0,33217.0,33217.0,33217.0
mean,124.629377,248.149939,23.261935,55.323391,45.764546,39.957938,20.498369,1.473976,42.049,3.335691,25.111851,29.119051,66.443967,1.705485,163.729579
std,123.484154,169.218485,53.98544,41.29864,56.168605,20.44126,23.141897,1.353074,35.97178,3.646394,21.510822,2.709916,18.374013,3.429617,90.433769
min,0.25,3.0,-20.039319,0.03,0.0,0.03,0.1,0.0,0.1,0.0,0.0,14.5,7.0,0.3,1.0
25%,43.5,125.0,1.88,24.52,13.53,27.3,6.7,0.71,15.83,0.95,10.0,27.7,53.5,0.48,108.5
50%,83.0,214.0,4.45,47.4,27.57,37.8,13.15,1.07,31.75,2.632402,22.12,29.68,67.5,1.18,164.321124
75%,161.75,324.5,16.3,75.4,53.3,48.65,24.0,1.66,54.92,4.248627,31.58,30.88,81.0,1.83,240.25
max,995.0,999.25,496.4,463.55,498.5,462.4,200.0,15.42,199.6,238.53,305.57,59.75,100.0,50.0,354.0


In [38]:
# Directory containing your CSV files
directory = 'Datasets\Without Imputation'

filepaths=[]

# Iterate through each file in the directory
for filename in os.listdir(directory):
    if filename.endswith(".csv"):
        # Construct the full file path
        file_path = os.path.join(directory, filename)
        # Call the preprocessing function with the file path
        filepaths.append(file_path)
        print(file_path)

Datasets\Without Imputation\Final_Dataset_Aotizhongxin.csv
Datasets\Without Imputation\Final_Dataset_Ghaziabad.csv
