# Step 2: Data  Cleaning and Data Processing

---

** Table of content


# Data Cleaning

### Imports and environment setup

In [1]:
# Imports and environment setup

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler


plt.style.use('ggplot')

### Preprocessing

This is a summarize of what we have done in the EDA

In [2]:
# Load datasets
df_train = pd.read_csv("../data/data_train.csv")  # Train dataset

df_test = pd.read_csv("../data/data_test.csv")    # Test dataset

df_train_cleaned = df_train.copy()
df_test_cleaned = df_test.copy()

# Remove duplications (if exist)
df_train_cleaned.drop_duplicates(inplace=True)
df_test_cleaned.drop_duplicates(inplace=True)

# Dropping unused columns
unused_columns = ['Id', 'AnyHealthcare', 'NoDocbcCost', 'Education']
df_train_cleaned = df_train.drop(columns=unused_columns, inplace=True)  # Unused col
df_test_cleaned = df_test.drop(columns=unused_columns, inplace=True)    # Unused col

# Engineering Feature
df_train_cleaned['BP_Chol_Interaction'] = df_train_cleaned['HighBP'] * df_train_cleaned['HighChol']
df_train_cleaned = df_train_cleaned.drop(columns=['HighBP', 'HighChol']) # Feature engineering

df_test_cleaned['BP_Chol_Interaction'] = df_test_cleaned['HighBP'] * df_test_cleaned['HighChol']
df_test_cleaned = df_test_cleaned.drop(columns=['HighBP', 'HighChol']) # Feature engineering


# Remove duplications (if exist). We do this a second time because we just dropped two columns, and more duplications might show up
df_train_cleaned.drop_duplicates(inplace=True)
df_test.drop_duplicates(inplace=True)

# Convert any infinite value into NaN
df_train_cleaned.replace([np.inf, -np.inf], np.nan, inplace=True)
df_test.replace([np.inf, -np.inf], np.nan, inplace=True)

# Convert categorical feature into category data type

categorical_features = [
    'BP_Chol_Interaction', 'Smoker', 'Stroke', 'HeartDiseaseorAttack',
    'PhysActivity', 'Fruits', 'Veggies', 'HvyAlcoholConsump', 
    'DiffWalk', 'Sex', 'Age', 'Income'
]

for column in categorical_features:
    df_train_cleaned[column] = df_train_cleaned[column].astype('category')
    df_test[column] = df_test[column].astype('category')

## Normalize/Standarize numerical feature

scaler = StandardScaler()
numerical_features = ['BMI', 'MentHlth', 'PhysHlth', 'GenHtlth', 'ExtraMedTest', 'ExtraAlcoholTest']

scaler.fit(df_train_cleaned[numerical_features])  # Fit on training data

df_train_cleaned[numerical_features] = scaler.transform(df_train_cleaned[numerical_features]) # Transform both training datasets
df_test_cleaned[numerical_features] = scaler.transform(df_test_cleaned[numerical_features]) # Transform both test datasets

# Outliers 

for column in numerical_features:
    Q1 = df_train_cleaned[column].quantile(0.25)
    Q3 = df_train_cleaned[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    # Filter out the outliers from each column
    df_train_cleaned = df_train_cleaned[(df_train_cleaned[column] >= lower_bound) & (df_train_cleaned[column] <= upper_bound)]

df_train_cleaned = df_train_cleaned.dropna()






IndentationError: unexpected indent (3964323863.py, line 55)

In [None]:
# Save the cleaned and processed data into a new file 


