In [8]:
#laod Data
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

In [16]:
df = pd.read_csv("/Users/moiz/Downloads/archive (2)/2022/heart_2022_with_nans.csv")

In [22]:
def clean_data(df):
    # Create a copy of the dataframe to avoid modifying the original
    df_cleaned = df.copy()

    # 1. Handle missing values
    numeric_features = df_cleaned.select_dtypes(include=['int64', 'float64']).columns
    categorical_features = df_cleaned.select_dtypes(include=['object']).columns
    
    imputer_numeric = SimpleImputer(strategy='mean')
    imputer_categorical = SimpleImputer(strategy='most_frequent')
    
    df_cleaned[numeric_features] = imputer_numeric.fit_transform(df_cleaned[numeric_features])
    df_cleaned[categorical_features] = imputer_categorical.fit_transform(df_cleaned[categorical_features])
    
    # 2. Remove duplicates
    df_cleaned.drop_duplicates(inplace=True)
    
    # 3. Address outliers (example: capping at 3 standard deviations)
    for col in numeric_features:
        mean = df_cleaned[col].mean()
        std = df_cleaned[col].std()
        df_cleaned[col] = df_cleaned[col].clip(lower=mean - 3*std, upper=mean + 3*std)
    
    # 4. Convert binary categorical variables to numeric
    binary_features = ['HeartDisease', 'Smoking', 'AlcoholDrinking', 'Stroke', 'DiffWalking', 
                       'PhysicalActivity', 'Asthma', 'KidneyDisease', 'SkinCancer']
    for col in binary_features:
        if col in df_cleaned.columns:
            df_cleaned[col] = df_cleaned[col].map({'Yes': 1, 'No': 0})
    
    # 5. Check for multicollinearity (only for numeric columns)
    numeric_cols = df_cleaned.select_dtypes(include=[np.number]).columns
    correlation_matrix = df_cleaned[numeric_cols].corr()
    high_correlation = np.where(np.abs(correlation_matrix) > 0.8)
    high_correlation = [(correlation_matrix.index[x], correlation_matrix.columns[y]) 
                        for x, y in zip(*high_correlation) if x != y and x < y]
    
    print("Highly correlated features:")
    for feat1, feat2 in high_correlation:
        print(f"{feat1} and {feat2}")
    
    return df_cleaned

if __name__ == "__main__":
    # Load your data
    df = pd.read_csv('/Users/moiz/Downloads/archive (2)/2022/heart_2022_with_nans.csv')
    
    # Clean the data
    cleaned_df = clean_data(df)
    
    # Save the cleaned data to a new CSV file
    save_path = os.path.join(os.getcwd(), 'cleaned_data.csv')
    cleaned_df.to_csv(save_path, index=False)
    print(f"Cleaned data saved to: {save_path}")
    
    # Display the first few rows of the cleaned data
    print("\nFirst few rows of cleaned data:")
    print(cleaned_df.head())
    
    # Display info about the cleaned dataset
    print("\nInfo about the cleaned dataset:")
    print(cleaned_df.info())
    
    # Display summary statistics
    print("\nSummary statistics of the cleaned data:")
    print(cleaned_df.describe())

Highly correlated features:
WeightInKilograms and BMI
Cleaned data saved to: /Users/moiz/Desktop/IntrotoAI/cleaned_data.csv

First few rows of cleaned data:
     State     Sex GeneralHealth  PhysicalHealthDays  MentalHealthDays  \
0  Alabama  Female     Very good                 0.0               0.0   
1  Alabama  Female     Excellent                 0.0               0.0   
2  Alabama  Female     Very good                 2.0               3.0   
3  Alabama  Female     Excellent                 0.0               0.0   
4  Alabama  Female          Fair                 2.0               0.0   

                                     LastCheckupTime PhysicalActivities  \
0  Within past year (anytime less than 12 months ...                 No   
1  Within past year (anytime less than 12 months ...                 No   
2  Within past year (anytime less than 12 months ...                Yes   
3  Within past year (anytime less than 12 months ...                Yes   
4  Within past year (an