<a href="https://colab.research.google.com/github/neeshanth/EDA-TH-DA-1/blob/main/mod_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.impute import SimpleImputer
from scipy.stats import zscore

# Load the data
data = pd.read_csv("C:/VIT/sem_7/EDA Lab L59+L60/EDA Theory DA 1/HepatitisCdata.csv")

# Diagnosis of data
# NA values
print("Total NA values:", data.isna().sum().sum())
data.replace(0, np.nan, inplace=True)
print("Total NA values after replacing 0 with NA:", data.isna().sum().sum())

# Outliers
def find_outliers(column):
    z_scores = zscore(column)
    return np.where(np.abs(z_scores) > 3)[0]

outliers_idx = {}
for col in data.columns:
    if data[col].dtype in [np.float64, np.int64]:
        outliers_idx[col] = find_outliers(data[col].dropna())

print("Outliers indices:", outliers_idx)

x_axis = np.arange(len(data))
chol_vector = data.iloc[:, list(outliers_idx.values())[7]]
plt.scatter(x_axis, chol_vector, c='red', s=6)
plt.xlabel("Index")
plt.ylabel("CHOL")
plt.title("Outliers in CHOL")
plt.show()

print("Dimensions of data:", data.shape)

# Relationship visualization
x = data['Age']
y = data.iloc[:, list(outliers_idx.values())[7]]
sns.scatterplot(x=x, y=y)
plt.xlabel("Age")
plt.ylabel("CHOL")
plt.title("Age vs CHOL")
plt.show()

# Imputing missing values
for col in data.columns:
    if col in ['col1', 'col3', 'col9']:  # Replace with actual column names
        continue
    col_data = data[col]
    print(f"Number of missing values before imputing {col}: {col_data.isna().sum()}")

    imputer = SimpleImputer(strategy="mean")
    data[col] = imputer.fit_transform(data[[col]])

    print(f"Number of missing values after imputing {col}: {data[col].isna().sum()}")

data.fillna(0, inplace=True)
print("Number of zeros in CHOL vector:", (data.iloc[:, list(outliers_idx.values())[7]] == 0).sum())

# Imputing outliers
for col, indices in outliers_idx.items():
    if len(indices) > 0:
        print(f"Attributes with outliers: {col}")
        col_data = data[col]
        data[col] = np.where(
            np.abs(zscore(col_data)) > 3,
            np.nan,  # Replace outliers with NaN
            col_data
        )
        imputer = SimpleImputer(strategy="mean")
        data[col] = imputer.fit_transform(data[[col]])

# Summary
print(data.describe())

# Transform and plot CHOL
chol_transformed = zscore(data['CHOL'])
data['CHOL_transformed'] = chol_transformed

plt.scatter(data['Age'], chol_transformed, c='red', s=6)
plt.xlabel("Age")
plt.ylabel("Transformed CHOL")
plt.title("Age vs Transformed CHOL")
plt.show()

# Binning Age
data['Age_bin'] = pd.cut(data['Age'], bins=3, labels=["Low", "Medium", "High"])
print(data['Age_bin'].value_counts())

# Plot Age Bins
sns.histplot(data['Age_bin'], kde=False)
plt.title("Age Bins Distribution")
plt.xlabel("Age Bin")
plt.ylabel("Count")
plt.show()