## Importing Libraries

In [None]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from scipy.stats import zscore


## Data Loading

In [None]:

df = pd.read_csv("census_2011.csv")
df1 = pd.read_csv('hospitals.csv')
df2 = pd.read_csv('government_hospitals.csv', skiprows=1)
df3 = pd.read_csv('Employees State Insurance Corporation.csv', header=1, index_col=0)

# Display first 10 rows of census data
df.head(10)


## Columns of Census Data

In [None]:
df.columns

## Summary Statistics of Census Data

In [None]:
df.describe()

## Handling Missing Values in Census Data

In [None]:

# Checking null values in each column
df.isnull().sum()


## Heatmap of Null Values

In [None]:

cols_to_visualize = df.columns  # Adjust this as needed
plt.figure(figsize=(20, 10))  # Adjust figure size
sns.heatmap(df[cols_to_visualize].isnull(), yticklabels=False, cbar=False, cmap="viridis")
plt.show()


## Removing Null Values and Checking with Graphs

In [None]:

def plot_missing_values(df4, title, before_imputation=True):
    missing_values = df4.isnull().sum()
    palette = 'inferno' if before_imputation else 'Greens'
    plt.figure(figsize=(12, 8))
    sns.barplot(x=missing_values.index, y=missing_values.values, palette=palette)
    plt.xticks(rotation=90)
    plt.title(title)
    plt.xlabel('Columns')
    plt.ylabel('Number of Missing Values')
    plt.subplots_adjust(bottom=0.2)
    plt.show()

# Plot before interpolation
plot_missing_values(df, "Pre- Interpolation Census Data: Missing Value Analysis", before_imputation=True)

# Perform interpolation
numeric_columns = df.select_dtypes(include=np.number).columns
df[numeric_columns] = df[numeric_columns].ffill()
df[numeric_columns] = df[numeric_columns].bfill()

# Plot after imputation
plot_missing_values(df, "Post- Interpolation Census Data: Analysis of Null Values", before_imputation=False)

# Checking by heatmap
sns.heatmap(df[cols_to_visualize].isnull(), yticklabels=False, cbar=False, cmap="viridis")
df.isnull().sum()


## Working on Other CSV Files

In [None]:

df1.head()
df2.head()
df3.head()
df1.describe()
df2.describe()
df3.describe()
df1.isnull().sum()


## Handling Missing Values in Hospitals Data

In [None]:

df1[['PHC', 'CHC', 'SDH', 'DH', 'Hospitals', 'HospitalBeds']] = df1[['PHC', 'CHC', 'SDH', 'DH', 'Hospitals', 'HospitalBeds']].apply(pd.to_numeric, errors='coerce')

def heatmaps(df1, title, before_imputation=True):
    sns.heatmap(df1.isnull(), yticklabels=False, cbar=False, cmap="viridis")
    plt.title(title)
    plt.show()

heatmaps(df1, "Pre- Interpolation Hospitals Data: Missing Value Analysis", before_imputation=True)
numeric_columns = df1.select_dtypes(include=np.number).columns
df1[numeric_columns] = df1[numeric_columns].ffill()
df1[numeric_columns] = df1[numeric_columns].bfill()
heatmaps(df1, "Post- Interpolation Hospitals Data: Analysis of Null Values", before_imputation=False)


## Handling Outliers in Census Data

In [None]:

def outliers_zscore_quantile(df, columns, threshold=3):
    for col in columns:
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        df = df[(df[col] >= lower_bound) & (df[col] <= upper_bound)]
        col_zscore = zscore(df[col].dropna())
        valid_indices = df[col].dropna().index[abs(col_zscore) < threshold]
        df = df.loc[valid_indices]
    return df

# Handling outlier related to households
plt.figure(figsize=(16, 10))
sns.boxplot(data=df[['Household_size_1_person_Households', 'Household_size_2_persons_Households', 'Household_size_3_persons_Households']])
plt.title("Boxplot for Household Size Columns")
plt.show()
