In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import preprocessing
# Task: 1 Data Cleaning 
# Loading the dataset from the CSV file

data_file = 'Dataset.csv'
df_patients = pd.read_csv(data_file)

# Displaying the first few rows to inspect the data

print("First 5 rows of the dataset:")
print(df_patients.head())

# Checking the shape (number of rows and columns)

print(f"Dataset Shape: {df_patients.shape}")

# Identifying and Handle Missing Values

# Checking missing values before replacement

print("Missing values in each column (before replacement):")
print(df_patients.isnull().sum())

# Replace '?' with NaN so missing values are properly recognized

df_patients.replace('?', pd.NA, inplace=True)

# Verifying missing values after replacement

print("Missing values in each column (after replacing '?'):")
print(df_patients.isnull().sum())

# Impute Missing Values Separately for Numeric and Categorical Columns

# For numeric columns: fill missing values with the mean.

numeric_cols = df_patients.select_dtypes(include=['float64', 'int64']).columns
df_patients[numeric_cols] = df_patients[numeric_cols].fillna(df_patients[numeric_cols].mean())

# For categorical columns: fill missing values with the mode (most frequent value).

for col in df_patients.select_dtypes(include=['object']).columns:
    df_patients[col] = df_patients[col].fillna(df_patients[col].mode()[0])


# Verify that missing values have been handled

print("Missing values after imputation:")
print(df_patients.isnull().sum())


# Outlier Detection and Removal using IQR

# Only numeric columns for outlier detection

numeric_cols = df_patients.select_dtypes(include=['float64', 'int64']).columns
print("Numeric columns for outlier detection:", numeric_cols)

# Calculating the first (Q1) and third (Q3) quartiles
Q1 = df_patients[numeric_cols].quantile(0.25)
Q3 = df_patients[numeric_cols].quantile(0.75)
IQR = Q3 - Q1

# Removing rows that have any outliers (using 1.5 * IQR rule)
df_patients = df_patients[~((df_patients[numeric_cols] < (Q1 - 1.5 * IQR)) | 
                            (df_patients[numeric_cols] > (Q3 + 1.5 * IQR))).any(axis=1)]

# Checking the new shape after outlier removal
print(f"Shape after removing outliers: {df_patients.shape}")

# Normalise the Features

# We want to normalise numerical columns (e.g., Age, Blood_Pressure, etc.)
# but we don't want to normalize columns that are identifiers or the target variable (e.g., ICU,SEX..).

# The list of numeric columns

numeric_cols = df_patients.select_dtypes(include=['int64', 'float64']).columns.tolist()

# Removing Index column or identifier

if 'index' in numeric_cols:
    numeric_cols.remove('index')

# Removing the target variable

if 'ICU' in numeric_cols:
    numeric_cols.remove('ICU')

# Removing columns that are categorical 

if 'SEX' in numeric_cols:
    numeric_cols.remove('SEX')
if 'CLASIFFICATION_FINAL' in numeric_cols:
    numeric_cols.remove('CLASIFFICATION_FINAL')

print("Numeric columns to normalize:", numeric_cols)

# Normalising the selected numeric columns

df_patients[numeric_cols] = preprocessing.normalize(df_patients[numeric_cols])
print(df_patients[numeric_cols])

# Printing a few rows to check that normalization worked

print("Data after normalization:")
print(df_patients.head())

# Final Check: Print the final shape of the dataset

print(f"Final dataset shape after normalization: {df_patients.shape}")

# Saving the cleaned and normalized dataset to a CSV file for later use

df_patients.to_csv('cleaned_patients.csv', index=False)

# Verifying Data Types and Removing Irrelevant Columns

# Checking the data types

print("Data types before adjustment:")
print(df_patients.dtypes)


# Removing irrelevant columns (eg. index)

columns_to_remove = ['index']  # Add any other irrelevant columns to this list if needed.
df_patients.drop(columns=columns_to_remove, inplace=True)

# Verify the data types and the list of columns after removal

print("Data types after adjustment:")
print(df_patients.dtypes)
print("Remaining columns:")
print(df_patients.columns)

In [None]:

# Task 2: Data Visualisation
df_patients['ICU'] = pd.to_numeric(df_patients['ICU'], errors='coerce')
df_patients['AGE'] = pd.to_numeric(df_patients['AGE'], errors='coerce')

# Plot 1: Distribution of the target variable 'ICU'
plt.figure(figsize=(8, 6))
sns.countplot(data=df_patients, x='ICU')
plt.title('Distribution of ICU Cases')
plt.xlabel('ICU (Indicator)')
plt.ylabel('Count')
plt.grid(True)
plt.show()

# Plot 2: Count of number of ICU cases against age.
# Get all patients that were admitted to the ICU
ICU_Cases = df_patients[df_patients['ICU'] == 2.0]

plt.figure(figsize=(12, 8))
sns.histplot(data=ICU_Cases, x='AGE', bins = 30, kde = False)
plt.title('ICU Cases vs. Age')
plt.xlabel('AGE')
plt.ylabel('ICU Count')
plt.grid(True)
plt.show()
df_patients.describe()
df_patients.to_csv("cleaned_patients.csv", index=False)

# Plot 3: Count of ICU admissions versus classification
ICU_Counts = df_patients.groupby('CLASIFFICATION_FINAL')['ICU'].sum()

plt.figure(figsize=(10, 6))
ICU_Counts.plot(kind='bar')
plt.title('Count of ICU Admissions by classification')
plt.xlabel('Classification')
plt.ylabel('ICU Admissions')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# Plot 4:
# Show the scatter matrix plot and the correlation matrices
numeric_cols2 = df_patients.select_dtypes(include=['float64', 'int64']).columns
numeric_cols2 = [col for col in numeric_cols2 if df_patients[col].nunique() > 1]

# PairGrid takes a long time, so sample the data
sample_df = df_patients[numeric_cols2].sample(n=500, random_state=26)

# Scatter Matrix
graph = sns.PairGrid(sample_df)
graph.map_upper(sns.scatterplot)
graph.map_lower(sns.kdeplot, cmap='Blues_d')
graph.map_diag(sns.histplot, kde_kws={'color': 'k'})
plt.suptitle('Scatter Matrix')
plt.show()

correlation_matrix = df_patients[numeric_cols2].corr()
plt.figure(figsize=(12,10))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', linewidths=0.5)
plt.title('Correlation Matrix')
plt.show()

# Other plots that may be useful
# Plot 5: ICU Case vs Sex
ICU_Cases = df_patients[df_patients['ICU'] == 2.0]
plt.figure(figsize=(8, 6))
sns.countplot(data=ICU_Cases, x='SEX')#, bins = 30, kde = False)
plt.title('ICU Cases vs. Sex')
plt.xlabel('Sex')
plt.ylabel('ICU Count')
plt.grid(True)
plt.show()
df_patients.describe()
df_patients.to_csv("cleaned_patients.csv", index=False)

# Plot 6: Medical Unit VS Age
# This shows negative Correlation betwen aged and Medical Unit
plt.figure(figsize=(12,8))
sns.boxplot(data=df_patients, x='AGE', y='MEDICAL_UNIT')
plt.title('Medical Unit vs. Age')
plt.xlabel('Age')
plt.ylabel('Medical Unit')
plt.grid(True)
plt.show()
