
# Exploratory Data Analysis (EDA) + Preprocessing


## --- 1. Imports And Loding Data ---

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

from google.colab import drive
drive.mount('/content/drive')
data_path = "/content/drive/MyDrive/organ_matching_project/dummy_organ_data.csv"

# For local VS Code:
# data_path = "../server/app/ml/data/dummy_organ_data.csv"

df = pd.read_csv(data_path)
print(f"Dataset loaded: {df.shape[0]} rows, {df.shape[1]} columns")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Dataset loaded: 400 rows, 20 columns


## --- 3. Initial Data Overview ---

In [7]:
print("\nFirst 5 rows:")
print(df.head())

print("\nDataset Info:")
print(df.info())

print("\nMissing Values:")
print(df.isnull().sum())



First 5 rows:
  organ_type donor_blood_type  donor_age donor_gender hla_a hla_b hla_c  \
0       Lung                O         50         Male    A2    B7   Cw4   
1     Kidney               AB         30         Male    A1    B8   Cw3   
2      Liver               AB         46         Male    A3    B8   Cw5   
3      Liver               AB         24         Male    A2    B8   Cw4   
4      Liver                B         31       Female    A2    B8   Cw5   

  hla_drb1 hla_dqb1 organ_needed blood_type  age  gender  priority_status  \
0      DR1      DQ2         Lung          B   39    Male                3   
1      DR1      DQ2         Lung         AB   13    Male                4   
2      DR7      DQ2        Liver         AB   39  Female                4   
3      DR4      DQ3        Liver          A   52    Male                2   
4      DR1      DQ2         Lung          A   53  Female                1   

  hla_a_p hla_b_p hla_c_p hla_drb1_p hla_dqb1_p  score  
0      A1     

## --- 4. Basic Statistics ---

In [None]:
print("\nBasic Statistics:")
print(df.describe(include='all'))

## --- 5. Visualizations ---

In [None]:
plt.figure(figsize=(6,4))
sns.countplot(x='organ_type', data=df)
plt.title('Distribution of Organ Types')
plt.show()

plt.figure(figsize=(6,4))
sns.countplot(x='donor_blood_type', data=df)
plt.title('Distribution of Donor Blood Types')
plt.show()

plt.figure(figsize=(6,4))
sns.histplot(df['donor_age'], kde=True, bins=20)
plt.title('Distribution of Donor Age')
plt.show()

plt.figure(figsize=(6,4))
sns.histplot(df['age'], kde=True, bins=20)
plt.title('Distribution of Patient Age')
plt.show()

# Correlation heatmap (numerical only)
plt.figure(figsize=(10,6))
sns.heatmap(df.corr(numeric_only=True), annot=True, cmap='coolwarm')
plt.title('Correlation Heatmap')
plt.show()

## --- 6. Handle Missing Values ---

In [None]:
df.fillna('Unknown', inplace=True)

## --- 7. Encode Categorical Features ---

In [10]:
from sklearn.preprocessing import LabelEncoder

categorical_cols = [
    "organ_type", "donor_blood_type", "donor_gender",
    "hla_a", "hla_b", "hla_c", "hla_drb1", "hla_dqb1",
    "organ_needed", "blood_type", "gender",
    "hla_a_p", "hla_b_p", "hla_c_p", "hla_drb1_p", "hla_dqb1_p"
]

encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    encoders[col] = le


## --- 8. Save Preprocessed Data ---

In [11]:

processed_path = "/content/drive/MyDrive/organ_matching_project/processed_organ_data.csv"
os.makedirs(os.path.dirname(processed_path), exist_ok=True)
df.to_csv(processed_path, index=False)
print(f"\nProcessed dataset saved to: {processed_path}")


Processed dataset saved to: /content/drive/MyDrive/organ_matching_project/processed_organ_data.csv


### --- 9. Save Encoders (Optional) ---

In [12]:

import pickle
encoder_path = "/content/drive/MyDrive/organ_matching_project/label_encoders.pkl"
with open(encoder_path, 'wb') as f:
    pickle.dump(encoders, f)
print(f"Encoders saved to: {encoder_path}")

Encoders saved to: /content/drive/MyDrive/organ_matching_project/label_encoders.pkl
