In [1]:
# 📌 Install necessary libraries (if not pre-installed in Kaggle)
!pip install -q opencv-python pydicom matplotlib seaborn 

# 📌 Import all required libraries for data processing, visualization, and deep learning
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import cv2
import pydicom
import tensorflow as tf
import torch

# 📌 Print library versions to ensure compatibility
print("✅ Libraries successfully loaded!")
print(f"TensorFlow Version: {tf.__version__}")
print(f"PyTorch Version: {torch.__version__}")


✅ Libraries successfully loaded!
TensorFlow Version: 2.17.1
PyTorch Version: 2.5.1+cu121


# Load and Explore the Dataset

In [2]:
# 📌 Load the dataset and display its structure
import pandas as pd
import os

# 📌 Define the dataset path (Kaggle automatically mounts datasets in `/kaggle/input/`)
data_path = "/kaggle/input/data"  # Update if needed

# 📌 Load the metadata CSV file containing image labels and patient information
csv_file = os.path.join(data_path, "Data_Entry_2017.csv")
df = pd.read_csv(csv_file)

# 📌 Display the first five rows of the dataset
df.head()


  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()


Unnamed: 0,Image Index,Finding Labels,Follow-up #,Patient ID,Patient Age,Patient Gender,View Position,OriginalImage[Width,Height],OriginalImagePixelSpacing[x,y],Unnamed: 11
0,00000001_000.png,Cardiomegaly,0,1,58,M,PA,2682,2749,0.143,0.143,
1,00000001_001.png,Cardiomegaly|Emphysema,1,1,58,M,PA,2894,2729,0.143,0.143,
2,00000001_002.png,Cardiomegaly|Effusion,2,1,58,M,PA,2500,2048,0.168,0.168,
3,00000002_000.png,No Finding,0,2,81,M,PA,2500,2048,0.171,0.171,
4,00000003_000.png,Hernia,0,3,81,F,PA,2582,2991,0.143,0.143,


In [3]:
# 📌 Display basic dataset information
print("✅ Dataset Overview:")
print(f"Total Records: {df.shape[0]}")
print(f"Total Unique Patients: {df['Patient ID'].nunique()}")
print(f"Total Unique Conditions: {df['Finding Labels'].nunique()}")

# 📌 Count occurrences of each disease
print("\n✅ Disease Distribution:")
disease_counts = df['Finding Labels'].value_counts()
print(disease_counts)

# 📌 Check for missing values
print("\n✅ Missing Values Check:")
print(df.isnull().sum())


✅ Dataset Overview:
Total Records: 112120
Total Unique Patients: 30805
Total Unique Conditions: 836

✅ Disease Distribution:
Finding Labels
No Finding                                                                      60361
Infiltration                                                                     9547
Atelectasis                                                                      4215
Effusion                                                                         3955
Nodule                                                                           2705
                                                                                ...  
Consolidation|Edema|Effusion|Mass|Nodule                                            1
Edema|Infiltration|Mass|Pneumonia|Pneumothorax                                      1
Consolidation|Effusion|Infiltration|Mass|Nodule|Pleural_Thickening|Pneumonia        1
Consolidation|Mass|Nodule|Pneumothorax                                              1


In [4]:
# 📌 Drop the unnecessary column "Unnamed: 11"
df_cleaned = df.drop(columns=["Unnamed: 11"], errors="ignore")

# 📌 Filter dataset to include only Pneumonia vs. No Finding cases
df_filtered = df_cleaned[df_cleaned["Finding Labels"].isin(["Pneumonia", "No Finding"])]

# 📌 Display the class distribution after filtering
print("✅ Updated Disease Distribution (After Filtering for Pneumonia vs. No Finding):")
print(df_filtered["Finding Labels"].value_counts())

# 📌 Display the first 5 rows of the cleaned dataset
df_filtered.head()


✅ Updated Disease Distribution (After Filtering for Pneumonia vs. No Finding):
Finding Labels
No Finding    60361
Pneumonia       322
Name: count, dtype: int64


Unnamed: 0,Image Index,Finding Labels,Follow-up #,Patient ID,Patient Age,Patient Gender,View Position,OriginalImage[Width,Height],OriginalImagePixelSpacing[x,y]
3,00000002_000.png,No Finding,0,2,81,M,PA,2500,2048,0.171,0.171
13,00000005_000.png,No Finding,0,5,69,F,PA,2048,2500,0.168,0.168
14,00000005_001.png,No Finding,1,5,69,F,AP,2500,2048,0.168,0.168
15,00000005_002.png,No Finding,2,5,69,F,AP,2500,2048,0.168,0.168
16,00000005_003.png,No Finding,3,5,69,F,PA,2992,2991,0.143,0.143
