In [1]:
import pandas as pd
import numpy as np
import sys
import os
from sklearn.preprocessing import LabelEncoder, StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns

print("Python interpreter:", sys.executable)


Python interpreter: /Users/hayden/Developer/Lung-Cancer-Prediction-Model-2025/.venv/bin/python


In [7]:

# Load the dataset
DATA_PATH = "../data/lung_cancer_data.csv"
df = pd.read_csv(DATA_PATH)
print(f"Loaded data shape: {df.shape}")
df.head()


Loaded data shape: (1000, 26)


Unnamed: 0,index,Patient Id,Age,Gender,Air Pollution,Alcohol use,Dust Allergy,OccuPational Hazards,Genetic Risk,chronic Lung Disease,...,Fatigue,Weight Loss,Shortness of Breath,Wheezing,Swallowing Difficulty,Clubbing of Finger Nails,Frequent Cold,Dry Cough,Snoring,Level
0,0,P1,33,1,2,4,5,4,3,2,...,3,4,2,2,3,1,2,3,4,Low
1,1,P10,17,1,3,1,5,3,4,2,...,1,3,7,8,6,2,1,7,2,Medium
2,2,P100,35,1,4,5,6,5,5,4,...,8,7,9,2,1,4,6,7,2,High
3,3,P1000,37,1,7,7,7,7,6,7,...,4,2,3,1,4,5,6,7,5,High
4,4,P101,46,1,6,8,7,7,7,6,...,3,2,4,1,4,2,4,2,3,High


In [9]:
# Clean and normalize column names
df.columns = (
    df.columns.str.strip()
    .str.lower()
    .str.replace(" ", "_")
    .str.replace("-", "_")
)

# Preview columns
print("Processed - Normalized columns:")
print(df.columns.tolist())

Processed - Normalized columns:
['index', 'patient_id', 'age', 'gender', 'air_pollution', 'alcohol_use', 'dust_allergy', 'occupational_hazards', 'genetic_risk', 'chronic_lung_disease', 'balanced_diet', 'obesity', 'smoking', 'passive_smoker', 'chest_pain', 'coughing_of_blood', 'fatigue', 'weight_loss', 'shortness_of_breath', 'wheezing', 'swallowing_difficulty', 'clubbing_of_finger_nails', 'frequent_cold', 'dry_cough', 'snoring', 'level']


In [10]:
# Preview and check for missing values
print(df.info())
print("\nMissing values:\n", df.isnull().sum())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 26 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   index                     1000 non-null   int64 
 1   patient_id                1000 non-null   object
 2   age                       1000 non-null   int64 
 3   gender                    1000 non-null   int64 
 4   air_pollution             1000 non-null   int64 
 5   alcohol_use               1000 non-null   int64 
 6   dust_allergy              1000 non-null   int64 
 7   occupational_hazards      1000 non-null   int64 
 8   genetic_risk              1000 non-null   int64 
 9   chronic_lung_disease      1000 non-null   int64 
 10  balanced_diet             1000 non-null   int64 
 11  obesity                   1000 non-null   int64 
 12  smoking                   1000 non-null   int64 
 13  passive_smoker            1000 non-null   int64 
 14  chest_pain               

In [11]:
# Encode categorical variables
# Identify object-type columns (assume these are categorical)
cat_cols = df.select_dtypes(include='object').columns.tolist()
print("Categorical columns:", cat_cols)

label_encoders = {}
for col in cat_cols:
    df[col] = df[col].astype(str).str.strip().str.lower()
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le  # Save encoder for future decoding if needed

Categorical columns: ['patient_id', 'level']


In [12]:
# Scale numeric columns
if 'age' in df.columns:
    scaler = StandardScaler()
    df['age'] = scaler.fit_transform(df[['age']])
    print("Scaled age column.")

Scaled age column.


In [13]:
# Optional: Visualize class balance (if target column exists)
target_col = 'lung_cancer'  # Adjust if different
if target_col in df.columns:
    sns.countplot(x=target_col, data=df)
    plt.title("Target Variable Distribution")
    plt.show()
else:
    print(f"Target column '{target_col}' not found. Skipping target visualization.")


Target column 'lung_cancer' not found. Skipping target visualization.


In [16]:
# Save preprocessed dataset
OUTPUT_PATH = "../data/processed_lung_cancer_data.csv"
df.to_csv(OUTPUT_PATH, index=False)
print(f"Preprocessed data saved to: {OUTPUT_PATH}")


Preprocessed data saved to: ../data/processed_lung_cancer_data.csv
