<a href="https://colab.research.google.com/github/mariaR5/lung-cancer-survival-prediction/blob/main/Lung_Cancer_Survival.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Mount Google Drive

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


Load Dataset

In [2]:
import pandas as pd

data_dir = '/content/drive/MyDrive/Unified_Mentor_Projects/Lung_Cancer'
df = pd.read_csv(f'{data_dir}/dataset_med.csv')
df.head()

Unnamed: 0,id,age,gender,country,diagnosis_date,cancer_stage,family_history,smoking_status,bmi,cholesterol_level,hypertension,asthma,cirrhosis,other_cancer,treatment_type,end_treatment_date,survived
0,1,64.0,Male,Sweden,2016-04-05,Stage I,Yes,Passive Smoker,29.4,199,0,0,1,0,Chemotherapy,2017-09-10,0
1,2,50.0,Female,Netherlands,2023-04-20,Stage III,Yes,Passive Smoker,41.2,280,1,1,0,0,Surgery,2024-06-17,1
2,3,65.0,Female,Hungary,2023-04-05,Stage III,Yes,Former Smoker,44.0,268,1,1,0,0,Combined,2024-04-09,0
3,4,51.0,Female,Belgium,2016-02-05,Stage I,No,Passive Smoker,43.0,241,1,1,0,0,Chemotherapy,2017-04-23,0
4,5,37.0,Male,Luxembourg,2023-11-29,Stage I,No,Passive Smoker,19.7,178,0,0,0,0,Combined,2025-01-08,0


# Data Preprocessing

In [3]:
# Drop unnecessary columns
df = df.drop(columns=['id', 'diagnosis_date', 'end_treatment_date'])

# Drop missing targets
df = df.dropna(subset=['survived'])

# Remove duplicates
df = df.drop_duplicates()

In [4]:
# Fill missing numeric columns with median
num_cols = df.select_dtypes(include=['float64', 'int64']).columns
df[num_cols] = df[num_cols].fillna(df[num_cols].median())

# Fill categorical columns with mode
cat_cols = df.select_dtypes(include='object').columns
for col in cat_cols:
    df[col] = df[col].fillna(df[col].mode()[0])

# Stratified sample
df_small = df.groupby('survived', group_keys=False).apply(lambda x: x.sample(frac=0.15, random_state=42)).reset_index(drop=True)

print("Reduced dataset shape:", df_small.shape)

Reduced dataset shape: (133499, 14)


  df_small = df.groupby('survived', group_keys=False).apply(lambda x: x.sample(frac=0.15, random_state=42)).reset_index(drop=True)


Encode Categorical Values

In [5]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
for col in df_small.select_dtypes(include='object').columns:
    df_small[col] = le.fit_transform(df_small[col])

X = df_small.drop('survived', axis=1)
y = df_small['survived']

Handle Class Imbalance (SMOTE)

In [6]:
from imblearn.over_sampling import SMOTE

sm = SMOTE(random_state=42)
X_resampled, y_resampled = sm.fit_resample(X, y)

# Model Training and Evaluation

Train - Test Split

In [7]:
from sklearn.model_selection import train_test_split

# Final split
X_train, X_test, y_train, y_test = train_test_split(
    X_resampled, y_resampled, test_size=0.2, random_state=42, stratify=y_resampled
)

Feature Scaling

In [8]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


Model Training

In [9]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

model = RandomForestClassifier(n_estimators=24, random_state=42)
model.fit(X_train_scaled, y_train)

y_pred = model.predict(X_test_scaled)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

Accuracy: 0.7406340057636888

Classification Report:
               precision    recall  f1-score   support

           0       0.73      0.77      0.75     20820
           1       0.75      0.72      0.73     20820

    accuracy                           0.74     41640
   macro avg       0.74      0.74      0.74     41640
weighted avg       0.74      0.74      0.74     41640


Confusion Matrix:
 [[15951  4869]
 [ 5931 14889]]


# Save Model

In [10]:
import joblib
joblib.dump(model, 'model.pkl', compress=3)
joblib.dump(scaler, 'scaler.pkl')

['scaler.pkl']

Exporting to Google Drive

In [11]:
import shutil

shutil.copy('model.pkl', data_dir)
shutil.copy('scaler.pkl', data_dir)

'/content/drive/MyDrive/Unified_Mentor_Projects/Lung_Cancer/scaler.pkl'