In [None]:
#Import Libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split, RandomizedSearchCV

In [None]:
# Load dataset
file_path = "cleaned_dataset.csv"
df = pd.read_csv(file_path)

# Fill missing values
for col in ['Radiation recode', 'Chemotherapy recode', 'Radiation sequence with surgery']:
    df[col].fillna(-1, inplace=True)

# Drop rows with missing survival status
df.dropna(subset=['stutus_5_years'], inplace=True)
print(df.shape)


(24929, 15)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(-1, inplace=True)


In [None]:
# Define features and target variables
X = df.drop(columns=['Radiation recode', 'Chemotherapy recode', 'Radiation sequence with surgery'])
y_radiation = df['Radiation recode']
y_chemotherapy = df['Chemotherapy recode']
y_sequence = df['Radiation sequence with surgery']

# Convert string columns in X to numerical using Label Encoding
for col in X.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col])

# Split data into training and testing sets (80-20)
X_train, X_test, y_radiation_train, y_radiation_test = train_test_split(X, y_radiation, test_size=0.2, random_state=42)
X_train, X_test, y_chemotherapy_train, y_chemotherapy_test = train_test_split(X, y_chemotherapy, test_size=0.2, random_state=42)
X_train, X_test, y_sequence_train, y_sequence_test = train_test_split(X, y_sequence, test_size=0.2, random_state=42)

In [None]:
# Train Random Forest models
rf_radiation = RandomForestClassifier(n_estimators=100, random_state=42)
rf_radiation.fit(X_train, y_radiation_train)
accuracy_rf_radiation = accuracy_score(y_radiation_test, rf_radiation.predict(X_test))

rf_chemotherapy = RandomForestClassifier(n_estimators=100, random_state=42)
rf_chemotherapy.fit(X_train, y_chemotherapy_train)
accuracy_rf_chemotherapy = accuracy_score(y_chemotherapy_test, rf_chemotherapy.predict(X_test))

rf_sequence = RandomForestClassifier(n_estimators=100, random_state=42)
rf_sequence.fit(X_train, y_sequence_train)
accuracy_rf_sequence = accuracy_score(y_sequence_test, rf_sequence.predict(X_test))

In [None]:
print(f"Random Forest - Radiation: {accuracy_rf_radiation:.4f}, Chemotherapy: {accuracy_rf_chemotherapy:.4f}, Sequence: {accuracy_rf_sequence:.4f}")

Random Forest - Radiation: 0.6254, Chemotherapy: 0.7900, Sequence: 0.6819


In [None]:
# Normalize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
# Define hyperparameter grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, 30, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2'],
    'bootstrap': [True, False]
}

In [None]:
# RandomizedSearchCV for Radiation
rf_radiation = RandomForestClassifier(random_state=42)
rf_search_radiation = RandomizedSearchCV(rf_radiation, param_grid, n_iter=10, cv=3, scoring='accuracy', n_jobs=-1, random_state=42)
rf_search_radiation.fit(X_train_scaled, y_radiation_train)

In [None]:
# Best model for Radiation
best_rf_radiation = rf_search_radiation.best_estimator_
accuracy_rf_radiation = accuracy_score(y_radiation_test, best_rf_radiation.predict(X_test_scaled))

In [None]:
# RandomizedSearchCV for Chemotherapy
rf_chemotherapy = RandomForestClassifier(random_state=42)
rf_search_chemotherapy = RandomizedSearchCV(rf_chemotherapy, param_grid, n_iter=10, cv=3, scoring='accuracy', n_jobs=-1, random_state=42)
rf_search_chemotherapy.fit(X_train_scaled, y_chemotherapy_train)

# Best model for Chemotherapy
best_rf_chemotherapy = rf_search_chemotherapy.best_estimator_
accuracy_rf_chemotherapy = accuracy_score(y_chemotherapy_test, best_rf_chemotherapy.predict(X_test_scaled))


In [None]:
# RandomizedSearchCV for Sequence
rf_sequence = RandomForestClassifier(random_state=42)
rf_search_sequence = RandomizedSearchCV(rf_sequence, param_grid, n_iter=10, cv=3, scoring='accuracy', n_jobs=-1, random_state=42)
rf_search_sequence.fit(X_train_scaled, y_sequence_train)

# Best model for Sequence
best_rf_sequence = rf_search_sequence.best_estimator_
accuracy_rf_sequence = accuracy_score(y_sequence_test, best_rf_sequence.predict(X_test_scaled))




In [None]:
# Print accuracy
print(f"Optimized Random Forest Accuracy - Radiation: {accuracy_rf_radiation:.4f}, Chemotherapy: {accuracy_rf_chemotherapy:.4f}, Sequence: {accuracy_rf_sequence:.4f}")

Optimized Random Forest Accuracy - Radiation: 0.6484, Chemotherapy: 0.7970, Sequence: 0.7128


In [None]:
# Decode encoded data of every column
for col in X.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    le.fit(X[col])
    X[col] = le.inverse_transform(X[col])




In [None]:
#Print 5 rows of dataset
df.head()


Unnamed: 0,Age at diagnosis,Regional nodes positive (1988+),Total number of in situ/malignant tumors for patient,Radiation recode,Chemotherapy recode,Radiation sequence with surgery,ER Status Recode Breast Cancer (1990+),PR Status Recode Breast Cancer (1990+),CS tumor size (2004-2015),Derived HER2 Recode (2010+),Regional nodes examined (1988+),COD to site recode,Race recode,interva_years,stutus_5_years
0,72,19,1,Beam radiation,No/Unknown,Radiation after surgery,Positive,Positive,46,Negative,19.0,Alive,White,5.0,Alive
1,42,2,1,None/Unknown,Yes,No radiation and/or cancer-directed surgery,Positive,Positive,17,Negative,98.0,Alive,White,6.0,Alive
2,45,0,1,None/Unknown,Yes,No radiation and/or cancer-directed surgery,Positive,Positive,20,Negative,5.0,Alive,White,6.0,Alive
3,40,7,1,Beam radiation,Yes,Radiation after surgery,Positive,Positive,25,Positive,10.0,Alive,White,5.0,Alive
4,55,0,1,Beam radiation,Yes,Radiation after surgery,Positive,Positive,22,Positive,3.0,Alive,White,6.0,Alive


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Load dataset
file_path = "cleaned_dataset.csv"  # Update with the actual file path
df = pd.read_csv(file_path)

# Encode categorical variables
label_encoders = {}
for col in df.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

# Define target variables
df["Combined Treatment"] = df["Radiation recode"].astype(str) + "_" + df["Chemotherapy recode"].astype(str)  # Combine both treatments

y_treatment = df["Combined Treatment"]  # Predicting overall treatment type (Radiation + Chemotherapy)
y_sequence = df["Radiation sequence with surgery"]  # Predicting sequence of treatment

# Define features (excluding target columns)
X = df.drop(columns=["Radiation recode", "Chemotherapy recode", "Radiation sequence with surgery", "Combined Treatment"])

# Split the dataset into training and testing sets
X_train, X_test, y_train_treatment, y_test_treatment = train_test_split(
    X, y_treatment, test_size=0.2, random_state=42
)
X_train_seq, X_test_seq, y_train_seq, y_test_seq = train_test_split(
    X, y_sequence, test_size=0.2, random_state=42
)

# Define hyperparameter grid
param_grid = {
    "n_estimators": [100, 200],
    "max_depth": [10, 20, None],
    "min_samples_split": [2, 5],
    "min_samples_leaf": [1, 2]
}

# Grid Search for treatment type prediction (Radiation + Chemotherapy)
grid_search_treatment = GridSearchCV(
    RandomForestClassifier(random_state=42), param_grid, cv=3, n_jobs=-1
)
grid_search_treatment.fit(X_train, y_train_treatment)

# Grid Search for treatment sequence prediction
grid_search_sequence = GridSearchCV(
    RandomForestClassifier(random_state=42), param_grid, cv=3, n_jobs=-1
)
grid_search_sequence.fit(X_train_seq, y_train_seq)

# Get best models
best_rf_treatment = grid_search_treatment.best_estimator_
best_rf_sequence = grid_search_sequence.best_estimator_

# Evaluate the tuned models
y_pred_treatment_best = best_rf_treatment.predict(X_test)
y_pred_sequence_best = best_rf_sequence.predict(X_test_seq)

accuracy_treatment_best = accuracy_score(y_test_treatment, y_pred_treatment_best)
accuracy_sequence_best = accuracy_score(y_test_seq, y_pred_sequence_best)

print(f"Optimized Treatment Type (Radiation + Chemotherapy) Prediction Accuracy: {accuracy_treatment_best:.4f}")
print(f"Optimized Treatment Sequence Prediction Accuracy: {accuracy_sequence_best:.4f}")




Optimized Treatment Type (Radiation + Chemotherapy) Prediction Accuracy: 0.5275
Optimized Treatment Sequence Prediction Accuracy: 0.7100


In [None]:
# Function to take user input and predict treatment
def predict_treatment():
    user_data = {}

    print("\nEnter patient details:")

    for col in X.columns:
        if col in label_encoders:  # Categorical columns
            categories = label_encoders[col].classes_
            print(f"\nOptions for {col}: {list(categories)}")
            value = input(f"Enter {col}: ").strip()
            if value in categories:
                user_data[col] = label_encoders[col].transform([value])[0]
            else:
                print(f"Invalid input for {col}. Try again.")
                return
        else:  # Numerical columns
            try:
                user_data[col] = float(input(f"Enter {col}: "))
            except ValueError:
                print(f"Invalid input for {col}. Try again.")
                return

    # Convert input into DataFrame
    user_df = pd.DataFrame([user_data])

    # Make predictions
    treatment_pred = best_rf_treatment.predict(user_df)[0]
    sequence_pred = best_rf_sequence.predict(user_df)[0]

    # Decode predictions
    treatment_pred_decoded = treatment_pred
    sequence_pred_decoded = label_encoders["Radiation sequence with surgery"].inverse_transform([sequence_pred])[0]

    print("\n--- Predicted Treatment Plan ---")
    print(f"Treatment Type (Radiation + Chemotherapy): {treatment_pred_decoded}")
    print(f"Treatment Sequence: {sequence_pred_decoded}")

# Run prediction function
predict_treatment()



Enter patient details:
Enter Age at diagnosis: 45
Enter Regional nodes positive (1988+): 23
Enter Total number of in situ/malignant tumors for patient: 2

Options for ER Status Recode Breast Cancer (1990+): ['Borderline', 'Negative', 'Posi', 'Positive']
Enter ER Status Recode Breast Cancer (1990+): Negative

Options for PR Status Recode Breast Cancer (1990+): ['Borderline', 'Negative', 'Positive']
Enter PR Status Recode Breast Cancer (1990+): Positive

Options for CS tumor size (2004-2015): ['0', '1', '10', '100', '101', '102', '103', '104', '105', '106', '107', '108', '109', '11', '110', '112', '114', '115', '116', '117', '118', '119', '12', '120', '121', '123', '124', '125', '126', '127', '129', '13', '130', '131', '132', '134', '135', '136', '137', '14', '140', '145', '147', '15', '150', '153', '155', '159', '16', '160', '161', '162', '165', '166', '17', '170', '172', '175', '18', '180', '181', '185', '19', '190', '197', '198', '2', '20', '200', '202', '21', '210', '22', '220', '23

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Load dataset
file_path = "cleaned_dataset.csv"  # Update with correct path if needed
df = pd.read_csv(file_path)

# Encode categorical variables (keeping numerical ones unchanged)
label_encoders = {}
for col in df.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le  # Store encoders for later decoding

# Define targets
y_radiation = df["Radiation recode"]  # Predict Radiation separately
y_chemotherapy = df["Chemotherapy recode"]   # Predict Chemotherapy separately
y_sequence = df["Radiation sequence with surgery"]  # Predict Treatment Sequence

# Define features (excluding target columns)
X = df.drop(columns=["Radiation recode", "Chemotherapy recode", "Radiation sequence with surgery"])

# Split data
X_train_r, X_test_r, y_train_r, y_test_r = train_test_split(X, y_radiation, test_size=0.2, random_state=42)
X_train_c, X_test_c, y_train_c, y_test_c = train_test_split(X, y_chemotherapy, test_size=0.2, random_state=42)
X_train_s, X_test_s, y_train_s, y_test_s = train_test_split(X, y_sequence, test_size=0.2, random_state=42)

# Hyperparameter tuning
param_grid = {
    "n_estimators": [100, 200],
    "max_depth": [10, 20, None],
    "min_samples_split": [2, 5],
    "min_samples_leaf": [1, 2]
}

def train_best_model(X_train, y_train):
    grid_search = GridSearchCV(RandomForestClassifier(random_state=42), param_grid, cv=3, n_jobs=-1)
    grid_search.fit(X_train, y_train)
    return grid_search.best_estimator_

# Train separate models
best_rf_radiation = train_best_model(X_train_r, y_train_r)
best_rf_chemotherapy = train_best_model(X_train_c, y_train_c)
best_rf_sequence = train_best_model(X_train_s, y_train_s)

# Model Accuracy
y_pred_r = best_rf_radiation.predict(X_test_r)
y_pred_c = best_rf_chemotherapy.predict(X_test_c)
y_pred_s = best_rf_sequence.predict(X_test_s)

accuracy_r = accuracy_score(y_test_r, y_pred_r)
accuracy_c = accuracy_score(y_test_c, y_pred_c)
accuracy_s = accuracy_score(y_test_s, y_pred_s)

print(f"Model Accuracy for Radiation Prediction: {accuracy_r:.4f}")
print(f"Model Accuracy for Chemotherapy Prediction: {accuracy_c:.4f}")
print(f"Model Accuracy for Treatment Sequence Prediction: {accuracy_s:.4f}")

# Function to take user input and predict treatment
def predict_treatment():
    user_data = {}

    print("\nEnter patient details:")

    for col in X.columns:
        if col in label_encoders:  # Categorical columns
            categories = label_encoders[col].classes_
            print(f"\nOptions for {col}: {list(categories)}")
            value = input(f"Enter {col}: ").strip()
            if value in categories:
                user_data[col] = label_encoders[col].transform([value])[0]
            else:
                print(f"Invalid input for {col}. Try again.")
                return
        else:  # Numerical columns (e.g., CS Tumor Size)
            try:
                user_data[col] = float(input(f"Enter {col}: "))
            except ValueError:
                print(f"Invalid input for {col}. Try again.")
                return

    # Convert input into DataFrame
    user_df = pd.DataFrame([user_data])

    # Make predictions
    radiation_pred = best_rf_radiation.predict(user_df)[0]
    chemotherapy_pred = best_rf_chemotherapy.predict(user_df)[0]
    sequence_pred = best_rf_sequence.predict(user_df)[0]

    # Decode predictions
    radiation_decoded = label_encoders["Radiation recode"].inverse_transform([radiation_pred])[0]
    chemotherapy_decoded = label_encoders["Chemotherapy recode"].inverse_transform([chemotherapy_pred])[0]
    sequence_decoded = label_encoders["Radiation sequence with surgery"].inverse_transform([sequence_pred])[0]

    # Prepare output
    required_treatments = []
    if radiation_decoded != "No Radiation":
        required_treatments.append("Radiation")
    if chemotherapy_decoded != "No Chemotherapy":
        required_treatments.append("Chemotherapy recode")

    print("\n--- Predicted Treatment Plan ---")
    if required_treatments:
        print(f"Required Treatment(s): {', '.join(required_treatments)}")
    else:
        print("No Treatment Required")

    print(f"Treatment Sequence: {sequence_decoded}")

# Run prediction function
predict_treatment()




Model Accuracy for Radiation Prediction: 0.6432
Model Accuracy for Chemotherapy Prediction: 0.7944
Model Accuracy for Treatment Sequence Prediction: 0.7034

Enter patient details:
Enter Age at diagnosis: 23
Enter Regional nodes positive (1988+): 12
Enter Total number of in situ/malignant tumors for patient: 2

Options for ER Status Recode Breast Cancer (1990+): ['Borderline', 'Negative', 'Posi', 'Positive']
Enter ER Status Recode Breast Cancer (1990+): Negative

Options for PR Status Recode Breast Cancer (1990+): ['Borderline', 'Negative', 'Positive']
Enter PR Status Recode Breast Cancer (1990+): Negative

Options for CS tumor size (2004-2015): ['0', '1', '10', '100', '101', '102', '103', '104', '105', '106', '107', '108', '109', '11', '110', '112', '114', '115', '116', '117', '118', '119', '12', '120', '121', '123', '124', '125', '126', '127', '129', '13', '130', '131', '132', '134', '135', '136', '137', '14', '140', '145', '147', '15', '150', '153', '155', '159', '16', '160', '161', 

In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Load dataset
file_path = "cleaned_dataset.csv"  # Update with correct path if needed
df = pd.read_csv(file_path)

# Encode categorical variables (keeping numerical ones unchanged)
label_encoders = {}
for col in df.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le  # Store encoders for later decoding

# Define targets
y_radiation = df["Radiation recode"]  # Predict Radiation separately
y_chemotherapy = df["Chemotherapy recode"]   # Predict Chemotherapy separately
y_sequence = df["Radiation sequence with surgery"]  # Predict Treatment Sequence

# Define features (excluding target columns)
X = df.drop(columns=["Radiation recode", "Chemotherapy recode", "Radiation sequence with surgery"])

# Split data
X_train_r, X_test_r, y_train_r, y_test_r = train_test_split(X, y_radiation, test_size=0.2, random_state=42)
X_train_c, X_test_c, y_train_c, y_test_c = train_test_split(X, y_chemotherapy, test_size=0.2, random_state=42)
X_train_s, X_test_s, y_train_s, y_test_s = train_test_split(X, y_sequence, test_size=0.2, random_state=42)

# Train separate models
rf_radiation = RandomForestClassifier(random_state=42)
rf_chemotherapy = RandomForestClassifier(random_state=42)
rf_sequence = RandomForestClassifier(random_state=42)

rf_radiation.fit(X_train_r, y_train_r)
rf_chemotherapy.fit(X_train_c, y_train_c)
rf_sequence.fit(X_train_s, y_train_s)

# Model Accuracy
y_pred_r = rf_radiation.predict(X_test_r)
y_pred_c = rf_chemotherapy.predict(X_test_c)
y_pred_s = rf_sequence.predict(X_test_s)

accuracy_r = accuracy_score(y_test_r, y_pred_r)
accuracy_c = accuracy_score(y_test_c, y_pred_c)
accuracy_s = accuracy_score(y_test_s, y_pred_s)

print(f"Model Accuracy for Radiation Prediction: {accuracy_r:.4f}")
print(f"Model Accuracy for Chemotherapy Prediction: {accuracy_c:.4f}")
print(f"Model Accuracy for Treatment Sequence Prediction: {accuracy_s:.4f}")

print("Classification Reports")
print("Radiation Prediction:")
print(classification_report(y_test_r, y_pred_r))

print("Classification Reports")
print("Radiation Prediction:")
print(classification_report(y_test_c, y_pred_c))

print("Classification Reports")
print("Radiation Prediction:")
print(classification_report(y_test_s, y_pred_s))

Model Accuracy for Radiation Prediction: 0.6139
Model Accuracy for Chemotherapy Prediction: 0.7898
Model Accuracy for Treatment Sequence Prediction: 0.6841
Classification Reports
Radiation Prediction:
              precision    recall  f1-score   support

           0       0.61      0.70      0.65      2354
           1       0.14      0.05      0.07       110
           2       0.07      0.02      0.03        92
           3       0.00      0.00      0.00         9
           4       0.09      0.02      0.03        65
           5       0.63      0.60      0.61      2356

    accuracy                           0.61      4986
   macro avg       0.26      0.23      0.23      4986
weighted avg       0.59      0.61      0.60      4986

Classification Reports
Radiation Prediction:
              precision    recall  f1-score   support

           0       0.82      0.83      0.82      2970
           1       0.74      0.73      0.74      2016

    accuracy                           0.79    

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
