<a href="https://colab.research.google.com/github/naidu199/BrainTumor_Survival_Time_Analysis/blob/master/Brain_Tumor_Survival_Time_Prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


In [None]:
df =pd.read_csv("/content/drive/MyDrive/BrainTumarData/BrainTumor.csv")


In [None]:
df.head()

Unnamed: 0,Patient ID,Age,Gender,Tumor Type,Tumor Grade,Tumor Location,Treatment,Treatment Outcome,Time to Recurrence (months),Recurrence Site,Survival Time (months)
0,1,45,Male,Glioblastoma,IV,Frontal lobe,Surgery,Partial response,10.0,Temporal lobe,18
1,2,55,Female,Meningioma,I,Parietal lobe,Surgery,Complete response,,,36
2,3,60,Male,Astrocytoma,III,Occipital lobe,Surgery + Chemotherapy,Progressive disease,14.0,Frontal lobe,22
3,4,50,Female,Glioblastoma,IV,Temporal lobe,Surgery + Radiation therapy,Complete response,,,12
4,5,65,Male,Astrocytoma,II,Frontal lobe,Surgery + Radiation therapy,Partial response,24.0,Frontal lobe,48


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 11 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   Patient ID                   2000 non-null   int64  
 1   Age                          2000 non-null   int64  
 2   Gender                       2000 non-null   object 
 3   Tumor Type                   2000 non-null   object 
 4   Tumor Grade                  2000 non-null   object 
 5   Tumor Location               2000 non-null   object 
 6   Treatment                    2000 non-null   object 
 7   Treatment Outcome            2000 non-null   object 
 8   Time to Recurrence (months)  1438 non-null   float64
 9   Recurrence Site              1438 non-null   object 
 10  Survival Time (months)       2000 non-null   int64  
dtypes: float64(1), int64(3), object(7)
memory usage: 172.0+ KB


#Data Preprocessing

In [None]:
#finding the null values in the data set
df.isnull().sum()

Patient ID                       0
Age                              0
Gender                           0
Tumor Type                       0
Tumor Grade                      0
Tumor Location                   0
Treatment                        0
Treatment Outcome                0
Time to Recurrence (months)    562
Recurrence Site                562
Survival Time (months)           0
dtype: int64

In [None]:
#removing the  null values
df['Time to Recurrence (months)'].fillna(df['Time to Recurrence (months)'].mean(), inplace=True)

# # Fill 'Recurrence Site' with 'None'
# df['Recurrence Site'].fillna('None', inplace=True)

# Fill 'Age' with median
df['Age'].fillna(df['Age'].median(), inplace=True)

# Fill categorical columns with mode
categorical_columns = ['Gender', 'Tumor Type', 'Tumor Grade', 'Tumor Location', 'Treatment', 'Treatment Outcome']
for col in categorical_columns:
    df[col].fillna(df[col].mode()[0], inplace=True)
#identifying the most repeated value and replacing it with the null values

In [None]:
df.isnull().sum()

Patient ID                       0
Age                              0
Gender                           0
Tumor Type                       0
Tumor Grade                      0
Tumor Location                   0
Treatment                        0
Treatment Outcome                0
Time to Recurrence (months)      0
Recurrence Site                562
Survival Time (months)           0
dtype: int64

**One-Hot Encoding ** is useful if the model does not handle categorical data natively and you want to avoid ordinal relationships.

**Label Encoding ** can be used if you prefer a simpler approach, but it introduces ordinal relationships where none may exist.

In [None]:
#Libraries for the encoding the data
from sklearn.preprocessing import OneHotEncoder, LabelEncoder


In [None]:
#gender encoding
label_encoder = LabelEncoder()
df['Gender_encoded'] = label_encoder.fit_transform(df['Gender'])

In [None]:
#Tumor types do not have an ordinal relationship
encoder = OneHotEncoder(sparse=False, drop='first')
encoded_tumor_type = encoder.fit_transform(df[['Tumor Type']])
df_encoded_tumor_type = pd.DataFrame(encoded_tumor_type, columns=encoder.get_feature_names_out(['Tumor Type']))




In [None]:
#Tumor Grade encoding label encoding
from sklearn.preprocessing import OrdinalEncoder

encoder = OrdinalEncoder()
df['Tumor Grade_encoded'] = encoder.fit_transform(df[['Tumor Grade']])

In [None]:
#Tumor Location encoding
encoder = OneHotEncoder(sparse=False, drop='first')
encoded_tumor_location = encoder.fit_transform(df[['Tumor Location']])
df_encoded_tumor_location = pd.DataFrame(encoded_tumor_location, columns=encoder.get_feature_names_out(['Tumor Location']))




In [None]:
#Treatment encoding
encoder = OneHotEncoder(sparse=False, drop='first')
encoded_treatment = encoder.fit_transform(df[['Treatment']])
df_encoded_treatment = pd.DataFrame(encoded_treatment, columns=encoder.get_feature_names_out(['Treatment']))




In [None]:
# Treatment Outcome
encoder = OneHotEncoder(sparse=False, drop='first')
encoded_treatment_outcome = encoder.fit_transform(df[['Treatment Outcome']])
df_encoded_treatment_outcome = pd.DataFrame(encoded_treatment_outcome, columns=encoder.get_feature_names_out(['Treatment Outcome']))




In [None]:
import pandas as pd

# Assuming df is your original DataFrame and the following DataFrames contain the encoded columns:
# df_encoded_gender, df_encoded_tumor_type, df_encoded_tumor_location, df_encoded_treatment, df_encoded_treatment_outcome

# Create a new DataFrame excluding the categorical columns
df_non_categorical = df.drop(columns=[
    'Patient ID',
    'Gender',
    'Tumor Type',
    'Tumor Grade',
    'Tumor Location',
    'Treatment',
    'Treatment Outcome',
    'Recurrence Site'
])

# Concatenate the non-categorical columns with the encoded DataFrames
df_encoded = pd.concat([
    df_non_categorical,
    df_encoded_tumor_type,
    df_encoded_tumor_location,
    df_encoded_treatment,
    df_encoded_treatment_outcome
], axis=1)

# Add the ordinally encoded Tumor Grade
df_encoded['Tumor Grade_encoded'] = df['Tumor Grade_encoded']

# Ensure Survival Time (months) is included
df_encoded['Survival Time (months)'] = df['Survival Time (months)']

# Print the new DataFrame with encoded values



In [None]:
print("Shape of the encoded DataFrame:", df_encoded.shape)
print("Columns in the encoded DataFrame:", df_encoded.columns.tolist())

# Print a small subset of the DataFrame
# print(df_encoded.head(5))

Shape of the encoded DataFrame: (2000, 19)
Columns in the encoded DataFrame: ['Age', 'Time to Recurrence (months)', 'Survival Time (months)', 'Gender_encoded', 'Tumor Grade_encoded', 'Tumor Type_Glioblastoma', 'Tumor Type_Meningioma', 'Tumor Location_Occipital lobe', 'Tumor Location_Parietal lobe', 'Tumor Location_Temporal lobe', 'Treatment_Chemotherapy + Radiation', 'Treatment_Radiation', 'Treatment_Surgery', 'Treatment_Surgery + Chemotherapy', 'Treatment_Surgery + Radiation', 'Treatment_Surgery + Radiation therapy', 'Treatment Outcome_Partial response', 'Treatment Outcome_Progressive disease', 'Treatment Outcome_Stable disease']


In [None]:
#droping any duplicate columns

df_encoded = df_encoded.loc[:,~df_encoded.columns.duplicated()]

In [None]:
#spliting featues and the target varibles

X = df_encoded.drop(columns=['Survival Time (months)'])
Y = df_encoded['Survival Time (months)']


In [None]:
X.shape, Y.shape

((2000, 18), (2000,))

In [None]:
# standardize the feature to make all the features a single range values
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Convert the scaled features back into a DataFrame
X_scaled_df = pd.DataFrame(X_scaled, columns=X.columns)

In [None]:

print(X.dtypes)


Age                                        int64
Time to Recurrence (months)              float64
Gender_encoded                             int64
Tumor Grade_encoded                      float64
Tumor Type_Glioblastoma                  float64
Tumor Type_Meningioma                    float64
Tumor Location_Occipital lobe            float64
Tumor Location_Parietal lobe             float64
Tumor Location_Temporal lobe             float64
Treatment_Chemotherapy + Radiation       float64
Treatment_Radiation                      float64
Treatment_Surgery                        float64
Treatment_Surgery + Chemotherapy         float64
Treatment_Surgery + Radiation            float64
Treatment_Surgery + Radiation therapy    float64
Treatment Outcome_Partial response       float64
Treatment Outcome_Progressive disease    float64
Treatment Outcome_Stable disease         float64
dtype: object


In [None]:
!pip install tqdm




training and testing the model with the kfold

In [None]:
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression
from tqdm import tqdm

cross_validate_model for the training and testing the data with differt models

In [None]:


def cross_validate_model(model, X, y, n_splits=5, random_state=42):
    """
    Perform K-Fold cross-validation on the given model.

    Parameters:
    - model: The regression model to be evaluated.
    - X: Feature DataFrame.
    - y: Target Series.
    - n_splits: Number of folds for cross-validation.
    - random_state: Random seed for reproducibility.

    Returns:
    - A tuple containing average Mean Squared Error and R² score.
    """
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=random_state)
    mse_scores = []
    r2_scores = []

    for train_index, test_index in tqdm(kf.split(X), total=kf.get_n_splits(), desc="Cross-Validation Progress"):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        model.fit(X_train, y_train)
        predictions = model.predict(X_test)

        mse_score = mean_squared_error(y_test, predictions)
        r2_score_value = r2_score(y_test, predictions)

        mse_scores.append(mse_score)
        r2_scores.append(r2_score_value)

        print(f"Fold Mean Squared Error: {mse_score:.4f}")
        print(f"Fold R² Score: {r2_score_value:.4f}")

    avg_mse = np.mean(mse_scores)
    avg_r2 = np.mean(r2_scores)

    print(f"Average Mean Squared Error: {avg_mse:.4f}")
    print(f"Average R² Score: {avg_r2:.4f}")

    return avg_mse, avg_r2


LinearRegression

In [None]:
from sklearn.linear_model import LinearRegression


# Assuming X_scaled_df and y are already defined

# Test Linear Regression
print("Testing Linear Regression")
linear_model = LinearRegression()
result= cross_validate_model(linear_model, X_scaled_df, Y)

print("Linear Regression",  result)






Testing Linear Regression


Cross-Validation Progress: 100%|██████████| 5/5 [00:00<00:00, 65.75it/s]

Fold Mean Squared Error: 52.8005
Fold R² Score: 0.3079
Fold Mean Squared Error: 50.7001
Fold R² Score: 0.3076
Fold Mean Squared Error: 49.7379
Fold R² Score: 0.3802
Fold Mean Squared Error: 52.7643
Fold R² Score: 0.2891
Fold Mean Squared Error: 43.7026
Fold R² Score: 0.3307
Average Mean Squared Error: 49.9411
Average R² Score: 0.3231
Linear Regression (49.94107611082522, 0.32311284140427554)





DecisionTreeRegressor

In [None]:
from sklearn.tree import DecisionTreeRegressor
# Test Decision Tree Regressor
print("\nTesting Decision Tree Regressor")
decision_tree_model = DecisionTreeRegressor()
result=cross_validate_model(decision_tree_model, X_scaled_df, Y)

print(" DecisionTreeRegressor",  result)


Testing Decision Tree Regressor


Cross-Validation Progress:   0%|          | 0/5 [00:00<?, ?it/s]

Fold Mean Squared Error: 16.6434
Fold R² Score: 0.7818
Fold Mean Squared Error: 13.5396
Fold R² Score: 0.8151
Fold Mean Squared Error: 33.7256
Fold R² Score: 0.5797
Fold Mean Squared Error: 21.0281
Fold R² Score: 0.7167


Cross-Validation Progress: 100%|██████████| 5/5 [00:00<00:00, 104.46it/s]

Fold Mean Squared Error: 21.9702
Fold R² Score: 0.6635
Average Mean Squared Error: 21.3814
Average R² Score: 0.7114
 DecisionTreeRegressor (21.381383646026563, 0.711381611255397)





 RandomForestRegressor

In [None]:
from sklearn.ensemble import RandomForestRegressor
# Test Random Forest Regressor
print("\nTesting Random Forest Regressor")
random_forest_model = RandomForestRegressor()
result= cross_validate_model(random_forest_model, X_scaled_df, Y)


print("RandomForestRegressor",  result)


Testing Random Forest Regressor


Cross-Validation Progress:  20%|██        | 1/5 [00:00<00:01,  2.10it/s]

Fold Mean Squared Error: 13.8259
Fold R² Score: 0.8188


Cross-Validation Progress:  40%|████      | 2/5 [00:00<00:01,  2.00it/s]

Fold Mean Squared Error: 8.1990
Fold R² Score: 0.8880


Cross-Validation Progress:  60%|██████    | 3/5 [00:01<00:01,  1.75it/s]

Fold Mean Squared Error: 18.1865
Fold R² Score: 0.7734


Cross-Validation Progress:  80%|████████  | 4/5 [00:02<00:00,  1.53it/s]

Fold Mean Squared Error: 15.0813
Fold R² Score: 0.7968


Cross-Validation Progress: 100%|██████████| 5/5 [00:03<00:00,  1.53it/s]

Fold Mean Squared Error: 12.9058
Fold R² Score: 0.8024
Average Mean Squared Error: 13.6397
Average R² Score: 0.8159
RandomForestRegressor (13.639694796395656, 0.8158690522908936)





SVM

In [None]:
from sklearn.svm import SVR
# Test Support Vector Machine (SVM) Regressor
print("\nTesting Support Vector Machine (SVM) Regressor")
svm_model = SVR()
result= cross_validate_model(svm_model, X_scaled_df, Y)

print("SVM",  result)


Testing Support Vector Machine (SVM) Regressor


Cross-Validation Progress:  20%|██        | 1/5 [00:00<00:02,  1.87it/s]

Fold Mean Squared Error: 39.4563
Fold R² Score: 0.4828


Cross-Validation Progress:  40%|████      | 2/5 [00:01<00:01,  1.65it/s]

Fold Mean Squared Error: 35.7213
Fold R² Score: 0.5122


Cross-Validation Progress:  60%|██████    | 3/5 [00:01<00:01,  1.71it/s]

Fold Mean Squared Error: 35.9486
Fold R² Score: 0.5520


Cross-Validation Progress:  80%|████████  | 4/5 [00:02<00:00,  1.89it/s]

Fold Mean Squared Error: 33.7515
Fold R² Score: 0.5453


Cross-Validation Progress: 100%|██████████| 5/5 [00:02<00:00,  2.06it/s]

Fold Mean Squared Error: 27.8406
Fold R² Score: 0.5736
Average Mean Squared Error: 34.5437
Average R² Score: 0.5332
SVM (34.54365016654972, 0.5331882730486335)





Ridge Regression

In [None]:
from sklearn.linear_model import Ridge

# Initialize Ridge Regression model
ridge_model = Ridge(alpha=1.0)

# Perform cross-validation
print("Testing Ridge Regression")
result= cross_validate_model(ridge_model, X_scaled_df, Y)
print("Ridge Regression",  result)

Testing Ridge Regression


Cross-Validation Progress: 100%|██████████| 5/5 [00:00<00:00, 87.59it/s]

Fold Mean Squared Error: 52.8082
Fold R² Score: 0.3078
Fold Mean Squared Error: 50.6998
Fold R² Score: 0.3076
Fold Mean Squared Error: 49.7367
Fold R² Score: 0.3802
Fold Mean Squared Error: 52.7547
Fold R² Score: 0.2892
Fold Mean Squared Error: 43.6919
Fold R² Score: 0.3309
Average Mean Squared Error: 49.9383
Average R² Score: 0.3232
Ridge Regression (49.93825768890666, 0.3231550701386642)





Lasso

In [None]:
from sklearn.linear_model import Lasso

# Initialize Lasso Regression model
lasso_model = Lasso(alpha=0.1)

# Perform cross-validation
print("Testing Lasso Regression")
result= cross_validate_model(lasso_model, X_scaled_df, Y)
print("Lasso Regression",  result)

Testing Lasso Regression


Cross-Validation Progress:   0%|          | 0/5 [00:00<?, ?it/s]

Fold Mean Squared Error: 53.3078
Fold R² Score: 0.3013
Fold Mean Squared Error: 51.1643
Fold R² Score: 0.3013
Fold Mean Squared Error: 49.7447
Fold R² Score: 0.3801
Fold Mean Squared Error: 52.4335
Fold R² Score: 0.2935


Cross-Validation Progress: 100%|██████████| 5/5 [00:00<00:00, 49.95it/s]

Fold Mean Squared Error: 43.6122
Fold R² Score: 0.3321
Average Mean Squared Error: 50.0525
Average R² Score: 0.3217
Lasso Regression (50.052505989393424, 0.32166631634070997)





Elastic Net

In [None]:
from sklearn.linear_model import ElasticNet

# Initialize Elastic Net model
elastic_net_model = ElasticNet(alpha=0.1, l1_ratio=0.5)

# Perform cross-validation
print("Testing Elastic Net")
cross_validate_model(elastic_net_model, X_scaled_df, Y)
print("Elastic Net",  result)

Testing Elastic Net


Cross-Validation Progress:   0%|          | 0/5 [00:00<?, ?it/s]

Fold Mean Squared Error: 53.5827
Fold R² Score: 0.2977
Fold Mean Squared Error: 51.1638
Fold R² Score: 0.3013
Fold Mean Squared Error: 50.1894
Fold R² Score: 0.3746


Cross-Validation Progress: 100%|██████████| 5/5 [00:00<00:00, 42.39it/s]

Fold Mean Squared Error: 52.2537
Fold R² Score: 0.2960
Fold Mean Squared Error: 43.4239
Fold R² Score: 0.3350
Average Mean Squared Error: 50.1227
Average R² Score: 0.3209
Elastic Net (50.052505989393424, 0.32166631634070997)





Gradient Boosting Regressor

In [None]:
from sklearn.ensemble import GradientBoostingRegressor

# Initialize Gradient Boosting Regressor model
gradient_boosting_model = GradientBoostingRegressor()

# Perform cross-validation
print("Testing Gradient Boosting Regressor")
result= cross_validate_model(gradient_boosting_model, X_scaled_df, Y)
print("Gradient Boosting Regressor",  result)

Testing Gradient Boosting Regressor


Cross-Validation Progress:  20%|██        | 1/5 [00:00<00:01,  3.40it/s]

Fold Mean Squared Error: 30.6087
Fold R² Score: 0.5988


Cross-Validation Progress:  40%|████      | 2/5 [00:00<00:00,  3.85it/s]

Fold Mean Squared Error: 28.6367
Fold R² Score: 0.6089


Cross-Validation Progress:  60%|██████    | 3/5 [00:00<00:00,  3.47it/s]

Fold Mean Squared Error: 27.4085
Fold R² Score: 0.6585


Cross-Validation Progress:  80%|████████  | 4/5 [00:01<00:00,  3.02it/s]

Fold Mean Squared Error: 30.2496
Fold R² Score: 0.5924


Cross-Validation Progress: 100%|██████████| 5/5 [00:01<00:00,  3.27it/s]

Fold Mean Squared Error: 25.7451
Fold R² Score: 0.6057
Average Mean Squared Error: 28.5297
Average R² Score: 0.6129
Gradient Boosting Regressor (28.529703330328278, 0.6128705666406167)





Polynomial Regression

In [None]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LinearRegression

# Create a Polynomial Regression pipeline
poly_features = PolynomialFeatures(degree=2)
poly_reg_model = make_pipeline(poly_features, LinearRegression())

# Perform cross-validation
print("Testing Polynomial Regression")
result= cross_validate_model(poly_reg_model, X_scaled_df, Y)
print("Polynomial Regression",  result)

Testing Polynomial Regression


Cross-Validation Progress:   0%|          | 0/5 [00:00<?, ?it/s]

Fold Mean Squared Error: 35.0727
Fold R² Score: 0.5403


Cross-Validation Progress:  60%|██████    | 3/5 [00:00<00:00,  7.47it/s]

Fold Mean Squared Error: 33.7262
Fold R² Score: 0.5394
Fold Mean Squared Error: 131668240494012416000.0000
Fold R² Score: -1640720754745525760.0000


Cross-Validation Progress: 100%|██████████| 5/5 [00:00<00:00,  7.59it/s]

Fold Mean Squared Error: 8745919820517106253824.0000
Fold R² Score: -117836199417864929280.0000
Fold Mean Squared Error: 32.7282
Fold R² Score: 0.4988
Average Mean Squared Error: 1775517612202223730688.0000
Average R² Score: -23895384034522091520.0000
Polynomial Regression (1.7755176122022237e+21, -2.389538403452209e+19)





finally we are trying with  xgboost


In [None]:
!pip install xgboost




In [None]:
import xgboost as xgb

xgb_model = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=100, max_depth=6, learning_rate=0.1)

# Perform cross-validation
print("Testing XGBoost Regressor")
result= cross_validate_model(xgb_model, X_scaled_df, Y)
print("XGBoost Regressor",  result)

Testing XGBoost Regressor


Cross-Validation Progress:  20%|██        | 1/5 [00:00<00:00,  8.85it/s]

Fold Mean Squared Error: 15.7515
Fold R² Score: 0.7935
Fold Mean Squared Error: 13.0189
Fold R² Score: 0.8222


Cross-Validation Progress:  60%|██████    | 3/5 [00:00<00:00, 10.30it/s]

Fold Mean Squared Error: 19.0026
Fold R² Score: 0.7632


Cross-Validation Progress: 100%|██████████| 5/5 [00:00<00:00, 10.72it/s]

Fold Mean Squared Error: 17.9170
Fold R² Score: 0.7586
Fold Mean Squared Error: 15.0751
Fold R² Score: 0.7691
Average Mean Squared Error: 16.1530
Average R² Score: 0.7813
XGBoost Regressor (16.153035197094855, 0.7813370052241736)





Test Train Split Xgboost

In [None]:
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

# Assuming X_scaled_df and y are your feature matrix and target variable

# Step 1: Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled_df, Y, test_size=0.2, random_state=42)

# Step 2: Initialize XGBoost Regressor model
xgb_model = xgb.XGBRegressor(objective='reg:squarederror',
                             n_estimators=100,
                             max_depth=6,
                             learning_rate=0.1)

# Step 3: Train the model
xgb_model.fit(X_train, y_train)

# Step 4: Make predictions on the test set
predictions = xgb_model.predict(X_test)

# Step 5: Evaluate the model
mse_score = mean_squared_error(y_test, predictions)
r2_score_value = r2_score(y_test, predictions)

# Print the evaluation metrics
print(f"Mean Squared Error: {mse_score:.4f}")
print(f"R² Score: {r2_score_value:.4f}")

# Optional: Feature importance
importances = xgb_model.feature_importances_
feature_names = X_scaled_df.columns
sorted_idx = np.argsort(importances)[::-1]

print("Feature Importances:")
for idx in sorted_idx:
    print(f"{feature_names[idx]}: {importances[idx]:.4f}")


Mean Squared Error: 15.7515
R² Score: 0.7935
Feature Importances:
Tumor Type_Glioblastoma: 0.2903
Time to Recurrence (months): 0.1355
Treatment Outcome_Stable disease: 0.0999
Tumor Location_Temporal lobe: 0.0798
Tumor Location_Occipital lobe: 0.0760
Tumor Grade_encoded: 0.0543
Treatment Outcome_Progressive disease: 0.0403
Age: 0.0343
Gender_encoded: 0.0324
Treatment_Surgery + Radiation therapy: 0.0308
Treatment_Surgery + Chemotherapy: 0.0305
Tumor Location_Parietal lobe: 0.0280
Treatment_Surgery + Radiation: 0.0234
Treatment Outcome_Partial response: 0.0224
Treatment_Radiation: 0.0110
Treatment_Surgery: 0.0105
Treatment_Chemotherapy + Radiation: 0.0007
Tumor Type_Meningioma: 0.0000


In [None]:
from sklearn.tree import DecisionTreeRegressor
# Test Decision Tree Regressor
print("\nTesting Decision Tree Regressor")
decision_tree_model = DecisionTreeRegressor()

decision_tree_model.fit(X_train, y_train)
y_pred= decision_tree_model.predict(X_test)


mse_score =mean_squared_error(y_test,y_pred)
r2_score_value = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse_score:.4f}")
print(f"R² Score: {r2_score_value:.4f}")



Testing Decision Tree Regressor
Mean Squared Error: 16.7912
R² Score: 0.7799


In [None]:
from sklearn.ensemble import RandomForestRegressor
# Test Random Forest Regressor
print("\nTesting Random Forest Regressor")
rf_model = RandomForestRegressor(n_estimators=100, max_depth=None, random_state=42)

# Step 3: Train the model
rf_model.fit(X_train, y_train)

# Step 4: Make predictions on the test set
predictions = rf_model.predict(X_test)

# Step 5: Evaluate the model
mse_score = mean_squared_error(y_test, predictions)
r2_score_value = r2_score(y_test, predictions)

# Print the evaluation metrics
print(f"Mean Squared Error: {mse_score:.4f}")
print(f"R² Score: {r2_score_value:.4f}")

# Optional: Feature importance
importances = rf_model.feature_importances_
feature_names = X_scaled_df.columns
sorted_idx = np.argsort(importances)[::-1]

print("Feature Importances:")
for idx in sorted_idx:
    print(f"{feature_names[idx]}: {importances[idx]:.4f}")


Testing Random Forest Regressor
Mean Squared Error: 14.3399
R² Score: 0.8120
Feature Importances:
Time to Recurrence (months): 0.3637
Age: 0.2123
Tumor Location_Temporal lobe: 0.0679
Tumor Grade_encoded: 0.0655
Tumor Location_Occipital lobe: 0.0541
Gender_encoded: 0.0395
Treatment Outcome_Stable disease: 0.0363
Tumor Type_Glioblastoma: 0.0283
Tumor Location_Parietal lobe: 0.0230
Treatment_Surgery + Chemotherapy: 0.0215
Treatment Outcome_Progressive disease: 0.0206
Treatment Outcome_Partial response: 0.0192
Treatment_Surgery + Radiation: 0.0182
Tumor Type_Meningioma: 0.0148
Treatment_Surgery: 0.0059
Treatment_Surgery + Radiation therapy: 0.0050
Treatment_Radiation: 0.0044
Treatment_Chemotherapy + Radiation: 0.0000


In [None]:
# Define a threshold for feature importance
importance_threshold = 0.02

# Identify unimportant features
unimportant_features = [feature_names[idx] for idx, importance in enumerate(importances) if importance < importance_threshold]

# Print unimportant features
print("Unimportant Features:")
for feature in unimportant_features:
    print(feature)

# Drop unimportant features from the original dataframe
X_scaled_df_reduced = X_scaled_df.drop(columns=unimportant_features)

# Print the new dataframe to verify changes
print("\nNew DataFrame with Reduced Features:")
print(X_scaled_df_reduced.head())

# Verify the new shape of the dataset
print(f"\nOriginal number of features: {X_scaled_df.shape[1]}")
print(f"Reduced number of features: {X_scaled_df_reduced.shape[1]}")


Unimportant Features:
Tumor Type_Meningioma
Treatment_Chemotherapy + Radiation
Treatment_Radiation
Treatment_Surgery
Treatment_Surgery + Radiation
Treatment_Surgery + Radiation therapy
Treatment Outcome_Partial response

New DataFrame with Reduced Features:
        Age  Time to Recurrence (months)  Gender_encoded  Tumor Grade_encoded  \
0 -1.835289                -2.300221e+00        1.007025             1.242512   
1 -0.189733                -1.339799e-15       -0.993024            -1.136260   
2  0.633046                -7.917419e-01        1.007025             0.449588   
3 -1.012511                -1.339799e-15       -0.993024             1.242512   
4  1.455824                 2.979457e+00        1.007025            -0.343336   

   Tumor Type_Glioblastoma  Tumor Location_Occipital lobe  \
0                 1.462777                      -0.565802   
1                -0.683631                      -0.565802   
2                -0.683631                       1.767402   
3          

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, r2_score
from xgboost import XGBRegressor

# Assuming X_scaled_df_reduced and y are your feature matrix and target variable

# Function to perform train-test split and model evaluation
def evaluate_model(model, X, y):
    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Train the model
    model.fit(X_train, y_train)

    # Make predictions on the test set
    predictions = model.predict(X_test)

    # Evaluate the model
    mse_score = mean_squared_error(y_test, predictions)
    r2_score_value = r2_score(y_test, predictions)

    return mse_score, r2_score_value

# Models to evaluate
models = {
    "Random Forest": RandomForestRegressor(n_estimators=100, max_depth=None, random_state=42),
    "Decision Tree": DecisionTreeRegressor(max_depth=None, random_state=42),
    "XGBoost": XGBRegressor(n_estimators=100, max_depth=6, learning_rate=0.1, random_state=42)
}

# Perform evaluation for each model
for model_name, model in models.items():
    mse_score, r2_score_value = evaluate_model(model, X_scaled_df_reduced, Y)
    print(f"{model_name} - Mean Squared Error: {mse_score:.4f}, R² Score: {r2_score_value:.4f}")


Random Forest - Mean Squared Error: 14.1848, R² Score: 0.8141
Decision Tree - Mean Squared Error: 14.8392, R² Score: 0.8055
XGBoost - Mean Squared Error: 17.2318, R² Score: 0.7741


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import mean_squared_error, r2_score
from xgboost import XGBRegressor

# Assuming X_scaled_df_reduced and y are your feature matrix and target variable

# Function to perform train-test split and model evaluation
def evaluate_model(model, X, y):
    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Train the model
    model.fit(X_train, y_train)

    # Make predictions on the test set
    predictions = model.predict(X_test)

    # Evaluate the model
    mse_score = mean_squared_error(y_test, predictions)
    r2_score_value = r2_score(y_test, predictions)

    return mse_score, r2_score_value

# Hyperparameter grid for XGBoost
param_grid_xgb = {
    'n_estimators': [100, 200, 300, 400, 500],
    'max_depth': [3, 5, 7, 10],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'gamma': [0, 0.1, 0.2, 0.3],
    'min_child_weight': [1, 5, 10]
}

# XGBoost model
xgb_model = XGBRegressor(random_state=42)

# Randomized search
random_search = RandomizedSearchCV(
    xgb_model, param_distributions=param_grid_xgb,
    n_iter=100, cv=5, scoring='r2', n_jobs=-1, random_state=42
)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled_df_reduced, Y, test_size=0.2, random_state=42)

# Perform randomized search
random_search.fit(X_train, y_train)

# Get the best model
best_xgb_model = random_search.best_estimator_

# Evaluate the best model
mse_score, r2_score_value = evaluate_model(best_xgb_model, X_scaled_df_reduced, Y)

print(f"XGBoost - Best Model - Mean Squared Error: {mse_score:.4f}, R² Score: {r2_score_value:.4f}")
print("Best Hyperparameters:", random_search.best_params_)


XGBoost - Best Model - Mean Squared Error: 12.9703, R² Score: 0.8300
Best Hyperparameters: {'subsample': 0.6, 'n_estimators': 300, 'min_child_weight': 1, 'max_depth': 10, 'learning_rate': 0.05, 'gamma': 0.3, 'colsample_bytree': 1.0}


In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score
from tqdm import tqdm
import xgboost as xgb

def cross_validate_model(model, X, y, n_splits=5, random_state=42):
    """
    Perform K-Fold cross-validation on the given model.

    Parameters:
    - model: The regression model to be evaluated.
    - X: Feature DataFrame.
    - y: Target Series.
    - n_splits: Number of folds for cross-validation.
    - random_state: Random seed for reproducibility.

    Returns:
    - A tuple containing average Mean Squared Error and R² score.
    """
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=random_state)
    mse_scores = []
    r2_scores = []

    for train_index, test_index in tqdm(kf.split(X), total=kf.get_n_splits(), desc="Cross-Validation Progress"):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        model.fit(X_train, y_train)
        predictions = model.predict(X_test)

        mse_score = mean_squared_error(y_test, predictions)
        r2_score_value = r2_score(y_test, predictions)

        mse_scores.append(mse_score)
        r2_scores.append(r2_score_value)

        print(f"Fold Mean Squared Error: {mse_score:.4f}")
        print(f"Fold R² Score: {r2_score_value:.4f}")

    avg_mse = np.mean(mse_scores)
    avg_r2 = np.mean(r2_scores)

    print(f"Average Mean Squared Error: {avg_mse:.4f}")
    print(f"Average R² Score: {avg_r2:.4f}")

    return avg_mse, avg_r2

# Define the parameter grid for XGBoost
param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7],
    'min_child_weight': [1, 3, 5],
    'subsample': [0.8, 0.9, 1.0],
    'colsample_bytree': [0.8, 0.9, 1.0]
}

# Define the XGBoost model
xgb_model = xgb.XGBRegressor(objective='reg:squarederror', random_state=42)

# Perform Grid Search with Cross-Validation
grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, scoring='neg_mean_squared_error', cv=5, verbose=1, n_jobs=-1)

# Fit the Grid Search model
grid_search.fit(X, Y)

# Get the best model from Grid Search
best_model = grid_search.best_estimator_

# Evaluate the best model using the cross_validate_model function
avg_mse, avg_r2 = cross_validate_model(best_model, X, Y)

print(f"Best Model Parameters: {grid_search.best_params_}")
print(f"Average Mean Squared Error: {avg_mse:.4f}")
print(f"Average R² Score: {avg_r2:.4f}")


Fitting 5 folds for each of 729 candidates, totalling 3645 fits

Cross-Validation Progress:  20%|██        | 1/5 [00:00<00:02,  1.83it/s]

Fold Mean Squared Error: 12.1309

Fold R² Score: 0.8410

Cross-Validation Progress:  40%|████      | 2/5 [00:02<00:04,  1.40s/it]

Fold Mean Squared Error: 7.3648

Fold R² Score: 0.8994

Cross-Validation Progress:  80%|████████  | 4/5 [00:02<00:00,  1.68it/s]

Fold Mean Squared Error: 17.9437

Fold R² Score: 0.7764

Fold Mean Squared Error: 15.8538

Fold R² Score: 0.7864

Cross-Validation Progress: 100%|██████████| 5/5 [00:03<00:00,  1.61it/s]

Fold Mean Squared Error: 12.2929

Fold R² Score: 0.8117

Average Mean Squared Error: 13.1172

Average R² Score: 0.8230

Best Model Parameters: {'colsample_bytree': 0.8, 'learning_rate': 0.2,
'max_depth': 7, 'min_child_weight': 1, 'n_estimators': 100, 'subsample': 0.9}

Average Mean Squared Error: 13.1172

Average R² Score: 0.8230
