## Imports Packages

In [2]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split

## Load data

In [5]:
cancer_df = pd.read_csv("normalized_data.csv")
cancer_df.head()

Unnamed: 0,country_name,year,population,cancer_name,new_cases/deaths,total_cases,cumulative_risk,measure,air_pollution,alcohol_use,gdp_per_capita,uhc_index,obesity_rate,tobacco_use,rate
0,Australia,2000,19017963,Head and neck,2109,18001064.0,1.144604,Incidence,7.457433,9.8,21870.415967,81.0,20.2,24.5,0.000111
1,Australia,2001,22695275,Head and neck,2057,18208623.0,1.101593,Incidence,7.388953,9.8,19695.729738,84.833333,20.9,18.476923,9.1e-05
2,Australia,2002,22695275,Head and neck,2193,18398749.0,1.137999,Incidence,7.266338,9.8,20301.843171,84.833333,21.6,18.476923,9.7e-05
3,Australia,2003,22695275,Head and neck,2206,18597239.0,1.136633,Incidence,7.131202,9.9,23718.13385,84.833333,22.3,18.476923,9.7e-05
4,Australia,2004,22695275,Head and neck,2089,18784618.0,1.043716,Incidence,7.025164,10.0,30836.730682,84.833333,22.9,18.476923,9.2e-05


In [6]:
summary_df = pd.read_csv("dominant_cancer_types.csv")
summary_df.head()

Unnamed: 0,country_name,highest_incidence_cancer,highest_mortality_cancer
0,Argentina,Breast,Lung
1,Armenia,,Lung
2,Australia,Prostate,Lung
3,Austria,Breast,Lung
4,Bahrain,Breast,


## Preprocessing

In [10]:
cancer_df = cancer_df[cancer_df["measure"].isin(["Incidence", "Mortality"])]

pivot_table = cancer_df.pivot_table(
    index="country_name",
    columns=["cancer_name", "measure"],
    values="new_cases/deaths",
    aggfunc="mean"
)

pivot_table.head()

cancer_name,Bladder,Bladder,Brain and central nervous system,Brain and central nervous system,Breast,Breast,Cervix uteri,Cervix uteri,Colon,Colon,...,Prostate,Rectum and anus,Rectum and anus,Stomach,Stomach,Testis,Testis,Thyroid,Thyroid,Uterus
measure,Incidence,Mortality,Incidence,Mortality,Incidence,Mortality,Incidence,Mortality,Incidence,Mortality,...,Mortality,Incidence,Mortality,Incidence,Mortality,Incidence,Mortality,Incidence,Mortality,Mortality
country_name,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
Argentina,97.4,625.826087,86.533333,926.869565,629.0,3567.956522,118.866667,955.26087,237.866667,3149.608696,...,1335.0,100.466667,599.304348,123.133333,1813.695652,63.266667,169.26087,69.466667,120.217391,2021.695652
Armenia,,98.125,,162.9375,,347.25,,56.375,,189.0625,...,97.625,,69.8125,,279.0625,,6.0625,,19.25,165.8125
Australia,1196.0,292.166667,1316.833333,970.375,11711.277778,1720.625,693.555556,168.75,5469.888889,958.75,...,829.416667,3553.111111,1141.416667,1219.833333,582.958333,721.777778,20.0,1917.777778,61.291667,400.291667
Austria,943.055556,165.625,556.888889,380.333333,3980.666667,788.75,369.111111,99.25,1711.555556,599.875,...,321.5,1185.444444,354.291667,734.777778,405.166667,356.277778,14.5,746.111111,34.75,226.708333
Bahrain,17.5,,13.166667,,118.555556,,7.444444,,31.277778,,...,,18.5,,11.833333,,3.277778,,20.777778,,


In [8]:
risk_factors = cancer_df.groupby("country_name")[["alcohol_use", "tobacco_use", "obesity_rate", "air_pollution", "uhc_index"]].mean()

df_features = pd.concat([pivot_table, risk_factors], axis=1)
df_features.columns = df_features.columns.map(str)

df_features.head()

Unnamed: 0_level_0,"('Bladder', 'Incidence')","('Bladder', 'Mortality')","('Brain and central nervous system', 'Incidence')","('Brain and central nervous system', 'Mortality')","('Breast', 'Incidence')","('Breast', 'Mortality')","('Cervix uteri', 'Incidence')","('Cervix uteri', 'Mortality')","('Colon', 'Incidence')","('Colon', 'Mortality')",...,"('Testis', 'Incidence')","('Testis', 'Mortality')","('Thyroid', 'Incidence')","('Thyroid', 'Mortality')","('Uterus', 'Mortality')",alcohol_use,tobacco_use,obesity_rate,air_pollution,uhc_index
country_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Argentina,97.4,625.826087,86.533333,926.869565,629.0,3567.956522,118.866667,955.26087,237.866667,3149.608696,...,63.266667,169.26087,69.466667,120.217391,2021.695652,7.075,28.508333,25.739474,14.93603,76.636364
Armenia,,98.125,,162.9375,,347.25,,56.375,,189.0625,...,,6.0625,,19.25,165.8125,3.978571,27.0,22.94375,35.465764,66.8
Australia,1196.0,292.166667,1316.833333,970.375,11711.277778,1720.625,693.555556,168.75,5469.888889,958.75,...,721.777778,20.0,1917.777778,61.291667,400.291667,10.04359,18.476923,25.734146,7.373216,84.833333
Austria,943.055556,165.625,556.888889,380.333333,3980.666667,788.75,369.111111,99.25,1711.555556,599.875,...,356.277778,14.5,746.111111,34.75,226.708333,12.061538,37.769231,15.095122,15.919421,80.083333
Bahrain,17.5,,13.166667,,118.555556,,7.444444,,31.277778,,...,3.277778,,20.777778,,,2.083333,19.3,28.983333,68.82405,67.8


## Training

In [13]:

performance_results = []
df_rf_filled = df_features.copy()
numeric_cols = df_rf_filled.select_dtypes(include=[np.number]).columns

for target_col in numeric_cols:
    if df_rf_filled[target_col].isnull().sum() == 0:
        continue

    print(f"\n Predicted Column: {target_col}")
    
    y = df_rf_filled[target_col]
    X_all = df_rf_filled.drop(columns=[target_col])
    X_all = X_all.select_dtypes(include=[np.number])

    mask = y.notnull() & ~X_all.isnull().any(axis=1)
    X_full = X_all[mask]
    y_full = y[mask]

    if X_full.shape[0] < 10:
        print(f"⚠️ Not enough data: {target_col}")
        continue

    X_train_eval, X_test_eval, y_train_eval, y_test_eval = train_test_split(X_full, y_full, test_size=0.2, random_state=42)
    model_eval = RandomForestRegressor(n_estimators=100, random_state=42)
    model_eval.fit(X_train_eval, y_train_eval)
    y_pred_eval = model_eval.predict(X_test_eval)

    mae = mean_absolute_error(y_test_eval, y_pred_eval)
    r2 = r2_score(y_test_eval, y_pred_eval)

    performance_results.append({
        "column": target_col,
        "MAE": mae,
        "R2": r2
    })

    train_mask = y.notnull() & ~X_all.isnull().any(axis=1)
    X_train = X_all[train_mask]
    y_train = y[train_mask]
    X_predict_all = X_all[y.isnull()]

    common_cols = X_train.columns[~X_train.isnull().any() & ~X_predict_all.isnull().any()]
    X_train_common = X_train[common_cols]
    X_predict_common = X_predict_all[common_cols]

    if X_train_common.empty or X_predict_common.empty:
        print(f"No common column: {target_col}")
        continue

    model = RandomForestRegressor(n_estimators=100, random_state=42)
    model.fit(X_train_common, y_train)
    y_pred = model.predict(X_predict_common)
    df_rf_filled.loc[X_predict_common.index, target_col] = y_pred



 Predicted Column: ('Bladder', 'Incidence')

 Predicted Column: ('Bladder', 'Mortality')

 Predicted Column: ('Brain and central nervous system', 'Incidence')

 Predicted Column: ('Brain and central nervous system', 'Mortality')

 Predicted Column: ('Breast', 'Incidence')

 Predicted Column: ('Breast', 'Mortality')

 Predicted Column: ('Cervix uteri', 'Incidence')

 Predicted Column: ('Cervix uteri', 'Mortality')

 Predicted Column: ('Colon', 'Incidence')

 Predicted Column: ('Colon', 'Mortality')

 Predicted Column: ('Colorectum', 'Incidence')

 Predicted Column: ('Colorectum', 'Mortality')

 Predicted Column: ('Corpus uteri', 'Incidence')

 Predicted Column: ('Corpus uteri', 'Mortality')

 Predicted Column: ('Gallbladder', 'Incidence')

 Predicted Column: ('Gallbladder', 'Mortality')

 Predicted Column: ('Head and neck', 'Incidence')

 Predicted Column: ('Head and neck', 'Mortality')

 Predicted Column: ('Hodgkin lymphoma', 'Incidence')

 Predicted Column: ('Hodgkin lymphoma', 'Mort

In [14]:

performance_summary = pd.DataFrame(performance_results).sort_values(by="R2", ascending=False)
performance_summary.reset_index(drop=True, inplace=True)

performance_summary.head(10)

Unnamed: 0,column,MAE,R2
0,"('Colorectum', 'Mortality')",401.670476,0.976641
1,"('Lung', 'Incidence')",90.831592,0.968898
2,"('Brain and central nervous system', 'Mortality')",136.09992,0.962299
3,"('Stomach', 'Mortality')",149.867359,0.960301
4,"('Rectum and anus', 'Incidence')",44.081675,0.951876
5,"('Kidney', 'Mortality')",75.413888,0.950615
6,"('Multiple myeloma', 'Mortality')",41.670705,0.942095
7,"('Breast', 'Mortality')",468.334567,0.9321
8,"('Ovary', 'Mortality')",94.108876,0.931061
9,"('Bladder', 'Mortality')",166.339746,0.930875


In [16]:
df_rf_filled.isnull().sum()

('Bladder', 'Incidence')                             0
('Bladder', 'Mortality')                             0
('Brain and central nervous system', 'Incidence')    0
('Brain and central nervous system', 'Mortality')    0
('Breast', 'Incidence')                              0
('Breast', 'Mortality')                              0
('Cervix uteri', 'Incidence')                        0
('Cervix uteri', 'Mortality')                        0
('Colon', 'Incidence')                               0
('Colon', 'Mortality')                               0
('Colorectum', 'Incidence')                          0
('Colorectum', 'Mortality')                          0
('Corpus uteri', 'Incidence')                        0
('Corpus uteri', 'Mortality')                        0
('Gallbladder', 'Incidence')                         0
('Gallbladder', 'Mortality')                         0
('Head and neck', 'Incidence')                       0
('Head and neck', 'Mortality')                       0
('Hodgkin 

## Cleanup And Output

In [21]:

incidence_cols = [col for col in df_rf_filled.columns if "Incidence" in col]
mortality_cols = [col for col in df_rf_filled.columns if "Mortality" in col]

df_rf_filled["highest_incidence_cancer"] = df_rf_filled[incidence_cols].idxmax(axis=1)
df_rf_filled["highest_mortality_cancer"] = df_rf_filled[mortality_cols].idxmax(axis=1)

df_rf_filled["highest_incidence_cancer"] = df_rf_filled["highest_incidence_cancer"].apply(
    lambda x: x.split(",")[0].strip() if isinstance(x, str) else x
)
df_rf_filled["highest_mortality_cancer"] = df_rf_filled["highest_mortality_cancer"].apply(
    lambda x: x.split(",")[0].strip() if isinstance(x, str) else x
)

df_rf_filled["highest_incidence_cancer"] = df_rf_filled["highest_incidence_cancer"].str.replace(" Incidence", "", regex=False).str.strip(" ('")
df_rf_filled["highest_mortality_cancer"] = df_rf_filled["highest_mortality_cancer"].str.replace(" Mortality", "", regex=False).str.strip(" ('")

df_rf_filled[["highest_incidence_cancer", "highest_mortality_cancer"]].head()

Unnamed: 0_level_0,highest_incidence_cancer,highest_mortality_cancer
country_name,Unnamed: 1_level_1,Unnamed: 2_level_1
Argentina,Breast,Lung
Armenia,Breast,Lung
Australia,Prostate,Lung
Austria,Breast,Lung
Bahrain,Breast,Lung


In [23]:
df_rf_filled[["highest_incidence_cancer", "highest_mortality_cancer"]].to_csv("predicted_cancer_data.xlsx", index=True)