In [2]:
### script for performing some preliminary preprocessing for the data ###

In [3]:
# import required packages

import pandas as pd

In [15]:
# read incidence data

# the data is retrieved 11 Aug 2025 from: https://cancerregistry.fi/statistics/cancer-statistics/
# (cancer sites selected individually, values are categorized by: Age group (10y.) and Year (1953-2023), values shown: Rate per 100,000)

incidence_df = pd.read_excel('cancer_registry_incidence_data.xlsx')

incidence_df.head()

Unnamed: 0,Sex,Cancer site,ICD-10,Age group (10y.),Year,Diagnosed cancer cases,"Rate per 100,000"
0,Male,Lip,C00,0-9,1953,0.0,0.0
1,Male,Lip,C00,0-9,1954,0.0,0.0
2,Male,Lip,C00,0-9,1955,0.0,0.0
3,Male,Lip,C00,0-9,1956,0.0,0.0
4,Male,Lip,C00,0-9,1957,0.0,0.0


In [16]:
# drop years from 1953 to 1962, unrelated to the analysis

incidence_df = incidence_df.loc[~incidence_df['Year'].isin([1953, 1954, 1955, 1956, 1957, 1958, 1959, 1960, 1961,1962])]

incidence_df.head()

Unnamed: 0,Sex,Cancer site,ICD-10,Age group (10y.),Year,Diagnosed cancer cases,"Rate per 100,000"
10,Male,Lip,C00,0-9,1963,0.0,0.0
11,Male,Lip,C00,0-9,1964,,0.24
12,Male,Lip,C00,0-9,1965,0.0,0.0
13,Male,Lip,C00,0-9,1966,0.0,0.0
14,Male,Lip,C00,0-9,1967,0.0,0.0


In [17]:
# drop cancer types that are not meaningful for the analysis (e.g., non-invasive neoplasms (prestages of cancer), in situ carcinoma (~stage 0 cancer),
# borderline tumour (low malignant potential tumour), basal cell carcinoma (spreads very slowly, very rarely will metastasize))

incidence_df_adjusted = incidence_df.loc[~incidence_df['Cancer site'].isin(['Basal cell carcinoma of the skin',
                                                                            'Basal cell carcinoma of the genitals',
                                                                            'Cervix uteri, non-invasive neoplasms',
                                                                            'Vagina and vulva, non-invasive neoplasms',
                                                                            'Carcinoma in situ of the breast',
                                                                            'Borderline tumour of the ovary'])]

In [18]:
# check unique cancer sites

incidence_df_adjusted['Cancer site'].unique()

array(['Lip', 'Tongue', 'Salivary glands', 'Mouth, other or unspecified',
       'Pharynx', 'Oesophagus', 'Stomach', 'Small intestine', 'Colon',
       'Rectum, rectosigmoid', 'Liver', 'Gallbladder, bile ducts',
       'Pancreas', 'Digestive organs, other and unspecified',
       'Nose, sinuses', 'Larynx, epiglottis', 'Lung, trachea',
       'Other or unspecified respiratory or intrathoracic organs',
       'Breast', 'Prostate', 'Testis',
       'Male genital, other and unspecified', 'Kidney',
       'Bladder and urinary tract', 'Melanoma of the skin', 'Eye',
       'Thyroid gland', 'Other endocrine glands', 'Bone', 'Soft tissues',
       'Ill-defined or unknown', 'Hodgkin lymphoma',
       'Myeloma and other plasma cell tumors', 'Mesothelioma',
       'Acute lymphoblastic leukaemia/lymphoma',
       'Chronic lymphatic leukaemia', 'Leukaemia, other or unspecified',
       'Acute myeloid leukaemia',
       'Peripheral nerves, autonomic nervous system', 'Anus',
       'Malignant immunopr

In [19]:
# to improve readibility of the results, some cancer sites are renamed

incidence_df_adjusted.loc[incidence_df_adjusted['Cancer site'] == 'Mouth, other or unspecified', 'Cancer site'] = 'Other mouth'
incidence_df_adjusted.loc[incidence_df_adjusted['Cancer site'] == 'Rectum, rectosigmoid', 'Cancer site'] = 'Rectum & rectosigmoid'
incidence_df_adjusted.loc[incidence_df_adjusted['Cancer site'] == 'Gallbladder, bile ducts', 'Cancer site'] = 'Gallbladder & bile ducts'
incidence_df_adjusted.loc[incidence_df_adjusted['Cancer site'] == 'Digestive organs, other and unspecified', 'Cancer site'] = 'Other digestive organs'
incidence_df_adjusted.loc[incidence_df_adjusted['Cancer site'] == 'Nose, sinuses', 'Cancer site'] = 'Nose & sinuses'
incidence_df_adjusted.loc[incidence_df_adjusted['Cancer site'] == 'Larynx, epiglottis', 'Cancer site'] = 'Larynx & epiglottis'
incidence_df_adjusted.loc[incidence_df_adjusted['Cancer site'] == 'Lung, trachea', 'Cancer site'] = 'Lung & trachea' 
incidence_df_adjusted.loc[incidence_df_adjusted['Cancer site'] == 'Male genital, other and unspecified', 'Cancer site'] = 'Other male genital'
incidence_df_adjusted.loc[incidence_df_adjusted['Cancer site'] == 'Bladder and urinary tract', 'Cancer site'] = 'Bladder & urinary tract'
incidence_df_adjusted.loc[incidence_df_adjusted['Cancer site'] == 'Myeloma and other plasma cell tumors', 'Cancer site'] = 'Myeloma & other plasma cell tumors'
incidence_df_adjusted.loc[incidence_df_adjusted['Cancer site'] == 'Leukaemia, other or unspecified', 'Cancer site'] = 'Other leukaemia'
incidence_df_adjusted.loc[incidence_df_adjusted['Cancer site'] == 'Peripheral nerves, autonomic nervous system', 'Cancer site'] = 'Peripheral nerves & autonomic nervous system'
incidence_df_adjusted.loc[incidence_df_adjusted['Cancer site'] == 'Non-Hodgkin lymphoma, other or unspeficied', 'Cancer site'] = 'Other non-Hodgkin lymphoma'
incidence_df_adjusted.loc[incidence_df_adjusted['Cancer site'] == 'Other, unspecified or mixed hematological disease', 'Cancer site'] = 'Other or unspecified or mixed hematological disease'
incidence_df_adjusted.loc[incidence_df_adjusted['Cancer site'] == 'Skin, squamous cell carcinoma', 'Cancer site'] = 'Skin squamous cell carcinoma'
incidence_df_adjusted.loc[incidence_df_adjusted['Cancer site'] == 'Skin, other', 'Cancer site'] = 'Other skin'
incidence_df_adjusted.loc[incidence_df_adjusted['Cancer site'] == 'CNS, nerve sheath tumor', 'Cancer site'] = 'CNS & nerve sheath tumor'
incidence_df_adjusted.loc[incidence_df_adjusted['Cancer site'] == 'Other and unspecified tumor of brain, meninges and central nervous system', 'Cancer site'] = 'Other brain & meninges & CNS'
incidence_df_adjusted.loc[incidence_df_adjusted['Cancer site'] == 'Myelodysplastic syndromes and myelodysplastic/myeloproliferative neoplasms', 'Cancer site'] = 'Myelodysplastic syndromes & myelodysplastic/myeloproliferative neoplasms'
incidence_df_adjusted.loc[incidence_df_adjusted['Cancer site'] == 'Ovary etc.', 'Cancer site'] = 'Ovary'
incidence_df_adjusted.loc[incidence_df_adjusted['Cancer site'] == 'Female genital, other and unspecified', 'Cancer site'] = 'Other female genital'


In [20]:
# check result of the operation

incidence_df_adjusted['Cancer site'].unique()

array(['Lip', 'Tongue', 'Salivary glands', 'Other mouth', 'Pharynx',
       'Oesophagus', 'Stomach', 'Small intestine', 'Colon',
       'Rectum & rectosigmoid', 'Liver', 'Gallbladder & bile ducts',
       'Pancreas', 'Other digestive organs', 'Nose & sinuses',
       'Larynx & epiglottis', 'Lung & trachea',
       'Other or unspecified respiratory or intrathoracic organs',
       'Breast', 'Prostate', 'Testis', 'Other male genital', 'Kidney',
       'Bladder & urinary tract', 'Melanoma of the skin', 'Eye',
       'Thyroid gland', 'Other endocrine glands', 'Bone', 'Soft tissues',
       'Ill-defined or unknown', 'Hodgkin lymphoma',
       'Myeloma & other plasma cell tumors', 'Mesothelioma',
       'Acute lymphoblastic leukaemia/lymphoma',
       'Chronic lymphatic leukaemia', 'Other leukaemia',
       'Acute myeloid leukaemia',
       'Peripheral nerves & autonomic nervous system', 'Anus',
       'Malignant immunoproliferative diseases', 'Mantle cell lymphoma',
       'Follicular B lym

In [21]:
# store processed dataframe into csv file and continue analysis using the attached R-files

incidence_df_adjusted.to_csv('incidence.csv', sep=',',
                              columns=['Age group (10y.)', 'Sex', 'Cancer site', 'ICD-10', 'Year',
                                       'Diagnosed cancer cases', 'Rate per 100,000'],
                              index=False)