In [30]:
import numpy as np
import pandas as pd
import matplotlib
import glob
import plotly.express as px
from os import listdir
#First We are going to create the different groups that we are going to be using
#throughout the code

colnames = ["Sex", "Type of Cancer","Age group", "Number of cases","person-years at risk"]

path_africa = "data/CI5_treated_data/AFRICA/"
path_europe = "data/CI5_treated_data/EUROPE/"
path_north_america = "data/CI5_treated_data/NORTH-AMERICA/"
path_south_america = "data/CI5_treated_data/SOUTH-AMERICA/"
path_asia = "data/CI5_treated_data/ASIA/"
path_oceania = "data/CI5_treated_data/OCEANIA/"

Africa_list = []
Europe_list = []
North_america_list = []
South_america_list = []
Asia_list = []
Oceania_list = []

In [31]:
def modify_cancer_value(df):
    df['Type of Cancer'] = df['Type of Cancer'].replace({
                    1: 'All sites (total)',
                    2: 'All sites but skin (total)',
                    3: 'Oral cavity and pharynx (total)',
                    4: 'Lip',
                    5: 'Tongue (total)',
                    (6,7) : 'Tongue',
                    8: 'Mouth (total)',
                    (9, 10, 11, 12): 'Mouth',
                    13: 'Salivary glands (total)',
                    (14,15): 'Salivary glands',
                    16: 'Tonsil',
                    17: 'Oropharynx',
                    18: 'Nasopharynx',
                    19: 'Pyriform sinus',
                    20: 'Hypopharynx',
                    21: 'Pharynx unspecified',
                    22: 'Digestive organs (total)',
                    23: 'Oesophagus (total)',
                    (24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34): 'Oesophagus',
                    35: 'Stomach (total)',
                    (36, 37, 38, 39, 40): 'Stomach',
                    41: 'Small intestine',
                    42: 'Colon (total)', 
                    (43, 44, 45, 46, 47): 'Colon',
                    48: 'Rectosigmoid junction',
                    49: 'Rectum',
                    50: 'Anus (total)', 
                    (51, 52, 53, 54, 55, 56, 57): 'Anus',
                    58: 'Liver (total)',
                    (59, 60, 61, 62, 63, 64, 65, 66, 67): 'Liver',
                    68: 'Gallbladder',
                    69: 'Unspecified parts of biliary tract',
                    70: 'Pancreas',
                    71: 'Ill-defined digestive organs',
                    72: 'Respiratory organs (total)',
                    73: 'Nasal cavity and middle ear',
                    74: 'Accessory sinuses',
                    75: 'Larynx',
                    76: 'Trachea',
                    77: 'Lung (total)',
                    (78, 79, 80, 81, 82, 83, 84, 85): 'Lung',
                    86: 'Thymus',
                    87: 'Heart, mediastinum and pleura',
                    88: 'Unspecified respiratory and intrathoracic organs',
                    89: 'Bone (total)',
                    (90, 91, 92, 93, 94, 95, 96, 97, 98): 'Bone',
                    99: 'Skin (total)', 
                    (100, 101, 102, 103, 104, 105, 106): 'Skin',
                    107: 'Mesothelial and soft tissues (total)',
                    108: 'Mesothelioma',
                    109: 'Kaposi sarcoma',
                    110: 'Peripheral nerves',
                    111: 'Peritoneum and retroperitoneum',
                    112: 'Connective and soft tissue',
                    113: 'Breast',
                    114: 'Female genital organs (total)',
                    115: 'Vulva',
                    116: 'Vagina',
                    117: 'Cervix otary (total)', 
                    (118, 119, 120, 121, 122, 123, 124): 'Cervix uteri',
                    125: 'Corpus uteri (total)',
                    (126, 127, 128, 129, 130, 131): 'Corpus uteri',
                    132: 'Uterus unspecified',
                    133: 'Ovary (total)',
                    (134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144): 'Ovary',
                    145: 'Other female genital organs (total)',
                    (146,147): 'Other female genital organs',
                    148: 'Placenta',
                    149: 'Male genital organs (total)',
                    150: 'Penis',
                    151: 'Prostate',
                    152: 'Testis (total)',
                    (153, 154, 155, 156, 157): 'Testis',
                    158: 'Other male genital organs',
                    159: 'Urinary tract (total)',
                    160: 'Kidney',
                    161: 'Renal pelvis',
                    162: 'Ureter',
                    163: 'Bladder (total)',
                    (164, 165, 166, 167, 168, 169, 170, 171): 'Bladder',
                    172: 'Other urinary organs',
                    173: 'Eye, brain and central nervous system (total)',
                    174: 'Eye (total)',
                    (175, 176, 177, 178, 179, 180, 181, 182): 'Eye',
                    183: 'Meninges',
                    184: 'Central nervous system (total)',
                    (185, 186, 187, 188, 189, 190, 191, 192, 193): 'Central nervous system',
                    194: 'Brain',
                    195: 'Other parts of central nervous system (total)',
                    (196, 197, 198): 'Other parts of central nervous system',
                    199: 'Thyroid and other endocrine glands (total)',
                    200: 'Thyroid (total)',
                    (201, 202, 203, 204, 205, 206, 207, 208, 209): 'Thyroid',
                    210: 'Adrenal gland',
                    211: 'Other endoctrine',
                    212: 'Lymphoid tissues (total)',
                    213: 'Non-Hodgkin lymphoma (total)',
                    (214, 215, 216): 'Non-Hodgkin lymphoma',
                    217: 'Hodgkin disease (total)',
                    (218, 219, 220, 221, 222, 223): 'Hodgkin disease', 
                    224: 'Immunoproliferative diseases',
                    225: 'Multiple myeloma',
                    226: 'Leukaemia (total)',
                    227: 'Lymphoid leukaemia (total)', 
                    (228, 229, 230, 231): 'Lymphoid leukaemia',
                    232: 'Myeloid leukaemia (total)',
                    (233, 234, 235,236): 'Myeloid leukaemia',
                    237: 'Leukaemia and unspecified (total)',
                    (238, 239, 240, 241): 'Leukaemia and unspecified',
                    242: 'Myeloproliferative disorders',
                    243: 'Myelodysplastic syndromes',
                    244: 'Unspecified cancers'})
    
    df['Age group'] = df['Age group'].replace({
                        1:'0-4',
                        2:'5-9',
                        3:'10-14',
                        4:'15-19',
                        5:'20-24',
                        6:'25-29',
                        7:'30-34',
                        8:'35-39',
                        9:'40-44',
                        10:'45-49',
                        11:'50-54',
                        12:'55-59',
                        13:'60-64',
                        14:'65-69',
                        15:'70-74',
                        16:'75-79',
                        17:'80-84',
                        18:'85+',
                        19:'Unknown'
                        })
    df['Sex'] = df['Sex'].replace({
                1:'Male',
                2:'Female'})
    

In [35]:
#Function to create Dataframe 
def create_df_country(df_path, df_name):
    all_files = glob.glob(df_path+df_name+"/*.csv")
    list_Country = [pd.read_csv(files,header=None,names=colnames) for files in all_files]
    Country = pd.concat(list_Country)
    modify_cancer_value(Country)
    Country = Country[Country['Type of Cancer'].str.contains("total|Unspecified")==False]
    return Country

def create_df_continent(df_path, name):
    list_Country = []
    for f in listdir(df_path):
        current_country = create_df_country(df_path,f)
        country_name = f[0] + f[1:].lower()
        current_country['Country'] = country_name
        list_Country.append(current_country)
    Continent = pd.concat(list_Country)
    Continent['Continent'] = name
    return Continent

In [40]:
Africa = create_df_continent(path_africa,'Africa')
Africa.to_pickle("data/Africa.pkl",compression={'method': 'gzip', 'compresslevel': 1, 'mtime': 1})
Asia = create_df_continent(path_asia, 'Asia')
Asia.to_pickle("data/Asia.pkl",compression={'method': 'gzip', 'compresslevel': 1, 'mtime': 1})
Oceania = create_df_continent(path_oceania,'Oceania')
Oceania.to_pickle("data/Oceania.pkl",compression={'method': 'gzip', 'compresslevel': 1, 'mtime': 1})
North_america = create_df_continent(path_north_america,'North-america')
North_america.to_pickle("data/North_america.pkl",compression={'method': 'gzip', 'compresslevel': 1, 'mtime': 1})
South_america = create_df_continent(path_south_america,'South-america')
South_america.to_pickle("data/South_america.pkl",compression={'method': 'gzip', 'compresslevel': 1, 'mtime': 1})
Europe = create_df_continent(path_europe,'Europe')
Europe.to_pickle("data/Europe.pkl",compression={'method': 'gzip', 'compresslevel': 1, 'mtime': 1})

World_list = [Africa,Oceania,Asia,North_america,South_america,Europe]
World = pd.concat(World_list)

In [41]:
World = World[World['Type of Cancer'].str.contains("total|Unspecified|unspecified")==False].to_pickle("data/World.pkl",compression={'method': 'gzip', 'compresslevel': 1, 'mtime': 1})