# Data Pre-Processing Notebook

Author: Sophie Mowe

The purpose of this Jupyter Notebook is pre-processing SEER and GTEx Gene Expression data.

In [1]:
import PreProcess
import numpy as np
import pandas as pd
from collections import defaultdict

## SEER Cancer Incidence Data

In [7]:
seer_cancer_list = ["Acute Lymphocytic Leukemia",
    "Acute Monocytic Leukemia","Acute Myeloid Leukemia","Adenocarcinoma of the Lung and Bronchus",
    "Aleukemic Subleukemic and NOS","allsites","Anus Anal Canal and Anorectum",
    "Appendix","Ascending Colon","Bones and Joints","Brain and Other Nervous System",
    "Brain","Breast","Cecum", "Cervix Uteri","Chronic Lymphocytic Leukemia","Chronic Myeloid Leukemia",
    "Colon and Rectum","Colon excluding Rectum",
    "Corpus and Uterus NOS","Corpus Uteri","Cranial Nerves Other Nervous System",
    "Descending Colon","Digestive System","Endocrine System","Esophagus",
    "Eye and Orbit","Female Genital System","Floor of Mouth","Gallbladder",
    "Gum and Other Mouth","Hepatic Flexure","Hodgkin - Extranodal","Hodgkin - Nodal",
    "Hodgkin Lymphoma","Hypopharynx","Intrahepatic Bile Duct",
    "Kaposi Sarcoma (9140)","Kidney and Renal Pelvis","Large Intestine NOS","Larynx",
    "Leukemia","lip","Liver and Intrahepatic Bile Duct","Liver","Lung and Bronchus",
    "Lymphocytic Leukemia","Lymphoma","Male Genital System","Melanoma of the Skin",
    "Mesothelioma (9050-9055)","Miscellaneous","Myeloid and Monocytic Leukemia",
    "Myeloma","Nasopharynx","Neuroblastoma (9490-9509)","NHL - Extranodal",
    "NHL - Nodal","Non-Hodgkin Lymphoma","Non-Small Cell Lung and Bronchus",
    "Nose Nasal Cavity and Middle Ear","Oral Cavity and Pharynx","Oropharynx",
    "Other Acute Leukemia","Other Biliary","Other Digestive Organs","Other Endocrine including Thymus",
    "Other Female Genital Organs","Other Leukemia","Other Lymphocytic Leukemia","Other Male Genital Organs",
    "Other Myeloid Monocytic Leukemia","Other Non-Epithelial Skin","Other Oral Cavity and Pharynx",
    "Other Urinary Organs","Ovary","Pancreas","Penis","Peritoneum Omentum and Mesentery",
    "Pleura","Prostate","Rectosigmoid Junction","Rectum and Rectosigmoid Junction",
    "Rectum","Respiratory System","Retroperitoneum","Salivary Gland","Sigmoid Colon",
    "Skin excluding Basal and Squamous","Small Cell Lung and Bronchus (8041-8045)","Small Intestine",
    "Soft Tissue including Heart","Splenic Flexure","Squamous cell Lung and Bronchus (8051-8130)",
    "Stomach","Testis","Thyroid","tongue","Tonsil","Trachea Mediastinum and Other Respiratory Organs",
    "Transverse Colon","Ureter","Urinary Bladder","Urinary System","Uterus NOS","Vagina","Vulva","Wilms Tumor (8960)"]

for seer_cancer in seer_cancer_list:
    if seer_cancer == "CancerIncidence_1975-2020":
        raw_seer_path = '../data/raw_data/SEER/' + seer_cancer + '.xlsx'
    else:
        raw_seer_path='../data/raw_data/SEER/' + seer_cancer + '.csv'
    results_folder='../data/SEER/'

    seer = pd.read_csv(raw_seer_path, header=1, usecols=['        ', 'All races.1', 'All races.2'])
    seer.rename(columns={'        ': 'Ages', 'All races.1': 'Male', 'All races.2': 'Female'}, inplace=True)

    # dropping the '0-19' and '80+' categories as the GTEx data does not contain these age groups
    seer.drop([0, 7], inplace=True)

    # saving cleaned data
    seer.to_csv(results_folder+seer_cancer+'.csv')
seer

## Scaling and Averaging GTEx Data

In [10]:
gtex_samples = [
    'brain_cerebellum',
    'brain_cortex',
    'brain_frontal_cortex_ba9',
    'brain_hippocampus',
    'brain_hypothalamus',
    'brain_nucleus_accumbens_basal ganglia',
    'brain_putamen_basal_ganglia',
    'brain_spinal_cord_cervical_c-1)',
    'brain_substantia_nigra',
    'breast_mammary_tissue',
    'cells_cultured_fibroblasts',
    'cells_ebv-transformed_lymphocytes',
    'cervix_ectocervix',
    'cervix_endocervix',
    'colon_sigmoid',
    'colon_transverse',
    'esophagus_gastroesophageal_junction',
    'esophagus_mucosa',
    'esophagus_muscularis',
    'fallopian_tube',
    'heart_atrial_appendage',
    'heart_left_ventricle',
    'kidney_cortex',
    'kidney_medulla',
    'liver',
    'lung',
    'minor_salivary_gland',
    'muscle_skeletal',
    'nerve_tibial',
    'ovary',
    'pancreas',
    'pituitary',
    'prostate',
    'skin_not_sun_exposed_suprapubic',
    'skin_sun_exposed_lower_leg',
    'small_intestine_terminal_ileum',
    'spleen',
    'stomach',
    'testis',
    'thyroid',
    'uterus',
    'vagina',
    'whole_blood',
    'adipose_subcutaneous',
    'adipose_visceral_omentum',
    'adrenal_gland',
    'artery_aorta',
    'artery_coronary',
    'artery_tibial',
    'bladder',
    'brain_amygdala',
    'brain_anterior_cingulate_cortex_ba24',
    'brain_caudate_basal_ganglia',
    'brain_cerebellar_hemisphere'
]

In [None]:
PreProcess.get_gtex()

### Getting and saving gene descriptions

In [None]:
# reading in first two columns of gene expression csv
def get_gene_descriptions(infile, outfile):
    gene_descriptions = pd.read_parquet(infile, columns=['Name', 'Description'])
    gene_descriptions.to_csv(outfile)

get_gene_descriptions(infile="data\pantissue\pantissue_gtex.parquet", outfile='data/gtex_gene_names.csv')