In [None]:
# Notebook to add, change, correct annotations
# conda env: curate_data

In [1]:
from functions import get_data
from normalize_data import get_unique_index
from run_checks import import_data
import pandas as pd
import os

# Import & Normalize Data

In [2]:
normalized_data = "../data/normalized/"
metadata = get_data(normalized_data)

In [3]:
def normalize_metadata(df, unique_index):
    """
    Function that uses union of all metadata fields to
    add missing fields to each cohort
    """
    df_index = df.iloc[:, 0]
    additional_items = [field for field in list(set(unique_index) - set(df_index)) if not field.startswith('field')]
    
    # get index of fields that should be replaced
    df.reset_index(inplace=True, drop=True)
    startpoint = min(df.loc[df['fieldname'].str.startswith('field', na=False)].index.tolist())
    endpoint = startpoint + len(additional_items) - 1
    df.loc[startpoint:endpoint, 'fieldname'] = additional_items
    df.iloc[startpoint:endpoint, 1:] = 'nd'
    
def add_ajcc_staging(df):
    """
    Function to add ajcc labels to the rows that miss a label,
    but have information about tnm.
    :param data: specific dataset
    :param index: index of rows that are to be modified
    """
    df_T = df.T

    tnm_to_ajcc = {
        'ptis':"0",
        ','.join(["t1n0m0", "t2n0m0"]):'i',
        ','.join(["t3n0m0", "t4n0m0"]):'ii',
        ','.join(['t1n1m0','t2n1m0', 't1n2m0', 't3n1m0', 't4n1m0', 't2n2m0', 't3n2m0']):'iii',
        ','.join([f"t{x}n{y}m1" for  x in range(1,5) for y in range(0,5)]):'iv'
    }

    tnm_index = df_T.loc[(df_T['ajcc'] == "nd") &
                         (df_T['tnm'] != "nd")].index.tolist()
    
    if tnm_index:
        for i in tnm_index:
            tnm_stage = df_T.loc[i, 'tnm']
            for key, ajcc_stage in tnm_to_ajcc.items():
                if tnm_stage in key:
                    df_T.loc[i, 'ajcc'] = ajcc_stage

        print(f"AJCC labels added for {str(len(tnm_index))} samples")

    else:
        print("No labels were added")

    # TODO extract values that are not allowed and curate them manually (e.g. tmn instead of tnm)
    tnm_values = [label for label in df_T['tnm'].tolist() if not label == 'nd']
    tnm_values = list(set([''.join(i for i in label if not i.isdigit()) for label in tnm_values]))
    tnm_values = [label for label in tnm_values if not label == 'tnm']

    if tnm_values:
        print(f"Some tnm labels have an unexpected format. Please Check!\n{tnm_values}")

In [4]:
# normalize the fields of the metadata headers to the union of all cohorts  in the dataset
metadata_union = get_unique_index(metadata)
for study, data in metadata.items():
    normalize_metadata(data, metadata_union)
    data.set_index("fieldname", inplace=True, drop=False)
    data.index.names = ['index']
    data.drop(columns=['fieldname'], inplace=True)

# Curate

In [5]:
# Add AJCC stage for samples that only have tnm annotations
for study, data in metadata.items():
    print(f"\n{study}")
    add_ajcc_staging(data)


YuJ_2015
No labels were added

ThomasAM_2018a
No labels were added

VogtmannE_2016
No labels were added

YachidaS_2019
No labels were added

WirbelJ_2018
No labels were added
Some tnm labels have an unexpected format. Please Check!
['tisnm']

FengQ_2015
AJCC labels added for 45 samples
Some tnm labels have an unexpected format. Please Check!
['ptis']

ThomasAM_2019_c
No labels were added

ThomasAM_2018b
No labels were added

GuptaA_2019
AJCC labels added for 30 samples
Some tnm labels have an unexpected format. Please Check!
['tmn']

ZellerG_2014
No labels were added
Some tnm labels have an unexpected format. Please Check!
['tnxm']

HanniganGD_2017
No labels were added


In [6]:
# modify potential false tnm labels and run ajcc annotating function again
dataset = metadata['GuptaA_2019'].T
dataset.loc[dataset['tnm'] == 't4n2m0', 'ajcc'] = "iii"
dataset.loc[dataset['tnm'] == 't2m0n0', 'ajcc'] = "i"

In [7]:
# dataset = metadata['FengQ_2015'].T
# dataset.loc[dataset['tnm'].str.startswith('p'), 'ajcc'] # = '0'

In [8]:
# dataset = metadata['ZellerG_2014'].T
# dataset.loc[dataset['tnm'].str.contains('x'), 'ajcc'] #= "iv"

In [9]:
dataset = metadata['YachidaS_2019'].T
dataset.loc[:, 'DNA_extraction_kit'] = 'Gnome'

In [10]:
dataset = metadata['ThomasAM_2019_c'].T
dataset.loc[:, 'DNA_extraction_kit'] = 'Gnome'

In [11]:
# for study in metadata:
#    print(f"\n{study}\n")
#    print(*[field for field in metadata[study].index if not "field" in field], sep="\n")

In [12]:
# # check ajcc stage information
# for study in metadata:
#     print(f"\n{study}\n")
#     dataset = metadata[study]
#     if "ajcc" in dataset.index:
#         print(dataset.loc['ajcc'].value_counts())
#         if 'tnm' in dataset.index:
#             dataset_T = dataset.T
#             print(dataset_T.loc[dataset_T['ajcc'] == 'iv', 'tnm'].unique())
#     else:
#         if 'tnm' in dataset.index:
#             print(dataset.loc['tnm'].unique())
#     # elif 'tnm' in dataset.index:
#     #     print(dataset.loc['tnm'].value_counts())

In [13]:
# # check for abx intake information
# for study in metadata:
#     dataset = metadata[study]
#     if 'antibiotics_current_use' in dataset.index:
#         print(study)
#         print(dataset.loc['antibiotics_current_use'].unique())
#     else:
#         print(study)


In [14]:
# # check disease labels
# for study in metadata:
#     dataset = metadata[study]
#     print(f"\n{study}\n")
#     print(dataset.loc['disease'].unique())

In [15]:
# # westernized labels
# for study in metadata:
#     dataset = metadata[study]
#     print(f"\n{study}\n")
#     print(dataset.loc['non_westernized'].unique())

In [16]:
# # check other annotations
# for study in metadata:
#     dataset = metadata[study]
#     print(f"\n{study}\n")
#     print(dataset.loc['disease_location'].unique())

## Exploratory Stuff

In [17]:
# look at all annotations in general again, per study
for study, dataset in metadata.items():
    print(study)
    dataset = dataset.T
    columns = [col for col in dataset.columns if not col.startswith("field")]
    for column in columns:
        print(dataset[column].value_counts())
        print("\n")

YuJ_2015
study_name
YuJ_2015    128
Name: count, dtype: int64


subject_id
SZAXPI017580-90     1
SZAXPI017581-93     1
SZAXPI017582-94     1
SZAXPI017583-102    1
SZAXPI017584-108    1
                   ..
SZAXPI003412-1      1
SZAXPI003413-4      1
SZAXPI003414-3      1
SZAXPI003415-12     1
SZAXPI003416-4      1
Name: count, Length: 128, dtype: int64


body_site
stool    128
Name: count, dtype: int64


antibiotics_current_use
no    128
Name: count, dtype: int64


study_condition
CRC        74
control    54
Name: count, dtype: int64


disease
CRC        45
healthy    38
CRC;T2D    29
T2D        16
Name: count, dtype: int64


age
64    16
62     8
65     7
58     6
73     6
61     5
68     5
77     5
59     5
54     4
69     4
71     4
55     4
60     4
70     4
74     4
67     3
56     3
63     3
50     3
51     3
83     3
53     2
79     2
52     2
75     1
72     1
80     1
89     1
45     1
66     1
44     1
57     1
78     1
49     1
81     1
76     1
34     1
Name: count, dtype:

# Write curated metadata

In [18]:
# for study, header in metadata.items():
#     print(header.T['disease'].value_counts())

In [19]:
# import the normalized datasets
# remove the metadata header
# add the curated metadata header
# write to a new file (which is then transfered to the container)

def get_paths(study, path):
    paths = [os.path.join(path, fname) for fname in os.listdir(path) if study in fname]    
    return  paths

def add_header(df1, df2):
    final_df = pd.concat([df1, df2], ignore_index=True)
    return final_df

def write_csv(final_df, fname):
    final_df.to_csv(fname, index=False)

In [20]:
for study, header in metadata.items():
    header.reset_index(inplace=True)
    header.rename(columns={"index":"Unnamed: 0"}, inplace=True)
    data_files = get_paths(study, normalized_data)
    for fname in data_files:
        output_name = fname.replace("/normalized/", "/normalized/final/")
        output_name = output_name.replace("normalized_CRC", "CRC")
        data = import_data(fname, skiprows=209)
        final_data = add_header(header, data)
        write_csv(final_data, output_name)