In [6]:
import pandas as pd

In [7]:
# Loading raw Manifests
awsmani = pd.read_csv('raw_manifests/aws.tsv', sep='\t')
pdcmani = pd.read_csv('raw_manifests/pdc.tsv', sep='\t')
fullmani = pd.read_csv('raw_manifests/full.csv')
fullmani.rename(columns={'donor_id/donor_count':'icgc_donor_id', 'Case (Tumor) or Control (Normal)':'type', }, inplace=True)
pdcmani.rename(columns={'guid': 'object_id'}, inplace=True)



In [8]:
pdcmani.shape

(1602, 5)

In [10]:

# Chacking for unpaired donor_ids
unpaired_donors = fullmani.duplicated(subset=['icgc_donor_id'], keep=False)
unpaireds = fullmani[~unpaired_donors]
# print info about unpaireds
print(unpaireds.groupby('histology_abbreviation').size())
print(unpaireds.groupby('type').size())

unpaireds.to_csv('data/unpaireds.csv', index=False)


histology_abbreviation
Biliary-AdenoCA    1
Bladder-TCC        1
Bone-Leiomyo       1
Breast-AdenoCA     1
Liver-HCC          1
Lymph-NOS          1
Panc-AdenoCA       2
Panc-Endocrine     1
Prost-AdenoCA      1
Skin-Melanoma      1
Stomach-AdenoCA    2
dtype: int64
type
case        3
control    10
dtype: int64


In [11]:

# Checking for non-unique donor/type pairs
duplicates = fullmani.duplicated(subset=['icgc_donor_id', 'type'], keep=False)
dup_rows = fullmani[duplicates]
# print info about non-unique donor/type pairs
print(dup_rows.groupby('histology_abbreviation').size())
print(dup_rows.groupby('type').size())

dup_rows.to_csv('data/dup_rows.csv', index=False)


histology_abbreviation
Liver-HCC        20
Myeloid-AML      14
Myeloid-MDS       2
Myeloid-MPN      45
Panc-AdenoCA      4
Prost-AdenoCA    97
dtype: int64
type
case    182
dtype: int64


In [33]:

# Combining the case-control pairs and adding the histology info
def transform_data(data, full_data):
    # pivot the dataframe
    df = data.pivot(index='icgc_donor_id', columns='type', values=['object_id', 'file_name', 'sex'])
    df.columns = [f'{j}_{i}' for i, j in df.columns]
    df['sex'] = df['case_sex']
    df.drop(columns=['case_sex', 'control_sex'], inplace=True)
    df.reset_index(inplace=True)
    # merge with full_data
    final_df = pd.merge(df, full_data[['icgc_donor_id', 'histology_abbreviation']], on='icgc_donor_id', how='left')
    return final_df
    


In [34]:
# Removing non-unique donor/type pairs and unpaireds
awsmani_no_dupes = awsmani[~awsmani['icgc_donor_id'].isin(dup_rows['icgc_donor_id'])]
awsmani_no_dupes = awsmani_no_dupes[~awsmani_no_dupes['icgc_donor_id'].isin(unpaireds['icgc_donor_id'])]

pdcmani_no_dupes = pdcmani[~pdcmani['icgc_donor_id'].isin(dup_rows['icgc_donor_id'])]
pdcmani_no_dupes = pdcmani_no_dupes[~pdcmani_no_dupes['icgc_donor_id'].isin(unpaireds['icgc_donor_id'])]


In [35]:
fullmani = fullmani.drop_duplicates(subset='icgc_donor_id')

In [36]:

awsmani_final = transform_data(awsmani_no_dupes, fullmani)
pdcmani_final = transform_data(pdcmani_no_dupes, fullmani)


In [42]:
# Saving modified manifests
awsmani_final.to_csv('manifests/aws.tsv', sep='\t', index=False)
pdcmani_final.to_csv('manifests/pdc.tsv', sep='\t', index=False)