In [29]:
import pandas as pd
import numpy as np
import boto3
import botocore
import json

In [36]:
from glob import glob

In [30]:
s3 = boto3.client('s3')

In [None]:
# Loading raw Manifests
awsmani = pd.read_csv('manifests/aws.tsv', sep='\t')
pdcmani = pd.read_csv('manifests/pdc.tsv', sep='\t')
fullmani = pd.read_csv('manifests/full.csv')
fullmani.rename(columns={'donor_id/donor_count':'icgc_donor_id', 'Case (Tumor) or Control (Normal)':'type', }, inplace=True)
pdcmani.rename(columns={'guid': 'object_id'}, inplace=True)


In [None]:
# Checking for duplicates
duplicates = fullmani.duplicated(subset=['icgc_donor_id', 'type'], keep=False)
dup_rows = fullmani[duplicates]
print(dup_rows.groupby('histology_abbreviation').size())
print(dup_rows.groupby('type').size())
print(duplicates.sum())

# Removing duplicates
awsmani_no_dupes = awsmani[~awsmani['icgc_donor_id'].isin(dup_rows['icgc_donor_id'])]
pdcmani_no_dupes = pdcmani[~pdcmani['icgc_donor_id'].isin(dup_rows['icgc_donor_id'])]


In [None]:

# pivot the dataframes
aws_df = awsmani_no_dupes.pivot(index='icgc_donor_id', columns='type', values=['object_id', 'file_name', 'sex'])
aws_df.columns = [f'{j}_{i}' for i, j in aws_df.columns]
aws_df['sex'] = aws_df['case_sex']
aws_df = aws_df.drop(columns=['case_sex', 'control_sex'])
aws_df = aws_df.reset_index()

pdc_df = pdcmani_no_dupes.pivot(index='icgc_donor_id', columns='type', values=['object_id', 'file_name', 'sex'])
pdc_df.columns = [f'{j}_{i}' for i, j in pdc_df.columns]
pdc_df['sex'] = pdc_df['case_sex']
pdc_df = pdc_df.drop(columns=['case_sex', 'control_sex'])
pdc_df = pdc_df.reset_index()
pdc_final_mani = pd.merge(pdc_df, fullmani[['icgc_donor_id', 'histology_abbreviation']], on='icgc_donor_id', how='left')
aws_final_mani = pd.merge(aws_df, fullmani[['icgc_donor_id', 'histology_abbreviation']], on='icgc_donor_id', how='left')


In [None]:
def transform_data(data, full_data, name):
    # pivot the dataframe
    df = data.pivot(index='icgc_donor_id', columns='type', values=['object_id', 'file_name', 'sex'])
    df.columns = [f'{j}_{i}' for i, j in df.columns]
    df['sex'] = df['case_sex']
    df.drop(columns=['case_sex', 'control_sex'], inplace=True)
    df.reset_index(inplace=True)
    # merge with full_data
    final_df = pd.merge(df, full_data[['icgc_donor_id', 'histology_abbreviation']], on='icgc_donor_id', how='left')
    return final_df

# load data
aws_data, pdc_data, full_data = load_data('manifests/aws.tsv', 'manifests/pdc.tsv', 'manifests/full.csv')

# check for duplicates
duplicates = full_data.duplicated(subset=['icgc_donor_id', 'Case (Tumor) or Control (Normal)'], keep=False)

# print info about duplicates
dup_rows = full_data[duplicates]
print(dup_rows.groupby('histology_abbreviation').size())
print(dup_rows.groupby('Case (Tumor) or Control (Normal)').size())
print(duplicates.sum())

# remove duplicates
aws_data_no_dupes = remove_duplicates(aws_data, dup_rows)
pdc_data_no_dupes = remove_duplicates(pdc_data, dup_rows)

# rename columns
pdc_data_no_dupes.rename(columns={'guid': 'object_id'}, inplace=True)

# transform and merge data
aws_final_data = transform_data(aws_data_no_dupes, full_data, 'aws')
pdc_final_data = transform_data(pdc_data_no_dupes, full_data, 'pdc')


In [100]:
df_final.head()

Unnamed: 0,icgc_donor_id,case_object_id,control_object_id,case_file_name,control_file_name,sex,histology_abbreviation
0,DO10172,ec5c919a-188e-428e-836d-58b2564eae28,9d6494f8-6967-4543-82b5-9fb0931816de,PCAWG.402285b1-01fb-4ae6-8cdc-aab1d479f31b.bam,PCAWG.54ce9151-3bee-4305-a954-d233c7ddff9a.bam,male,ColoRect-AdenoCA
1,DO10172,ec5c919a-188e-428e-836d-58b2564eae28,9d6494f8-6967-4543-82b5-9fb0931816de,PCAWG.402285b1-01fb-4ae6-8cdc-aab1d479f31b.bam,PCAWG.54ce9151-3bee-4305-a954-d233c7ddff9a.bam,male,ColoRect-AdenoCA
2,DO10486,017d09fe-0e6b-4136-afeb-02c1d396ccff,298be5c5-0e30-430b-a221-8a0c8ff4695e,PCAWG.b687ab8e-8210-469c-a914-2299c00443ef.bam,PCAWG.9fc66d8c-133d-430f-8a2e-2d1a11e7ef8d.bam,female,ColoRect-AdenoCA
3,DO10486,017d09fe-0e6b-4136-afeb-02c1d396ccff,298be5c5-0e30-430b-a221-8a0c8ff4695e,PCAWG.b687ab8e-8210-469c-a914-2299c00443ef.bam,PCAWG.9fc66d8c-133d-430f-8a2e-2d1a11e7ef8d.bam,female,ColoRect-AdenoCA
4,DO10631,993c9300-c693-485e-a8fb-0181cd8c3bfc,ac2471f9-9d0f-44be-a53e-f01144e69a0a,PCAWG.b2f57308-2c3c-41a0-b866-c3cb60a82b1e.bam,PCAWG.c95c1fe9-c156-4c41-ac14-f07214e5e613.bam,female,ColoRect-AdenoCA


In [19]:
bucket_name = 'icgc-eh-bucket'
object_key = "results/aws/{object_id}.json"


In [28]:
local_file_path = 'exams/{object_id}.json'  # Specify the desired local file path

for obj_id in awsmani['object_id'].head(5):
    try:
        # Specify the desired local file path with the correct formatting
        local_file_path = f'exams/{obj_id}.json'
        print(local_file_path)
        s3.download_file(bucket_name, object_key.format(object_id=obj_id), local_file_path)
        print("File downloaded successfully!")
    except botocore.exceptions.ClientError as e:
        if e.response['Error']['Code'] == "404":
            print("The object does not exist.")
        else:
            raise

exams/ccf30f0f-0ec1-5aa2-8eb3-14cb1f4f58cd.json
File downloaded successfully!
exams/0b894f0a-0f97-5b6b-8585-9b15e49013b6.json
File downloaded successfully!
exams/124c8c1f-70a9-5335-a096-75f32cd76d75.json
File downloaded successfully!
exams/fdd88637-7046-5dd0-829e-42f3b3f4ddf9.json
File downloaded successfully!
exams/566b1bcc-97fd-59ca-9c8f-e98f5dea4152.json
File downloaded successfully!


AFF2       {'AlleleCount': 2, 'Coverage': 47.473684210526...
AR         {'AlleleCount': 2, 'Coverage': 48.684210526315...
ATN1       {'AlleleCount': 2, 'Coverage': 66.0, 'Fragment...
ATXN1      {'AlleleCount': 2, 'Coverage': 57.052631578947...
ATXN10     {'AlleleCount': 2, 'Coverage': 61.315789473684...
ATXN2      {'AlleleCount': 2, 'Coverage': 55.684210526315...
ATXN3      {'AlleleCount': 2, 'Coverage': 65.894736842105...
ATXN7      {'AlleleCount': 2, 'Coverage': 54.368421052631...
ATXN8OS    {'AlleleCount': 2, 'Coverage': 59.105263157894...
C9ORF72    {'AlleleCount': 2, 'Coverage': 41.684210526315...
Name: LocusResults, dtype: object