In [5]:
import pandas as pd
import numpy as np


In [3]:
# Load the manifest
mani = pd.read_csv('manifests/manifest.csv')
mani.shape



(2506, 7)

In [4]:
mani.iloc[5]

icgc_donor_id                                           DO1005
case_object_id            0b4573e7-6c69-5996-86de-463036d6589b
control_object_id         2b1b2587-1f4e-5918-872f-b87bac0c2429
case_file_name            b6ee98f353017df7d1354ae22fc9e3dc.bam
control_file_name         228a53b73d2c63eb66530b172cb65b41.bam
sex                                                     female
histology_abbreviation                          Breast-AdenoCA
Name: 5, dtype: object

In [10]:
# Load the data and split the genotype column into two numeric columns
def process_dataframe(path):
    # Load the data
    df = pd.read_json(path, lines=True)

    # Apply add_slash to the 'genotype' column
    df['genotype'] = df['genotype'].astype('string').apply(add_slash)

    # Split the 'genotype' column into two
    df[['genotype1', 'genotype2']] = df['genotype'].str.split('/', expand=True)

    # Convert genotypes to numeric, errors='coerce' will convert invalid parsing to NaN
    df[['genotype1', 'genotype2']] = df[['genotype1', 'genotype2']].apply(pd.to_numeric, errors='coerce')

    return df



# if there is no slash in the string, make it 'orginal/original'
def add_slash(string):
    if pd.isna(string):
        return string
    elif '/' not in string:
        return '{}/{}'.format(string, string)
    else:
        return string
    
    
# Load the data and subtract the genotype columns
def load_and_diff(sample):
    path_format = 'data/ndjson_examples/examples/{id}.ndjson'
    # Use the helper function to process the dataframes
    dfn = process_dataframe(path_format.format(id=sample['control_object_id']))
    dft = process_dataframe(path_format.format(id=sample['case_object_id']))

    # Subtract genotype1 and genotype2 columns in both dataframes
    print(dfn['genotype1'])
    diff1 = dfn['genotype1'].subtract(dft['genotype1'])
    diff2 = dfn['genotype2'].subtract(dft['genotype2'])

    # Create a new DataFrame from these Series
    row_names = ['0' + sample['icgc_donor_id'], '1' + sample['icgc_donor_id']]
    diff_df = pd.DataFrame([diff1, diff2], index = row_names)
    diff_df.columns = dfn['region']
    diff_df.insert(0, 'sample_id', sample['icgc_donor_id'])

    return diff_df



In [37]:
manifest = pd.read_csv('manifests/manifest.csv')

In [38]:
manifest.shape

(2506, 7)

In [39]:
empties = pd.read_csv('data/other/empties.txt', header=None)

In [51]:
# remove the rows from manifest where either the case or control is in the empties list
case = manifest['case_object_id'].isin(empties[0])
control = manifest['control_object_id'].isin(empties[0])

In [53]:
# remove the rows from manifest where either the case or control is in the empties list
manifest = manifest[~(case | control)]

In [54]:
manifest.shape

(2487, 7)

In [55]:
manifest.to_csv('manifests/manifest.csv', index=False)

In [None]:

# Group the DataFrame by the desired column
groups = mani.groupby('column_name')

# Iterate over the groups and save them as files
for name, group in groups:
    group.to_csv(f'manifests/by_cancer/{name}.csv', index=False)
