In [1]:
import pandas as pd
import numpy as np
import dask as dd

In [2]:
# Load the manifest
mani = pd.read_csv('manifests/sample_manifest.csv')
mani.shape



(20, 7)

In [3]:
examdf = pd.read_json('data/ndjson_examples/exmaples/0fac0f32-9377-5739-901d-94baa2d17f0b.ndjson', lines=True)
examdf.head()

Unnamed: 0,sample,genotype,motif,region
0,0fac0f32-9377-5739-901d-94baa2d17f0b,10,GCC,X:147582151-147582211
1,0fac0f32-9377-5739-901d-94baa2d17f0b,21,GCA,X:66765158-66765227
2,0fac0f32-9377-5739-901d-94baa2d17f0b,19/20,CAG,12:7045879-7045936
3,0fac0f32-9377-5739-901d-94baa2d17f0b,29/31,TGC,6:16327864-16327954
4,0fac0f32-9377-5739-901d-94baa2d17f0b,14/14,ATTCT,22:46191234-46191304


In [12]:
# Load the data and split the genotype column into two numeric columns
def process_dataframe(path):
    # Load the data
    df = pd.read_json(path, lines=True)

    # Apply add_slash to the 'genotype' column
    df['genotype'] = df['genotype'].astype('string').apply(add_slash)

    # Split the 'genotype' column into two
    df[['genotype1', 'genotype2']] = df['genotype'].str.split('/', expand=True)

    # Convert genotypes to numeric, errors='coerce' will convert invalid parsing to NaN
    df[['genotype1', 'genotype2']] = df[['genotype1', 'genotype2']].apply(pd.to_numeric, errors='coerce')

    return df



# if there is no slash in the string, make it 'orginal/original'
def add_slash(string):
    if pd.isna(string):
        return string
    elif '/' not in string:
        return '{}/{}'.format(string, string)
    else:
        return string
    
    
# Load the data and subtract the genotype columns
def load_and_diff(sample):
    path_format = 'data/ndjson_examples/exmaples/{id}.ndjson'
    # Use the helper function to process the dataframes
    dfn = process_dataframe(path_format.format(id=sample['control_object_id']))
    dft = process_dataframe(path_format.format(id=sample['case_object_id']))

    # Subtract genotype1 and genotype2 columns in both dataframes
    diff1 = dfn['genotype1'].subtract(dft['genotype1'], fill_value=0)
    diff2 = dfn['genotype2'].subtract(dft['genotype2'], fill_value=0)

    # Create a new DataFrame from these Series
    diff_df = pd.DataFrame([diff1, diff2])

    return diff_df



In [13]:
test = mani.iloc[1]
print(test)
dfn = pd.read_json('data/ndjson_examples/exmaples/{id}.ndjson'.format(id = test['control_object_id']), lines=True)
dft = pd.read_json('data/ndjson_examples/exmaples/{id}.ndjson'.format(id = test['case_object_id']), lines=True)


icgc_donor_id                                          DO46827
case_object_id            d1dbd41b-c82d-5c9d-b404-07f97352e6f1
control_object_id         b4f8d104-4de4-5ceb-aee0-08f599351f00
case_file_name            72fa6dda655cffe29e8ae26357ec3069.bam
control_file_name         6e7e2e09c1b59afdda8266a5d2706de2.bam
sex                                                       male
histology_abbreviation                              Kidney-RCC
Name: 1, dtype: object


In [14]:
dfn.isnull().sum()

sample      0
genotype    0
motif       0
region      0
dtype: int64

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,170856,170857,170858,170859,170860,170861,170862,170863,170864,170865
genotype1,-5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-3.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
genotype2,-5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-3.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
difftest

Unnamed: 0,sample,genotype,motif,region
0,b4f8d104-4de4-5ceb-aee0-08f599351f00,11/11,GCC,X:147582151-147582211
1,b4f8d104-4de4-5ceb-aee0-08f599351f00,19/19,GCA,X:66765158-66765227
2,b4f8d104-4de4-5ceb-aee0-08f599351f00,20/22,CAG,12:7045879-7045936
3,b4f8d104-4de4-5ceb-aee0-08f599351f00,30/32,TGC,6:16327864-16327954
4,b4f8d104-4de4-5ceb-aee0-08f599351f00,13/14,ATTCT,22:46191234-46191304
...,...,...,...,...
170861,b4f8d104-4de4-5ceb-aee0-08f599351f00,12/12,AC,X:99177272-99177300
170862,b4f8d104-4de4-5ceb-aee0-08f599351f00,12/12,AC,X:99179509-99179533
170863,b4f8d104-4de4-5ceb-aee0-08f599351f00,8/8,GATA,X:99183991-99184019
170864,b4f8d104-4de4-5ceb-aee0-08f599351f00,16/16,TG,X:99224659-99224703
