In [196]:
import pandas as pd
import numpy as np


In [197]:
# Load the manifest
mani = pd.read_csv('manifests/sample_manifest.csv')
mani.shape



(20, 7)

In [223]:
# Load the data and split the genotype column into two numeric columns
def process_dataframe(path):
    # Load the data
    df = pd.read_json(path, lines=True)

    # Apply add_slash to the 'genotype' column
    df['genotype'] = df['genotype'].astype('string').apply(add_slash)

    # Split the 'genotype' column into two
    df[['genotype1', 'genotype2']] = df['genotype'].str.split('/', expand=True)

    # Convert genotypes to numeric, errors='coerce' will convert invalid parsing to NaN
    df[['genotype1', 'genotype2']] = df[['genotype1', 'genotype2']].apply(pd.to_numeric, errors='coerce')

    return df



# if there is no slash in the string, make it 'orginal/original'
def add_slash(string):
    if pd.isna(string):
        return string
    elif '/' not in string:
        return '{}/{}'.format(string, string)
    else:
        return string
    
    
# Load the data and subtract the genotype columns
def load_and_diff(sample):
    path_format = 'data/ndjson_examples/examples/{id}.ndjson'
    # Use the helper function to process the dataframes
    dfn = process_dataframe(path_format.format(id=sample['control_object_id']))
    dft = process_dataframe(path_format.format(id=sample['case_object_id']))

    # Subtract genotype1 and genotype2 columns in both dataframes
    print(dfn['genotype1'])
    diff1 = dfn['genotype1'].subtract(dft['genotype1'])
    diff2 = dfn['genotype2'].subtract(dft['genotype2'])

    # Create a new DataFrame from these Series
    row_names = ['0' + sample['icgc_donor_id'], '1' + sample['icgc_donor_id']]
    diff_df = pd.DataFrame([diff1, diff2], index = row_names)
    diff_df.columns = dfn['region']
    diff_df.insert(0, 'sample_id', sample['icgc_donor_id'])

    return diff_df



In [224]:
test = mani.iloc[0]
test_diff = load_and_diff(test)

0          7.0
1         24.0
2         12.0
3         30.0
4         11.0
          ... 
170861    14.0
170862    14.0
170863     6.0
170864    27.0
170865     3.0
Name: genotype1, Length: 170866, dtype: float64
0         2.0
1         0.0
2         0.0
3         0.0
4         0.0
         ... 
170861    0.0
170862    0.0
170863    0.0
170864    0.0
170865    0.0
Name: genotype1, Length: 170866, dtype: float64
0DO46826   -2396.0
1DO46826   -6152.0
dtype: float64


In [225]:
test_diff.head()

region,sample_id,X:147582151-147582211,X:66765158-66765227,12:7045879-7045936,6:16327864-16327954,22:46191234-46191304,12:112036753-112036822,14:92537353-92537386,3:63898360-63898390,3:63898390-63898402,...,X:99103539-99103567,X:9954635-9954655,X:99142826-99142859,X:99147915-99147955,X:99176749-99176771,X:99177272-99177300,X:99179509-99179533,X:99183991-99184019,X:99224659-99224703,X:99234955-99234975
0DO46826,DO46826,2.0,0.0,0.0,0.0,0.0,-1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1DO46826,DO46826,2.0,0.0,1.0,3.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [230]:
out = pd.read_csv('EHPoutput.csv', index_col=0)
out.head(6)

Unnamed: 0,sample_id,X:147582151-147582211,X:66765158-66765227,12:7045879-7045936,6:16327864-16327954,22:46191234-46191304,12:112036753-112036822,14:92537353-92537386,3:63898360-63898390,3:63898390-63898402,...,X:99103539-99103567,X:9954635-9954655,X:99142826-99142859,X:99147915-99147955,X:99176749-99176771,X:99177272-99177300,X:99179509-99179533,X:99183991-99184019,X:99224659-99224703,X:99234955-99234975
0DO46826,DO46826,2.0,0.0,0.0,0.0,0.0,-1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1DO46826,DO46826,2.0,0.0,1.0,3.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0DO46827,DO46827,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,...,0.0,0.0,0.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.0
1DO46827,DO46827,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,...,0.0,0.0,0.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.0
0DO46830,DO46830,-1.0,-1.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1DO46830,DO46830,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
