In [27]:
%cd ~/github/liz.9.11.19_GMVLE/

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os


PATH_RAW = 'data/raw/'
PATH_INTERIM = 'data/interim/'
PATH_PROCESSED = 'data/processed/'

/Users/maartenbiolizard/GitHub/liz.9.11.19_GMVLE


In [7]:
def read_vcf(file: str) -> pd.DataFrame:
    num_header = 0
    with open(file) as f:
        for line in f.readlines():
            if line.startswith("##"):
                num_header += 1
            else:
                break
    vcf = pd.read_csv(file, sep="\t", skiprows=num_header)
    vcf = vcf.rename({"#CHROM": "CHROM"}, axis=1)
    return vcf

In [24]:
data_name = "HG003_NA24149_Ashkenazim_father.trim."
callers = ["dv", "fb", "oc", "st"]

for caller in callers:
    if not os.path.exists(PATH_INTERIM + caller + ".csv"):
        file_name = PATH_RAW + data_name + caller + ".vcf"
        data = read_vcf(file_name)
        data.to_csv(PATH_INTERIM + caller + ".csv", index=False)
        print(f"Saved {caller}.csv")
    else:
        print(f"{caller}.csv already exists")

dv.csv already exists
fb.csv already exists
oc.csv already exists
st.csv already exists


In [29]:
def data_expl(data: pd.DataFrame, caller: str):
    display(data["CHROM"].value_counts())
#    print(data["FILTER"].value_counts())
#    print(data["INFO"].value_counts())
    plt.hist(data["QUAL"].fillna(0), bins=100)
    plt.title(f"QUAL {caller}")
    plt.show()

def get_values_from_format(data: pd.DataFrame, format_column:str = "FORMAT", value_column:str = "HG003_NA24149_Ashkenazim_father.trim") -> pd.DataFrame:

    # Assume that the FORMAT column is a string with the values separated by ":", and is the same for all rows
    # check if FORMAT column is unique
    if data[format_column].nunique() == 1:
        print("FORMAT column is unique")

        value_names = data[format_column].iloc[0].split(":")

        # split the value column by ":", and convert it to separate columns
        values = data[value_column].str.split(":", expand=True)
        values.columns = value_names

        # drop the original value column and concatenate the new columns
        data = data.drop(value_column, axis=1).drop(format_column, axis=1)
        data_new = pd.concat([data, values], axis=1).copy()

    else:
        print("FORMAT column is not unique")

        # convert the format_column and value_column to a dict per row
        format_dict = {}

        for row in data.iterrows():
            
            # get row index and values
            index = row[0]

            values = row[1][value_column].split(":")
            cols = row[1]["FORMAT"].split(":")

            if len(values) != len(cols):
                print("Different lengths")
                break

            format_dict[index] = dict(zip(cols, values))

        # drop the original value column and concatenate the new columns
        data = data.drop(value_column, axis=1).drop(format_column, axis=1)
        data_new = data.merge(pd.DataFrame(format_dict).T, left_index=True, right_index=True).copy()

    return data_new

In [28]:
dtype = {
    "CHROM": str,
    "POS": int,
    "ID": str,
    "REF": str,
    "ALT": str,
    "QUAL": float,
    "FILTER": str,
    "INFO": str,
    "FORMAT": str,
    "HG003_NA24149_Ashkenazim_father.trim": str,
}

cols_to_keep = ["CHROM", "POS", "REF", "ALT", "QUAL", "FILTER", "FORMAT", "HG003_NA24149_Ashkenazim_father.trim"]

for caller in callers:

    if not os.path.exists(PATH_PROCESSED + caller + ".csv"):
        print(f"Processing {caller}.csv")
        data = pd.read_csv(PATH_INTERIM + caller + ".csv", dtype=dtype, na_values=".")[cols_to_keep]
        data_new = get_values_from_format(data)
        data_new.to_csv(PATH_PROCESSED + caller + ".csv", index=False)
        display(data_new.head())

    else:
        print(f"{caller}.csv already processed")



Processing dv.csv
FORMAT column is unique


Unnamed: 0,CHROM,POS,REF,ALT,QUAL,FILTER,FORMAT,GT,GQ,DP,AD,VAF,PL
0,1,10120,T,C,0.0,RefCall,GT:GQ:DP:AD:VAF:PL,0/0,28,127,10316,0.125984,3330
1,1,10126,T,C,0.0,RefCall,GT:GQ:DP:AD:VAF:PL,0/0,29,116,9215,0.12931,3131
2,1,10132,T,C,0.0,RefCall,GT:GQ:DP:AD:VAF:PL,0/0,25,102,8315,0.147059,3026
3,1,10138,T,C,0.0,RefCall,GT:GQ:DP:AD:VAF:PL,0/0,26,92,6918,0.195652,3127
4,1,10146,AC,A,0.6,RefCall,GT:GQ:DP:AD:VAF:PL,./.,9,77,6211,0.142857,1210


Processing fb.csv
FORMAT column is unique


Unnamed: 0,CHROM,POS,REF,ALT,QUAL,FILTER,FORMAT,GT,DP,RO,QR,AO,QA,GL
0,1,10230,ACC,AC,5.88242e-14,,GT:DP:RO:QR:AO:QA:GL,0/1,187,109,3276,51,1621,"-7.08476,0,-84.5075"
1,1,10247,T,C,0.0,,GT:DP:RO:QR:AO:QA:GL,0/0,150,114,2909,33,695,"0,-4.1921,-79.028"
2,1,10327,T,C,3.0919e-14,,GT:DP:RO:QR:AO:QA:GL,0/0,198,135,3103,53,1151,"0,-5.78885,-88.8844"
3,1,10352,TAC,TAAC,1.1171e-07,,GT:DP:RO:QR:AO:QA:GL,0/1,119,44,1260,39,526,"-10.2012,0,-30.3596"
4,1,10611,C,G,406.172,,GT:DP:RO:QR:AO:QA:GL,1/1,23,3,91,20,699,"-50.0305,-1.8951,0"


Processing oc.csv
FORMAT column is unique


Unnamed: 0,CHROM,POS,REF,ALT,QUAL,FILTER,FORMAT,GT,GQ,DP,MQ,PS,PQ,FT
0,1,10611,C,G,310.27,PASS,GT:GQ:DP:MQ:PS:PQ:FT,1|1,113,37,40,10611,100,PASS
1,1,10623,T,C,363.58,PASS,GT:GQ:DP:MQ:PS:PQ:FT,1|1,120,39,40,10611,100,PASS
2,1,10629,G,A,92.38,AFB,GT:GQ:DP:MQ:PS:PQ:FT,1|0,92,39,40,10611,100,AFB
3,1,10815,T,TC,629.45,LBQ,GT:GQ:DP:MQ:PS:PQ:FT,0|1,227,49,46,10611,100,LBQ
4,1,10816,C,CCA,227.6,PASS,GT:GQ:DP:MQ:PS:PQ:FT,1|0,227,49,46,10611,100,PASS


Processing st.csv
FORMAT column is not unique


Unnamed: 0,CHROM,POS,REF,ALT,QUAL,FILTER,FORMAT,GT,GQ,GQX,DP,DPF,AD,ADF,ADR,SB,FT,PL,DPI,PS
0,1,10103,T,A,0.0,LowGQX;NoPassedVariantGTs,GT:GQ:GQX:DP:DPF:AD:ADF:ADR:SB:FT:PL,0/1,14,0,12.0,68.0,102,62,40,0.0,LowGQX,160149,,
1,1,10105,A,C,0.0,LowGQX;NoPassedVariantGTs,GT:GQ:GQX:DP:DPF:AD:ADF:ADR:SB:FT:PL,0/1,21,0,16.0,65.0,142,121,21,0.0,LowGQX,220164,,
2,1,10174,C,T,0.0,LowGQX;NoPassedVariantGTs,GT:GQ:GQX:DP:DPF:AD:ADF:ADR:SB:FT:PL,0/1,7,0,7.0,93.0,61,31,30,0.0,LowGQX,80106,,
3,1,10177,A,AC,472.0,PASS,GT:GQ:GQX:DPI:AD:ADF:ADR:FT:PL,0/1,144,6,,,1326,911,415,,PASS,5140141,102.0,
4,1,10230,AC,A,140.0,PASS,GT:GQ:GQX:DPI:AD:ADF:ADR:FT:PL,0/1,179,3,,,3816,189,207,,PASS,1760554,142.0,
