In [6]:
%cd ~/github/liz.9.11.19_GMVLE/

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os

from utils.utils import read_vcf


PATH_RAW = 'data/raw/'
PATH_INTERIM = 'data/interim/'
PATH_PROCESSED = 'data/processed/'

/Users/maartenbiolizard/GitHub/liz.9.11.19_GMVLE


In [3]:
data_name = "HG003_NA24149_Ashkenazim_father.trim."
callers = ["dv", "fb", "oc", "st"]

for caller in callers:
    if not os.path.exists(PATH_INTERIM + caller + ".csv"):
        file_name = PATH_RAW + data_name + caller + ".vcf"
        data = read_vcf(file_name)
        data.to_csv(PATH_INTERIM + caller + ".csv", index=False)
        print(f"Saved {caller}.csv")
    else:
        print(f"{caller}.csv already exists")

dv.csv already exists
fb.csv already exists
oc.csv already exists
st.csv already exists


In [4]:
def data_expl(data: pd.DataFrame, caller: str):
    display(data["CHROM"].value_counts())
#    print(data["FILTER"].value_counts())
#    print(data["INFO"].value_counts())
    plt.hist(data["QUAL"].fillna(0), bins=100)
    plt.title(f"QUAL {caller}")
    plt.show()

def get_values_from_format(data: pd.DataFrame, format_column:str = "FORMAT", value_column:str = "HG003_NA24149_Ashkenazim_father.trim") -> pd.DataFrame:

    # Assume that the FORMAT column is a string with the values separated by ":", and is the same for all rows
    # check if FORMAT column is unique
    if data[format_column].nunique() == 1:
        print("FORMAT column is unique")

        value_names = data[format_column].iloc[0].split(":")

        # split the value column by ":", and convert it to separate columns
        values = data[value_column].str.split(":", expand=True)
        values.columns = value_names

        # drop the original value column and concatenate the new columns
        data = data.drop(value_column, axis=1).drop(format_column, axis=1)
        data_new = pd.concat([data, values], axis=1).copy()

    else:
        print("FORMAT column is not unique")

        # convert the format_column and value_column to a dict per row
        format_dict = {}

        for row in data.iterrows():
            
            # get row index and values
            index = row[0]

            values = row[1][value_column].split(":")
            cols = row[1]["FORMAT"].split(":")

            if len(values) != len(cols):
                print("Different lengths")
                break

            format_dict[index] = dict(zip(cols, values))

        # drop the original value column and concatenate the new columns
        data = data.drop(value_column, axis=1).drop(format_column, axis=1)
        data_new = data.merge(pd.DataFrame(format_dict).T, left_index=True, right_index=True).copy()

    return data_new

In [5]:
dtype = {
    "CHROM": str,
    "POS": int,
    "ID": str,
    "REF": str,
    "ALT": str,
    "QUAL": float,
    "FILTER": str,
    "INFO": str,
    "FORMAT": str,
    "HG003_NA24149_Ashkenazim_father.trim": str,
}

cols_to_keep = ["CHROM", "POS", "REF", "ALT", "QUAL", "FILTER", # original columns
                 "DP", # columns from FORMAT
                 ]

for caller in callers:

    if not os.path.exists(PATH_PROCESSED + caller + ".csv"):
        print(f"Processing {caller}.csv")
        data = pd.read_csv(PATH_INTERIM + caller + ".csv", dtype=dtype, na_values=".")
        data_new = get_values_from_format(data)[cols_to_keep]
        data_new.to_csv(PATH_PROCESSED + caller + ".csv", index=False)
        display(data_new.head())

    else:
        print(f"{caller}.csv already processed")



Processing dv.csv
FORMAT column is unique


Unnamed: 0,CHROM,POS,REF,ALT,QUAL,FILTER,DP
0,1,10120,T,C,0.0,RefCall,127
1,1,10126,T,C,0.0,RefCall,116
2,1,10132,T,C,0.0,RefCall,102
3,1,10138,T,C,0.0,RefCall,92
4,1,10146,AC,A,0.6,RefCall,77


Processing fb.csv
FORMAT column is unique


Unnamed: 0,CHROM,POS,REF,ALT,QUAL,FILTER,DP
0,1,10230,ACC,AC,5.88242e-14,,187
1,1,10247,T,C,0.0,,150
2,1,10327,T,C,3.0919e-14,,198
3,1,10352,TAC,TAAC,1.1171e-07,,119
4,1,10611,C,G,406.172,,23


Processing oc.csv
FORMAT column is unique


Unnamed: 0,CHROM,POS,REF,ALT,QUAL,FILTER,DP
0,1,10611,C,G,310.27,PASS,37
1,1,10623,T,C,363.58,PASS,39
2,1,10629,G,A,92.38,AFB,39
3,1,10815,T,TC,629.45,LBQ,49
4,1,10816,C,CCA,227.6,PASS,49


Processing st.csv
FORMAT column is not unique


Unnamed: 0,CHROM,POS,REF,ALT,QUAL,FILTER,DP
0,1,10103,T,A,0.0,LowGQX;NoPassedVariantGTs,12.0
1,1,10105,A,C,0.0,LowGQX;NoPassedVariantGTs,16.0
2,1,10174,C,T,0.0,LowGQX;NoPassedVariantGTs,7.0
3,1,10177,A,AC,472.0,PASS,
4,1,10230,AC,A,140.0,PASS,
