In [1]:
%cd ~/github/liz.9.11.19_GMVLE/

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os

from utils.utils import read_vcf


PATH_RAW = 'data/raw/'
PATH_INTERIM = 'data/interim/'
PATH_PROCESSED = 'data/processed/'

/Users/maartenbiolizard/GitHub/liz.9.11.19_GMVLE


In [2]:
data_name = "HG003_NA24149_Ashkenazim_father.trim."
callers = ["dv", "fb", "oc", "st"]

for caller in callers:
    if not os.path.exists(PATH_INTERIM + caller + ".csv"):
        file_name = PATH_RAW + data_name + caller + ".vcf"
        data = read_vcf(file_name)
        data.to_csv(PATH_INTERIM + caller + ".csv", index=False)
        print(f"Saved {caller}.csv")
    else:
        print(f"{caller}.csv already exists")

benchmark = "HG003_GRCh38_1_22_v4.2.1_benchmark"
if not os.path.exists(PATH_INTERIM + benchmark + ".csv"):
    file_name = PATH_RAW + benchmark + ".vcf"
    benchmark = read_vcf(file_name)
    benchmark.to_csv(PATH_INTERIM + benchmark + ".csv", index=False)
    print(f"Saved {benchmark}")
else:
    print(f"{benchmark} already exists")

dv.csv already exists
fb.csv already exists
oc.csv already exists
st.csv already exists
HG003_GRCh38_1_22_v4.2.1_benchmark already exists


In [3]:
def data_expl(data: pd.DataFrame, caller: str):
    display(data["CHROM"].value_counts())
#    print(data["FILTER"].value_counts())
#    print(data["INFO"].value_counts())
    plt.hist(data["QUAL"].fillna(0), bins=100)
    plt.title(f"QUAL {caller}")
    plt.show()

def get_values_from_format(data: pd.DataFrame, format_column:str = "FORMAT", value_column:str = "HG003_NA24149_Ashkenazim_father.trim") -> pd.DataFrame:

    dict_error = {}

    # Assume that the FORMAT column is a string with the values separated by ":", and is the same for all rows
    # check if FORMAT column is unique
    if data[format_column].nunique() == 1:
        print("FORMAT column is unique")

        value_names = data[format_column].iloc[0].split(":")

        # split the value column by ":", and convert it to separate columns
        values = data[value_column].str.split(":", expand=True)
        values.columns = value_names

        # drop the original value column and concatenate the new columns
        data = data.drop(value_column, axis=1).drop(format_column, axis=1)
        data_new = pd.concat([data, values], axis=1).copy()

    else:
        print("FORMAT column is not unique")

        # convert the format_column and value_column to a dict per row
        format_dict = {}

        for row in data.iterrows():
            
            # get row index and values
            index = row[0]

            values = row[1][value_column].split(":")
            cols = row[1]["FORMAT"].split(":")

            if len(values) != len(cols):
                # Different lengths at this row
                if row[1]["CHROM"] not in dict_error:
                    dict_error[row[1]["CHROM"]] = [row[1]["POS"]]
                else:
                    dict_error[row[1]["CHROM"]].append(row[1]["POS"])

                # add empty values to match the length of cols
                values += [np.nan] * (len(cols) - len(values))

            format_dict[index] = dict(zip(cols, values))

        # drop the original value column and concatenate the new columns
        data = data.drop(value_column, axis=1).drop(format_column, axis=1)
        data_new = data.merge(pd.DataFrame(format_dict).T, left_index=True, right_index=True).copy()

        print("Different lenght at CHROM: positions\n", dict_error)

    return data_new

In [4]:
def process_vcf_data(callers = [], benchmark = None):

    dtype = {
        "CHROM": str,
        "POS": int,
        "ID": str,
        "REF": str,
        "ALT": str,
        "QUAL": float,
        "FILTER": str,
        "INFO": str,
        "FORMAT": str,
        "HG003_NA24149_Ashkenazim_father.trim": str,
        "HG003": str,
    }

    cols_to_keep = ["CHROM", "POS", "REF", "ALT", "QUAL", "FILTER", # original columns
                    "DP", # "GT", "GQ" columns from FORMAT
                    ]

    for caller in callers:

        if not os.path.exists(PATH_PROCESSED + caller + ".csv"):
            print(f"Processing {caller}.csv")
            data = pd.read_csv(PATH_INTERIM + caller + ".csv", dtype=dtype, na_values=".")
            data_new = get_values_from_format(data)[cols_to_keep]
            data_new.to_csv(PATH_PROCESSED + caller + ".csv", index=False)
            display(data_new.head())

        else:
            print(f"{caller}.csv already processed")

    if benchmark is not None:
        if not os.path.exists(PATH_PROCESSED + benchmark + ".csv"):
            print(f"Processing {benchmark}.csv")
            data = pd.read_csv(PATH_INTERIM + benchmark + ".csv", dtype=dtype, na_values=".")
            data_new = get_values_from_format(data, value_column="HG003")[cols_to_keep]

            # remove "chr" from CHROM column to match the other data
            data_new["CHROM"] = data_new["CHROM"].str.replace("chr", "")

            data_new.to_csv(PATH_PROCESSED + benchmark + ".csv", index=False)
            display(data_new.head())

        else:
            print(f"{benchmark}.csv already processed")

process_vcf_data(callers, benchmark)

dv.csv already processed
fb.csv already processed
oc.csv already processed
st.csv already processed
Processing HG003_GRCh38_1_22_v4.2.1_benchmark.csv
FORMAT column is not unique
Different lenght at CHROM: positions
 {'chr6': [28513056, 28513154, 28513641, 28515705, 28517502, 28517903, 28518897, 28522210, 28526845, 28528813, 28529562, 28530782, 28531387, 28534773, 28535580, 28538066, 28540653, 28543387, 28546559, 28550539, 28553793, 28554918, 28555531, 28557488, 28557524, 28557848, 28558642, 28558878, 28561284, 28561528, 28563470, 28565488, 28566169, 28567194, 28567908, 28574327, 28574647, 28574743, 28576915, 28579175, 28579645, 28583399, 28586002, 28588421, 28589356, 28589856, 28590859, 28591113, 28592294, 28593010, 28593035, 28593379, 28595180, 28595330, 28595722, 28596143, 28596393, 28597691, 28598050, 28598683, 28598739, 28598745, 28598876, 28600751, 28600899, 28601561, 28602172, 28602956, 28603337, 28603636, 28604282, 28604615, 28605229, 28605678, 28606638, 28607395, 28607519, 2860

Unnamed: 0,CHROM,POS,REF,ALT,QUAL,FILTER,DP
0,1,602439,C,T,50.0,PASS,31
1,1,602493,C,T,50.0,PASS,23
2,1,602494,A,G,50.0,PASS,21
3,1,631859,CG,C,50.0,PASS,312
4,1,779047,G,A,50.0,PASS,800


In [25]:
from functools import reduce

dtype = {
    "CHROM": str,
    "POS": int,
    "REF": str,
    "ALT": str,
    "QUAL": float,
    "FILTER": str,
    "DP": pd.Int64Dtype(),
}



# Merge the dataframes on CHROM and POS
# Columns that are required to be the same in all dataframes: REF, DP
# Columns that are specific to the caller: ALT, QUAL, FILTER
def merge_dataframes(caller_names: list, on: list, same_col: list) -> pd.DataFrame:
    
    file_name = PATH_PROCESSED + "merged.csv"
    if not os.path.exists(file_name):
        print("Merging dataframes")

        dataframes = [pd.read_csv(PATH_PROCESSED + caller + ".csv", dtype=dtype) for caller in caller_names]

        # Create the merged dataframe
        for caller, df in zip(caller_names, dataframes):
            df.columns = [col + f"_{caller}" if col not in on else col for col in df.columns]

        df_merged = reduce(lambda left, right: pd.merge(left, right, on=on, how="outer"), dataframes)

        # Check if the data in the same columns is the same when it's not NaN

        for col in same_col:
            df_merged[f"{col}_same"] = df_merged[[f"{col}_{caller}" for caller in caller_names]].apply(lambda x: len(set(x.dropna())) == 1, axis=1)

        # Save the dataframe
        df_merged.to_csv(file_name, index=False)

    else:
        print("Merged dataframe already exists")
        df_merged = pd.read_csv(file_name, dtype=dtype)

    return df_merged

df_merged = merge_dataframes(callers, on=["CHROM", "POS"], same_col=["REF", "DP"])
df_merged.head()

Merged dataframe already exists


Unnamed: 0,CHROM,POS,REF_dv,ALT_dv,QUAL_dv,FILTER_dv,DP_dv,REF_fb,ALT_fb,QUAL_fb,...,QUAL_oc,FILTER_oc,DP_oc,REF_st,ALT_st,QUAL_st,FILTER_st,DP_st,REF_same,DP_same
0,1,10103,,,,,,,,,...,,,,T,A,0.0,LowGQX;NoPassedVariantGTs,12.0,True,True
1,1,10105,,,,,,,,,...,,,,A,C,0.0,LowGQX;NoPassedVariantGTs,16.0,True,True
2,1,10120,T,C,0.0,RefCall,127.0,,,,...,,,,,,,,,True,True
3,1,10126,T,C,0.0,RefCall,116.0,,,,...,,,,,,,,,True,True
4,1,10132,T,C,0.0,RefCall,102.0,,,,...,,,,,,,,,True,True


In [26]:
dtype = {
    "CHROM": str,
    "POS": int,
    "REF": str,
    "ALT": str,
    "QUAL": float,
    "FILTER": str,
    "DP": pd.Int64Dtype(),
}

df_benchmark = pd.read_csv(PATH_PROCESSED + benchmark + ".csv", dtype=dtype)
def merge_benchmark(df_benchmark, df_merged, on=["CHROM", "POS"]) -> pd.DataFrame:
    file_name = PATH_PROCESSED + "merged_benchmark.csv"
    if not os.path.exists(file_name):
        print("Merging benchmark")
        df_benchmark.columns = [col + "_BM" if col not in ["CHROM", "POS"] else col for col in df_benchmark.columns]

        # Select only the chromosomes that are in the benchmark
        chromosomes = df_benchmark["CHROM"].unique()
        df_merged = df_merged[df_merged["CHROM"].isin(chromosomes)]

        df_merged = pd.merge(df_merged, df_benchmark, on=on, how="outer")
        df_merged.to_csv(file_name, index=False)

    else:
        print("Merged benchmark already exists")
        df_merged = pd.read_csv(file_name, dtype=dtype)

    # print the number of rows per chromosome
    print("Number of rows per chromosome")
    display(df_merged["CHROM"].value_counts())

    return df_merged

df_merged_BM = merge_benchmark(df_benchmark, df_merged)
df_merged_BM.head()

Merging benchmark
Number of rows per chromosome


CHROM
1     981778
2     971246
3     782302
4     756881
7     719057
6     707193
5     704690
10    628570
12    601145
8     591362
11    585413
9     561585
17    456086
16    427742
19    417570
13    396878
14    385585
15    378263
18    340268
20    328338
22    237992
21    192540
Name: count, dtype: int64

Unnamed: 0,CHROM,POS,REF_dv,ALT_dv,QUAL_dv,FILTER_dv,DP_dv,REF_fb,ALT_fb,QUAL_fb,...,QUAL_st,FILTER_st,DP_st,REF_same,DP_same,REF_BM,ALT_BM,QUAL_BM,FILTER_BM,DP_BM
0,1,10103,,,,,,,,,...,0.0,LowGQX;NoPassedVariantGTs,12.0,True,True,,,,,
1,1,10105,,,,,,,,,...,0.0,LowGQX;NoPassedVariantGTs,16.0,True,True,,,,,
2,1,10120,T,C,0.0,RefCall,127.0,,,,...,,,,True,True,,,,,
3,1,10126,T,C,0.0,RefCall,116.0,,,,...,,,,True,True,,,,,
4,1,10132,T,C,0.0,RefCall,102.0,,,,...,,,,True,True,,,,,


In [32]:
def calculate_performance(df_merged_BM, caller, benchmark:str = "BM", keys = ["CHROM", "POS"], y = "ALT"):

    print(f"Calculating performance metrics for {caller}")

    df = df_merged_BM.copy()[keys + [f"{y}_{caller}", f"{y}_{benchmark}"]].set_index(keys)
    
    # drop all rows with nan values for the caller -> caller has no prediction
    df.dropna(subset=[f"{y}_{caller}"], inplace=True)
    display(df.head())

    caller_bool = df[f"{y}_{caller}"].notna()
    benchmark_bool = df[f"{y}_{benchmark}"].notna()

    print(len(caller_bool), len(benchmark_bool))

    # True Positives (TP)
    TP = (caller_bool & benchmark_bool)
    
    # False Positives (FP)
    FP = (caller_bool & ~benchmark_bool)
    
    # False Negatives (FN)
    FN = (~caller_bool & benchmark_bool)

    # Performance metrics
    TP_count = TP.sum()
    FP_count = FP.sum()
    FN_count = FN.sum()

    precision = TP_count / (TP_count + FP_count)
    recall = TP_count / (FN_count + TP_count)
    F1 = 2 * precision * recall / (precision + recall)

    TP_baseline = benchmark_bool.sum()

    print(f"Performance metrics for {caller}")

    print(f"TP_baseline: {TP_baseline}")

    print(f"TP: {TP_count}")
    print(f"FP: {FP_count}")
    print(f"FN: {FN_count}")

    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1: {F1:.4f}")

    return


df_merged_BM_path = PATH_PROCESSED + "merged_benchmark.csv"
df_merged_BM = pd.read_csv(df_merged_BM_path)
for caller in callers:
    
    calculate_performance(df_merged_BM, caller)


  df_merged_BM = pd.read_csv(df_merged_BM_path)


Calculating performance metrics for dv


Unnamed: 0_level_0,Unnamed: 1_level_0,ALT_dv,ALT_BM
CHROM,POS,Unnamed: 2_level_1,Unnamed: 3_level_1
1,10120,C,
1,10126,C,
1,10132,C,
1,10138,C,
1,10146,A,


11850252 11850252
Performance metrics for dv
TP_baseline: 4009580
TP: 4009580
FP: 7840672
FN: 0
Precision: 0.3384
Recall: 1.0000
F1: 0.5056
Calculating performance metrics for fb


Unnamed: 0_level_0,Unnamed: 1_level_0,ALT_fb,ALT_BM
CHROM,POS,Unnamed: 2_level_1,Unnamed: 3_level_1
1,10230,AC,
1,10247,C,
1,10327,C,
1,10352,TAAC,
1,10611,G,


5036824 5036824
Performance metrics for fb
TP_baseline: 3895752
TP: 3895752
FP: 1141072
FN: 0
Precision: 0.7735
Recall: 1.0000
F1: 0.8723
Calculating performance metrics for oc


Unnamed: 0_level_0,Unnamed: 1_level_0,ALT_oc,ALT_BM
CHROM,POS,Unnamed: 2_level_1,Unnamed: 3_level_1
1,10611,G,
1,10623,C,
1,10629,A,
1,10815,TC,
1,10816,CCA,


4877237 4877237
Performance metrics for oc
TP_baseline: 3989796
TP: 3989796
FP: 887441
FN: 0
Precision: 0.8180
Recall: 1.0000
F1: 0.8999
Calculating performance metrics for st


Unnamed: 0_level_0,Unnamed: 1_level_0,ALT_st,ALT_BM
CHROM,POS,Unnamed: 2_level_1,Unnamed: 3_level_1
1,10103,A,
1,10105,C,
1,10174,T,
1,10177,AC,
1,10230,A,


5104123 5104123
Performance metrics for st
TP_baseline: 3996524
TP: 3996524
FP: 1107599
FN: 0
Precision: 0.7830
Recall: 1.0000
F1: 0.8783


------------------------------------------------------------------------------------------------

# Random Forest Classifier

## Load the data

In [None]:
# load the merged benchmark data
def load_merged_benchmark(file_name: str = "merged_benchmark",
                          keys = ["CHROM", "POS"],
                          truth_col = "ALT_BM",
                          callers = ["dv", "fb", "oc", "st"],
                          features = ["DP", "QUAL", "ALT"],
                          ):

    df = pd.read_csv(PATH_PROCESSED + file_name + ".csv", dtype=dtype).set_index(keys)

    # Truth nodes: convert the truth column to boolean
    y = df[truth_col].notna()

    # Feature nodes: select the features for each caller
    X = df[[f"{feature}_{caller}" for caller in callers for feature in features]].copy()

    # Convert ALT of the callers to boolean, keep all other columns as they are
    for caller in callers:
        X[f"ALT_{caller}"] = X[f"ALT_{caller}"].notna()

    # Fill nan QUAL with the minimum value per caller
    for caller in callers:
        X[f"QUAL_{caller}"] = X[f"QUAL_{caller}"].fillna(X[f"QUAL_{caller}"].min())

    # If a caller has a nan value for DP, fill it with the mean value for all callers
    DP_cols = [f"DP_{caller}" for caller in callers]
    X[DP_cols] = X[DP_cols].fillna(X[DP_cols].mean())                

    return X, y

X, y = load_merged_benchmark()

display(X.head())
y.head()

## Split the data into training and testing sets

In [None]:
from sklearn.model_selection import train_test_split

# Try on chromosome 1
X_1 = X[X.index.get_level_values("CHROM") == "1"]
y_1 = y[X_1.index]

X_train, X_test, y_train, y_test = train_test_split(X_1, y_1, test_size=0.2, random_state=42)

## Preprocess the data

## Train the model

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

