### Développement d’un pipeline de calcul du TMB

#### Projet 4BiM, 2021

#### Auteurs : Marie Casimir, Loup Petitjean et Nicolas Mendiboure
#### Encadrantes Innate-Pharma : Sabrina Carpentier et Luciana Bastista
#### Encadrante INSA : Maïwenn Pineau

*Pour installer cyvcf2 : pip install cyvcf2*

In [1]:
import cyvcf2
import pandas as pd

In [2]:
## Importation d'un VCF avec la librairie cyvcf2 :
def read_cyvcf(vcf):

    #Général :

    CHROM = []
    POS = []
    REF = []
    ALT = []
    QUAL = []
    FILTER = []

    # Détails de la section INFO du VCF :

    AN = [] #Total number of alleles in called genotypes
    AC = [] #Allele count in genotypes, for each ALT allele, in the same order as listed
    AF = [] #Allele Frequency in primary data, for each ALT allele, in the same order as listed
    BQ = [] #RMS base quality
    SB = [] #Strand bias
    FA = [] #Overall fraction of reads supporting ALT
    MC = [] #Modification base changes at this position
    MT = [] #Modification types at this position
    NS = [] #Number of Samples With Data
    DP = [] #Total Depth across samples
    VT = [] #Variant type, can be SNP, INS or DEL
    SS = [] #Variant status relative to non-adjacent Normal,0=wildtype,1=germline,2=somatic,3=LOH,4=post-transcriptional modification,5=unknown
    ORIGIN = [] #Where the call originated from, the tumor DNA, RNA, or both
    SOMATIC = [] #Indicates if record is a somatic mutation
    INDEL = [] #Number of indels for all samples
    START = [] #Number of reads starting at this position across all samples
    STOP = [] #Number of reads stopping at this position across all samples



    for record in cyvcf2.VCF(vcf):
        CHROM.append(record.CHROM)
        POS.append(record.POS)
        REF.append(record.REF)
        ALT.append(record.ALT)
        QUAL.append(record.QUAL)
        FILTER.append(record.FILTER)

        #record.INFO est un objet de type cyvcf, pour extraire les données il faut utiliser .get()
        AN.append(record.INFO.get("AN"))
        AC.append(record.INFO.get("AC"))
        AF.append(record.INFO.get("AF"))
        BQ.append(record.INFO.get("BQ"))
        SB.append(record.INFO.get("SB"))
        FA.append(record.INFO.get("FA"))
        MC.append(record.INFO.get("MC"))
        MT.append(record.INFO.get("MT"))
        NS.append(record.INFO.get("NS"))
        DP.append(record.INFO.get("DP"))
        VT.append(record.INFO.get("VT"))
        SS.append(record.INFO.get("SS"))
        ORIGIN.append(record.INFO.get("ORIGIN"))
        SOMATIC.append(record.INFO.get("SOMATIC"))
        INDEL.append(record.INFO.get("INDEL"))
        START.append(record.INFO.get("START"))
        STOP.append(record.INFO.get("STOP"))

    df_VCF = pd.DataFrame(list(zip(CHROM, POS, REF, ALT, QUAL, FILTER)), 
                      columns=["CHROM", "POS", "REF", "ALT", "QUAL", "FILTER"])

    df_VCF[ ["AN", "AC", "AF", "BQ", "SB", "FA", "MC", 
         "MT", "NS", "DP", "VT", "SS", "ORIGIN", 
         "SOMATIC", "INDEL", "START", "STOP"]] = pd.DataFrame(list(zip (AN, AC, AF, BQ, SB, FA, MC,
                                                                        MT, NS, DP, VT, SS, ORIGIN, 
                                                                        SOMATIC, INDEL, START, STOP)))
    return (df_VCF)

In [3]:
radia = read_cyvcf("./test_radia.vcf")
radia_alt = radia['ALT'].values
print(radia_alt[0:6])

[list(['T']) list(['A']) list(['T']) list(['G']) list(['C'])
 list(['A', 'C', 'T'])]


In [4]:
basic_radia_TMB = len(radia_alt)
print(basic_radia_TMB)

87


In [5]:
import vcf
# Avec la librairie pyvcf :

def read_pyvcf(file):
    reader = vcf.Reader(open(file))
    df = pd.DataFrame([vars(r) for r in reader])
    out = df.merge(pd.DataFrame(df.INFO.tolist()),
                   left_index=True, right_index=True)
    return out

In [7]:
mutect = read_pyvcf("./test_mutect.vcf")
mutect_alt = mutect['ALT'].values
print(mutect_alt[:6])

[list([G]) list([A]) list([G]) list([T]) list([C]) list([C])]


In [8]:
basic_mutect_TMB = len(mutect_alt)
print(basic_mutect_TMB)

500
