### 29 October 2019
# Import and parse ClinVar VCF file to a bed-like data frame
### by Pavlos Bousounis
***Last updated 11/07/2019***

## Import modules

In [1]:
from datetime import datetime
import gzip
import os
import numpy as np
import pandas as pd
import shutil

In [2]:
# get today's date
today = datetime.today().strftime('%Y-%m-%d')

# print date and current directory
print('Current working directory: {}\n'.format(os.getcwd()))
print('Today is: {}'.format(today))

Current working directory: /Users/pbousounis/Experiments/2019-10-29_hg19mod/2019-11-07_ClinVarVCF-GRCh37_parse2bed

Today is: 2019-11-14


### Set working directory

In [4]:
basedir = '/Users/pbousounis/Experiments/2019-10-29_hg19mod/2019-11-07_ClinVarVCF-GRCh37_parse2bed/'
os.chdir(basedir)

### Decompress the VCF archive

In [10]:
# decompress the gunzipped file
gff_gz = 'data/clinvar.vcf.gz'
gff_out = 'data/clinvar.vcf'

with gzip.open(gff_gz, 'rb') as f_in:
    with open(gff_out, 'wb') as f_out:
        shutil.copyfileobj(f_in, f_out)

### Import the ClinVar and parse

In [7]:
def clinvar2bed_plp(clinvar_vcf_file):
    
    # define column names
    colns = ['chr', 'pos', 'vid', 'ref', 'alt', 'qual', 'filter', 'info']
    cv_vcf = pd.read_csv(clinvar_vcf_file, sep = '\t', comment='#', low_memory=False, header=None,
                         index_col=False, names=colns)

    # extract clinical significance column
    cv_vcf.loc[:, 'clnsig'] = cv_vcf['info'].str.extract(r'(CLNSIG=)(\w+);')[1]
    # extract gene info column
    cv_vcf.loc[:, 'gene'] = cv_vcf['info'].str.extract(r'(GENEINFO=)(\w+)')[1]

    cv_vcf_plp = cv_vcf[(cv_vcf.clnsig == 'Pathogenic') | (cv_vcf.clnsig == 'Likely_pathogenic')]
    cv_vcf_plp.head(20)

    # NOTICE:
    # specify start and end coordinates and convert to 0-based
    cv_vcf_plp.loc[:, 'start'] = cv_vcf_plp['pos'] #- 1
    cv_vcf_plp.loc[:, 'end'] = cv_vcf_plp['pos']
    cv_vcf_plp.loc[:, 'name'] = cv_vcf_plp['vid'].map(str) + '_' + cv_vcf_plp['gene']

    cv_bed_plp = cv_vcf_plp[['chr', 'start', 'end', 'name']]
    cv_bed_plp.head()

    return(cv_bed_plp)

### Convert the ClinVar VCF to BED file

In [8]:
clinvar_vcf_file = 'data/clinvar.vcf.gz'
cv_bed = clinvar2bed_plp(clinvar_vcf_file)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


### Drop rows with NAs

In [26]:
cv_bed = cv_bed.dropna()

### Save the parsed ClinVar pathogenic/likely-pathogenic BED file

In [27]:
cv_bed_plp_name = 'output/' + today + '_ClinVar-GRCh37_path-likely_path.bed'
cv_bed.to_csv(cv_bed_plp_name, sep='\t', index=False)

if os.path.isfile(cv_bed_plp_name):
    print('Success! File saved to {}.'.format(cv_bed_plp_name))

Success! File saved to output/2019-11-07_ClinVar-GRCh37_path-likely_path.bed.


# Results:

1. CinVar GRCh37 latest clinical variants VCF archive was decompressed
2. Decompressed VCF was parsed to a bed file containing only pathogenic and likely-pathogenic regions
3. Rows containing NAs were removed for compatibility with pybedtools 
4. BED formatted file saved to ***./output/2019-11-07_ClinVar-GRCh37_path-likely_path.bed***