In [4]:
# UAMS group: Dr. Donald Johann Jr, Dr. Erich Peterson, Jason Liem

# Note: These functions access compressed VCFs that have been processed with SnpEff

# INITIALIZATION

# 'bpa_analysis_functions_v2.py', 'sqlite_funcs.py' and 'bpa_analysis_functions_uams.py' must be in the same directory

import bpa_analysis_functions_uams as ar
import sqlite_funcs as sql

# set the following variables for convenience; they will be used throughout the following examples

project = 'bpa-UAMS_P0001_T1' # insert your project name assigned by BPA
profile = 'jwliem' # insert your profile
path    = 'files/' # your working directory where files will be created, downloaded to, etc

In [5]:
# EXAMPLE 1: list cases in a project and print to file

# description: function 'print_cases_by_project'
# returns list of all cases in our project and prints to file

# specify file name
file_name = "patients.txt"

ar.print_cases_by_project(project, path, file_name)

C-P-PT-005-3070
C-P-PT-002-3067
C-P-PT-009-3073
C-P-PT-007-3072
C-P-FB-001-3065
C-P-PT-001-3066
C-P-PT-003-3068
C-P-PT-004-3069


In [6]:
# EXAMPLE 2: list all VCFs for a case

# description: function 'print_VCF_files_by_case' returns list of all VCFs & 
# experimental strategy for a specified case; prints results to file

# copy a case from the previous results and paste into the variable 'case' below

case = "C-P-FB-001-3065"
file_name = case + "_VCF_list.txt"

ar.print_VCF_files_by_case(project, case, path, file_name)

RG-MPS-20347-BID-30603-strelka.snvs.final.vcf.gz	WGS
RG-MPS-20370-BID-30605-1603d54b.final.vcf.gz	Total RNA
RG-MPS-20347-BID-30603-strelka.indels.final.vcf.gz	WGS
RG-MPS-20390-BID-30639-30639.merged.final.vcf.gz	Panel


In [7]:
# EXAMPLE 3: find gene/mutation combination in a VCF

# description: for a specified VCF, function 'somatic_mutation_by_gene_and_mutation' returns 
# all records w/ a specified gene + mutation and return total found

# copy a selected file from the previous results and paste into the variable 'vcf_file' below

vcf_file = "RG-MPS-20390-BID-30639-30639.merged.final.vcf.gz"
gene = "EGFR"
mutation = "p.Glu746_Ala750del"

ar.somatic_mutation_by_gene_and_mutation(project, profile, path, vcf_file, gene, mutation)


Local extracted file copy found: RG-MPS-20390-BID-30639-30639.merged.final.vcf
--------------------------------------------------------------------------------
Processing file: RG-MPS-20390-BID-30639-30639.merged.final.vcf

(u'A|protein_protein_contact|HIGH|EGFR|ENSG00000146648|interaction|ENST00000275493:2EB3_A:746_787|protein_coding|19/28|c.2235_2249delGGAATTAAGAGAAGC||||||', u'A|protein_protein_contact|HIGH|EGFR|ENSG00000146648|interaction|ENST00000275493:4G5J_A:725_746|protein_coding|19/28|c.2235_2249delGGAATTAAGAGAAGC||||||', u'A|protein_protein_contact|HIGH|EGFR|ENSG00000146648|interaction|ENST00000275493:4I1Z_A:725_746|protein_coding|19/28|c.2235_2249delGGAATTAAGAGAAGC||||||', u'A|protein_protein_contact|HIGH|EGFR|ENSG00000146648|interaction|ENST00000275493:4I1Z_A:746_785|protein_coding|19/28|c.2235_2249delGGAATTAAGAGAAGC||||||', u'A|protein_protein_contact|HIGH|EGFR|ENSG00000146648|interaction|ENST00000275493:4I22_A:725_746|protein_coding|19/28|c.2235_2249delGGAATTAAGAGAAGC||||

1

In [5]:
# EXAMPLE 3.1: find gene/mutation combination in a VCF (will also display specfic fields for readability)

# description: for a specified VCF, function 'somatic_mutation_by_gene_and_mutation' returns 
# all records w/ a specified gene + mutation and return total found

# 'details = True' parameter will extract and print notable fields below the record for better readability
# these fields include gene, chromosome, mutation offset, nucleotide change, amino acid change

# copy a selected file from Example 2 and paste into the variable 'vcf_file' below

vcf_file = "RG-MPS-20390-BID-30639-30639.merged.final.vcf.gz"
gene = "EGFR"
mutation = "p.Glu746_Ala750del"

ar.somatic_mutation_by_gene_and_mutation(project, profile, path, vcf_file, gene, mutation, details = True)

Local extracted file copy found: RG-MPS-20390-BID-30639-30639.merged.final.vcf
--------------------------------------------------------------------------------
Processing file: RG-MPS-20390-BID-30639-30639.merged.final.vcf

(u'A|protein_protein_contact|HIGH|EGFR|ENSG00000146648|interaction|ENST00000275493:2EB3_A:746_787|protein_coding|19/28|c.2235_2249delGGAATTAAGAGAAGC||||||', u'A|protein_protein_contact|HIGH|EGFR|ENSG00000146648|interaction|ENST00000275493:4G5J_A:725_746|protein_coding|19/28|c.2235_2249delGGAATTAAGAGAAGC||||||', u'A|protein_protein_contact|HIGH|EGFR|ENSG00000146648|interaction|ENST00000275493:4I1Z_A:725_746|protein_coding|19/28|c.2235_2249delGGAATTAAGAGAAGC||||||', u'A|protein_protein_contact|HIGH|EGFR|ENSG00000146648|interaction|ENST00000275493:4I1Z_A:746_785|protein_coding|19/28|c.2235_2249delGGAATTAAGAGAAGC||||||', u'A|protein_protein_contact|HIGH|EGFR|ENSG00000146648|interaction|ENST00000275493:4I22_A:725_746|protein_coding|19/28|c.2235_2249delGGAATTAAGAGAAGC||||

1

In [12]:
# EXAMPLE 4: find gene/mutation combination in a series of VCFs

# description: for a csv list of VCFs (not limited to one case) function 'somatic_mutation_by_gene_and_mutation_list'
# finds all records w/ a specified gene + mutation and return aggregate total found

# using Examples 1 & 2 (using any cases of your choice)
# copy and paste VCF file names into a csv string in the 'vcf_files_csv' variable below

vcf_files_csv = "RG-MPS-20390-BID-30639-30639.merged.final.vcf.gz, RG-MPS-20370-BID-30605-1603d54b.final.vcf.gz"
gene = "EGFR"
mutation = "p.Glu746_Ala750del"

ar.somatic_mutation_by_gene_and_mutation_list(project, profile, path, vcf_files_csv, gene, mutation, details = True)

# 'details = True' parameter can optionally be used here, similar to Example 3.1

Local extracted file copy found: RG-MPS-20370-BID-30605-1603d54b.final.vcf
--------------------------------------------------------------------------------
Processing file: RG-MPS-20370-BID-30605-1603d54b.final.vcf

(u'A|structural_interaction_variant|HIGH|EGFR|ENSG00000146648|interaction|1XKK:A_745-A_788:ENST00000275493|protein_coding|19/28|c.2235_2249delGGAATTAAGAGAAGC||||||', u'A|structural_interaction_variant|HIGH|EGFR|ENSG00000146648|interaction|2EB2:A_745-A_788:ENST00000275493|protein_coding|19/28|c.2235_2249delGGAATTAAGAGAAGC||||||', u'A|structural_interaction_variant|HIGH|EGFR|ENSG00000146648|interaction|2EB3:A_745-A_788:ENST00000275493|protein_coding|19/28|c.2235_2249delGGAATTAAGAGAAGC||||||', u'A|structural_interaction_variant|HIGH|EGFR|ENSG00000146648|interaction|2ITN:A_745-A_788:ENST00000275493|protein_coding|19/28|c.2235_2249delGGAATTAAGAGAAGC||||||', u'A|structural_interaction_variant|HIGH|EGFR|ENSG00000146648|interaction|2ITP:A_745-A_788:ENST00000275493|protein_coding|19

--------------------------------------------------------------------------------
Records found: 1
Finished processing file: RG-MPS-20370-BID-30605-1603d54b.final.vcf
--------------------------------------------------------------------------------
Local extracted file copy found: RG-MPS-20390-BID-30639-30639.merged.final.vcf
--------------------------------------------------------------------------------
Processing file: RG-MPS-20390-BID-30639-30639.merged.final.vcf

(u'A|protein_protein_contact|HIGH|EGFR|ENSG00000146648|interaction|ENST00000275493:2EB3_A:746_787|protein_coding|19/28|c.2235_2249delGGAATTAAGAGAAGC||||||', u'A|protein_protein_contact|HIGH|EGFR|ENSG00000146648|interaction|ENST00000275493:4G5J_A:725_746|protein_coding|19/28|c.2235_2249delGGAATTAAGAGAAGC||||||', u'A|protein_protein_contact|HIGH|EGFR|ENSG00000146648|interaction|ENST00000275493:4I1Z_A:725_746|protein_coding|19/28|c.2235_2249delGGAATTAAGAGAAGC||||||', u'A|protein_protein_contact|HIGH|EGFR|ENSG00000146648|intera

2

In [13]:
# EXAMPLE 5: find gene/mutation combination in all VCFs for a single case

# description: for a single case, function 'all_somatic_mutation_by_gene_and_mutation_by_case' will process all
# VCFs and finds all records w/ a specified gene + mutation and return aggregate total found

case = "C-P-FB-001-3065"
gene = "EGFR"
mutation = "p.Glu746_Ala750del"

ar.all_somatic_mutation_by_gene_and_mutation_by_case(project, profile, path, case, gene, mutation, details = True)

# 'details = True' parameter can optionally be used here, similar to Example 3.1

VCF files found for case C-P-FB-001-3065: 

RG-MPS-20347-BID-30603-strelka.snvs.final.vcf.gz	WGS
RG-MPS-20370-BID-30605-1603d54b.final.vcf.gz	Total RNA
RG-MPS-20347-BID-30603-strelka.indels.final.vcf.gz	WGS
RG-MPS-20390-BID-30639-30639.merged.final.vcf.gz	Panel
--------------------------------------------------------------------------------
Local extracted file copy found: RG-MPS-20347-BID-30603-strelka.indels.final.vcf
--------------------------------------------------------------------------------
Processing file: RG-MPS-20347-BID-30603-strelka.indels.final.vcf

(u'A|protein_protein_contact|HIGH|EGFR|ENSG00000146648|interaction|ENST00000275493:2EB3_A:746_787|protein_coding|19/28|c.2235_2249delGGAATTAAGAGAAGC||||||', u'A|protein_protein_contact|HIGH|EGFR|ENSG00000146648|interaction|ENST00000275493:4G5J_A:725_746|protein_coding|19/28|c.2235_2249delGGAATTAAGAGAAGC||||||', u'A|protein_protein_contact|HIGH|EGFR|ENSG00000146648|interaction|ENST00000275493:4I1Z_A:725_746|protein_coding|19/2

--------------------------------------------------------------------------------
Records found: 1
Finished processing file: RG-MPS-20347-BID-30603-strelka.indels.final.vcf
--------------------------------------------------------------------------------
Local extracted file copy found: RG-MPS-20347-BID-30603-strelka.snvs.final.vcf
--------------------------------------------------------------------------------
Processing file: RG-MPS-20347-BID-30603-strelka.snvs.final.vcf

--------------------------------------------------------------------------------
Records found: 0
Finished processing file: RG-MPS-20347-BID-30603-strelka.snvs.final.vcf
--------------------------------------------------------------------------------
Local extracted file copy found: RG-MPS-20370-BID-30605-1603d54b.final.vcf
--------------------------------------------------------------------------------
Processing file: RG-MPS-20370-BID-30605-1603d54b.final.vcf

(u'A|structural_interaction_variant|HIGH|EGFR|ENSG000001

--------------------------------------------------------------------------------
Records found: 1
Finished processing file: RG-MPS-20370-BID-30605-1603d54b.final.vcf
--------------------------------------------------------------------------------
Local extracted file copy found: RG-MPS-20390-BID-30639-30639.merged.final.vcf
--------------------------------------------------------------------------------
Processing file: RG-MPS-20390-BID-30639-30639.merged.final.vcf

(u'A|protein_protein_contact|HIGH|EGFR|ENSG00000146648|interaction|ENST00000275493:2EB3_A:746_787|protein_coding|19/28|c.2235_2249delGGAATTAAGAGAAGC||||||', u'A|protein_protein_contact|HIGH|EGFR|ENSG00000146648|interaction|ENST00000275493:4G5J_A:725_746|protein_coding|19/28|c.2235_2249delGGAATTAAGAGAAGC||||||', u'A|protein_protein_contact|HIGH|EGFR|ENSG00000146648|interaction|ENST00000275493:4I1Z_A:725_746|protein_coding|19/28|c.2235_2249delGGAATTAAGAGAAGC||||||', u'A|protein_protein_contact|HIGH|EGFR|ENSG00000146648|intera

3

In [8]:
# EXAMPLE 6 part 1: import all DNA Panel files for a case into a sqlite db

# description: initialization steps to setup db with tables and assign index keys to cases

# create DB and tables
db_full_location = sql.create_db_tables(path)

# initialize integer index IDs for each case for DB use; this will be passed to the
# actual db import function
case_dict = ar.assign_case_id_dict(project)



Dropping tables if they exist.
Creating tables.


In [11]:
# EXAMPLE 6 part 2: import Panel files for a case into a sqlite db

# description: function 'import_case_sqlite' will import all DNA Panel VCFs for a given case to the sqlite db

sql.import_case_sqlite(db_full_location, project, profile, path, case_dict, "C-P-PT-001-3066")

Row inserted into metadata table.
Downloading file: RG-MPS-20385-BID-30640-30640.merged.final.vcf.gz
Getting files...
Finished
Extracting file: RG-MPS-20385-BID-30640-30640.merged.final.vcf.gz
files/RG-MPS-20385-BID-30640-30640.merged.final.vcf.gz extracted to files/RG-MPS-20385-BID-30640-30640.merged.final.vcf
files/RG-MPS-20385-BID-30640-30640.merged.final.vcf.gz removed
--------------------------------------------------------------------------------

Opening: files/RG-MPS-20385-BID-30640-30640.merged.final.vcf
Importing: RG-MPS-20385-BID-30640-30640.merged.final.vcf
Imported: RG-MPS-20385-BID-30640-30640.merged.final.vcf | Total File Records in File : 35 | Total Rows Inserted: 35
