In [92]:
from IPython.display import IFrame

import os
import pandas as pd
import numpy as np
import gzip
import re
import json


import matplotlib.pyplot as plt
import matplotlib.cm as cm


pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

In [7]:
folder = '../data'

In [150]:
def import_VCF41_to_pandas(vcf_file, sep='\t'):
    """
    Script to read vcf 4.2
    - now handle correct allele frequency calculated by summing REF reads + ALT reads instead from DP parameter
    - now retrieve the largest read number for ALT allele frequency in case is a heterozygous SNP (depends on calculate_ALT_AD())
    - now uses dataframe.iterrows() instead dataframe.index
    - remove snps with two alternate alleles, keeping the most abundant if this is more at least 3 times more frequent
    """

    header_lines = 0
    if vcf_file.endswith(".gz"):
        compress = 'gzip'
        with gzip.open(vcf_file, 'rb') as f:
            first_line = f.readline().decode().strip()
            next_line = f.readline().decode().strip()
            while next_line.startswith("##"):
                header_lines = header_lines + 1
                next_line = f.readline().decode().strip()
    else:
        compress = None
        with open(vcf_file, 'r') as f:
            first_line = f.readline().strip()
            next_line = f.readline().strip()
            while next_line.startswith("##"):
                header_lines = header_lines + 1
                next_line = f.readline().strip()
    
    if first_line.endswith('VCFv4.1'):
        dataframe = pd.read_csv(vcf_file, compression=compress, sep=sep, skiprows=[header_lines], header=header_lines)

        sample = dataframe.columns[-1]
        dataframe.rename(columns={sample:'sample'}, inplace=True)
        
        for index, data_row in dataframe.iterrows():
            info_fields = re.findall(r';*([a-zA-Z]{1,20})=', data_row.INFO)
            info_values = re.findall(r'-?\d+\.?\d*e?[+-]?\d{0,2}', data_row.INFO)
            
            format_fields = data_row['FORMAT'].split(":")
            format_values = data_row['sample'].split(":")
                                    
            for ifield, ivalue in zip(info_fields,info_values):
                dataframe.loc[index,ifield] = ivalue
                
            for ffield, fvalue in zip(format_fields,format_values):
                dataframe.loc[index,ffield] = fvalue
                
        dataframe['FREQ'] = dataframe['FREQ'].str.replace('%', '')
                
        to_float = ['ADP', 'WT', 'HET', 'HOM', 'NC', 'GQ', 'SDP', 'DP',
                    'RD', 'AD', 'FREQ', 'PVAL', 'RBQ', 'ABQ', 'RDF', 'RDR', 'ADF', 'ADR']
            
        for column in dataframe.columns:
            if column in to_float:
                dataframe[column] = dataframe[column].astype(float)    
        
    else:
        print("This vcf file is not v4.2")
        sys.exit(1)
           
    return dataframe

In [151]:
def import_cov_to_pandas(cov_file):
    dataframe = pd.read_csv(cov_file, sep='\t', names=['REF', 'POS', 'DP'])
    return dataframe

In [33]:
for root, _, files in os.walk(folder):
    for name in files:
        if 'vcf' in name:
            filename = os.path.join(root, name)
            print(filename)

../data/201334.lowfreq.vcf.gz
../data/201277.lowfreq.vcf.gz


In [34]:
df = import_VCF41_to_pandas('../data/201334.lowfreq.vcf.gz')

In [152]:
dfcov = import_cov_to_pandas('../data/201334.cov')

In [35]:
df.columns

Index(['#CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'FORMAT',
       'sample', 'ADP', 'WT', 'HET', 'HOM', 'NC', 'GT', 'GQ', 'SDP', 'DP',
       'RD', 'AD', 'FREQ', 'PVAL', 'RBQ', 'ABQ', 'RDF', 'RDR', 'ADF', 'ADR'],
      dtype='object')

In [153]:
df.head()

Unnamed: 0,#CHROM,POS,ID,REF,ALT,QUAL,FILTER,INFO,FORMAT,sample,ADP,WT,HET,HOM,NC,GT,GQ,SDP,DP,RD,AD,FREQ,PVAL,RBQ,ABQ,RDF,RDR,ADF,ADR
0,NC_045512.2,49,.,T,G,.,PASS,ADP=129;WT=0;HET=1;HOM=0;NC=0,GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:ADF:ADR,0/1:0:151:129:123:5:3.88%:9.8E-1:36:19:0:123:0:5,129.0,0.0,1.0,0.0,0.0,0/1,0.0,151.0,129.0,123.0,5.0,3.88,0.98,36.0,19.0,0.0,123.0,0.0,5.0
1,NC_045512.2,394,.,T,G,.,PASS,ADP=47;WT=0;HET=1;HOM=0;NC=0,GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:ADF:ADR,0/1:0:48:47:44:2:4.26%:9.8E-1:37:38:34:10:0:2,47.0,0.0,1.0,0.0,0.0,0/1,0.0,48.0,47.0,44.0,2.0,4.26,0.98,37.0,38.0,34.0,10.0,0.0,2.0
2,NC_045512.2,411,.,G,A,.,PASS,ADP=48;WT=0;HET=1;HOM=0;NC=0,GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:ADF:ADR,0/1:0:51:48:46:2:4.17%:9.8E-1:37:24:35:11:2:0,48.0,0.0,1.0,0.0,0.0,0/1,0.0,51.0,48.0,46.0,2.0,4.17,0.98,37.0,24.0,35.0,11.0,2.0,0.0
3,NC_045512.2,424,.,A,G,.,PASS,ADP=52;WT=0;HET=1;HOM=0;NC=0,GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:ADF:ADR,0/1:0:53:52:49:3:5.77%:9.8E-1:36:36:35:14:2:1,52.0,0.0,1.0,0.0,0.0,0/1,0.0,53.0,52.0,49.0,3.0,5.77,0.98,36.0,36.0,35.0,14.0,2.0,1.0
4,NC_045512.2,456,.,T,C,.,PASS,ADP=194;WT=0;HET=1;HOM=0;NC=0,GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:ADF:ADR,0/1:0:262:194:190:4:2.06%:9.8E-1:48:47:157:33:3:1,194.0,0.0,1.0,0.0,0.0,0/1,0.0,262.0,194.0,190.0,4.0,2.06,0.98,48.0,47.0,157.0,33.0,3.0,1.0


In [154]:
dfcov.head()

Unnamed: 0,REF,POS,DP
0,NC_045512.2,1,4
1,NC_045512.2,2,4
2,NC_045512.2,3,4
3,NC_045512.2,4,4
4,NC_045512.2,5,4


In [37]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 110 entries, 0 to 109
Data columns (total 29 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   #CHROM  110 non-null    object 
 1   POS     110 non-null    int64  
 2   ID      110 non-null    object 
 3   REF     110 non-null    object 
 4   ALT     110 non-null    object 
 5   QUAL    110 non-null    object 
 6   FILTER  110 non-null    object 
 7   INFO    110 non-null    object 
 8   FORMAT  110 non-null    object 
 9   sample  110 non-null    object 
 10  ADP     110 non-null    float64
 11  WT      110 non-null    float64
 12  HET     110 non-null    float64
 13  HOM     110 non-null    float64
 14  NC      110 non-null    float64
 15  GT      110 non-null    object 
 16  GQ      110 non-null    float64
 17  SDP     110 non-null    float64
 18  DP      110 non-null    float64
 19  RD      110 non-null    float64
 20  AD      110 non-null    float64
 21  FREQ    110 non-null    float64
 22  PV

In [155]:
dfcov.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29903 entries, 0 to 29902
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   REF     29903 non-null  object
 1   POS     29903 non-null  int64 
 2   DP      29903 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 701.0+ KB


In [240]:
%%writefile d3.template
<!DOCTYPE html>
<html>
    <head>
        <title>Car Scatter plot</title>
        <meta http-equiv="Access-Control-Allow-Origin" content="*"/>
        <style>
        body {
            margin: 0;
            overflow: hidden;
        }

        text {
            font-family: sans-serif;
        }

        .tick text {
            font-size: 1em;
            fill: #635F5D;
        }

        .tick line{
            stroke: #C0C0BB;
        }

        .xAxisLabel{
            font-size: 4em;
            fill: #8E8883;
        }

        .yAxisLabel{
            font-size: 4em;
            fill: #8E8883;
        }

        .title{
            font-size: 3.2em;
            fill: #635F5D;
        }
        
        .circle:hover {
            border: 2px solid red;
        }
        
        div.tooltip {
            position: absolute;
            text-align: center;
            width: auto;
            height: 28px;
            padding: 2px;
            font: 12px sans-serif;
            background: lightsteelblue;
            border: 0px;
            border-radius: 8px;
            pointer-events: none;
        }
        </style>
        <script src="https://d3js.org/d3.v5.min.js"></script>
    </head>
    <body>
        <svg width="960" height="500"></svg>
        <script>

        const svg = d3.select('svg');
        const width = svg.attr('width');
        const height = svg.attr('height');

        //data.forEach(element => console.log(element.population));
        //data.forEach(d => d.population = d.population * 1000);
        //console.log(data)
        const render = data => {
            const title = 'Allele frequency'
            const xValue = d => d.POS;
            const xAxisLabel = 'Position';
            const yValue = d => d.FREQ;
            const yAxisLabel = 'Frequency';
            const circleRadius = 8;
            
            const margin = {top: 60, right: 40, bottom: 88, left: 150};
            const innerWidth = width - margin.left - margin.right;
            const innerHeight = height - margin.top - margin.bottom;

            const xScale = d3.scaleLinear()
                            .domain(d3.extent(data, xValue))
                            .range([0, innerWidth])
                            .nice();
            
            const yScale = d3.scaleLinear()
                            .domain(d3.extent(data, yValue))
                            .range([innerHeight, 0])
                            .nice();

            const yAxis = d3.axisLeft(yScale)
                        .tickSize(-innerWidth)
                        .tickPadding(15);

            const xAxis = d3.axisBottom(xScale)
                        .tickSize(-innerHeight)
                        .tickPadding(15);

            //to set axis
            const g = svg.append('g')
                        .attr('transform', `translate(${margin.left}, ${margin.top})`);
            
            //Add labels and ticks to axes and remove some ticks
            const yAxisG = g.append('g')
                        .call(yAxis)

            yAxisG.selectAll('.domain')
                    .remove();

            const xAxisG = g.append('g')
                        .call(xAxis)
                        .attr('transform', `translate(0, ${innerHeight})`);
                    
            xAxisG.select('.domain')
                .remove();
            
            // Define the div for the tooltip
            var div = d3.select("body").append("div")
                .attr("class", "tooltip")
                .style("opacity", 0);

            //Add x label
            xAxisG.append('text')
                .attr('class', 'xAxisLabel')
                .attr('y',75)
                .attr('x', innerWidth / 2)
                .attr('fill', 'black')
                .text(xAxisLabel);

            //Add y label
            yAxisG.append('text')
                .attr('class', 'yAxisLabel')
                .attr('y', -93)
                .attr('x', - innerHeight / 2)
                .attr('fill', 'black')
                .attr('transform', `rotate(-90)`)
                .attr('text-anchor', 'middle')
                .text(yAxisLabel);

            //Add title
            g.append('text')
                .attr('class', 'title')
                .attr('y', -10)
                .text(title);
            
            //Create elements
            g.selectAll('circle')
            .data(data)
            .enter()
            .append('circle')
            .attr('fill-opacity', 0.5)
            .attr('stroke-opacity', 1)
            .attr('stroke', function(d) {return (yValue(d) > 50 ? "blue" : "salmon"); })
            .attr('stroke-width',0)
            .attr('fill', function(d) {return (yValue(d) > 50 ? "blue" : "salmon"); })
            .attr('cy', d => yScale(yValue(d)))
            .attr('cx', d => xScale(xValue(d)))
            .attr('r', circleRadius)
            .on("mouseover", function(d) {
                d3.select(this)
                  .transition()
                  .duration(50)
                  .attr('stroke-width',2)

                div.transition()
                    .duration(100)
                    .style("opacity", .9);
                div.html('POS: ' + xValue(d) + "<br/>"  + 'FREQ: ' + yValue(d))
                    .style("left", (d3.event.pageX + 3) + "px")
                    .style("top", (d3.event.pageY - 30) + "px");
                })
            .on("mouseout", function(d) {
                 d3.select(this)
                  .transition()
                  .duration(1000)
                  .attr('stroke-width',0)
                
                div.transition()
                    .duration(500)
                    .style("opacity", 0)
            });


        };
        
    const data = DATA;
    render(data)
    </script>
                
    </body>
</html>

Overwriting d3.template


In [241]:
display_d3(data)

In [193]:
data = df[['POS','FREQ']].to_dict(orient='records')

In [210]:
datacov = dfcov[['POS','DP']].to_dict(orient='records')

In [194]:
def display_d3(data, w=1000, h=500):
    final_html= 'd3.template' + '.html'
    with open('d3.template','r') as fin:
        template_file = fin.read()
        template_file_replaced = template_file.replace('DATA', str(data))
        with open(final_html,'w+') as fout:
            fout.write(template_file_replaced)
    
    #s= replace_all(s,d)        
    return IFrame('d3.template.html', width=w, height=h)

In [96]:
IFrame('makingABarchart_selfContained_.html', width=1000, height=500)

In [99]:
IFrame('CarsScatterPlot_selfContained.html', width=1000, height=500)

In [97]:
IFrame('basicNetwork_SNP_selfContained_K10339_blocks.html', width=1000, height=500)