# Examine the effects of geography and divergence on proportion of variants shared among samples 

June 24, 2020 

We would like to determine whether samples collected in the same geographic area share more variants than expected by chance alone. In addition to the permutation test, I would also like to perform some sort of metric for whether being close together on the tree also predicts having shared variation. There are probably a few different ways to do this: 

1. Compare some sort of raw measure of sequence divergence, like hamming distance (number of differences/length of sequence)
2. Compare the branch length of the path between the 2 sequences. 
3. Compare the tmrca, where more divergent sequences will have older tmrcas.

All 3 of these could be proxies for how close together sequences are on the tree. It would be good to test this out using the Wisconsin-only build as well as the Wisconsin-focused build with other sequences in there for context.

In [371]:
import imp
import importlib, json
import glob
import re,copy,json
import Bio.Phylo
import requests
import pandas as pd 
import numpy as np

import copy
from scipy.special import binom
import datetime as dt
    
import rpy2
%load_ext rpy2.ipython

# for this to work, you will need to download the most recent version of baltic, available here 
bt = imp.load_source('baltic', '/Users/lmoncla/src/baltic/baltic/baltic.py')

The rpy2.ipython extension is already loaded. To reload it, use:
  %reload_ext rpy2.ipython


# Read in metadata

In [372]:
def return_metadata_dict(metadata_file):
    x = {}
    with open(metadata_file, "r") as infile: 
        for line in infile:
            if "Barcode" not in line:   # skip first line
                samplename = line.split("\t")[0]
                strain_name = samplename.replace("hCoV-19/","")
                geo = line.split("\t")[7].title()
                Ct1 = line.split("\t")[21]
                Ct2 = line.split("\t")[22]
                household = line.split("\t")[29]
                
                if geo == "Oregon":
                    geo = "Oregon WI"
                if geo == "Columbus":
                    geo = "Columbus WI"
                if geo == "Verona":
                    geo = "Verona WI"

                x[strain_name] = {"location":geo, "Ct1":Ct1, "Ct2": Ct2, "household":household}
    
    return(x)

## Read in VCF data and output SNVs to query into a dataframe

In [373]:
def return_list_of_vcfs(vcf_directory):
    vcf_list = []
    for f in glob.glob(vcf_directory + "*intersection.csv"):
        vcf_list.append(f)
    return(vcf_list)

In [374]:
def read_in_intersection_snvs(vcf_list, vcf_directory):
    within_host_df = pd.DataFrame()
    
    for v in vcf_list:
        # pull out sampleid
        sampleid = v.replace(vcf_directory,"").replace("intersection.csv","")
        
        d = pd.read_csv(v, sep="\t")
        d['sampleid'] = sampleid
        within_host_df = within_host_df.append(d)
        
    return(within_host_df)

In [375]:
def separate_snvs_from_frameshift(within_host_df):
    snvs_df = within_host_df[(within_host_df['annotation'] == "missense") | (within_host_df['annotation'] == "synonymous") | (within_host_df['annotation'] == "stop")]
    snvs_df['aa_site'] = snvs_df['aa_chage'].str[3:-3]
    snvs_df['wt_aa'] = snvs_df['aa_chage'].str[0:3]
    snvs_df['mut_aa'] = snvs_df['aa_chage'].str[-3:]
    
    # add in columns for nucleotide changes 
    snvs_df['nt_ref'] = snvs_df['nt_change'].str.split(">",expand=True)[0].str[-1:]
    snvs_df['nt_mut'] = snvs_df['nt_change'].str.split(">",expand=True)[1]
    
    indels_df = within_host_df[(within_host_df['annotation'] == "frameshift") | (within_host_df['annotation'] == "frameshift&stop")]
    return(snvs_df, indels_df)

In [376]:
def read_strain_names_from_csv(strain_names_file):
    strain_names_dict = {}
    
    with open(strain_names_file, "r") as infile: 
        for line in infile:
            if "Sample identifier" not in line:
                tube_number = line.split("\t")[1]
                strain_name = line.split("\t")[0].replace("hCoV-19/","")
                
                # there are 2 sets of tube numbers, some with leading 0s and others without. I am pretty sure we
                # want the ones with leading 0s; there are also some we don't want that have non-numeric tube #s
                if tube_number.isdigit() and tube_number.startswith("0"):
                    strain_names_dict[str(int(tube_number))] = strain_name
                    
    return(strain_names_dict)

In [377]:
def convert_number_to_strain(sampleid, strain_names_dict):
    if sampleid in strain_names_dict:
        strain_name = strain_names_dict[sampleid]
    else:
        strain_name = "unknown"
        print(sampleid)
    return(strain_name)

In [378]:
def add_in_strain_column(df, strain_names_dict):
    temp_df = pd.DataFrame(df)
    strain_name = temp_df['sampleid'].apply(convert_number_to_strain, args=[strain_names_dict])
    #strain_name = temp_df['sampleid'].apply(lambda x: "USA/" + tube_number_conversion[x] + "/2020")
    temp_df["strain_name"] = strain_name
    return(temp_df)

In [379]:
def format_indels(row):
    
    if "dup" in row['nt_change']:
        split_char = "dup"
        variant = row['nt_change'].split(split_char)[1]
        new_value = "-" + str(int(row["POS"])) + variant
    
    elif "del" in row['nt_change']:
        split_char = "del"
        variant = row['nt_change'].split(split_char)[1]
        new_value = str(int(row['POS'])) +  variant + "-"
    
    return(new_value)

In [380]:
def return_indel_type(row):
    
    if "dup" in row['nt_change']:
        type_change = "insertion"
     
    elif "del" in row['nt_change']:
        type_change = "deletion"
    
    return(type_change)

In [381]:
def compute_shared_variant_proportion(sample1,sample2,df):
    shared_variants = 0
    
    s1_df = df[df['strain_name'] == sample1]
    variants_in_s1 = set(s1_df['nuc_muts'].tolist())
    
    s2_df = df[df['strain_name'] == sample2]
    variants_in_s2 = set(s2_df['nuc_muts'].tolist())
    
    total_variants = len(variants_in_s1) + len(variants_in_s2)
    
    for v in variants_in_s1:
        if v in variants_in_s2:
            shared_variants += 2
            
    proportion_shared = float(shared_variants/total_variants)
            
    return(proportion_shared)

## Read in vcfs and convert to dataframes 

I will only read in the intersection SNVs, meaning the ones that were detected in both technical sequencing replicates. This code will separate this into 2 dataframes, 1 for SNVs and 1 for indels, and will also look up and add in the strain names (necessary for converting from tube numbers, which is how the csvs are labelled).

In [484]:
strain_names_file = "/Users/lmoncla/src/ncov-WI-within-host/data/spreadsheet-with-strain-names.tsv"
strain_names_dict = read_strain_names_from_csv(strain_names_file)

In [485]:
vcf_directory = "/Users/lmoncla/src/ncov-WI-within-host/data/VCFs/"
vcfs = return_list_of_vcfs(vcf_directory)
all_intersection_variants = read_in_intersection_snvs(vcfs, vcf_directory)
all_intersection_variants = add_in_strain_column(all_intersection_variants, strain_names_dict)
snvs_only, indels_only = separate_snvs_from_frameshift(all_intersection_variants)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the ca

In [486]:
# add in a column for the nucleotide mutation so that it is in the same format as the annotation on nextstrain
snvs_only['nuc_muts'] = snvs_only['nt_ref'] + snvs_only["POS"].astype(int).astype(str) + snvs_only['nt_mut']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [487]:
# add a column with formatted indel that matches nextstrain
indels_only['nuc_muts'] = indels_only.apply(format_indels, axis=1)
indels_only['type_mut'] = indels_only.apply(return_indel_type, axis=1)
indels_only.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0.1,Unnamed: 0,SNP,POS,REF,annotation,gene_x,nt_change,aa_chage,rep1_percent,rep2_percent,%,sampleid,strain_name,nuc_muts,type_mut
0,1,ORF1ab_875dupA_Leu293fs_frameshift,1135.0,G,frameshift,ORF1ab,875dupA,Leu293fs,2.23,1.96,2.095,117,USA/WI-UW-110/2020,-1135A,insertion
3,4,ORF1ab_6435dupT_Leu2146fs_frameshift,6696.0,C,frameshift,ORF1ab,6435dupT,Leu2146fs,6.63,7.57,7.1,117,USA/WI-UW-110/2020,-6696T,insertion
5,8,ORF1ab_10817dupT_Leu3606fs_frameshift,11074.0,C,frameshift,ORF1ab,10817dupT,Leu3606fs,2.67,3.02,2.845,117,USA/WI-UW-110/2020,-11074T,insertion
12,17,ORF1ab_15705dupT_Val5236fs_frameshift,15965.0,G,frameshift,ORF1ab,15705dupT,Val5236fs,6.31,5.98,6.145,117,USA/WI-UW-110/2020,-15965T,insertion
13,18,ORF1ab_18109dupT_Ser6037fs_frameshift,18368.0,G,frameshift,ORF1ab,18109dupT,Ser6037fs,5.92,4.22,5.07,117,USA/WI-UW-110/2020,-18368T,insertion


In [488]:
# Split into only low frequency, here defined as <50%; I don't really want to query the fixed variants here 
low_freq_snvs_only = snvs_only[snvs_only["%"] < 50]
low_freq_indels_only = indels_only[indels_only["%"] < 50]

In [489]:
snvs_to_query = set(low_freq_snvs_only['nuc_muts'])
indels_to_query = set(low_freq_indels_only['nuc_muts'])
all_variants_to_query = snvs_to_query.copy()
all_variants_to_query.update(indels_to_query)
print(len(snvs_to_query))
print(len(indels_to_query))
print(len(all_variants_to_query))

59
31
90


## Code for parsing through tree

In [388]:
"""This is a small, recursive function to return the TMRCA for 2 tips. Starting with the parental node of tip1,
go recursively backwards in the tree until you find an internal node whose children contains both tip1 and 
tip2. Return that node."""

def return_TMRCA_node(input_node,tip1,tip2):
    
    # for a given internal node, generate a list of all its children, i.e., tips descending from that node
    node = input_node
    children = list(node.children)   # .children will output all of the direct descendants as baltic objects
    leaves = list(node.leaves)       # .leaves will output the names of all tips descending from the node
    
    if tip2 in leaves and tip1 in leaves: 
        node_to_return = node
    else:
        node_to_return = return_TMRCA_node(node.parent,tip1,tip2)
            
    return(node_to_return)

In [389]:
"""given 2 tips and a tree, iterate through the tree. when we reach tip 1, run return_TMRCA_node, to find the 
internal node that is the TMRCA for tips 1 and 2. Extract its date and return the node object and date"""

def return_TMRCA(tip1,tip2,tree):
    for k in tree.Objects: 
        if k.branchType == "leaf" and k.name == tip1:
            tmrca_node = return_TMRCA_node(k.parent,tip1,tip2)
            date = tmrca_node.traits['node_attrs']['num_date']['value']  # output the mean inferred date
            
    return(tmrca_node, date)

In [390]:
"""Given a starting internal node, and a tip you would like to end at, traverse the full path from that node to
tip. Along the way, gather nucleotide mutations that occur along that path. Once you have reached the ending 
tip, return the list of mutations that fell along that path"""

def return_divergence_on_path_to_tip(starting_node, ending_tip):
    
    children = starting_node.children
    
    for child in children:
        
        """if the child is a leaf: if leaf is the target end tip, collect its divergence and return; 
        if leaf is not the target end tip, move on"""
        """if the child is an internal node: first, test whether that child node contains the target tips in its 
        children. child.leaves will output a list of the names of all tips descending from that node. If not, pass. 
        if the node does contain the target end tip in its leaves, keep traversing down that node recursively"""

        if child.branchType == "leaf":
            if child.name != ending_tip:
                pass
            elif child.name == ending_tip:
                child_divergence = child.traits['node_attrs']['div']
                return(child_divergence)
         
        elif child.branchType == "node":
            if ending_tip not in child.leaves:
                pass
            else:
                child_divergence = return_divergence_on_path_to_tip(child, ending_tip)
    
    return(child_divergence)

In [391]:
def return_clade(tipname, tree):
    for k in tree.Objects:
        if k.branchType == "leaf" and k.name == tipname:
            clade = k.traits['node_attrs']['clade_membership']['value']
    return(clade)

In [392]:
def return_all_Wisconsin_tips(tree):
    Wisconsin_leaves = []
    
    for k in tree.Objects: 
        if k.branchType == "leaf":
            division = k.traits['node_attrs']['division']['value']
            if division == "Wisconsin":
                Wisconsin_leaves.append(k.name)
                
    return(Wisconsin_leaves)

In [393]:
def compare_Cts(tip1,tip2,metadata):
        
    mean1 = (float(metadata[tip1]['Ct1']))
    mean2 = (float(metadata[tip2]['Ct1']))
    difference = abs(mean1-mean2)
    return(difference)

In [394]:
def compare_location(tip1,tip2,metadata):
    geo1 = metadata[tip1]["location"]
    geo2 = metadata[tip2]["location"]
    
    if geo1 == geo2: 
        location = 1
    else: 
        location = 0
        
    return(location,geo1,geo2)

In [395]:
def return_lat_longs_dictionary(lat_longs_file):
    
    output_dict = {}
    
    with open(lat_longs_file, "r") as infile: 
        for line in infile:
            if len(line.split("\t")) == 4:
                location = line.split("\t")[1]
                longitude = line.split("\t")[2]
                latitude = line.split("\t")[3].strip()
                output_dict[location] = {"latitude":latitude, "longitude":longitude}
    return(output_dict)

In [396]:
"""I took this from here, https://stackoverflow.com/questions/4913349/haversine-formula-in-python-bearing-and-distance-between-two-gps-points
A decent overview of this formula can be found here: https://www.movable-type.co.uk/scripts/latlong.html"""
from math import radians, cos, sin, asin, sqrt

def haversine(lon1, lat1, lon2, lat2):
    """
    Calculate the great circle distance between two points 
    on the earth (specified in decimal degrees). 
    """
    # convert decimal degrees to radians 
    lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])

    # haversine formula 
    dlon = lon2 - lon1 
    dlat = lat2 - lat1 
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    c = 2 * asin(sqrt(a)) 
    r = 6371 # Radius of earth in kilometers. Use 3956 for miles
    return c * r

In [481]:
def return_distance_between_locations(tip1,tip2,metadata,lat_longs):
    geo1 = metadata[tip1]["location"]
    geo2 = metadata[tip2]["location"]
    
    lat1 = lat_longs_dict[geo1]['latitude']
    lat2 = lat_longs_dict[geo2]['latitude']
    
    long1 = lat_longs_dict[geo1]['longitude']
    long2 = lat_longs_dict[geo2]['longitude']
    
    distance_km = haversine(float(long1), float(lat1), float(long2), float(lat2))
    return(distance_km)

# run!

In [344]:
# test this out first on the Wisconsin-only build json
#WI_tree_path = "/Users/lmoncla/src/ncov-WI-within-host/data/Wisconsin.json"
WI_tree_path = "/Volumes/gradschool-and-postdoc-backups/post-doc/stored_files_too_big_for_laptop/ncov-build-forced-WI/ncov/auspice/ncov_usa_wisconsin.json"

analysis_level = "division"

# with open(WI_tree_path) as json_file:
#     WI_tree_json = json.load(json_file)
# WI_tree_object=WI_tree_json['tree']
# WI_meta=WI_tree_json['meta']
# json_translation={'absoluteTime':lambda k: k.traits['node_attrs']['num_date']['value'],'name':'name'} ## allows baltic to find correct attributes in JSON, height and name are required at a minimum
# json_meta={'file':WI_meta,'traitName':analysis_level} ## if you want auspice stylings you can import the meta file used on nextstrain.org

# WI_tree=bt.loadJSON(WI_tree_object,json_translation,json_meta)

In [490]:
transmission_pairs = [["USA/WI-UW-65/2020","USA/WI-UW-32/2020"],["USA/WI-UW-41/2020","USA/WI-UW-48/2020"],
                      ["USA/WI-UW-74/2020","USA/WI-UW-29/2020"],["USA/WI-UW-120/2020","USA/WI-UW-119/2020"]]

In [491]:
Wisconsin_tips_in_tree = return_all_Wisconsin_tips(WI_tree)
print(len(Wisconsin_tips_in_tree))

369


In [492]:
tips_to_query = set(all_intersection_variants['strain_name'].tolist())

for t in tips_to_query:
    if t not in Wisconsin_tips_in_tree:
        print(t)
        
print(len(tips_to_query))

USA/WI-UW-119/2020
19


In [493]:
tips_to_query.remove("USA/WI-UW-119/2020")

In [494]:
# read in metadata and latitude and longitude files
metadata_input_file = "/Users/lmoncla/src/ncov-WI-within-host/data/spreadsheet-with-strain-names.tsv"
metadata_dict = return_metadata_dict(metadata_input_file)
lat_longs_dict = return_lat_longs_dictionary("/Users/lmoncla/src/ncov/config/lat_longs.tsv")
wh_df_to_use = low_freq_snvs_only

In [495]:
df = pd.DataFrame()

combos = []
for t in tips_to_query: 
    tip1 = t
    
    for a in tips_to_query:
        tip2 = a
        combo = set([tip1,tip2])
        
        if tip1 != tip2 and combo not in combos:   # to prevent doing the pairwise comparisons twice
            
            # output Cts
            Ct_diff = compare_Cts(tip1,tip2,metadata_dict)
            
            # are the locations the same? 0 means no, 1 means yes
            location,loc1,loc2 = compare_location(tip1,tip2,metadata_dict)
            
            # output great circle distance between locations
            great_circle_distance = return_distance_between_locations(tip1,tip2,metadata_dict,lat_longs_dict)

            # output their clades
            tip1_clade = return_clade(tip1, tree)
            tip2_clade = return_clade(tip2, tree)
            if tip1_clade == tip2_clade:
                clades_same = 0
            else:
                clades_same = 1

            # output the tmrca and divergence
            parental_node,tmrca_date = return_TMRCA(tip1,tip2,tree)
            parent_divergence = parental_node.traits['node_attrs']['div']

            tip1_divergence = return_divergence_on_path_to_tip(parental_node, tip1)
            tip2_divergence = return_divergence_on_path_to_tip(parental_node, tip2)

            node_to_tip1 = tip1_divergence - parent_divergence
            node_to_tip2 = tip2_divergence - parent_divergence
            total_divergence = node_to_tip1 + node_to_tip2

            # calculate the proportion of variants shared
            shared_proportion_snvs = compute_shared_variant_proportion(tip1,tip2,wh_df_to_use)

            d = pd.DataFrame.from_dict({"tip1":[tip1],"tip2":[tip2],"tmrca":[tmrca_date],"clades_same":[clades_same],
                                        "divergence":[total_divergence],"prop_snvs_shared":[shared_proportion_snvs],
                                       "Ct_diff":[Ct_diff], "location_same":[location],"location1":[loc1],"location2":[loc2],
                                       "great_circle_distance_km":[great_circle_distance]})

            df = df.append(d)
            combos.append(combo)

In [496]:
df.head()
len(df)

153

In [471]:
# write to csv so I can use it in R 
df.to_csv("WI-variants-vs-geo-2020-06-29.csv")

## First, evaluate divergence 

I will first model shared diversity as a function of divergence. I will then model it as a combination of divergence, Ct differences and having the same clade

In [503]:
# evaluate the proportion of variants shared as a function of divergence
%R -i df
%R model.div = glm(prop_snvs_shared~divergence,data=df,family = gaussian(link="identity"),na.action(na.omit))
%R print(summary(model.div))


Call:
glm(formula = prop_snvs_shared ~ divergence, family = gaussian(link = "identity"), 
    data = df, weights = na.action(na.omit))

Deviance Residuals: 
     Min        1Q    Median        3Q       Max  
-0.18871  -0.08327  -0.02562   0.08551   0.27621  

Coefficients:
             Estimate Std. Error t value Pr(>|t|)    
(Intercept)  0.588810   0.016950  34.738   <2e-16 ***
divergence  -0.003827   0.001697  -2.256   0.0255 *  
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for gaussian family taken to be 0.01230708)

    Null deviance: 1.9210  on 152  degrees of freedom
Residual deviance: 1.8584  on 151  degrees of freedom
AIC: -234.65

Number of Fisher Scoring iterations: 2



In [504]:
# evaluate the proportion of variants shared as a function of great circle distance
%R -i df
%R model.div2 = glm(prop_snvs_shared~great_circle_distance_km,data=df,family = gaussian(link="identity"),na.action(na.omit))
%R print(summary(model.div2))
%R print(anova(model.div2, test="Chisq"))


Call:
glm(formula = prop_snvs_shared ~ great_circle_distance_km, family = gaussian(link = "identity"), 
    data = df, weights = na.action(na.omit))

Deviance Residuals: 
     Min        1Q    Median        3Q       Max  
-0.18069  -0.08377  -0.01599   0.07603   0.28099  

Coefficients:
                           Estimate Std. Error t value Pr(>|t|)    
(Intercept)               0.5814392  0.0144260   40.30   <2e-16 ***
great_circle_distance_km -0.0011886  0.0005355   -2.22   0.0279 *  
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for gaussian family taken to be 0.01231979)

    Null deviance: 1.9210  on 152  degrees of freedom
Residual deviance: 1.8603  on 151  degrees of freedom
AIC: -234.49

Number of Fisher Scoring iterations: 2



Analysis of Deviance Table

Model: gaussian, link: identity

Response: prop_snvs_shared

Terms added sequentially (first to last)


                         Df Deviance Resid. Df Resid. Dev Pr(>Chi)  
NULL                                       152     1.9210           
great_circle_distance_km  1 0.060703       151     1.8603  0.02644 *
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1


In [505]:
# evaluate the proportion of variants shared as a function of Ct difference
%R -i df
%R model.div2 = glm(prop_snvs_shared~Ct_diff,data=df,family = gaussian(link="identity"),na.action(na.omit))
%R print(summary(model.div2))
%R print(anova(model.div2, test="Chisq"))


Call:
glm(formula = prop_snvs_shared ~ Ct_diff, family = gaussian(link = "identity"), 
    data = df, weights = na.action(na.omit))

Deviance Residuals: 
     Min        1Q    Median        3Q       Max  
-0.18003  -0.08808  -0.01884   0.07686   0.29479  

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept) 0.536954   0.016301  32.939   <2e-16 ***
Ct_diff     0.004464   0.003117   1.432    0.154    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for gaussian family taken to be 0.01255129)

    Null deviance: 1.9210  on 152  degrees of freedom
Residual deviance: 1.8952  on 151  degrees of freedom
AIC: -231.64

Number of Fisher Scoring iterations: 2



Analysis of Deviance Table

Model: gaussian, link: identity

Response: prop_snvs_shared

Terms added sequentially (first to last)


        Df Deviance Resid. Df Resid. Dev Pr(>Chi)
NULL                      152     1.9210         
Ct_diff  1 0.025747       151     1.8952   0.1521


In [506]:
# lastly, try with clade same
%R -i df
%R model.div2 = glm(prop_snvs_shared~clades_same,data=df,family = gaussian(link="identity"),na.action(na.omit))
%R print(summary(model.div2))
%R print(anova(model.div2, test="Chisq"))


Call:
glm(formula = prop_snvs_shared ~ clades_same, family = gaussian(link = "identity"), 
    data = df, weights = na.action(na.omit))

Deviance Residuals: 
     Min        1Q    Median        3Q       Max  
-0.18762  -0.09554  -0.01675   0.07492   0.28325  

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)  0.56857    0.01559  36.462   <2e-16 ***
clades_same -0.01849    0.01919  -0.963    0.337    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for gaussian family taken to be 0.01264407)

    Null deviance: 1.9210  on 152  degrees of freedom
Residual deviance: 1.9093  on 151  degrees of freedom
AIC: -230.51

Number of Fisher Scoring iterations: 2



Analysis of Deviance Table

Model: gaussian, link: identity

Response: prop_snvs_shared

Terms added sequentially (first to last)


            Df Deviance Resid. Df Resid. Dev Pr(>Chi)
NULL                          152     1.9210         
clades_same  1 0.011737       151     1.9092   0.3353


## now try all together

In [507]:
# evaluate the proportion of variants shared as a linear combination of divergence and whether the clade is the same
%R -i df
%R model.marsh = glm(prop_snvs_shared~divergence+clades_same+Ct_diff+great_circle_distance_km,data=df,family = gaussian(link="identity"),na.action(na.omit))
%R print(summary(model.marsh))
%R print(coef(model.marsh))
%R print(anova(model.marsh, test="Chisq"))


Call:
glm(formula = prop_snvs_shared ~ divergence + clades_same + Ct_diff + 
    great_circle_distance_km, family = gaussian(link = "identity"), 
    data = df, weights = na.action(na.omit))

Deviance Residuals: 
     Min        1Q    Median        3Q       Max  
-0.20479  -0.07855  -0.01655   0.08104   0.28272  

Coefficients:
                           Estimate Std. Error t value Pr(>|t|)    
(Intercept)               0.5762230  0.0243430  23.671   <2e-16 ***
divergence               -0.0028635  0.0021833  -1.312    0.192    
clades_same               0.0093013  0.0225840   0.412    0.681    
Ct_diff                   0.0033767  0.0031198   1.082    0.281    
great_circle_distance_km -0.0007778  0.0006103  -1.274    0.204    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for gaussian family taken to be 0.01230299)

    Null deviance: 1.9210  on 152  degrees of freedom
Residual deviance: 1.8208  on 148  degrees of freedom
AIC: -231.77

Numb

             (Intercept)               divergence              clades_same 
            0.5762230400            -0.0028634616             0.0093012867 
                 Ct_diff great_circle_distance_km 
            0.0033767155            -0.0007777859 


Analysis of Deviance Table

Model: gaussian, link: identity

Response: prop_snvs_shared

Terms added sequentially (first to last)


                         Df Deviance Resid. Df Resid. Dev Pr(>Chi)  
NULL                                       152     1.9210           
divergence                1 0.062623       151     1.8584  0.02406 *
clades_same               1 0.001026       150     1.8573  0.77276  
Ct_diff                   1 0.016516       149     1.8408  0.24660  
great_circle_distance_km  1 0.019983       148     1.8208  0.20250  
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1


In [508]:
# evaluate the proportion of variants shared as a linear combination of divergence and whether the clade is the same
%R -i df
%R model.marsh = glm(prop_snvs_shared~divergence+clades_same+Ct_diff,data=df,family = gaussian(link="identity"),na.action(na.omit))
%R print(summary(model.marsh))
%R print(coef(model.marsh))
%R print(anova(model.marsh, test="Chisq"))


Call:
glm(formula = prop_snvs_shared ~ divergence + clades_same + Ct_diff, 
    family = gaussian(link = "identity"), data = df, weights = na.action(na.omit))

Deviance Residuals: 
     Min        1Q    Median        3Q       Max  
-0.19530  -0.08062  -0.02077   0.08819   0.28793  

Coefficients:
             Estimate Std. Error t value Pr(>|t|)    
(Intercept)  0.569131   0.023748  23.965   <2e-16 ***
divergence  -0.003896   0.002032  -1.917   0.0571 .  
clades_same  0.006918   0.022554   0.307   0.7595    
Ct_diff      0.003609   0.003121   1.156   0.2494    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for gaussian family taken to be 0.01235454)

    Null deviance: 1.9210  on 152  degrees of freedom
Residual deviance: 1.8408  on 149  degrees of freedom
AIC: -232.1

Number of Fisher Scoring iterations: 2



 (Intercept)   divergence  clades_same      Ct_diff 
 0.569130801 -0.003895600  0.006918280  0.003608546 


Analysis of Deviance Table

Model: gaussian, link: identity

Response: prop_snvs_shared

Terms added sequentially (first to last)


            Df Deviance Resid. Df Resid. Dev Pr(>Chi)  
NULL                          152     1.9210           
divergence   1 0.062623       151     1.8584  0.02436 *
clades_same  1 0.001026       150     1.8573  0.77322  
Ct_diff      1 0.016516       149     1.8408  0.24759  
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1


In [509]:
# evaluate the proportion of variants shared as a linear combination of divergence and whether the clade is the same
%R -i df
%R model.marsh = glm(prop_snvs_shared~great_circle_distance_km+clades_same+Ct_diff,data=df,family = gaussian(link="identity"),na.action(na.omit))
%R print(summary(model.marsh))
%R print(coef(model.marsh))
%R print(anova(model.marsh, test="Chisq"))


Call:
glm(formula = prop_snvs_shared ~ great_circle_distance_km + clades_same + 
    Ct_diff, family = gaussian(link = "identity"), data = df, 
    weights = na.action(na.omit))

Deviance Residuals: 
     Min        1Q    Median        3Q       Max  
-0.19447  -0.08212  -0.01615   0.07871   0.27799  

Coefficients:
                           Estimate Std. Error t value Pr(>|t|)    
(Intercept)               0.5659632  0.0231075  24.493   <2e-16 ***
great_circle_distance_km -0.0010747  0.0005681  -1.892   0.0605 .  
clades_same              -0.0044782  0.0200395  -0.223   0.8235    
Ct_diff                   0.0036859  0.0031183   1.182   0.2391    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for gaussian family taken to be 0.01236245)

    Null deviance: 1.921  on 152  degrees of freedom
Residual deviance: 1.842  on 149  degrees of freedom
AIC: -232

Number of Fisher Scoring iterations: 2



             (Intercept) great_circle_distance_km              clades_same 
             0.565963186             -0.001074677             -0.004478218 
                 Ct_diff 
             0.003685932 


Analysis of Deviance Table

Model: gaussian, link: identity

Response: prop_snvs_shared

Terms added sequentially (first to last)


                         Df Deviance Resid. Df Resid. Dev Pr(>Chi)  
NULL                                       152     1.9210           
great_circle_distance_km  1 0.060703       151     1.8603   0.0267 *
clades_same               1 0.001011       150     1.8593   0.7749  
Ct_diff                   1 0.017272       149     1.8420   0.2372  
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
