In [28]:
import pandas as pd
import os
from haversine import haversine, Unit
from skbio import stats

# Mantel Test

```results/mantel/locations.tsv``` have all of the exome samples (post filtration) and their locations in tsv as:

```sample\tlat\tlong```

## get genetic distance

in bash run:
    
```
#https://github.com/BGI-shenzhen/VCF2Dis
```
which generats the output pairwise genetic distance matrix

```
smv7_ex_autosomes_exomes_renamed_genetic_distance_matrix.tsv
```

The names of the columns and header are integers, the key to covert back to sample id is here:
```
sample_name_key.list
```

## get geographic distance (in Km)

In [23]:
os.chdir("/master/nplatt/sch_man_nwinvasion/results/mantel")

#read in distance matrix
gen_df = pd.read_table("smv7_ex_autosomes_exomes_renamed_genetic_distance_matrix.tsv", sep="\t", header=None, index_col=0)
gen_df

#read in header names
header={}
with open("sample_name_key.list") as f:
    for line in f:
        (re_id, sample_id)=line.split()
        header[int(re_id)]=sample_id
            
#add header and header names
index_names=[]
for re_id in gen_df.index:
    index_names.append(header[re_id])

gen_df.index=index_names
gen_df.columns=index_names
gen_df

#re-save
gen_df.to_csv("gen_dist.csv", sep=",")

In [48]:
#read in tsv file of sample\tlocation
locations={}
with open("locations.tsv") as f:
    for line in f:
        (sample_id, lat, lon)=line.split()
        locations[sample_id]=[float(lat), float(lon)]

x_s=[]
for x in gen_df.index:
    y_s=[]
    for y in gen_df.index:
        #get distance
        km=haversine(locations[x], locations[y])
        y_s.append(km)
    x_s.append(y_s)
    
km_df=pd.DataFrame(x_s, columns=gen_df.index, index=gen_df.index)

#save to csv
km_df.to_csv("km_dist.csv", sep=",")

In [50]:
# skbio.stats.distance.mantel(x, y, 
#                             method='pearson', 
#                             permutations=999, 
#                             alternative='two-sided', 
#                             strict=True, 
#                             lookup=None)


## Mantel Tests

In [56]:
#mantel test from all exome samples:
stats.distance.mantel(gen_df, km_df, permutations=1000)

(0.6432888597268949, 0.000999000999000999, 135)

In [68]:
#mantel test within africa:

#get list of brazilian samples
brazil_samples = list(gen_df.filter(regex='Sm.BR'))

gen_no_brazil_df = gen_df.drop(index=brazil_samples, columns=brazil_samples)
km_no_brazil_df = km_df.drop(index=brazil_samples, columns=brazil_samples)

stats.distance.mantel(gen_no_brazil_df, km_no_brazil_df, permutations=1000)


(0.7664780051722738, 0.000999000999000999, 90)