In [58]:
import malariagen_data
import pandas as pd
import numpy as np
import allel

In [15]:
ag3 = malariagen_data.Ag3("simplecache::gs://vo_agam_release/", 
                          simplecache=dict(cache_storage="gcs_cache"))

In [206]:
meta_df = ag3.sample_metadata()

In [207]:
# fix the gambia
meta_df.replace('Gambia, The', 'The Gambia', inplace=True)

In [208]:
meta_df.species.value_counts(dropna=False)

gambiae                            1571
coluzzii                            675
arabiensis                          368
intermediate_gambiae_coluzzii       169
intermediate_arabiensis_gambiae       1
Name: species, dtype: int64

## kdr
new column in the metadata that is True for an individual carrying either het/hom 995f/s and False for ref

In [209]:
# 995S
chrom = '2L'
pos = 2422651
posk, refk, altk = ag3.snp_sites(chrom)
posk = allel.SortedIndex(posk.compute())
gtk = ag3.snp_genotypes(chrom)
loc = posk.locate_key(pos)
gt = gtk[loc].compute()
alt = altk[loc].compute()
ref = refk[loc].compute()
meta_df['kdr_S'] = gt.sum(axis=1) > 0

In [210]:
# 995S
chrom = '2L'
pos = 2422652
posk, refk, altk = ag3.snp_sites(chrom)
posk = allel.SortedIndex(posk.compute())
gtk = ag3.snp_genotypes(chrom)
loc = posk.locate_key(pos)
gt = gtk[loc].compute()
alt = altk[loc].compute()
ref = refk[loc].compute()
meta_df['kdr_F'] = gt.sum(axis=1) > 0

In [212]:
# any kdr
meta_df['any_kdr'] = meta_df.kdr_S | meta_df.kdr_F > 0

In [214]:
# 1570
chrom = '2L'
pos = 2429745
posk, refk, altk = ag3.snp_sites(chrom)
posk = allel.SortedIndex(posk.compute())
gtk = ag3.snp_genotypes(chrom)
loc = posk.locate_key(pos)
gt = gtk[loc].compute()
alt = altk[loc].compute()
ref = refk[loc].compute()
meta_df['kdr_1570'] = gt.sum(axis=1) > 0

In [215]:
# ace1_G119S = count_alleles('2R', 3492074, 1)
chrom = '2R'
pos = 3492074
posk, refk, altk = ag3.snp_sites(chrom)
posk = allel.SortedIndex(posk.compute())
gtk = ag3.snp_genotypes(chrom)
loc = posk.locate_key(pos)
gt = gtk[loc].compute()
alt = altk[loc].compute()
ref = refk[loc].compute()
meta_df['ace_119s'] = gt.sum(axis=1) > 0


In [216]:
# gste2_I114T = count_alleles('3R', 28598166, 1)
chrom = '3R'
pos = 28598166
posk, refk, altk = ag3.snp_sites(chrom)
posk = allel.SortedIndex(posk.compute())
gtk = ag3.snp_genotypes(chrom)
loc = posk.locate_key(pos)
gt = gtk[loc].compute()
alt = altk[loc].compute()
ref = refk[loc].compute()
meta_df['gst_114t'] = gt.sum(axis=1) > 0

## cnvs
- add columns to metadata with modal cnv per cyp450 and a True/False column for presence/absence of any cnv
- cyp6aa/p-(2R) cyp6m-(3R) cyp6z-(3R) cyp9k1-(X)
- might also make sense here to add some other columns for gste2 cluster and ace1 genes (for OP map)

In [247]:
# use Erics table for convenience
cnv_df = pd.read_csv("HMM_gene_copy_number.csv", sep=',', index_col=0)

In [248]:
cnv_df

Unnamed: 0_level_0,sex,high.var,Ace1,Cyp6aa1,Cyp6aa2,Cyp6p1,Cyp6p2,Cyp6p3,Cyp6p4,Cyp6p5,...,Gste1,Gste2,Gste3,Gste4,Gste5,Gste6,Gste7,Gstu4,Max_Gstue,Cyp9k1
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AB0333-C,F,False,0,0,0,0,0,0,0,0,...,2,2,0,0,0,0,0,0,2,0
AB0457-C,M,False,0,0,0,0,0,0,0,0,...,2,2,0,0,0,0,0,0,2,0
AB0502-C,M,False,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
AC0007-C,F,False,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
AC0008-C,F,False,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
VBS02119-4431STDY6772831,F,False,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
VBS02121-4431STDY6772832,F,False,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
VBS02127-4431STDY6772833,F,False,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
VBS02129-4431STDY6772835,F,False,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [249]:
# make a bolean column for any cyp cnv
cyp_list = ['Max_Cyp6aap', 'Max_Cyp6mz', 'Cyp9k1']
cnv_df['any_cyp'] = cnv_df[cyp_list].sum(axis=1) > 0

In [250]:
cnv_df.head()

Unnamed: 0_level_0,sex,high.var,Ace1,Cyp6aa1,Cyp6aa2,Cyp6p1,Cyp6p2,Cyp6p3,Cyp6p4,Cyp6p5,...,Gste2,Gste3,Gste4,Gste5,Gste6,Gste7,Gstu4,Max_Gstue,Cyp9k1,any_cyp
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AB0333-C,F,False,0,0,0,0,0,0,0,0,...,2,0,0,0,0,0,0,2,0,False
AB0457-C,M,False,0,0,0,0,0,0,0,0,...,2,0,0,0,0,0,0,2,0,False
AB0502-C,M,False,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,True
AC0007-C,F,False,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,False
AC0008-C,F,False,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,False


## merge

In [227]:
# let's drop 'AC0198-C' from our meta_df as there is no cnv data for that hybrid sample
meta_df.set_index('sample_id', inplace=True)

In [229]:
meta_df.drop('AC0198-C', inplace=True)

In [251]:
merged_df = pd.merge(meta_df, cnv_df, on='sample_id')

In [252]:
merged_df

Unnamed: 0_level_0,partner_sample_id,contributor,country,location,year,month,latitude,longitude,sex_call,sample_set,...,Gste2,Gste3,Gste4,Gste5,Gste6,Gste7,Gstu4,Max_Gstue,Cyp9k1,any_cyp
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AR0047-C,LUA047,Joao Pinto,Angola,Luanda,2009,4,-8.884,13.302,F,AG1000G-AO,...,0,0,0,0,0,0,0,0,0,False
AR0049-C,LUA049,Joao Pinto,Angola,Luanda,2009,4,-8.884,13.302,F,AG1000G-AO,...,1,1,1,1,1,1,1,1,1,True
AR0051-C,LUA051,Joao Pinto,Angola,Luanda,2009,4,-8.884,13.302,F,AG1000G-AO,...,0,0,0,0,0,0,0,0,0,False
AR0061-C,LUA061,Joao Pinto,Angola,Luanda,2009,4,-8.884,13.302,F,AG1000G-AO,...,0,0,0,0,0,0,0,0,0,False
AR0078-C,LUA078,Joao Pinto,Angola,Luanda,2009,4,-8.884,13.302,F,AG1000G-AO,...,0,0,0,0,0,0,0,0,0,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
AC0295-C,K92,Martin Donnelly,Uganda,Kihihi,2012,11,-0.751,29.701,F,AG1000G-UG,...,0,0,0,0,0,0,0,0,1,True
AC0296-C,K93,Martin Donnelly,Uganda,Kihihi,2012,11,-0.751,29.701,F,AG1000G-UG,...,0,0,0,0,0,0,0,0,2,True
AC0297-C,K94,Martin Donnelly,Uganda,Kihihi,2012,11,-0.751,29.701,F,AG1000G-UG,...,0,0,0,0,0,0,0,0,2,True
AC0298-C,K95,Martin Donnelly,Uganda,Kihihi,2012,11,-0.751,29.701,F,AG1000G-UG,...,0,0,0,0,0,0,0,0,1,True


## plot