In [1]:
!pip install -q malariagen_data
!pip install -q scikit-allel
!pip install -q petl

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m148.5/148.5 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m14.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m52.8/52.8 MB[0m [31m14.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.2/10.2 MB[0m [31m89.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.0/4.0 MB[0m [31m71.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.5/302.5 kB[0m [31m27.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m139.8/139.8 kB[0m [31m15.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m20.9/20.9 MB[0m [31m31.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━

In [2]:
import allel
import malariagen_data
import dask
import dask.array as da
# silence some dask warnings
dask.config.set(**{'array.slicing.split_large_chunks': True})
from dask.diagnostics.progress import ProgressBar


In [3]:
# Warp frequencies to Dataframe ###
import pandas as pd
#pd.set_option("display.max_rows", None, "display.max_columns", None)
#pd.reset_option('^display.', silent=True)
import numpy as np
import petl
# Print style
#from colorama import Fore, Back, Style

In [4]:
# plotting setup
import matplotlib as mpl
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap, LinearSegmentedColormap
from matplotlib.gridspec import GridSpec
import matplotlib_venn as venn
import seaborn as sns
import plotly.express as px
%config InlineBackend.figure_format = "retina"
%matplotlib inline

In [5]:
plt.rcdefaults()
sns.reset_defaults()
#sns.set_context('paper')
#sns.set_style('white')
#sns.set_style('ticks')
rcParams = plt.rcParams
base_font_size = 12
rcParams['font.size'] = base_font_size
rcParams['axes.titlesize'] = base_font_size
rcParams['axes.labelsize'] = base_font_size
rcParams['xtick.labelsize'] = base_font_size
rcParams['ytick.labelsize'] = base_font_size
rcParams['legend.fontsize'] = base_font_size
rcParams['axes.linewidth'] = .5
rcParams['lines.linewidth'] = .5
rcParams['patch.linewidth'] = .5
rcParams['ytick.direction'] = 'out'
rcParams['xtick.direction'] = 'out'
#rcParams['savefig.jpeg_quality'] = 300
rcParams['lines.markeredgewidth'] = .5
rcParams['figure.max_open_warning'] = 1000
rcParams['figure.dpi'] = 120
rcParams['figure.facecolor'] = 'w'
#plt.rcdefaults()

In [6]:
#Mounting Google Drive
import os
from google.colab import drive
drive.mount("drive")

# make dir
results_dir = "drive/MyDrive"
os.makedirs(results_dir, exist_ok=True)

Mounted at drive


In [7]:
## Importing malariagen data set
ag3 = malariagen_data.Ag3("gs://vo_agam_release/", pre=True)

## Let's check the the sample set
sets = ["1191-VO-MULTI-OLOUGHLIN-VMF00106", "1191-VO-MULTI-OLOUGHLIN-VMF00140", "AG1000G-BF-A", "AG1000G-BF-B", "AG1000G-BF-C"]
df_samples = ag3.sample_metadata(sample_sets=sets)#.set_index("sample_id")




In [8]:
## Create population column
bf_samples = df_samples.query('country == "Burkina Faso"')

##
cohort_, cohorts, pop_list1 = {}, {}, []
for species in bf_samples.aim_species.unique():
  if species in ['coluzzii', 'gambiae', 'arabiensis']:
    sp_sample = bf_samples.query(f"aim_species == '{species}'")
    for year in sp_sample.year.unique():
      key1 = f'An. {species} ({year})'
      key = f'{species}_{year}'
      cohort_[key] = f"country == 'Burkina Faso' and aim_species == '{species}' and year == {year}"
      cohorts[key] = cohort_[key]
      pop_list1.append(key1)
#cohorts

n_xlab = ['2014', '2015', '2016', '2012','2014', '2015', '2016', '2017','2012','2014', '2015', '2016', '2017']

## define cohorts
cohorts_y = {'Bana_col_2012': "location == ['Bana Market', 'Bana Village'] and aim_species == 'coluzzii' and year==2012",
             'Bana_col_2014': "location == ['Bana Market', 'Bana Village'] and aim_species == 'coluzzii' and year==2014",
             'Bana_col_2015': "location == ['Bana Market', 'Bana Village'] and aim_species == 'coluzzii' and year==2015",
             'Bana_col_2016': "location == ['Bana Market', 'Bana Village'] and aim_species == 'coluzzii' and year==2016",
             'Bana_col_2017': "location == ['Bana Market', 'Bana Village'] and aim_species == 'coluzzii' and year==2017",
             'Bana_gam_2012': "location == ['Bana Market', 'Bana Village'] and aim_species == 'gambiae' and year==2012",
             'Bana_gam_2014': "location == ['Bana Market', 'Bana Village'] and aim_species == 'gambiae' and year==2014",
             'Bana_gam_2015': "location == ['Bana Market', 'Bana Village'] and aim_species == 'gambiae' and year==2015",
             'Bana_gam_2016': "location == ['Bana Market', 'Bana Village'] and aim_species == 'gambiae' and year==2016",
             'Pala_ara_2014': "location == 'Pala' and aim_species == 'arabiensis' and year==2014",
             'Pala_ara_2015': "location == 'Pala' and aim_species == 'arabiensis' and year==2015",
             'Pala_ara_2016': "location == 'Pala' and aim_species == 'arabiensis' and year==2016",
             'Pala_col_2012': "location == 'Pala' and aim_species == 'coluzzii' and year==2012",
             'Pala_gam_2012': "location == 'Pala' and aim_species == 'gambiae' and year==2012",
             'Pala_gam_2014': "location == 'Pala' and aim_species == 'gambiae' and year==2014",
             'Pala_gam_2015': "location == 'Pala' and aim_species == 'gambiae' and year==2015",
             'Pala_gam_2016': "location == 'Pala' and aim_species == 'gambiae' and year==2016",
             'Pala_gam_2017': "location == 'Pala' and aim_species == 'gambiae' and year==2017",
             'Sour_col_2012': "location == 'Souroukoudinga' and aim_species == 'coluzzii' and year==2012",
             'Sour_col_2014': "location == 'Souroukoudinga' and aim_species == 'coluzzii' and year==2014",
             'Sour_col_2015': "location == 'Souroukoudinga' and aim_species == 'coluzzii' and year==2015",
             'Sour_col_2016': "location == 'Souroukoudinga' and aim_species == 'coluzzii' and year==2016",
             'Sour_col_2017': "location == 'Souroukoudinga' and aim_species == 'coluzzii' and year==2017",
             'Sour_gam_2012': "location == 'Souroukoudinga' and aim_species == 'gambiae' and year==2012",
             'Sour_gam_2014': "location == 'Souroukoudinga' and aim_species == 'gambiae' and year==2014",
             'Sour_gam_2015': "location == 'Souroukoudinga' and aim_species == 'gambiae' and year==2015",
             'Sour_gam_2016': "location == 'Souroukoudinga' and aim_species == 'gambiae' and year==2016",
             'Sour_gam_2017': "location == 'Souroukoudingan' and aim_species == 'gambiae' and year==2017",
             'Mono_gam_2004': "location == 'Monomtenga' and aim_species == 'gambiae' and year==2004",
              }
cohorts_y.keys()

dict_keys(['Bana_col_2012', 'Bana_col_2014', 'Bana_col_2015', 'Bana_col_2016', 'Bana_col_2017', 'Bana_gam_2012', 'Bana_gam_2014', 'Bana_gam_2015', 'Bana_gam_2016', 'Pala_ara_2014', 'Pala_ara_2015', 'Pala_ara_2016', 'Pala_col_2012', 'Pala_gam_2012', 'Pala_gam_2014', 'Pala_gam_2015', 'Pala_gam_2016', 'Pala_gam_2017', 'Sour_col_2012', 'Sour_col_2014', 'Sour_col_2015', 'Sour_col_2016', 'Sour_col_2017', 'Sour_gam_2012', 'Sour_gam_2014', 'Sour_gam_2015', 'Sour_gam_2016', 'Sour_gam_2017', 'Mono_gam_2004'])

In [9]:
bf_samples.groupby(['location', 'year', 'aim_species']).size()

location        year  aim_species                  
Bana Market     2014  coluzzii                         49
                2015  coluzzii                         54
                      gambiae                           8
                2016  coluzzii                         28
                2017  coluzzii                         50
Bana Village    2012  coluzzii                         42
                      gambiae                          22
                      intermediate_gambiae_coluzzii     1
                2014  arabiensis                        1
                      coluzzii                         47
                      gambiae                          32
                2015  coluzzii                         42
                      gambiae                          17
                2016  coluzzii                         92
                      gambiae                          13
                2017  coluzzii                         85
Monomtenga      2004

In [10]:
## Compute SNP allele frequencies by collection site and species
#help(ag3.snp_allele_frequencies)
rd = 'AGAP004707-RD'
snps_rd = ag3.snp_allele_frequencies(transcript=rd, cohorts=cohorts, site_mask='gamb_colu_arab',
                                     sample_sets=sets, drop_invariant=True, effects=True)

##
snps_rd_y = ag3.snp_allele_frequencies(transcript=rd, cohorts=cohorts_y, site_mask='gamb_colu_arab',
                                       sample_sets=sets, drop_invariant=True, effects=True)
print('done !')



Load SNP genotypes:   0%|          | 0/214 [00:00<?, ?it/s]



Compute allele frequencies:   0%|          | 0/14 [00:00<?, ?it/s]



Compute SNP effects:   0%|          | 0/6783 [00:00<?, ?it/s]



Load SNP genotypes:   0%|          | 0/214 [00:00<?, ?it/s]



Compute allele frequencies:   0%|          | 0/28 [00:00<?, ?it/s]

Compute SNP effects:   0%|          | 0/6783 [00:00<?, ?it/s]

done !


In [11]:
# Subset allelic frequencies
frq_vgsc = snps_rd.reset_index().query("effect == 'NON_SYNONYMOUS_CODING' and max_af > 0.05 and pass_gamb_colu == True")
df_frq_vgsc = frq_vgsc.drop('frq_gambiae_2004', axis=1)
frq_vgsc_y = snps_rd_y.reset_index().query("effect == 'NON_SYNONYMOUS_CODING' and max_af > 0.05 and pass_gamb_colu == True")

# Subset allelic frequencies - save tab
# NS snps
frq_vgsc1 = snps_rd.reset_index().query("effect == 'NON_SYNONYMOUS_CODING' and pass_gamb_colu == True")
frq_vgsc1.to_csv('drive/MyDrive/Insecticide_resistance/vgsc_results/Ns_frq_vgsc.csv')
# year
frq_vgsc1_y = snps_rd_y.reset_index().query("effect == 'NON_SYNONYMOUS_CODING' and pass_gamb_colu == True")
frq_vgsc1_y.to_csv('drive/MyDrive/Insecticide_resistance/vgsc_results/Ns_frq_vgsc_year.csv')

# all snp
frq_vgsc2 = snps_rd.reset_index().query("pass_gamb_colu == True")
frq_vgsc2.to_csv('drive/MyDrive/Insecticide_resistance/vgsc_results/frq_vgsc_all.csv')
# year
frq_vgsc2_y = snps_rd_y.reset_index().query("pass_gamb_colu == True")
frq_vgsc2_y.to_csv('drive/MyDrive/Insecticide_resistance/vgsc_results/frq_vgsc_all.csv')

In [12]:
color_1 = sns.light_palette("darkslategray",n_colors=100)
color_2 = sns.color_palette("ch:s=16,rot=0.05,dark=0.18,light=0.99", as_cmap=True)

frq_vgsc_y.columns

Index(['contig', 'position', 'ref_allele', 'alt_allele', 'aa_change',
       'pass_gamb_colu_arab', 'pass_gamb_colu', 'pass_arab',
       'frq_Bana_col_2012', 'frq_Bana_col_2014', 'frq_Bana_col_2015',
       'frq_Bana_col_2016', 'frq_Bana_col_2017', 'frq_Bana_gam_2012',
       'frq_Bana_gam_2014', 'frq_Bana_gam_2015', 'frq_Bana_gam_2016',
       'frq_Pala_ara_2014', 'frq_Pala_ara_2015', 'frq_Pala_ara_2016',
       'frq_Pala_col_2012', 'frq_Pala_gam_2012', 'frq_Pala_gam_2014',
       'frq_Pala_gam_2015', 'frq_Pala_gam_2016', 'frq_Pala_gam_2017',
       'frq_Sour_col_2012', 'frq_Sour_col_2014', 'frq_Sour_col_2015',
       'frq_Sour_col_2016', 'frq_Sour_col_2017', 'frq_Sour_gam_2012',
       'frq_Sour_gam_2014', 'frq_Sour_gam_2015', 'frq_Sour_gam_2016',
       'frq_Mono_gam_2004', 'max_af', 'transcript', 'effect', 'impact',
       'ref_codon', 'alt_codon', 'aa_pos', 'ref_aa', 'alt_aa', 'label'],
      dtype='object')

In [13]:
#frq_vgsc.columns, df_frq_vgsc.columns,len(frq_vgsc.columns),len(df_frq_vgsc.columns)

In [14]:
# plot allele freqs per pop
from mpl_toolkits.axes_grid1.inset_locator import inset_axes
fig = plt.figure(figsize=(16,7))
ptl_df = petl.fromdataframe(df_frq_vgsc)

pl_list = []
for idx in cohorts:
  key = f'frq_{idx}'
  if key in df_frq_vgsc.columns:
    pl_list.append(f'frq_{idx}')
col_lab = ['An. arabiensis (2014)','An. arabiensis (2015)', 'An. arabiensis (2016)', 'An. coluzzii (2012)','An. coluzzii (2014)',
           'An. coluzzii (2015)','An. coluzzii (2016)','An. coluzzii (2017)', 'An. gambiae ss (2012)',
           'An. gambiae ss (2014)','An. gambiae ss (2015)','An. gambiae ss (2016)','An. gambiae ss (2017)']
labels = ['2L:%s:%s>%s-(%s)' % (rec.position, rec.ref_allele, rec.alt_allele, rec.aa_change) for rec in ptl_df.records()]
# plot
ax=sns.heatmap(df_frq_vgsc[sorted(pl_list)], vmin=0, vmax=1, cmap=color_2, yticklabels=labels, linewidths=0.5,
               xticklabels=col_lab, linecolor="whitesmoke",annot=True, cbar=False)
#ax.set_title("Alternative alleles frequencies per population within the AGAP004050 gene")
plt.xticks(rotation=30, ha='right')


ax.set(xlabel="$An.$ $gambiae$ s.l. populations", ylabel = 'Non-synonymous SNPs positions in the $vgsc$ gene')

cax = inset_axes(ax,width="30%", height="3%", loc='lower left', bbox_to_anchor=(0, 1.03, 1, 1),
                 bbox_transform=ax.transAxes,borderpad=0)
mpl.colorbar.ColorbarBase(cax,orientation='horizontal', ticklocation='top',cmap=color_2,
                         label='$Allelic$ $frequencies$')

fig.tight_layout()
fig.savefig('drive/MyDrive/Insecticide_resistance/vgsc_results/plot_vgsc_snp_fq.png', dpi=300, bbox_inches='tight')

  fig.tight_layout()


In [15]:
# plot allele freqs per pop
from mpl_toolkits.axes_grid1.inset_locator import inset_axes
fig, ax = plt.subplots(figsize=(16,7))
ptl_df = petl.fromdataframe(df_frq_vgsc)

pl_list = []
for idx in cohorts:
  key = f'frq_{idx}'
  if key in df_frq_vgsc.columns:
    pl_list.append(f'frq_{idx}')
col_lab = ['An. arabiensis (2014)','An. arabiensis (2015)', 'An. arabiensis (2016)', 'An. coluzzii (2012)','An. coluzzii (2014)',
           'An. coluzzii (2015)','An. coluzzii (2016)','An. coluzzii (2017)', 'An. gambiae ss (2012)',
           'An. gambiae ss (2014)','An. gambiae ss (2015)','An. gambiae ss (2016)','An. gambiae ss (2017)']
labels = ['2L:%s:%s>%s-(%s)' % (rec.position, rec.ref_allele, rec.alt_allele, rec.aa_change) for rec in ptl_df.records()]
# plot
ax=sns.heatmap(df_frq_vgsc[sorted(pl_list)], vmin=0, vmax=1, cmap=color_2, yticklabels=labels, linewidths=0.5,
               xticklabels=n_xlab, linecolor="whitesmoke",annot=True, cbar=False)
#ax.set_title("Alternative alleles frequencies per population within the AGAP004050 gene")
plt.xticks(rotation=30, ha='right')

## line to separate samples
plt.axvline(x=0, ymin=-0.02, ymax=1.04, ls='dashdot',color='k',clip_on=False,lw=1)
plt.axvline(x=3, ymin=-0.02, ymax=1.04, ls='dashdot',color='k',clip_on=False,lw=1)
plt.axvline(x=8, ymin=-0.02, ymax=1.04, ls='dashdot',color='k',clip_on=False,lw=1)
plt.axvline(x=13, ymin=-0.02, ymax=1.04, ls='dashdot',color='k',clip_on=False,lw=1)

##Text
plt.text(1.02, -0.3, '$An. arabiensis$', size=12, color='purple')
plt.text(4.5, -0.3, '$An. coluzzii$', size=12, color='purple')
plt.text(10, -0.3, '$An. gambiae$'+' s.s.', size=12, color='purple')

## Legend
ax.set_xlabel("Sampling periods")
ax.set_ylabel('Non-synonymous SNPs positions in the $vgsc$ gene')

cax = inset_axes(ax, width="35%", height="2%", loc='lower left', bbox_to_anchor=(0, 1.06, 1, 1),
                 bbox_transform=ax.transAxes,borderpad=0)
mpl.colorbar.ColorbarBase(cax, orientation='horizontal', ticklocation='top',cmap=color_2,
                         label='$Allelic$ $frequencies$')

#fig.tight_layout()
fig.savefig('drive/MyDrive/Insecticide_resistance/vgsc_results/plt_vgsc_fq_300x.png', dpi=300, bbox_inches='tight')
fig.savefig('drive/MyDrive/Insecticide_resistance/vgsc_results/plt_vgsc_fq_150x.png', dpi=150, bbox_inches='tight')

In [16]:
# plot allele freqs per pop
from mpl_toolkits.axes_grid1.inset_locator import inset_axes
fig, ax = plt.subplots(figsize=(16,7))
ptl_df = petl.fromdataframe(df_frq_vgsc)

pl_list = []
for idx in cohorts:
  key = f'frq_{idx}'
  if key in df_frq_vgsc.columns:
    pl_list.append(f'frq_{idx}')
col_lab = ['An. arabiensis (2014)','An. arabiensis (2015)', 'An. arabiensis (2016)', 'An. coluzzii (2012)','An. coluzzii (2014)',
           'An. coluzzii (2015)','An. coluzzii (2016)','An. coluzzii (2017)', 'An. gambiae ss (2012)',
           'An. gambiae ss (2014)','An. gambiae ss (2015)','An. gambiae ss (2016)','An. gambiae ss (2017)']
labels = ['2L:%s:%s>%s-(%s)' % (rec.position, rec.ref_allele, rec.alt_allele, rec.aa_change) for rec in ptl_df.records()]
# plot
ax=sns.heatmap(df_frq_vgsc[sorted(pl_list)], vmin=0, vmax=1, cmap=color_2, yticklabels=labels, linewidths=0.5,
               xticklabels=n_xlab, linecolor="whitesmoke",annot=True, cbar=False)
#ax.set_title("Alternative alleles frequencies per population within the AGAP004050 gene")
plt.xticks(rotation=30, ha='right')

## line to separate samples
plt.axvline(x=0, ymin=-0.02, ymax=1.04, ls='dashdot',color='k',clip_on=False,lw=1)
plt.axvline(x=3, ymin=-0.02, ymax=1.04, ls='dashdot',color='k',clip_on=False,lw=1)
plt.axvline(x=8, ymin=-0.02, ymax=1.04, ls='dashdot',color='k',clip_on=False,lw=1)
plt.axvline(x=13, ymin=-0.02, ymax=1.04, ls='dashdot',color='k',clip_on=False,lw=1)

##Text
plt.text(1.02, -0.3, '$An. arabiensis$', size=12, color='purple')
plt.text(4.5, -0.3, '$An. coluzzii$', size=12, color='purple')
plt.text(10, -0.3, '$An. gambiae$'+' s.s.', size=12, color='purple')

## Legend
ax.set_xlabel("Périodes de collecte des échantillons")
ax.set_ylabel('Positions des SNP non-synonymes du gène $vgsc$')

cax = inset_axes(ax, width="35%", height="2%", loc='lower left', bbox_to_anchor=(0, 1.06, 1, 1),
                 bbox_transform=ax.transAxes,borderpad=0)
mpl.colorbar.ColorbarBase(cax, orientation='horizontal', ticklocation='top',cmap=color_2,
                         label='$Fréquences$ $alléliques$')

#fig.tight_layout()
fig.savefig('drive/MyDrive/Insecticide_resistance/vgsc_results/plt_vgsc_fq_fr300x.png', dpi=300, bbox_inches='tight')
fig.savefig('drive/MyDrive/Insecticide_resistance/vgsc_results/plt_vgsc_fq_fr150x.png', dpi=150, bbox_inches='tight')

In [17]:
# plot allele freqs per pop
from mpl_toolkits.axes_grid1.inset_locator import inset_axes
fig = plt.figure(figsize=(16,5))
frq_vgsc_sel = frq_vgsc.drop([4516, 6578, 6581, 6603, 6694, 6720])
ptl_df = petl.fromdataframe(frq_vgsc_sel)

pl_list = []
for idx in cohorts:
  key = f'frq_{idx}'
  if key in frq_vgsc.columns:
    pl_list.append(f'frq_{idx}')
col_lab = ['An. arabiensis (2014)','An. arabiensis (2015)', 'An. arabiensis (2016)', 'An. coluzzii (2012)','An. coluzzii (2014)',
           'An. coluzzii (2015)','An. coluzzii (2016)','An. coluzzii (2017)','An. gambiae ss (2004)', 'An. gambiae ss (2012)',
           'An. gambiae ss (2014)','An. gambiae ss (2015)','An. gambiae ss (2016)','An. gambiae ss (2017)']
labels = ['2L:%s:%s>%s-(%s)' % (rec.position, rec.ref_allele, rec.alt_allele, rec.aa_change) for rec in ptl_df.records()]
# plot
ax=sns.heatmap(frq_vgsc_sel[sorted(pl_list)], vmin=0, vmax=1, cmap=color_2, yticklabels=labels, linewidths=0.5,
               xticklabels=col_lab, linecolor="whitesmoke",annot=True, cbar=False)
#ax.set_title("Alternative alleles frequencies per population within the AGAP004050 gene")
plt.xticks(rotation=30, ha='right')

ax.set(xlabel="$An.$ $gambiae$ s.l. populations", ylabel = 'Non-synonymous SNPs positions in the $vgsc$ gene')

cax = inset_axes(ax,width="30%", height="3%", loc='lower left', bbox_to_anchor=(0, 1.03, 1, 1),
                 bbox_transform=ax.transAxes,borderpad=0)
mpl.colorbar.ColorbarBase(cax,orientation='horizontal', ticklocation='top',cmap=color_2,
                         label='$Allelic$ $frequencies$')

fig.tight_layout()
fig.savefig('drive/MyDrive/Insecticide_resistance/vgsc_results/plot_vgsc_snp_fq1.png', dpi=300, bbox_inches='tight')

  fig.tight_layout()


In [18]:
# plot allele freqs per pop
from mpl_toolkits.axes_grid1.inset_locator import inset_axes
fig = plt.figure(figsize=(16,7))
ptl_df = petl.fromdataframe(frq_vgsc)

pl_list = []
for idx in cohorts:
  key = f'frq_{idx}'
  if key in frq_vgsc.columns:
    pl_list.append(f'frq_{idx}')
col_lab = ['An. arabiensis (2014)','An. arabiensis (2015)', 'An. arabiensis (2016)', 'An. coluzzii (2012)','An. coluzzii (2014)',
           'An. coluzzii (2015)','An. coluzzii (2016)','An. coluzzii (2017)','An. gambiae ss (2004)', 'An. gambiae ss (2012)',
           'An. gambiae ss (2014)','An. gambiae ss (2015)','An. gambiae ss (2016)','An. gambiae ss (2017)']
labels = ['2L:%s:%s>%s-(%s)' % (rec.position, rec.ref_allele, rec.alt_allele, rec.aa_change) for rec in ptl_df.records()]
# plot
ax=sns.heatmap(frq_vgsc[sorted(pl_list)], vmin=0, vmax=1, cmap=color_2, yticklabels=labels, linewidths=0.5,
               xticklabels=col_lab, linecolor="whitesmoke",annot=True, cbar=False)
#ax.set_title("Alternative alleles frequencies per population within the AGAP004050 gene")
plt.xticks(rotation=30, ha='right')

ax.set(xlabel="Populations d'$An.$ $gambiae$ s.l.", ylabel = 'Positions des SNP non-synonymes du gène $vgsc$')

cax = inset_axes(ax,width="30%", height="3%", loc='lower left', bbox_to_anchor=(0, 1.03, 1, 1),
                 bbox_transform=ax.transAxes,borderpad=0)
mpl.colorbar.ColorbarBase(cax,orientation='horizontal', ticklocation='top',cmap=color_2,
                         label='$Freéquences$ $alléliques$')

fig.tight_layout()
fig.savefig('drive/MyDrive/Insecticide_resistance/vgsc_results/plot_vgsc_snp_fr.png', dpi=300, bbox_inches='tight')

  fig.tight_layout()


In [19]:
## col_name
columns_year = ['frq_Bana_col_2012', 'frq_Bana_col_2014', 'frq_Bana_col_2015', 'frq_Bana_col_2016', 'frq_Bana_col_2017',
           'frq_Bana_gam_2012', 'frq_Bana_gam_2014', 'frq_Bana_gam_2015', 'frq_Bana_gam_2016',
           'frq_Pala_ara_2014', 'frq_Pala_ara_2015', 'frq_Pala_ara_2016', 'frq_Pala_col_2012',
           'frq_Pala_gam_2012', 'frq_Pala_gam_2014', 'frq_Pala_gam_2015', 'frq_Pala_gam_2016', 'frq_Pala_gam_2017',
           'frq_Sour_col_2012', 'frq_Sour_col_2014', 'frq_Sour_col_2015', 'frq_Sour_col_2016', 'frq_Sour_col_2017',
           'frq_Sour_gam_2012', 'frq_Sour_gam_2014', 'frq_Sour_gam_2015', 'frq_Sour_gam_2016', 'frq_Mono_gam_2004']

labs = ['An. coluzzii (BN, 2012)', 'An. coluzzii (BN, 2014)', 'An. coluzzii (BN, 2015)', 'An. coluzzii (BN, 2016)', 'An. coluzzii (BN, 2017)',
        'An. gambiae s.s. (BN, 2012)', 'An. gambiae s.s. (BN, 2014)', 'An. gambiae s.s. (BN, 2015)', 'An. gambiae s.s. (BN, 2016)',
        'An. arabiensis (PL, 2014)',  'An. arabiensis (PL, 2015)',  'An. arabiensis (PL, 2016)', 'An. coluzzii (PL, 2012)',
        'An. gambiae s.s. (PL, 2012)', 'An. gambiae s.s. (PL, 2014)', 'An. gambiae s.s. (PL, 2015)', 'An. gambiae s.s. (PL, 2016)', 'An. gambiae s.s. (PL, 2017)',
        'An. coluzzii (SK, 2012)', 'An. coluzzii (SK, 2014)', 'An. coluzzii (SK, 2015)', 'An. coluzzii (SK, 2016)', 'An. coluzzii (SK, 2017)',
        'An. gambiae s.s. (SK, 2012)', 'An. gambiae s.s. (SK, 2014)', 'An. gambiae s.s. (SK, 2015)', 'An. gambiae s.s. (SK, 2016)', 'An. gambiae s.s. (MM, 2004)'
        ]

columns_year1 = ['frq_Bana_col_2012', 'frq_Bana_col_2014', 'frq_Bana_col_2015', 'frq_Bana_col_2016', 'frq_Bana_col_2017',
                 'frq_Sour_col_2012', 'frq_Sour_col_2014', 'frq_Sour_col_2015', 'frq_Sour_col_2016', 'frq_Sour_col_2017', 'frq_Pala_col_2012',
                 'frq_Bana_gam_2012', 'frq_Bana_gam_2014', 'frq_Bana_gam_2015', 'frq_Bana_gam_2016',
                 'frq_Sour_gam_2012', 'frq_Sour_gam_2014', 'frq_Sour_gam_2015', 'frq_Sour_gam_2016',
                 'frq_Pala_gam_2012', 'frq_Pala_gam_2014', 'frq_Pala_gam_2015', 'frq_Pala_gam_2016', 'frq_Pala_gam_2017',
                 'frq_Mono_gam_2004', 'frq_Pala_ara_2014', 'frq_Pala_ara_2015', 'frq_Pala_ara_2016',
                 ]


labs1 = ['An. coluzzii (BN, 2012)', 'An. coluzzii (BN, 2014)', 'An. coluzzii (BN, 2015)', 'An. coluzzii (BN, 2016)', 'An. coluzzii (BN, 2017)',
         'An. coluzzii (SK, 2012)', 'An. coluzzii (SK, 2014)', 'An. coluzzii (SK, 2015)', 'An. coluzzii (SK, 2016)', 'An. coluzzii (SK, 2017)', 'An. coluzzii (PL, 2012)',
         'An. gambiae s.s. (BN, 2012)', 'An. gambiae s.s. (BN, 2014)', 'An. gambiae s.s. (BN, 2015)', 'An. gambiae s.s. (BN, 2016)',
         'An. gambiae s.s. (SK, 2012)', 'An. gambiae s.s. (SK, 2014)', 'An. gambiae s.s. (SK, 2015)', 'An. gambiae s.s. (SK, 2016)',
         'An. gambiae s.s. (PL, 2012)', 'An. gambiae s.s. (PL, 2014)', 'An. gambiae s.s. (PL, 2015)', 'An. gambiae s.s. (PL, 2016)', 'An. gambiae s.s. (PL, 2017)',
         'An. gambiae s.s. (MM, 2004)','An. arabiensis (PL, 2014)',  'An. arabiensis (PL, 2015)',  'An. arabiensis (PL, 2016)',
       ]

In [20]:
sns.set_context('paper')
sns.set_style('white')
sns.set_style('ticks')
rcParams = plt.rcParams
base_font_size = 10
rcParams['font.size'] = base_font_size
rcParams['axes.titlesize'] = base_font_size
rcParams['axes.labelsize'] = base_font_size
rcParams['xtick.labelsize'] = base_font_size
rcParams['ytick.labelsize'] = base_font_size
rcParams['legend.fontsize'] = base_font_size
rcParams['axes.linewidth'] = .5
rcParams['lines.linewidth'] = .5
rcParams['patch.linewidth'] = .5
rcParams['ytick.direction'] = 'out'
rcParams['xtick.direction'] = 'out'
#rcParams['savefig.jpeg_quality'] = 300
rcParams['lines.markeredgewidth'] = .5
rcParams['figure.max_open_warning'] = 1000
rcParams['figure.dpi'] = 120
rcParams['figure.facecolor'] = 'w'

In [21]:
# plot allele freqs per pop
from mpl_toolkits.axes_grid1.inset_locator import inset_axes
fig = plt.figure(figsize=(19,8))
ptl_df = petl.fromdataframe(frq_vgsc_y)

labels = ['2L:%s:%s>%s-(%s)' % (rec.position, rec.ref_allele, rec.alt_allele, rec.aa_change) for rec in ptl_df.records()]
# plot
ax=sns.heatmap(frq_vgsc_y[columns_year], vmin=0, vmax=1, cmap=color_2, yticklabels=labels, linewidths=0.5,
               xticklabels=labs, linecolor="whitesmoke",annot=True, cbar=False)
#ax.set_title("Alternative alleles frequencies per population within the AGAP004050 gene")
plt.xticks(rotation=30, ha='right')



ax.set(xlabel="$An.$ $gambiae$ s.l. populations", ylabel = 'Non-synonymous SNPs positions in the $vgsc$ gene')

cax = inset_axes(ax,width="30%", height="3%", loc='lower left', bbox_to_anchor=(0, 1.03, 1, 1),
                 bbox_transform=ax.transAxes,borderpad=0)
mpl.colorbar.ColorbarBase(cax,orientation='horizontal', ticklocation='top',cmap=color_2,
                         label='$Allelic$ $frequencies$')

fig.tight_layout()
#fig.savefig('drive/MyDrive/Insecticide_resistance/vgsc_results/plot_vgsc_snp_fq_year.png', dpi=300, bbox_inches='tight')

  fig.tight_layout()


In [22]:
# plot allele freqs per pop
from mpl_toolkits.axes_grid1.inset_locator import inset_axes
fig = plt.figure(figsize=(19,8))
ptl_df = petl.fromdataframe(frq_vgsc_y)

labels = ['2L:%s:%s>%s-(%s)' % (rec.position, rec.ref_allele, rec.alt_allele, rec.aa_change) for rec in ptl_df.records()]
# plot
ax=sns.heatmap(frq_vgsc_y[columns_year1], vmin=0, vmax=1, cmap=color_2, yticklabels=labels, linewidths=0.5,
               xticklabels=labs1, linecolor="whitesmoke",annot=True, cbar=False)
#ax.set_title("Alternative alleles frequencies per population within the AGAP004050 gene")
plt.xticks(rotation=30, ha='right')

ax.set(xlabel="$An.$ $gambiae$ s.l. populations", ylabel = 'Non-synonymous SNPs positions in the $vgsc$ gene')

cax = inset_axes(ax,width="30%", height="3%", loc='lower left', bbox_to_anchor=(0, 1.03, 1, 1),
                 bbox_transform=ax.transAxes,borderpad=0)
mpl.colorbar.ColorbarBase(cax,orientation='horizontal', ticklocation='top',cmap=color_2,
                         label='$Allelic$ $frequencies$')

fig.tight_layout()
fig.savefig('drive/MyDrive/Insecticide_resistance/vgsc_results/plot_vgsc_snp_fq_year.png', dpi=300, bbox_inches='tight')

  fig.tight_layout()


In [23]:
# plot allele freqs per pop
from mpl_toolkits.axes_grid1.inset_locator import inset_axes
fig = plt.figure(figsize=(19,7))
frq_vgsc_y_sel = frq_vgsc_y.drop([2420, 4516, 6578])
ptl_df = petl.fromdataframe(frq_vgsc_y_sel)

labels = ['2L:%s:%s>%s-(%s)' % (rec.position, rec.ref_allele, rec.alt_allele, rec.aa_change) for rec in ptl_df.records()]
# plot
ax=sns.heatmap(frq_vgsc_y_sel[columns_year], vmin=0, vmax=1, cmap=color_2, yticklabels=labels, linewidths=0.5,
               xticklabels=labs, linecolor="whitesmoke",annot=True, cbar=False)
#ax.set_title("Alternative alleles frequencies per population within the AGAP004050 gene")
plt.xticks(rotation=30, ha='right')

ax.set(xlabel="$An.$ $gambiae$ s.l. populations", ylabel = 'Non-synonymous SNPs positions in the $vgsc$ gene')

cax = inset_axes(ax,width="30%", height="3%", loc='lower left', bbox_to_anchor=(0, 1.03, 1, 1),
                 bbox_transform=ax.transAxes,borderpad=0)
mpl.colorbar.ColorbarBase(cax,orientation='horizontal', ticklocation='top',cmap=color_2,
                         label='$Allelic$ $frequencies$')

fig.tight_layout()
#fig.savefig('drive/MyDrive/Insecticide_resistance/vgsc_results/plot_vgsc_snp_fq_year.png', dpi=300, bbox_inches='tight')

  fig.tight_layout()


In [24]:
fig1 = ag3.plot_frequencies_heatmap(frq_vgsc[sorted(frq_vgsc.columns)], height=600, width=1400)
#fig1.update_xaxes(tickangle= -35)

In [25]:
def ds_freq_tab(ds):
  #extract cohorts into a dataframe
  cohort_vars = [v for v in ds if v.startswith("cohort_")]
  df_cohorts = ds[cohort_vars].to_dataframe()
  df_cohorts.columns = [c.split("cohort_")[1] for c in df_cohorts.columns]

  variant_labels = ds["variant_label"].values
  dfs = []
  for cohort_index, cohort in enumerate(df_cohorts.itertuples()):
    ds_cohort = ds.isel(cohorts=cohort_index)
    dict_df =  {"taxon": cohort.taxon, "area": cohort.area, "date": cohort.period_start, "period": str(cohort.period),
                "sample_size": cohort.size,"variant": variant_labels, "count": ds_cohort["event_count"].values,"nobs": ds_cohort["event_nobs"].values,
                "frequency": ds_cohort["event_frequency"].values, "frequency_ci_low": ds_cohort["event_frequency_ci_low"].values,
                "frequency_ci_upp": ds_cohort["event_frequency_ci_upp"].values
                }
    df = pd.DataFrame(dict_df)
    dfs.append(df)

  df_events = pd.concat(dfs, axis=0).reset_index(drop=True)
  df_events = df_events.query("nobs > 0")

  # Frequencies stats
  frq = df_events["frequency"]
  frq_ci_low = df_events["frequency_ci_low"]
  frq_ci_upp = df_events["frequency_ci_upp"]
  df_events["frequency_error"] = frq_ci_upp - frq
  df_events["frequency_error_minus"] = frq - frq_ci_low

  return df_events

In [35]:
df = pd.DataFrame(dict(
    Date=[1,2,3],
    Male   = [1,2,3],
    Female = [2,3,1],
    Others = [7,5,2]
))


fig = px.line(df, x="Date", y=["Male",
                              # "Female", comment this type for testing color
                               "Others"],
             color_discrete_map={
                 "Male": "#456987",
                 "Female": "#147852",
                 "Others": "#00D",

             })

fig.show()

In [36]:
def plot_freq_time_series(data, height=450, width=900, template='plotly_white', xlab='Years', var_title='Genetic variants',
                          ylab= 'Allelic frequencies', color=None, title='AGAP004707-RD (Vgsc/para) SNP frequencies'):

  #plot time series frequencies
  fig = px.line(data,x="date", y="frequency", error_y="frequency_error",
                error_y_minus="frequency_error_minus", color="variant", markers=True,
                height=height,width=width,template=template, title=title,
                labels={"date": f'{xlab}', "frequency": f'{ylab}', "variant":f'{var_title}'})

  # figure layout
  fig.update_layout(xaxis=dict(showgrid=False, showline=True, linewidth=1, linecolor='black'),
                   yaxis=dict(showgrid=False, showline=True, linewidth=1, linecolor='gray'))
  fig.update_yaxes(range=[0, 1.0], ticks="outside", col=1)
  fig.update_xaxes(ticks="outside", col=1)

  return fig

In [27]:
ds_aafreq = ag3.aa_allele_frequencies_advanced(transcript=rd, area_by="admin1_iso", period_by="year",
                                               sample_sets=sets, sample_query="country == 'Burkina Faso'",
                                               variant_query="max_af > 0.05")
df_events = ds_freq_tab(ds_aafreq)
#df_events



Load SNP genotypes:   0%|          | 0/124 [00:00<?, ?it/s]

Compute SNP allele frequencies:   0%|          | 0/14 [00:00<?, ?it/s]

Compute SNP effects:   0%|          | 0/35318 [00:00<?, ?it/s]



In [28]:
fig = px.line(df_events,
            facet_col="taxon",
            facet_row="area",
            x="date",
            y="frequency",
            error_y="frequency_error",
            error_y_minus="frequency_error_minus",
            color="variant",
            markers=True)

fig

# evolution of Allelic frequencies in An. gambiae

In [None]:
df_events.query("taxon=='gambiae' and period > '2004'")

In [29]:
plot_freq_time_series(df_events.query("taxon=='gambiae' and period > '2004'"))

In [30]:
vars = ['L995F (2L:2,422,652 A>T)', 'N1570Y (2L:2,429,745 A>T)', 'A1746S (2L:2,430,424 G>T)', 'P1874L (2L:2,430,881 C>T)']
fig1 = plot_freq_time_series(df_events.query(f"taxon=='gambiae' and period > '2004' and variant=={vars}"))
fig1.update_layout(title={'text':' '}, font_color='black')

# evolution of Allelic frequencies in An. coluzzii

In [31]:
plot_freq_time_series(df_events.query("taxon=='coluzzii'"))

In [39]:
vars2 = ['L995F (2L:2,422,652 A>T)', 'N1570Y (2L:2,429,745 A>T)', 'I1527T (2L:2,429,617 T>C)', 'V402L (2L:2,391,228 G>{C,T})']
fig2 = plot_freq_time_series(df_events.query(f"taxon=='coluzzii' and variant=={vars2}"), height=500, width=1000)
fig2.update_layout(title={'text':'  '}, font_color='black', font=dict(size=15))
#fig2.update_layout()

In [47]:
from itertools import cycle
color = ['turquoise','red','turquoise','mediumpurple']
color_style = cycle(color)
vars2 = ['L995F (2L:2,422,652 A>T)', 'N1570Y (2L:2,429,745 A>T)', 'I1527T (2L:2,429,617 T>C)', 'V402L (2L:2,391,228 G>{C,T})']
fig2 = plot_freq_time_series(df_events.query(f"taxon=='coluzzii' and variant=={vars2}"), xlab='Sampling periods', height=500, width=1000)
fig2.update_layout(title={'text':'  '}, font_color='black', font=dict(size=15))
for d in fig2['data']:
  d.line['color'] = next(color_style)

fig2.show()
#fig2.update_layout()

In [33]:
vars2 = ['L995F (2L:2,422,652 A>T)', 'N1570Y (2L:2,429,745 A>T)', 'I1527T (2L:2,429,617 T>C)', 'V402L (2L:2,391,228 G>{C,T})']
fig2 = plot_freq_time_series(df_events.query(f"taxon=='coluzzii' and variant=={vars2}"), height=500, width=1000,
                             xlab='Années', ylab='Fréquences alléliques', var_title='Variantes génétiques')
fig2.update_layout(title={'text':'  '}, font_color='black', font=dict(size=15))
#fig2.update_layout()

# evolution of Allelic frequencies in An. arabiaensis

In [None]:
plot_freq_time_series(df_events.query("taxon=='arabiensis'"))

In [None]:
vars3 = ['L995S (2L:2,422,651 T>C)', 'L995F (2L:2,422,652 A>T)', 'A1553T (2L:2,429,694 G>A)']
fig3 = plot_freq_time_series(df_events.query(f"taxon=='arabiensis' and variant=={vars3}"))
fig3.update_layout(title={'text':' '}, font_color='black')

In [None]:
#df_samples

In [None]:
freq_df = ag3.aa_allele_frequencies_advanced(transcript=rd, area_by="location", period_by="year",
                                             sample_sets=sets, sample_query="country == 'Burkina Faso'",
                                             variant_query="max_af > 0.05")



Load SNP genotypes:   0%|          | 0/124 [00:00<?, ?it/s]

Compute SNP allele frequencies:   0%|          | 0/32 [00:00<?, ?it/s]

Compute SNP effects:   0%|          | 0/35261 [00:00<?, ?it/s]



In [None]:
freq = ds_freq_tab(freq_df)
#freq

In [None]:
fig = px.line(freq,
            facet_col="area",
            facet_row="taxon",
            x="date",
            y="frequency",
            error_y="frequency_error",
            error_y_minus="frequency_error_minus",
            color="variant",
            markers=True)

fig

In [None]:
plot_freq_time_series(freq.query("taxon=='arabiensis'"))

In [None]:
plot_freq_time_series(freq.query("taxon=='gambiae' and area == 'Bana Village'"))

In [None]:
plot_freq_time_series(freq.query("taxon=='gambiae' and area == 'Souroukoudinga'"))