In [7]:
import numpy as np
import csv
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import math
from mpl_toolkits.axes_grid1.inset_locator import inset_axes
import re
import collections
import string
from collections import OrderedDict
import random
from collections import Counter
import statistics
num_regex = re.compile(r'\d+')
import os
import operator
import pandas as pd

## This notebook is used to plot the genotype distributions for some given runs (the pink and blue plots)

In [4]:
nRuns = 100
runID_offsets = [0, 100]
duration = 4368 #duration of sims in hours - equivalent to 26 weeks
dates = ["24-Sep-2020", "09-Oct-2020"] #dates the simulations were performed on
pc_res = [14, 15, 16] #percentages of the populations which are resistant to the applied biocide
phase2_str = "phase2"

#parameters for the log normal distributions used
#[scale, sigma]
log_norm_params_14pcRes = [2.703747953786337, 0.5690825284230452]
log_norm_params_15pcRes = [2.6133256846855746, 0.6260058161550592]
log_norm_params_16pcRes = [2.47772924764521, 0.7060073500033884]

In [5]:
def getFilepathToGenoRuns(date, pc_res, phase):
    '''
    creates a string with the file location of the genotype distributions (all the run_ID files)
    '''
    
    return "geno_distb_data_"+phase+"/"+str(pc_res)+"_resistant-"+date+"/"

def getEventCountersDataframe(date, pc_res, phase, sigma, duration):
    
    return pd.read_csv("geno_distb_data_"+phase+"/"+str(pc_res)+"_resistant-"+date+"/"+str(pc_res)+"_resistant-"+date+"-event_counters-sigma="+"{:.5f}".format(sigma)+"-t="+str(duration)+".0.csv")


def getListOfMeasurementTimes(directory_name):
    '''
    for each runID directory, this gets the filenames and extracts a list of the times they were sampled at.
    directory_name is of form path_to_files/runID_<n>
    
    returns: sorted list of the time vals, in string form with 2 decimal places
    '''
    time_list = []
    def get_numbers_from_filename(filename):
        return re.search(r'(\d+(?:\.\d+)?)', filename).group(0)
    
    for filename in os.listdir(directory_name):
        time_list.append(float(get_numbers_from_filename(filename)))

    return ["{:.2f}".format(float(t)) for t in sorted(time_list)]

In [8]:
event_counters_14pc_24Sep = getEventCountersDataframe(dates[0], pc_res[0], phase2_str, log_norm_params_14pcRes[1], duration)
event_counters_15pc_24Sep = getEventCountersDataframe(dates[0], pc_res[1], phase2_str, log_norm_params_15pcRes[1], duration)
event_counters_16pc_24Sep = getEventCountersDataframe(dates[0], pc_res[2], phase2_str, log_norm_params_16pcRes[1], duration)

event_counters_14pc_09Oct = getEventCountersDataframe(dates[1], pc_res[0], phase2_str, log_norm_params_14pcRes[1], duration)
event_counters_15pc_09Oct = getEventCountersDataframe(dates[1], pc_res[1], phase2_str, log_norm_params_15pcRes[1], duration)

In [17]:
all_event_counters_14pc = pd.concat([event_counters_14pc_24Sep, event_counters_14pc_09Oct], ignore_index=True)
all_event_counters_15pc = pd.concat([event_counters_15pc_24Sep, event_counters_15pc_09Oct], ignore_index=True)
all_event_counters_16pc = pd.concat([event_counters_16pc_24Sep], ignore_index=True)
all_event_counters_14pc[all_event_counters_14pc["bf_thickness"] > 0]

Unnamed: 0,runID,bf_thickness,exit_time,final_pop,avg_pop,n_deaths,n_detachments,n_immigrations,n_migrations,n_replications
17,17,40,3595,16425,2328,182733,21925,71603,0,149475
38,38,2,4368,1057,456,77745,26698,87758,0,17737
48,48,40,2758,16526,2944,168982,16467,55122,0,146848
49,49,23,4368,9611,1065,122224,26357,87347,0,70840
66,66,40,3735,16501,2277,188461,22315,74735,0,152537
79,79,1,4368,660,392,73363,26369,87588,0,12799
103,103,2,4368,895,400,73919,26382,87558,0,13633
184,184,40,2304,16426,3447,167184,13664,46365,0,150904
193,193,3,4368,1171,405,73928,26247,87118,0,14223


In [15]:
def getGenoDistbDict(runID, pc_res, phase_val):
    '''
    gets a dict containing the geno distbs over time.
    each key is the time step.
    each entry is a dataframe containing the genotypes in each microhabitat
    '''
    
    runID_key = "runID_"+str(runID)
    date = dates[runID//nRuns] #gets the relevant date from the list of dates, according to the runID (100 runs per session, starting at 0)
    
    directoryPath = getFilepathToGenoRuns(date=date, pc_res=pc_res, phase=phase_val)
    filepath_runID = directoryPath+"/"+runID_key
    
    geno_time_dict = {} #dictionary containing geno dataframes for each timestep
    time_list = getListOfMeasurementTimes(filepath_runID) #sorted list of the times that the genos were sampled at in this run
    
    for t in time_list:
            
        filepath_time = filepath_runID+"/geno_distb-t="+t+".csv"

        #need to swap the rows and columns so that the microhabitat is the key in the dataframe
        #geno_df = pd.read_csv(filename, header=None).T
        geno_df = pd.DataFrame([line.strip().split(',') for line in open(filepath_time, 'r')]).T
        #geno
        new_header = geno_df.iloc[0] #grab the first row for the header
        geno_df = geno_df[1:] #take the data less the header row
        geno_df.columns = new_header #set the header row as the df header

        geno_df = geno_df.astype(float)

        #round the time to the nearest integer value to make reading it in easier
        #the [-3] is so the decimal point and decimal numbers are removed when casting the string to an int
        geno_time_dict[int(t[:-3])] = geno_df
    
    return geno_time_dict
    
    

In [26]:
x = getGenoDistbDict(184, 14, phase2_str)

In [64]:
def plotGenoDistbsInSystemOverTime(geno_dict):
    '''
    This is the method that makes the big blue and pink plots, showing the distribution of genotypes over time throughout the system
    Input is the geno_dict, where each key is the timestep and each entry is a dataframe where each column is the genotypes in each microhabitat
    '''
    
    #make a new dict where each key is the timestep, each entry is the dataframe values condensed into a single array (2D dataframe -> 1D array)
    collated_geno_dict = {}
    max_geno_val = 0. #this will be updated to be the genotype of the most resistant bacteria in the system, used for scaling the colourmap
    
    for t in geno_dict.keys():
        #gets all the genotypes from the dataframe, removes NaNs and converts to 1D numpy array
        #round the genotypes to n decimal places
        collated_geno_dict[t] = np.around(geno_dict[t].values[~np.isnan(geno_dict[t].values)].flatten(), decimals=0)
        max_geno_val = max(np.max(collated_geno_dict[t]), max_geno_val)
        
        geno_counts = Counter(collated_geno_dict[t])
        print(geno_counts.values())
        #print(max_geno_val)
        
    
    

In [65]:
plotGenoDistbsInSystemOverTime(x)

dict_values([1, 1, 2, 1])
dict_values([29, 59, 44, 16, 42, 3, 37, 1, 10, 2, 3, 2, 2, 10, 2])
dict_values([86, 49, 10, 61, 30, 26, 13, 3, 9, 30, 3, 11, 1, 1])
dict_values([50, 85, 37, 1, 10, 20, 20, 48, 6, 1, 37, 8, 1, 1, 14])
dict_values([86, 69, 1, 26, 11, 4, 26, 11, 5, 58, 2, 40, 1, 1, 9])
dict_values([54, 76, 14, 18, 24, 13, 3, 3, 4, 53, 46, 1, 1, 10])
dict_values([49, 97, 25, 14, 5, 36, 16, 13, 39, 7, 43, 1, 6])
dict_values([54, 88, 29, 17, 13, 38, 13, 1, 7, 3, 54, 1, 26, 1, 12])
dict_values([95, 26, 15, 25, 38, 46, 4, 9, 46, 4, 5, 4, 30, 12, 1, 3])
dict_values([91, 23, 26, 48, 33, 14, 14, 68, 4, 4, 3, 37, 1, 8])
dict_values([94, 13, 22, 42, 4, 16, 4, 19, 58, 38, 7])
dict_values([106, 10, 55, 36, 8, 22, 14, 2, 59, 9, 3, 35, 9])
dict_values([81, 56, 13, 53, 37, 2, 11, 12, 8, 7, 32, 1, 2, 14])
dict_values([60, 77, 32, 10, 12, 15, 4, 6, 49, 28, 1, 6, 34, 8, 1])
dict_values([59, 79, 25, 17, 14, 6, 10, 15, 10, 40, 3, 26, 3, 10, 1, 1])
dict_values([4, 16, 79, 22, 48, 2, 35, 47, 2, 13, 4,

In [42]:
y = x[2009].values[~np.isnan(x[2009].values)].flatten()

In [38]:
x[2009]

Unnamed: 0,mh_0,mh_1,mh_2,mh_3,mh_4,mh_5,mh_6,mh_7,mh_8,mh_9,...,mh_17,mh_18,mh_19,mh_20,mh_21,mh_22,mh_23,mh_24,mh_25,mh_26
1,18.4663,18.4663,18.4663,13.0048,18.4663,18.4663,18.4663,18.4663,18.4663,18.4663,...,18.4663,18.4663,18.4663,18.4663,18.4663,18.4663,18.4663,18.4663,18.4663,18.4663
2,18.4663,18.4663,4.1396,12.6214,10.5755,18.4663,18.4663,18.4663,18.4663,18.4663,...,18.4663,18.4663,18.4663,4.7592,5.5410,18.4663,18.4663,3.9414,18.4663,18.4663
3,18.4663,18.4663,18.4663,18.4663,12.0618,7.1465,18.4663,18.4663,18.4663,18.4663,...,18.4663,18.4663,7.7978,3.7263,18.4663,18.4663,18.4663,18.4663,18.4663,3.5163
4,18.4663,18.4663,18.4663,18.4663,18.4663,18.4663,18.4663,18.4663,18.4663,18.4663,...,5.1287,18.4663,18.4663,18.4663,18.4663,18.4663,4.2778,18.4663,18.4663,18.4663
5,18.4663,18.4663,18.4663,18.4663,18.4663,18.4663,18.4663,18.4663,18.4663,18.4663,...,18.4663,18.4663,18.4663,18.4663,18.4663,18.4663,18.4663,18.4663,4.2558,18.4663
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
437,,,,,,,,,,,...,,,,,,,,,,
438,,,,,,,,,,,...,,,,,,,,,,
439,,,,,,,,,,,...,,,,,,,,,,
440,,,,,,,,,,,...,,,,,,,,,,


In [43]:
y

array([18.4663, 18.4663, 18.4663, ..., 11.3219, 18.4663, 16.6462])