### README:

This notebook is used to process the shannon indices for the 16% resistant, low migration rate runs.

Much the same as the original shannon calculator notebook, but that was getting a bit too cluttered so we'll do a tidier version here.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
import matplotlib.ticker as ticker
import matplotlib.pylab as pl
from itertools import cycle
import matplotlib.gridspec as gridspec
import glob
import collections
import math
import re
import os
from collections import defaultdict
from pathlib import Path

In [2]:
nRuns = 100
runID_offsets = [0, 100]
duration = 4368 #duration of sims in hours - equivalent to 26 weeks
dates = ["5-Jan-2022", "10-Jan-2022", "11-Jan-2022", "14-Jan-2022"] #dates the simulations were performed on
pc_res = 16 #percentages of the populations which are resistant to the applied biocide
phase2_str = "phase2"

#parameters for the log normal distributions used
#[scale, sigma]
log_norm_params_16pcRes = [2.47772924764521, 0.7060073500033884]

In [3]:
# these are modified to include the string regarding the new 
def getFilepathToGenoRuns(date, pc_res, phase):
    '''
    creates a string with the file location of the genotype distributions (all the run_ID files)
    '''
    
    return "geno_distb_data_"+phase+"/"+str(pc_res)+"_resistant-"+date+"-migration_rate-0.1/"


def getEventCountersDataframe(date, pc_res, phase, sigma, duration):
    
    return pd.read_csv("geno_distb_data_"+phase+"/"+str(pc_res)+"_resistant-"+date+"-migration_rate-0.1/"+str(pc_res)+"_resistant-"+date+"-migration_rate-0.1-event_counters-sigma="+"{:.5f}".format(sigma)+"-t="+str(duration)+".0.csv")


def getListOfMeasurementTimes(directory_name):
    '''
    for each runID directory, this gets the filenames and extracts a list of the times they were sampled at.
    directory_name is of form path_to_files/runID_<n>
    
    returns: sorted list of the time vals, in string form with 2 decimal places
    '''
    time_list = []
    def get_numbers_from_filename(filename):
        return re.search(r'(\d+(?:\.\d+)?)', filename).group(0)
    
    for filename in os.listdir(directory_name):
        time_list.append(float(get_numbers_from_filename(filename)))

    return ["{:.2f}".format(float(t)) for t in sorted(time_list)]

In [4]:
def shannonIndexAndEquitabilitySolo(geno_dict):
    '''
    For a single run, this calculates the shannon variables H, E, S.
    Outputs a .csv file with the calculated variables over time.
    Can then combine these into a dictionary of dataframes later.
    '''
    
    times = []
    nBac_t = defaultdict(list) #no. of bacteria over time
    H_t = defaultdict(list) #shannon index over time
    E_t = defaultdict(list) #shannon equitability over time
    S_t = defaultdict(list) #no. of species over time
    
    times = geno_dict.keys()
    #print(times)
    for time_key in times:

        #here we create an array with all the genotypes in it and remove any nans
        geno_vals = geno_dict[time_key].values.flatten()[~np.isnan(geno_dict[time_key].values.flatten())]
        nTot = geno_vals.size #total number of bacteria in the population
        genoCounts = collections.Counter(geno_vals) #number of members of each bacterial species in the system

        H = sum([-n/nTot*math.log(n/nTot) for _, n in genoCounts.items()]) #shannon index of this run at time t
        S = len(genoCounts.keys()) #no. of different species in the system
        logS_adjusted = 1 if S == 1 else math.log(S)
        E = H/logS_adjusted #shannon equitability

        nBac_t[time_key].append(int(nTot))
        H_t[time_key].append(H)
        E_t[time_key].append(E)
        S_t[time_key].append(S)
        
    #this is a very poor way of doing things, but in a rush and just trying to 
    #get a good enough job done atm
    nBac_t_list = [b[0] for b in nBac_t.values()]
    H_t_list = [h[0] for h in H_t.values()]
    E_t_list = [e[0] for e in E_t.values()]
    S_t_list = [s[0] for s in S_t.values()]
    
    return list(H_t.keys()), nBac_t_list, H_t_list, E_t_list, S_t_list

In [5]:
def writeShannonCalculationsToFile(t_data, nBac_data, H_data, E_data, S_data, pc_res, date, phase_val, growth_val, filename):
    '''
    it takes an absolute age to load in all the genotype data, so this method will write the calculated values to a file
    '''
    #this creates the required directories
    output_dir = Path("shannon_calculations_"+phase_val+"_"+growth_val+"/"+str(pc_res)+"_pc_res-"+date+"-migration_rate-0.1")
    output_dir.mkdir(parents=True, exist_ok=True)
    #create a dataframe containing all the calculated values
    #first we need a dictionary with the data in it
    collated_data = {'t':t_data, 'nBac':nBac_data, 'H':H_data, 'E':E_data, 'S':S_data}
    df = pd.DataFrame(collated_data)
    #df.to_csv("shannon_calculations_"+phase_val+"_"+growth_val+"/"+str(pc_res)+"_pc_res-"+date+"/"+filename)
    df.to_csv(output_dir / filename)

In [6]:
def readGenoDistbAndProcessShannonData(directoryPath, pc_res, date, phase_val, growth_val, runID):
    '''
    This loads in all the genotype data for a single run.
    
    growth_val can either be "GROWTH" for runs which exhibit growth, "NOGROWTH" for runs which remain in the first microhabitat
    '''
    runID_key = "runID_"+str(runID)
    print(runID_key)
    filepath_runID = directoryPath+"/"+runID_key
    geno_time_dict = {} #dictionary containing geno dataframes for each timestep

    time_list = getListOfMeasurementTimes(filepath_runID) #sorted list of the times that the genos were sampled at in this run
    
    for t in time_list:
            
        filepath_time = filepath_runID+"/geno_distb-t="+t+".csv"

        #need to swap the rows and columns so that the microhabitat is the key in the dataframe
        #geno_df = pd.read_csv(filename, header=None).T
        geno_df = pd.DataFrame([line.strip().split(',') for line in open(filepath_time, 'r')]).T
        #geno
        new_header = geno_df.iloc[0] #grab the first row for the header
        geno_df = geno_df[1:] #take the data less the header row
        geno_df.columns = new_header #set the header row as the df header

        geno_df = geno_df.astype(float)

        #round the time to the nearest integer value to make reading it in easier
        #the [-3] is so the decimal point and decimal numbers are removed when casting the string to an int
        geno_time_dict[int(t[:-3])] = geno_df

    #we now have the geno distb loaded, so can process it
    t_list, nBac_list, H_vs_t_list, E_vs_t_list, S_vs_t_list = shannonIndexAndEquitabilitySolo(geno_time_dict)
    #write the data to file 
    writeShannonCalculationsToFile(t_list, nBac_list, H_vs_t_list, E_vs_t_list, S_vs_t_list, pc_res=pc_res, date=date, phase_val=phase_val, growth_val=growth_val,
                                   filename="shannon_calculations-"+str(pc_res)+"_pc_res-runID_"+str(runID)+"-migration_rate-0.1.csv")
    del(geno_time_dict)

In [7]:
def shannonProcessNRuns(shannon_function, runID_list, directoryPath, pc_res_val, date, phase_val_str, growth_val_str):
    '''
    This is just a simple function to replace the for loops that used to be used to process the data
    '''
    print(pc_res_val)
    print(growth_val_str)
    for runID in runID_list:
        
        shannon_function(directoryPath=directoryPath, pc_res=pc_res_val, date=date, phase_val=phase_val_str, growth_val=growth_val_str, runID=runID)
    print()

In [8]:
pc_res_16_05_Jan_filepath = getFilepathToGenoRuns(date=dates[0], pc_res=pc_res, phase=phase2_str)
pc_res_16_10_Jan_filepath = getFilepathToGenoRuns(date=dates[1], pc_res=pc_res, phase=phase2_str)
pc_res_16_11_Jan_filepath = getFilepathToGenoRuns(date=dates[2], pc_res=pc_res, phase=phase2_str)
pc_res_16_14_Jan_filepath = getFilepathToGenoRuns(date=dates[3], pc_res=pc_res, phase=phase2_str)

In [9]:
event_counters_16pc_05Jan = getEventCountersDataframe(dates[0], pc_res, phase2_str, log_norm_params_16pcRes[1], duration)
event_counters_16pc_10Jan = getEventCountersDataframe(dates[1], pc_res, phase2_str, log_norm_params_16pcRes[1], duration)
event_counters_16pc_11Jan = getEventCountersDataframe(dates[2], pc_res, phase2_str, log_norm_params_16pcRes[1], duration)
event_counters_16pc_14Jan = getEventCountersDataframe(dates[3], pc_res, phase2_str, log_norm_params_16pcRes[1], duration)

In [10]:
# get the runs where growth ocurred
GROWTH_RUNS_16pc_05Jan = list(event_counters_16pc_05Jan["runID"][(event_counters_16pc_05Jan["bf_thickness"] > 0)])
GROWTH_RUNS_16pc_10Jan = list(event_counters_16pc_10Jan["runID"][(event_counters_16pc_10Jan["bf_thickness"] > 0)])
GROWTH_RUNS_16pc_11Jan = list(event_counters_16pc_11Jan["runID"][(event_counters_16pc_11Jan["bf_thickness"] > 0)])
GROWTH_RUNS_16pc_14Jan = list(event_counters_16pc_14Jan["runID"][(event_counters_16pc_14Jan["bf_thickness"] > 0)])

In [11]:
#shannonProcessNRuns(shannon_function=readGenoDistbAndProcessShannonData, runID_list=GROWTH_RUNS_16pc_05Jan, directoryPath=pc_res_16_05_Jan_filepath, pc_res_val=pc_res, date=dates[0], phase_val_str=phase2_str, growth_val_str="GROWTH")
shannonProcessNRuns(shannon_function=readGenoDistbAndProcessShannonData, runID_list=GROWTH_RUNS_16pc_10Jan, directoryPath=pc_res_16_10_Jan_filepath, pc_res_val=pc_res, date=dates[1], phase_val_str=phase2_str, growth_val_str="GROWTH")
shannonProcessNRuns(shannon_function=readGenoDistbAndProcessShannonData, runID_list=GROWTH_RUNS_16pc_11Jan, directoryPath=pc_res_16_11_Jan_filepath, pc_res_val=pc_res, date=dates[2], phase_val_str=phase2_str, growth_val_str="GROWTH")
shannonProcessNRuns(shannon_function=readGenoDistbAndProcessShannonData, runID_list=GROWTH_RUNS_16pc_14Jan, directoryPath=pc_res_16_14_Jan_filepath, pc_res_val=pc_res, date=dates[3], phase_val_str=phase2_str, growth_val_str="GROWTH")

16
GROWTH
runID_103
runID_107
runID_109
runID_117
runID_118
runID_126
runID_129
runID_141
runID_145
runID_151
runID_157
runID_161
runID_166
runID_193
runID_196
runID_198
runID_199

16
GROWTH
runID_200
runID_206
runID_211
runID_220
runID_221
runID_229
runID_233
runID_256
runID_257
runID_261
runID_262
runID_274
runID_276
runID_281
runID_283
runID_297

16
GROWTH
runID_308
runID_314
runID_321
runID_324
runID_326
runID_332
runID_333
runID_336
runID_344
runID_348
runID_350
runID_356
runID_357
runID_363
runID_364
runID_379
runID_388

