In [18]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
import matplotlib.ticker as ticker
import matplotlib.pylab as pl
from itertools import cycle
import matplotlib.gridspec as gridspec
import glob
import collections
import math
import re
import os
from collections import defaultdict

In [19]:
nRuns = 100 #reduced to 10 for now to speed things up (nRuns is the no. of files we load into the notebook)
duration = 100.
nSamples = 50

#these durations are used for the second batch of runs, where phase_4 was run for longer to get
#more comparative results for the plots of things vs N_bacteria
#not used now that there's the new way of getting the time values and the runs are analysed seperately
duration_phase2 = 100.
nSamples_phase2 = 50
duration_phase4 = 300.
nSamples_phase4 = 150

# phase2_filepath = "speciesComp-phase2-fixedImm/geno_distbs"
# phase4_filepath = "speciesComp-phase4-fixedImm/geno_distbs"

# phase2_bigK_filepath = "speciesComp-phase2-fixedImm-bigK-mostPrecise/geno_distbs"
# phase4_bigK_filepath = "speciesComp-phase4-fixedImm-bigK-mostPrecise/geno_distbs"

#these are for the varying immigration rate runs
phase2_bigK_filepath = "speciesComp-phase2-data/speciesComp-phase2-varyingImm-bigK-mostPrecise/geno_distbs"
phase4_bigK_filepath = "speciesComp-phase4-data/speciesComp-phase4-varyingImm-bigK-mostPrecise/geno_distbs"

In [20]:
def getListOfMeasurementTimes(directory_name):
    '''
    for each runID directory, this gets the filenames and extracts a list of the times they were sampled at.
    directory_name is of form path_to_files/runID_<n>
    
    returns: sorted list of the time vals, in string form with 2 decimal places
    '''
    time_list = []
    def get_numbers_from_filename(filename):
        return re.search(r'(\d+(?:\.\d+)?)', filename).group(0)
    
    for filename in os.listdir(directory_name):
        time_list.append(float(get_numbers_from_filename(filename)))

    return ["{:.2f}".format(float(t)) for t in sorted(time_list)]

In [21]:
def shannonIndexAndEquitabilitySolo(geno_dict):
    '''
    This does the same as the other shannon processing stuff, but just for a single run.
    We'll save all the individual calculations to .csv files, then combine them into a dataframe later
    '''
    
    times = []
    nBac_t = defaultdict(list) #no. of bacteria over time
    H_t = defaultdict(list) #shannon index over time
    E_t = defaultdict(list) #shannon equitability over time
    S_t = defaultdict(list) #no. of species over time
    
    times = geno_dict.keys()
    #print(times)
    for time_key in times:

        #here we create an array with all the genotypes in it and remove any nans
        geno_vals = geno_dict[time_key].values.flatten()[~np.isnan(geno_dict[time_key].values.flatten())]
        nTot = geno_vals.size #total number of bacteria in the population
        genoCounts = collections.Counter(geno_vals) #number of members of each bacterial species in the system

        H = sum([-n/nTot*math.log(n/nTot) for _, n in genoCounts.items()]) #shannon index of this run at time t
        S = len(genoCounts.keys()) #no. of different species in the system
        logS_adjusted = 1 if S == 1 else math.log(S)
        E = H/logS_adjusted #shannon equitability

        nBac_t[time_key].append(int(nTot))
        H_t[time_key].append(H)
        E_t[time_key].append(E)
        S_t[time_key].append(S)
        
    #this is a very poor way of doing things, but in a rush and just trying to 
    #get a good enough job done atm
    nBac_t_list = [b[0] for b in nBac_t.values()]
    H_t_list = [h[0] for h in H_t.values()]
    E_t_list = [e[0] for e in E_t.values()]
    S_t_list = [s[0] for s in S_t.values()]
    
    return list(H_t.keys()), nBac_t_list, H_t_list, E_t_list, S_t_list

In [22]:
def shannonIndexAndEquitabilitySolo_EDGE(geno_dict):
    '''
    This does the same as the other shannon processing stuff, but just for a single run.
    We'll save all the individual calculations to .csv files, then combine them into a dataframe later
    
    This is just for the edge microhabitats
    '''
    
    times = []
    nBac_t = defaultdict(list) #no. of bacteria over time
    H_t = defaultdict(list) #shannon index over time
    E_t = defaultdict(list) #shannon equitability over time
    S_t = defaultdict(list) #no. of species over time
    
    times = geno_dict.keys()
    #print(times)
    for time_key in times:
        
        #get the last key in this timestep, hopefully it's the edge one
        edge_mh_key = geno_dict[time_key].keys()[-1]
        #here we create an array with all the genotypes in it and remove any nans (this version should just be of the edge values)
        geno_vals = geno_dict[time_key][edge_mh_key].values.flatten()[~np.isnan(geno_dict[time_key][edge_mh_key].values.flatten())]
        nTot = geno_vals.size #total number of bacteria in the population
        genoCounts = collections.Counter(geno_vals) #number of members of each bacterial species in the system

        H = sum([-n/nTot*math.log(n/nTot) for _, n in genoCounts.items()]) #shannon index of this run at time t
        S = len(genoCounts.keys()) #no. of different species in the system
        logS_adjusted = 1 if S <= 1 else math.log(S)
        E = H/logS_adjusted #shannon equitability

        nBac_t[time_key].append(int(nTot))
        H_t[time_key].append(H)
        E_t[time_key].append(E)
        S_t[time_key].append(S)
        
    #this is a very poor way of doing things, but in a rush and just trying to 
    #get a good enough job done atm
    nBac_t_list = [b[0] for b in nBac_t.values()]
    H_t_list = [h[0] for h in H_t.values()]
    E_t_list = [e[0] for e in E_t.values()]
    S_t_list = [s[0] for s in S_t.values()]
    
    return list(H_t.keys()), nBac_t_list, H_t_list, E_t_list, S_t_list

In [23]:
def writeShannonCalculationsToFile(t_data, nBac_data, H_data, E_data, S_data, phase_val, filename):
    '''
    it takes an absolute age to load in all the genotype data, so this method will write the calculated values to a file
    '''
    #create a dataframe containing all the calculated values
    #first we need a dictionary with the data in it
    collated_data = {'t':t_data, 'nBac':nBac_data, 'H':H_data, 'E':E_data, 'S':S_data}
    df = pd.DataFrame(collated_data)
    df.to_csv("species_comp_calculations_varying_immigration/"+phase_val+"_data/"+filename)

In [24]:
def writeShannonCalculationsToFile_EDGE(t_data, nBac_data, H_data, E_data, S_data, phase_val, filename):
    '''
    it takes an absolute age to load in all the genotype data, so this method will write the calculated values to a file
    this method is just for the edge data
    '''
    #create a dataframe containing all the calculated values
    #first we need a dictionary with the data in it
    collated_data = {'t':t_data, 'nBac':nBac_data, 'H':H_data, 'E':E_data, 'S':S_data}
    df = pd.DataFrame(collated_data)
    df.to_csv("species_comp_calculations_varying_immigration/"+phase_val+"_data_EDGE/"+filename)

In [25]:
def readGenoDistbAndProcessShannonData(directoryPath, phase_val, runID):
    '''
    Loading all the dataframes into one master dictionary was causing serious memory issues.
    So here we'll just load in the geno data and process it for a single run at a time.
    '''
    runID_key = "runID_"+str(runID)
    print(runID_key)
    filepath_runID = directoryPath+"/"+runID_key
    geno_time_dict = {} #dictionary containing geno dataframes for each timestep

    time_list = getListOfMeasurementTimes(filepath_runID) #sorted list of the times that the genos were sampled at in this run
    
    for t in time_list:
            
        filepath_time = filepath_runID+"/geno_distb-t="+t+".csv"

        #need to swap the rows and columns so that the microhabitat is the key in the dataframe
        #geno_df = pd.read_csv(filename, header=None).T
        geno_df = pd.DataFrame([line.strip().split(',') for line in open(filepath_time, 'r')]).T
        #geno
        new_header = geno_df.iloc[0] #grab the first row for the header
        geno_df = geno_df[1:] #take the data less the header row
        geno_df.columns = new_header #set the header row as the df header

        geno_df = geno_df.astype(float)

        #round the time to the nearest integer value to make reading it in easier
        #the [-3] is so the decimal point and decimal numbers are removed when casting the string to an int
        geno_time_dict[int(t[:-3])] = geno_df

    #we now have the geno distb loaded, so can process it
    t_list, nBac_list, H_vs_t_list, E_vs_t_list, S_vs_t_list = shannonIndexAndEquitabilitySolo(geno_time_dict)
    #write the data to file 
    writeShannonCalculationsToFile(t_list, nBac_list, H_vs_t_list, E_vs_t_list, S_vs_t_list, phase_val, "shannon_calculations_precisest_bigK_"+phase_val+"_runID-"+str(runID)+".csv")
    del(geno_time_dict)

In [26]:
def readGenoDistbAndProcessShannonData_EDGE(directoryPath, phase_val, runID):
    '''
    Loading all the dataframes into one master dictionary was causing serious memory issues.
    So here we'll just load in the geno data and process it for a single run at a time.
    this method is just for the edge microhabitats
    '''
    runID_key = "runID_"+str(runID)
    print(runID_key)
    filepath_runID = directoryPath+"/"+runID_key
    geno_time_dict = {} #dictionary containing geno dataframes for each timestep

    time_list = getListOfMeasurementTimes(filepath_runID) #sorted list of the times that the genos were sampled at in this run
    
    for t in time_list:
            
        filepath_time = filepath_runID+"/geno_distb-t="+t+".csv"

        #need to swap the rows and columns so that the microhabitat is the key in the dataframe
        #geno_df = pd.read_csv(filename, header=None).T
        geno_df = pd.DataFrame([line.strip().split(',') for line in open(filepath_time, 'r')]).T
        #geno
        new_header = geno_df.iloc[0] #grab the first row for the header
        geno_df = geno_df[1:] #take the data less the header row
        geno_df.columns = new_header #set the header row as the df header

        geno_df = geno_df.astype(float)

        #round the time to the nearest integer value to make reading it in easier
        #the [-3] is so the decimal point and decimal numbers are removed when casting the string to an int
        geno_time_dict[int(t[:-3])] = geno_df

    #we now have the geno distb loaded, so can process it
    t_list, nBac_list, H_vs_t_list, E_vs_t_list, S_vs_t_list = shannonIndexAndEquitabilitySolo_EDGE(geno_time_dict)
    #write the data to file 
    writeShannonCalculationsToFile_EDGE(t_list, nBac_list, H_vs_t_list, E_vs_t_list, S_vs_t_list, phase_val, "shannon_calculations_precisest_bigK_"+phase_val+"_runID-"+str(runID)+".csv")
    del(geno_time_dict)

In [27]:
readGenoDistbAndProcessShannonData(phase4_bigK_filepath, "phase4", 14)

runID_14


In [29]:
for runID in range(nRuns):
    readGenoDistbAndProcessShannonData(phase2_bigK_filepath, "phase2", runID)

runID_0
runID_1
runID_2
runID_3
runID_4
runID_5
runID_6
runID_7
runID_8
runID_9
runID_10
runID_11
runID_12
runID_13
runID_14
runID_15
runID_16
runID_17
runID_18
runID_19
runID_20
runID_21
runID_22
runID_23
runID_24
runID_25
runID_26
runID_27
runID_28
runID_29
runID_30
runID_31
runID_32
runID_33
runID_34
runID_35
runID_36
runID_37
runID_38
runID_39
runID_40
runID_41
runID_42
runID_43
runID_44
runID_45
runID_46
runID_47
runID_48
runID_49
runID_50
runID_51
runID_52
runID_53
runID_54
runID_55
runID_56
runID_57
runID_58
runID_59
runID_60
runID_61
runID_62
runID_63
runID_64
runID_65
runID_66
runID_67
runID_68
runID_69
runID_70
runID_71
runID_72
runID_73
runID_74
runID_75
runID_76
runID_77
runID_78
runID_79
runID_80
runID_81
runID_82
runID_83
runID_84
runID_85
runID_86
runID_87
runID_88
runID_89
runID_90
runID_91
runID_92
runID_93
runID_94
runID_95
runID_96
runID_97
runID_98
runID_99


In [30]:
for runID in range(nRuns):
    readGenoDistbAndProcessShannonData_EDGE(phase2_bigK_filepath, "phase2", runID)

runID_0
runID_1
runID_2
runID_3
runID_4
runID_5
runID_6
runID_7
runID_8
runID_9
runID_10
runID_11
runID_12
runID_13
runID_14
runID_15
runID_16
runID_17
runID_18
runID_19
runID_20
runID_21
runID_22
runID_23
runID_24
runID_25
runID_26
runID_27
runID_28
runID_29
runID_30
runID_31
runID_32
runID_33
runID_34
runID_35
runID_36
runID_37
runID_38
runID_39
runID_40
runID_41
runID_42
runID_43
runID_44
runID_45
runID_46
runID_47
runID_48
runID_49
runID_50
runID_51
runID_52
runID_53
runID_54
runID_55
runID_56
runID_57
runID_58
runID_59
runID_60
runID_61
runID_62
runID_63
runID_64
runID_65
runID_66
runID_67
runID_68
runID_69
runID_70
runID_71
runID_72
runID_73
runID_74
runID_75
runID_76
runID_77
runID_78
runID_79
runID_80
runID_81
runID_82
runID_83
runID_84
runID_85
runID_86
runID_87
runID_88
runID_89
runID_90
runID_91
runID_92
runID_93
runID_94
runID_95
runID_96
runID_97
runID_98
runID_99


In [32]:
'''
run 74 had some issues, either ignore it or replace it with some other run
maybe just have a continue statement when we iterate over the runs and get to 74
'''
for runID in range(75, nRuns):
    readGenoDistbAndProcessShannonData(phase4_bigK_filepath, "phase4", runID)

runID_75
runID_76
runID_77
runID_78
runID_79
runID_80
runID_81
runID_82
runID_83
runID_84
runID_85
runID_86
runID_87
runID_88
runID_89
runID_90
runID_91
runID_92
runID_93
runID_94
runID_95
runID_96
runID_97
runID_98
runID_99


In [34]:
for runID in range(75, nRuns):
    readGenoDistbAndProcessShannonData_EDGE(phase4_bigK_filepath, "phase4", runID)

runID_75
runID_76
runID_77
runID_78
runID_79
runID_80
runID_81
runID_82
runID_83
runID_84
runID_85
runID_86
runID_87
runID_88
runID_89
runID_90
runID_91
runID_92
runID_93
runID_94
runID_95
runID_96
runID_97
runID_98
runID_99
