In [None]:
### Imports ###
%pylab inline
import pandas as pd
import seaborn as sns


from scipy.spatial.distance import euclidean
# from scipy.spatial.distance import jensenshannon
from scipy.spatial import distance_matrix
from scipy.stats import binom_test 
import scipy.stats

import collections
from itertools import combinations
import string

%load_ext autoreload
%autoreload 2

import Co_Evo_Func as cef

### Preproccessing ###

counts = pd.read_excel("raw_data/counts_uni.xlsx", index_col=0)

counts['Generation'] = counts['transfer']*10.5  # translate transfers to generations
species = counts.loc[:, 'Ea':'IN72'].columns # list of species in the experiment
counts['total'] = counts[species].sum(axis = 1) # total counts for each row
counts = counts[counts['total']>0]
counts['present'] = counts[species].apply(lambda x:x>0, axis = 1).apply(lambda x: list(species[x.values]), axis=1) # which species are present at each count
counts = counts[counts['sample_kind'].isin(['Pair', 'Trio'])].reset_index(drop = True) # leave only pairs and trios

counts = counts[~counts['transfer'].isin([13, 15])].reset_index(drop = True)
# Samples that did not coexsist for atleast 70 generations are taken out of the analysis

coexistence = lambda x : True if all([sp in x['present'] for sp in x['sample'].split('_')]) else False
counts['coexistence'] = counts.apply(coexistence, axis = 1)

excluded = lambda x:[sp for sp in x['sample'].split('_') if sp not in x['present']][0] if len([sp for sp in x['sample'].split('_') if sp not in x['present']]) > 0 else 'non'
counts['excluded'] = counts.apply(excluded, axis =1)

last_coex = lambda x: max(x['transfer'][x['coexistence']])
lc = counts.groupby(['sample_kind','sample','ident'])[['transfer', 'coexistence']].apply(last_coex)
lc = pd.DataFrame(lc).reset_index()
lc = lc.rename(columns={0: 'last_coex'})
lc['excluded'] = lc.apply(lambda x:counts['excluded'][counts['ident']==x['ident']][counts['transfer']==38].values, axis = 1)


counts = counts[~counts['ident'].isin(lc['ident'][lc['last_coex']<7])].reset_index(drop = True)
counts = counts[counts['sample']!='Pa_Fj'].reset_index(drop = True)

#Spot contaminations 

spot_contaminations = lambda x : False if all([sp in x['sample'].split('_') for sp in x['present']]) else True
counts['cont'] = counts.apply(spot_contaminations, axis = 1) # when a species which souldn't be present is present, set cont as True
first_cont = lambda x: min(x['transfer'][x['cont']].values) if (len(x['transfer'][x['cont']]) != 0) else 38 
fcont = counts[counts['sample_kind'].isin(['Pair', 'Trio'])].groupby(['ident'])[['transfer', 'cont']].apply(first_cont) # If cont == True where did it apear first
fcont = pd.DataFrame(fcont).reset_index()
for t, ide in zip(fcont[fcont[0]!=38][0], fcont[fcont[0]!=38]['ident']):
    # all datapoints after contamination appeared are considered contaminated
    counts['cont'][counts['ident']==ide][counts['transfer'] > t] = True
contaminated = counts['ident'][counts['cont']].unique()
with open("proccesed_data/contamination.txt", "w") as output:
    output.write(str(contaminated))
    
counts = counts[~counts['cont']].reset_index(drop = True)

 # New counts table only with fractions
cf = counts.copy() 
cf[species] = cf[species].apply(lambda x:x/cf['total'])
#counts_frac['detection_limit'] = 1/counts_frac['total']

only_pairs = cf['present'].apply(lambda x:len(x))==2 #return only columns with 2 species present
first_species = lambda x:x.split('_')[0] # return the first species
# calculate the std from binome distribution
std_binom = lambda x: np.sqrt(x[first_species(x['sample'])]*(1-x[first_species(x['sample'])])/(x['total']+1)) 
cf['std'] = cf[only_pairs].apply(std_binom, axis = 1)

 # New counts table only with fractions and pseudocounts
    
cf_psu = counts.copy() 
cf_psu = cf_psu.apply(cef.add_pseudocounts, axis = 1)
cf_psu['total'] = cf_psu[species].sum(axis = 1)

cf_psu[species] = cf_psu[species].apply(lambda x:x/cf_psu['total'])
only_pairs = cf_psu['present'].apply(lambda x:len(x))==2 #return only columns with 2 species present
first_species = lambda x:x.split('_')[0] # return the first species
# calculate the std from binome distribution
std_binom = lambda x: np.sqrt(x[first_species(x['sample'])]*(1-x[first_species(x['sample'])])/(x['total']+1)) 
cf_psu['std'] = cf_psu[only_pairs].apply(std_binom, axis = 1)

cf['change'] = cf.groupby('ident')[species].apply(cef.euc_change)# quantify the distance of each timepoint from the last one
cf['change'] = cf.apply(lambda x:x['change']/sqrt(len(x['sample'].split('_'))), axis = 1)
#cf['Shannon'] = cf.loc[:,'Ea':'IN72'].apply(cef.Shannon, axis = 1) # calculate the alpha diversity for each community
cf['dist_from_ee'] = cf.apply(lambda x: euclidean(x[species], cf[species][cf['transfer']==7][cf['ident'] == x['ident']]) if len(cf[species][cf['transfer']==7][cf['ident'] == x['ident']])!= 0 else NaN, axis = 1)
cf['dist_from_ee'] = cf.apply(lambda x:x['dist_from_ee']/sqrt(len(x['sample'].split('_'))), axis = 1)
cf['dist_from_cent'] = cf.apply(lambda x:euclidean(x[species], cf[species][cf['sample']==x['sample']][cf['transfer']==x['transfer']].mean()), axis =1)


OD = pd.read_excel("proccesed_data/OD_proccesed.xlsx") # OD table
OD = OD.drop(['Plate', 'Well', 'Z_score', 'growth'], axis =1)

frac_od = pd.merge(cf, OD[['ident', 'transfer', 'OD', 'smoothed']], on=['ident', 'transfer'])
frac_od[species] = frac_od[species].apply(lambda x:x*frac_od['smoothed'])

cf['Gen_jit'] = cf['Generation']+ random.normal(0, 2, len(cf)) #jittered generation for some plots
cf['Gen_jit'][cf['transfer']==0] = 0 #don't jitter t0

# cf.to_excel('proccesed_data/cf.xlsx', index=False)
# cf_psu.to_excel('proccesed_data/cf_psu.xlsx', index=False)
# frac_od.to_excel('proccesed_data/frac_od.xlsx', index=False)
#counts.to_excel('proccesed_data/counts_proccesed.xlsx', index = False)

cf = pd.read_excel('proccesed_data/cf.xlsx') # fractionized count table
cf_psu = pd.read_excel('proccesed_data/cf_psu.xlsx') # fractionized count table with pseudocounts
frac_od = pd.read_excel('proccesed_data/frac_od.xlsx')

OD = pd.read_excel("raw_data/OD_proccesed.xlsx") # OD table
OD = OD.drop(['Plate', 'Well', 'Z_score', 'growth'], axis =1)

pairs = cf['sample'][cf['sample_kind']=='Pair'].unique() # a list of pairs in the experiment
trios = cf['sample'][cf['sample_kind']=='Trio'].unique() # a list of trios in the experiment
species = cf.loc[:, 'Ea':'IN72'].columns # list of species in the experiment
transfers = cf['transfer'].unique()
gens = cf['Generation'].unique()
cf['total'][cf.transfer == 0] = cf['total'][cf.transfer == 0] + 10
idents = cf['ident'].unique()
