In [1]:
## Bring in needed mods
import pandas as pd, numpy as np
from matplotlib import pyplot as plt
%matplotlib inline

In [2]:
## Load in gff path
gff_path = '../DATA/xl280genome.gff3.csv.gz'
gff = pd.read_csv(gff_path,index_col=0)

In [3]:
## Load in fine-mapped progeny genetic variant data
geno_path = '../DATA/GENOTYPE/Cdx_fine_mapped_SNP-INDEL-blocked.csv.gz'
geno = pd.read_csv(geno_path,index_col=0)
geno.head()

Unnamed: 0,1,9,17,25,33,41,49,57,65,73,...,Ref,Alt,Chrom,AF,DPmin,DPmax,DPmean,ADmin,Co,Blocks
233,0,0,0,1,1,1,0,1,0,0,...,G,[A],Chr01,0.5,8,93,39.28125,0.888889,0.0,0
234,0,0,0,1,1,1,0,1,0,0,...,G,[A],Chr01,0.5,7,111,37.703125,0.875,0.0,0
235,0,0,0,1,1,1,0,1,0,0,...,T,[C],Chr01,0.5,7,107,40.671875,0.875,0.0,0
236,0,0,0,1,1,1,0,1,0,0,...,A,[G],Chr01,0.5,9,88,42.572917,0.9,0.0,0
238,0,0,0,1,1,1,0,1,0,0,...,A,[G],Chr01,0.5,9,87,37.75,0.857143,0.0,0


In [4]:
## Load in fludioxinl results for the fine mapped progeny
fludio_path = '../DATA/PHENOTYPE/FLUDIOXONIL/FM plate map.xlsx'
fludio = pd.read_excel(fludio_path,header=None)
fludio.columns = ['Plate','Row','Column','Segregant','Growth']
fludio = fludio[(fludio.Plate!='deletion 3a')]
fludio.head()

Unnamed: 0,Plate,Row,Column,Segregant,Growth
0,Plate FM1a,a,1,1,0.0
1,Plate FM1a,a,2,2,
2,Plate FM1a,a,3,3,
3,Plate FM1a,a,4,4,0.0
4,Plate FM1a,a,5,5,1.0


In [5]:
## Gather segregants
segs_ix = geno.columns.tolist().index('Pos')
segs = geno.columns.tolist()[:segs_ix]
segs[:5]

['1', '9', '17', '25', '33']

In [7]:
## Check parental genotype sums
geno[['93','94','95','96']].sum(axis=0)

93    73787
94    73787
95    73787
96        0
dtype: int64

In [8]:
## Check the other set of parental genotype sums
geno[['189','190','191','192']].sum(axis=0)

189    73787
190    73787
191    73787
192        0
dtype: int64

In [9]:
## make a list of chromosomes and cumulative ums
chrlist = sorted(geno.Chrom.unique())
chrlens = np.array([geno[(geno.Chrom==c)].Pos.max() 
           for c in chrlist])
cumpos = np.array([0] + list(np.cumsum(chrlens[:-1])))
cummax = cumpos[-1] + geno[(geno.Chrom=='Chr14')].Pos.max()
chrmid = (chrlens/2) + cumpos

In [49]:
## Gather ric8 variants and check the unique combiations (via the sums of genotypes) of genotypes
ric8 = gff[(gff.gene=='CNN01270') & (gff.type=='gene')][['start','end']].values[0]
ric8_gv = geno[(geno.Chrom=='Chr14') & (geno.Pos.isin(np.arange(*ric8+1)))]
ric8_gv[segs].sum(axis=0).unique()

array([15, 14,  0])

In [54]:
## Gather SSK1 variants
ssk1 = gff[(gff.gene=='CNB03090') & (gff.type=='gene')][['start','end']].values[0]
ssk1_gv = geno[(geno.Chrom=='Chr02') & (geno.Pos.isin(np.arange(*ssk1+1)))]
ssk1_gv.head()

Unnamed: 0,1,9,17,25,33,41,49,57,65,73,...,Ref,Alt,Chrom,AF,DPmin,DPmax,DPmean,ADmin,Co,Blocks
20777,0,0,0,0,0,0,1,0,0,0,...,A,[G],Chr02,0.075581,12,78,38.677083,0.923077,0.0,988
20778,0,0,0,0,0,0,1,0,0,0,...,G,[A],Chr02,0.075581,11,74,38.583333,0.916667,0.0,988
20779,0,0,0,0,0,0,1,0,0,0,...,G,[A],Chr02,0.075581,11,86,39.703125,0.916667,0.0,988
20782,0,0,0,0,0,0,1,0,0,0,...,G,[A],Chr02,0.075581,12,89,42.880208,0.923077,0.0,988
20783,0,0,0,0,0,0,1,0,0,0,...,T,[C],Chr02,0.081395,9,86,40.161458,0.9,1.0,989


In [53]:
## Check
ssk1_gv.shape,ssk1_gv[segs].sum(axis=0).unique()

((19, 203), array([ 0, 19,  1, 15]))

In [18]:
ssk1_zero_segs = ssk1_gv[segs].T[(ssk1_gv[segs].sum(axis=0) == 0)].index.tolist()
assert ssk1_gv[segs].T[(ssk1_gv[segs].sum(axis=0) == 0)].sum(axis=1).sum() == 0
len(ssk1_zero_segs)

171

In [19]:
ssk1_one_segs = ssk1_gv[segs].T[(ssk1_gv[segs].sum(axis=0) == ssk1_gv.shape[0])].index.tolist()
assert len(ssk1_one_segs)*ssk1_gv.shape[0] == ssk1_gv[segs].T[
    (ssk1_gv[segs].sum(axis=0) == ssk1_gv.shape[0])].sum(axis=1).sum()
len(ssk1_one_segs)

19

In [20]:
ssk2 = gff[(gff.gene=='CNL05560') & (gff.type=='gene')][['start','end']].values[0]
ssk2_gv = geno[(geno.Chrom=='Chr12') & (geno.Pos.isin(np.arange(*ssk2+1)))]

In [21]:
ssk2_gv.shape

(25, 203)

In [22]:
ssk2_gv.head()

Unnamed: 0,1,9,17,25,33,41,49,57,65,73,...,Ref,Alt,Chrom,AF,DPmin,DPmax,DPmean,ADmin,Co,Blocks
122104,1,1,1,1,1,1,1,1,1,1,...,G,[C],Chr12,0.860465,7,61,26.223958,0.875,2.0,10683
122109,1,1,1,1,1,1,1,1,1,1,...,T,[C],Chr12,0.866279,11,80,39.135417,0.916667,1.0,10684
122110,1,1,1,1,1,1,1,1,1,1,...,G,[C],Chr12,0.866279,8,83,39.65625,0.882353,0.0,10684
122111,1,1,1,1,1,1,1,1,1,1,...,T,[C],Chr12,0.866279,9,97,41.286458,0.9,0.0,10684
122112,1,1,1,1,1,1,1,1,1,1,...,C,[G],Chr12,0.866279,9,86,38.75,0.9,0.0,10684


In [23]:
ssk2_gv[segs].sum(axis=0).unique()

array([25, 18,  0, 24,  9, 16])

In [24]:
ssk2_zero_segs = ssk2_gv[segs].T[(ssk2_gv[segs].sum(axis=0) == 0)].index.tolist()
assert ssk2_gv[segs].T[(ssk2_gv[segs].sum(axis=0) == 0)].sum(axis=1).sum() == 0
len(ssk2_zero_segs)

31

In [25]:
ssk2_one_segs = ssk2_gv[segs].T[(ssk2_gv[segs].sum(axis=0) == ssk2_gv.shape[0])].index.tolist()
assert len(ssk2_one_segs)*ssk2_gv.shape[0] == ssk2_gv[segs].T[
    (ssk2_gv[segs].sum(axis=0) == ssk2_gv.shape[0])].sum(axis=1).sum()
len(ssk2_one_segs)

156

In [26]:
ssk1_segs = ssk1_zero_segs+ssk1_one_segs
ssk2_segs = ssk2_zero_segs+ssk2_one_segs

In [27]:
len(ssk1_segs)

190

In [28]:
len(ssk2_segs)

187

In [29]:
ssk1_ssk2_segs = [ s for s in np.unique(ssk1_segs+ssk2_segs) 
                  if (s in ssk1_segs) and (s in ssk2_segs) and (s in fludio.dropna().Segregant.apply(str).tolist())]

In [30]:
len(ssk1_ssk2_segs)

173

In [31]:
fludio['Segregant'] = fludio.Segregant.apply(str)

In [32]:
fludiobyseg = fludio[(fludio.Segregant.isin(ssk1_ssk2_segs))][['Segregant','Growth']].copy()
fludiobyseg.drop_duplicates(inplace=True)
fludiobyseg.index = fludiobyseg.Segregant
fludiobyseg.drop('Segregant',axis=1,inplace=True)

In [33]:
duplicates = [s for s in ssk1_ssk2_segs if fludiobyseg.T[s].T.shape[0] > 1 ]

In [34]:
fludiobyseg.shape

(173, 1)

In [35]:
ssk1_gt = ssk1_gv[ssk1_ssk2_segs].drop_duplicates().T
ssk1_gt.columns = ['SSK1']

In [36]:
ssk1_gt.head()

Unnamed: 0,SSK1
1,0
10,0
100,0
101,0
102,0


In [37]:
ssk2_gt = ssk2_gv[ssk1_ssk2_segs].drop_duplicates().T
ssk2_gt.columns = ['SSK2']

In [38]:
ssk2_gt.head()

Unnamed: 0,SSK2
1,1
10,1
100,1
101,1
102,1


In [39]:
ric8_gt = ric8_gv.T[[133171]].T[ssk1_ssk2_segs].T
ric8_gt.columns = ['RIC8']

In [40]:
fludio_res = pd.concat([fludiobyseg,ssk1_gt,ssk2_gt,ric8_gt],sort=True,axis=1)

In [41]:
fludio_res.drop_duplicates()

Unnamed: 0,Growth,SSK1,SSK2,RIC8
1,0.0,0,1,1
101,1.0,0,1,1
102,0.0,0,1,0
108,1.0,0,1,0
112,1.0,0,0,1
117,1.0,1,0,0
133,1.0,1,1,1
152,1.0,0,0,0
16,1.0,1,0,1
169,1.0,1,1,0


In [42]:
fludio_res.reset_index(inplace=True)

In [43]:
fludio_res.groupby(['SSK1','SSK2','RIC8','Growth']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,index
SSK1,SSK2,RIC8,Growth,Unnamed: 4_level_1
0,0,0,1.0,6
0,0,1,1.0,14
0,1,0,0.0,48
0,1,0,1.0,7
0,1,1,0.0,82
0,1,1,1.0,3
1,0,0,1.0,4
1,0,1,1.0,3
1,1,0,1.0,2
1,1,1,1.0,4


In [44]:
baddies = fludio_res[(fludio_res.SSK1==0) & (fludio_res.SSK2==1) & (fludio_res.Growth==1)]
baddies

Unnamed: 0,index,Growth,SSK1,SSK2,RIC8
3,101,1.0,0,1,1
10,108,1.0,0,1,0
25,121,1.0,0,1,0
33,129,1.0,0,1,1
60,154,1.0,0,1,0
62,157,1.0,0,1,0
88,181,1.0,0,1,1
92,185,1.0,0,1,0
132,55,1.0,0,1,0
159,81,1.0,0,1,0


In [45]:
## Location of the MAT locus on chromosome 4
mats = 1529240 ## 5' 
matst = 1661861 ## 3'

In [46]:
geno[(geno.Chrom=='Chr04') & (geno.Pos>=mats) & (geno.Pos<=matst)][[str(a) for a in baddies.index]].median()

3      0.0
10     1.0
25     1.0
33     1.0
60     0.0
62     1.0
88     1.0
92     0.0
132    0.0
159    0.0
dtype: float64

In [47]:
survival_counts = fludio_res.groupby(['SSK1','SSK2','Growth']
                  ).count()['index'].reset_index()
survival_counts

Unnamed: 0,SSK1,SSK2,Growth,index
0,0,0,1.0,20
1,0,1,0.0,130
2,0,1,1.0,10
3,1,0,1.0,7
4,1,1,1.0,6


In [48]:
saveout_path = '../DATA/PHENOTYPE/FLUDIOXONIL/Cdx_Fine_map_fludio_res.csv'

survival_counts.to_csv(saveout_path,index=True)