In [1]:
## Bring in needed mods
import numpy as np, pandas as pd, scipy.stats as ss
import seaborn as sns, sys, os, glob
from matplotlib import pyplot as plt
sys.path.insert(0, '/home/croth/QTL-functions/SCRIPTS/')
import snpy, qtlfunctions as qtlfun

In [2]:
## Set path to GFF file
gffpath ='/home/croth/Downloads/B3502/REF/FungiDB-48_CneoformansJEC21.gff.gz'

## Set path to reference file
refpath = '/home/croth/Downloads/B3502/REF/FungiDB-48_CneoformansJEC21_Genome.fasta'

## Set genotype data path
geno_path = '../GENOTYPE/B3502_parent_variants.csv.gz'

## Bring in genotype data
genos = pd.read_csv(geno_path,index_col=0)
genos['Ref'] = [p.split('.')[0] for  p in genos.Alleles]
genos['Alt'] = genos.Alleles
genos.head()

Unnamed: 0,B3502_A1,B3502_B1,B3502_B7,CF830,JEC21,Seqid,Chrom,Oldix,Pos,Qual,...,Alleles,Type,Avedepth,Mindepth,Maxdepth,Averatio,Minratio,Maxratio,Ref,Alt
51,0.0,1.0,1.0,0.0,0.0,AE017341.1,1,51,3681,3860.63,...,G.T,snp,270.6,183.0,389.0,0.572585,0.483696,0.665254,G,G.T
1062,1.0,1.0,4.0,1.0,1.0,AE017341.1,1,1062,55108,90277.0,...,CGTTGGTTTTT.GGTTTTTTTTT.GGTTGTTTTT.GGTTGTTTTTT...,complex,62.6,31.0,115.0,0.240623,0.051724,0.9375,CGTTGGTTTTT,CGTTGGTTTTT.GGTTTTTTTTT.GGTTGTTTTT.GGTTGTTTTTT...
8887,0.0,1.0,0.0,0.0,0.0,AE017341.1,1,8887,461149,4093.03,...,G.C,snp,131.8,70.0,253.0,0.704192,0.575,0.874317,G,G.C
10062,0.0,0.0,0.0,1.0,1.0,AE017341.1,1,10062,530127,103609.0,...,C.G,snp,225.6,180.0,251.0,0.99553,0.994475,0.996032,C,C.G
11653,1.0,1.0,1.0,4.0,1.0,AE017341.1,1,11653,636710,88263.9,...,TGGGGGGGGGGGGGGGGGGGTTGTGCTGATG.GGGGGGGGGGGGGG...,snp,7.8,3.0,13.0,0.014286,0.0,0.071429,TGGGGGGGGGGGGGGGGGGGTTGTGCTGATG,TGGGGGGGGGGGGGGGGGGGTTGTGCTGATG.GGGGGGGGGGGGGG...


In [3]:
## Gather samples
samples = genos.columns[:5].tolist()

## Gather info columns
info_cols = [c for c in genos.columns if c not in samples]

## View samples and # of info columns
samples,len(info_cols)

(['B3502_A1', 'B3502_B1', 'B3502_B7', 'CF830', 'JEC21'], 16)

In [4]:
## Make a chromosome map
chrommap = qtlfun.chrommap(genos)
chrommap.head()

Unnamed: 0,Chrom,Length,Cumlen,Midpts
0,1,2300473,0,1150236.5
1,2,1495312,2300473,3048129.0
2,3,2082458,3795785,4837014.0
3,4,1783042,5878243,6769764.0
4,5,1505348,7661285,8413959.0


In [5]:
## Gather paths to gene dataframes
samplespath = np.concatenate([glob.glob('../GENES/%s/C*-t26*.csv.gz'%s) for s in samples])
len(samplespath),samplespath[:5]

(468,
 array(['../GENES/B3502_A1/CNC07110-t26_1.csv.gz',
        '../GENES/B3502_A1/CND05940-t26_1.csv.gz',
        '../GENES/B3502_A1/CNF04140-t26_1.csv.gz',
        '../GENES/B3502_A1/CNB04980-t26_1.csv.gz',
        '../GENES/B3502_A1/CNB00100-t26_1.csv.gz'], dtype='<U39'))

In [6]:
## Calculate differences for transcipts
res = snpy.snpresults(samplespath)
res.shape

(468, 14)

In [7]:
## Gather paths to gene dataframes of mrna
samplespath_mrna = np.concatenate([glob.glob('../GENES/%s/C*mRNA*.csv.gz'%s) for s in samples])
len(samplespath_mrna),samplespath_mrna[:5]

(53,
 array(['../GENES/B3502_A1/CNE03000.mRNA.csv.gz',
        '../GENES/B3502_A1/CNM00530.mRNA.csv.gz',
        '../GENES/B3502_A1/CNB04970.mRNA.csv.gz',
        '../GENES/B3502_A1/CND00840.mRNA.csv.gz',
        '../GENES/B3502_A1/CNA03640.mRNA.csv.gz'], dtype='<U38'))

In [8]:
## Calculate differences for mRNA 
res_m = snpy.snpresults(samplespath_mrna,parent_end='.mRNA')
res_m.shape

(53, 14)

In [9]:
res_m.head()

Unnamed: 0,Strain,Gene,Parent,Expected,Ref,Alt,Refstop,Altstop,Nonsyn,Nvars,Utr5,Utr3,Exon,Intron
0,B3502_A1,CNE03000,CNE03000.mRNA,2153.0,453,453,7,7,0,2,0,0,2,0
1,B3502_A1,CNM00530,CNM00530.mRNA,1577.0,29,29,7,66,0,2,0,0,2,0
2,B3502_A1,CNB04970,CNB04970.mRNA,1031.0,90,155,44,36,2,23,0,0,23,0
3,B3502_A1,CND00840,CND00840.mRNA,497.666667,37,37,15,15,0,2,0,0,2,0
4,B3502_A1,CNA03640,CNA03640.mRNA,2111.0,63,63,69,69,0,1,0,0,1,0


In [10]:
## Candidate repressor
res[(res.Gene=='CNF04940')]

Unnamed: 0,Strain,Gene,Parent,Expected,Ref,Alt,Refstop,Altstop,Nonsyn,Nvars,Utr5,Utr3,Exon,Intron
151,B3502_B1,CNF04940,CNF04940-t26_1,526.0,526,526,1,1,1,14,0,1,13,0


In [11]:
## RIC8
res[(res.Gene=='CNN01270')]

Unnamed: 0,Strain,Gene,Parent,Expected,Ref,Alt,Refstop,Altstop,Nonsyn,Nvars,Utr5,Utr3,Exon,Intron
119,B3502_A1,CNN01270,CNN01270-t26_1,691.0,691,456,1,2,0,1,0,0,1,0
247,B3502_B1,CNN01270,CNN01270-t26_1,691.0,691,456,1,2,0,1,0,0,1,0
371,B3502_B7,CNN01270,CNN01270-t26_1,691.0,691,456,1,2,0,1,0,0,1,0


In [12]:
## RHO
res[(res.Gene=='CNC06490')]

Unnamed: 0,Strain,Gene,Parent,Expected,Ref,Alt,Refstop,Altstop,Nonsyn,Nvars,Utr5,Utr3,Exon,Intron
12,B3502_A1,CNC06490,CNC06490-t26_1,996.0,996,996,1,1,1,1,0,0,1,0
141,B3502_B1,CNC06490,CNC06490-t26_1,996.0,996,996,1,1,1,1,0,0,1,0
272,B3502_B7,CNC06490,CNC06490-t26_1,996.0,996,996,1,1,1,1,0,0,1,0


In [13]:
repressors = ['CNA03680', 'CNB00860', 'CNC07030', 'CNC07110', 'CNE05380',
               'CNF04940', 'CNL04090', 'CNM00880', 'CNM01500', 'CNN02460']

In [14]:
res[(res.Gene.isin(repressors))]

Unnamed: 0,Strain,Gene,Parent,Expected,Ref,Alt,Refstop,Altstop,Nonsyn,Nvars,Utr5,Utr3,Exon,Intron
0,B3502_A1,CNC07110,CNC07110-t26_1,256.0,256,256,1,1,0,1,1,0,0,0
37,B3502_A1,CNB00860,CNB00860-t26_1,361.0,361,361,1,1,0,1,0,1,0,0
85,B3502_A1,CNC07030,CNC07030-t26_1,1588.0,1588,1574,1,1,608,42,0,0,42,0
112,B3502_A1,CNM01500,CNM01500-t26_1,1120.0,1120,1120,1,1,0,3,2,0,0,1
145,B3502_B1,CNN02460,CNN02460-t26_1,621.0,621,621,1,1,3,5,0,0,5,0
151,B3502_B1,CNF04940,CNF04940-t26_1,526.0,526,526,1,1,1,14,0,1,13,0
167,B3502_B1,CNM00880,CNM00880-t26_1,343.0,343,343,1,1,0,3,0,3,0,0
209,B3502_B1,CNL04090,CNL04090-t26_1,1477.0,1477,1477,1,1,0,1,0,1,0,0
210,B3502_B1,CNC07030,CNC07030-t26_1,1588.0,1588,1587,1,1,586,3,0,0,3,0
237,B3502_B1,CNM01500,CNM01500-t26_1,1120.0,1120,1120,1,1,0,5,4,0,0,1


In [15]:
assert 1 == 0

AssertionError: 

In [None]:
## Bring in chrommap
chrommap = pd.read_csv('/home/croth/Downloads/B3502/DATA/chrommap.csv.gz')
chrommap['Seqid'] = chrommap['Contig']
chrommap['Chromosome'] = chrommap.index+1

In [None]:
## Bring in GFF file
gffpath ='/home/croth/Downloads/B3502/REF/FungiDB-48_CneoformansJEC21.gff.gz'

names = ["Seqid", "Source", "Type", 
         "Start", "End", "Score", 
         "Strand", "Phase", "Attributes"]

descriptions = ['hypothetical protein','unspecified product']
dtype = ["str","str","str","int","int","str","str","str","str"]

gff = pd.read_csv(gffpath,
                   comment="#",header=None,
                  delimiter="\t",names=names,
                    dtype=dict(zip(names,dtype)))

foi = ['gene','three_prime_UTR','five_prime_UTR','CDS']

gff = gff[(gff.Type.isin(foi))]

gff['Parent'] = [a.split('Parent=')[-1].split(';')[0].split('ID=')[-1] 
                 for a in gff.Attributes ]

gff['Gene'] = [a.split(';')[0].split('D=')[-1].split('-')[0]
               for a in gff.Attributes]

genes = gff[(gff.Type=='gene')].copy()
genes['Description'] = [a.split('description=')[-1].split('%2C')[0] 
                        for a in genes.Attributes]

genes = genes.merge(chrommap[['Seqid','Length','Chromosome','Midpts','Cumlen']])
genes.head()

In [None]:
B3502 = sorted(glob.glob('../GENES/B3502*/*.csv.gz')) 
CF830 = sorted(glob.glob('../GENES/CF830/*.csv.gz'))
JEC21 = sorted(glob.glob('../GENES/JEC21/*.csv.gz'))
#JEC20 = sorted(glob.glob('../GENES/JEC20/*.csv.gz'))
samplespath = B3502+CF830+JEC21
samplespath[:5],len(samplespath)

In [None]:
todf = []
for s in samplespath:

    sample = s.split('/')[2]
    genep = s.split('/')[-1].split('.csv')[0]
    gene_name = s.split('/')[-1].split('-t26')[0]

    temp = pd.read_csv(s)
    gene = temp.Gene.min()
    assert sample == temp.Sample.min()
    assert gene == genep
        
    cds = temp[(temp.Type==0)]
        
    ref,alt = makeorf(cds)
        
    el = cds.Pos.unique().shape[0]/3-1
        
    ra = ref.translate(to_stop=True)
    aa = alt.translate(to_stop=True)
        
    rl = len(ra)
    al = len(aa)
        
    sr = ref.translate().count('*')
    sa = alt.translate().count('*')
        
    ns = sa - 1
    for i in range(np.min([len(ra),len(aa)])):
        if ra[i]!=aa[i]:
            ns = ns + 1
            
    nvars = temp[(temp.Isvar==1)].shape[0]
    utr3 = temp[(temp.Type==3) & (temp.Isvar==1)].shape[0]
    utr5 = temp[(temp.Type==5) & (temp.Isvar==1)].shape[0]
    inexon = temp[(temp.Type==0) & (temp.Isvar==1)].shape[0]
    inintron = temp[(temp.Type==-1) & (temp.Isvar==1)].shape[0]
        
    todf.append((sample,gene_name,gene,
                 el,rl,al,sr,sa,ns,
                 nvars,utr5,utr3,inexon,inintron
                ))
    
    resdf = pd.DataFrame(todf,
        columns=['Strain','Gene','Parent','Expected',
                 'Ref','Alt','Refstop','Altstop','Nonsyn',
                 'Nvars','Utr5','Utr3','Exon','Intron'])
    
    return resdf

In [None]:

resdf.tail()

In [None]:
resdf.Gene.unique().shape

In [None]:
resdf[(resdf.Nonsyn>0)].Gene.unique().shape

In [None]:
resdf[(resdf.Utr5>0)].shape

In [None]:
resdf[(resdf.Utr3>0)].shape

In [None]:
resdf.shape

In [None]:
resdf[(resdf.Altstop==0)].shape

In [None]:
resdf[(resdf.Altstop>1)].shape

In [None]:
resdf[(resdf.Refstop==0)].shape

In [None]:
resdf[(resdf.Refstop>1)].shape

In [None]:
toredo = resdf[(resdf.Expected!=resdf.Ref)]
toredo.shape

In [None]:
resdf.Strain.unique().shape

In [None]:
## Genes with stop gains within them
sorted(resdf[(resdf.Altstop>1)].Gene.unique())

In [None]:
resdf[(resdf.Altstop!=1)].sort_values('Gene')

In [None]:
## CNF02550 hypothetical 

In [None]:
ric8_path = '../GENES/JEC21/CNN01270-t26_1.csv.gz'
ric8 = pd.read_csv(ric8_path,index_col=0)

In [None]:
ric8_name = 'CNN01270'
resdf[(resdf.Gene==ric8_name) &(resdf.Altstop>1)]

In [None]:
resdf[(resdf.Strain=='CF830') & (resdf.Altstop>1)]

In [None]:
finalres = resdf.merge(genes[['Gene','Chromosome','Seqid',
                   'Start','End','Strand','Description']])

In [None]:
finalres.head()

In [None]:
altstoped = finalres[(finalres.Altstop>1)]
altstoped.shape

In [None]:
altstoped

In [None]:
finalres.to_csv('../GENES/gene_changes.csv.gz',index=None)
finalres.shape

In [None]:
finalres[(finalres.Gene=='CNM00880')]

In [None]:
finalres[(finalres.Gene=='CNJ01150')]

In [None]:
finalres[(finalres.Gene=='CNF04940')]

In [None]:
finalres[(finalres.Gene=='CNJ01150')]

In [None]:
#test = pd.read_csv('../DATA/OLD/gene_changes.csv.gz')
#test.shape

In [None]:
#test[(test.Strain=='Stock6') & (test.Altstop>1)]

In [None]:
#test[(test.Strain=='CF830') & (resdf.Altstop>1)]

In [None]:
#test[(test.Gene==ric8_name)]

In [None]:
#a1 = pd.read_csv('../GENES/B3502_A1_Stock1/CNN01270-t26_1.csv.gz',
#                 index_col=0)
#a1[(a1.Isvar==1)]

In [None]:
#d1 = pd.read_csv('../GENES/B3502_D1_Stock6/CNN01270-t26_1.csv.gz',
#                 index_col=0)
#d1[(d1.Isvar==1)]