In [9]:
import pandas as pd, numpy as np

In [10]:
def loadgff(path=None,sr=16, dropsource=True,zerobase=True,
            cols=['chrom','source','type','start','end',
                  'score','strand','phase','attribute']):
    """
    Loads a gff as pandas dataframe and assignes column names.
    Skips SR rows of the gff file and assigns column names COLS.
    """
    if path is None:
        path = '/Users/croth/Downloads/FungiDB-41_CneoformansJEC21.gff'
    gff = pd.read_table(path,header=None,skiprows=sr) ## Load the gff file via pandas
    gff.index = gff.index + sr ## reset index to match row id in original gff file
    gff.columns = cols ## Rename colunms
    if dropsource:
        gff.drop(['source','score'],axis=1,inplace=True)
    if zerobase:
        gff['start'] = gff['start'] - 1
    return gff

In [11]:
def addgene(gff,col='attribute'):
    return [a.split(';')[0].split('.')[0].split('-')[0].split('_')[-1].split('ID=')[-1] 
             for a in gff[col].tolist()]

def addtogff(gff,ret='description=',col='attribute'):
    descripts = [[a for a in b.split(';') if ret in a]
         for b in gff[col].tolist()]
    descripts_return = []
    for d in descripts:
        if len(d) > 0:
            descripts_return.append(d[0].split(ret)[-1])
        else:
            descripts_return.append(np.nan)
    return descripts_return

In [12]:
## load in the JEC21 sequence file
jgff = loadgff()
jgff.shape

  # Remove the CWD from sys.path while we load stuff.


(119444, 7)

In [13]:
## Add gene name
jgff['gene'] = addgene(jgff)
jgff['description'] = addtogff(jgff)
jgff['id'] = addtogff(jgff,ret='ID=')

In [17]:
bounds = jgff[(jgff.type=='gene') & (jgff.id.isin(['CNB02850','CNB03330']))]

In [22]:
left = np.min(np.concatenate(bounds[['start','end']].values))
right = np.max(np.concatenate(bounds[['start','end']].values))

In [26]:
jgff.head()

Unnamed: 0,chrom,type,start,end,strand,phase,attribute,gene,description,id
16,AE017352.1,gene,713523,714773,-,.,ID=CNL06190;description=aldo-keto reductase%2C...,CNL06190,aldo-keto reductase%2C putative,CNL06190
17,AE017352.1,mRNA,713523,714773,-,.,ID=CNL06190-t26_1;Parent=CNL06190;description=...,CNL06190,aldo-keto reductase%2C putative,CNL06190-t26_1
18,AE017352.1,exon,713523,713666,-,.,ID=exon_CNL06190-E5;Parent=CNL06190-t26_1,CNL06190,,exon_CNL06190-E5
19,AE017352.1,exon,713719,713815,-,.,ID=exon_CNL06190-E4;Parent=CNL06190-t26_1,CNL06190,,exon_CNL06190-E4
20,AE017352.1,exon,713865,714350,-,.,ID=exon_CNL06190-E3;Parent=CNL06190-t26_1,CNL06190,,exon_CNL06190-E3


In [34]:
## Load gene anlaysis
gene_analysis = pd.read_csv('../FILES/XL280_431_gene_analysis.csv.gz')
gene_analysis.head()

Unnamed: 0,chrom,gene,mod_three,length_check,nonsense,missense
0,Chr01,CNA04490,0,0,0,2
1,Chr01,CNA05630,0,0,0,2
2,Chr01,CNA04300,0,0,0,0
3,Chr01,CNA08210,0,0,0,0
4,Chr01,CNA00370,0,0,0,3


In [33]:
genes_in_qtl_maybe = jgff[(jgff.chrom=='AE017342.1') & 
     (jgff.start.isin(np.arange(left,right+1))) & 
     (jgff.type=='gene')].sort_values('start').id.tolist()

In [39]:
len([g for g in genes_in_qtl_maybe if g not in gene_analysis.gene.tolist()])

15

In [43]:
jgff[(jgff.id.isin([g for g in genes_in_qtl_maybe if g not in gene_analysis.gene.tolist()]))].sort_values('id')

Unnamed: 0,chrom,type,start,end,strand,phase,attribute,gene,description,id
65849,AE017342.1,gene,850460,852258,-,.,ID=CNB02880;description=endoplasmic reticulum ...,CNB02880,endoplasmic reticulum protein%2C putative,CNB02880
112652,AE017342.1,gene,852305,853767,-,.,ID=CNB02890;description=purine-specific oxidiz...,CNB02890,purine-specific oxidized base lesion DNA N-gly...,CNB02890
82602,AE017342.1,gene,853851,855449,+,.,ID=CNB02900;description=serine/threonine-prote...,CNB02900,serine/threonine-protein kinase%2C putative,CNB02900
118138,AE017342.1,gene,856408,862548,+,.,ID=CNB02910;description=unspecified product,CNB02910,unspecified product,CNB02910
6061,AE017342.1,gene,865735,877066,-,.,ID=CNB02920;description=unspecified product,CNB02920,unspecified product,CNB02920
36341,AE017342.1,gene,878805,885495,-,.,ID=CNB02930;description=unspecified product,CNB02930,unspecified product,CNB02930
93260,AE017342.1,gene,885495,890169,+,.,ID=CNB02940;description=unspecified product,CNB02940,unspecified product,CNB02940
63194,AE017342.1,gene,890392,896167,-,.,ID=CNB02950;description=unspecified product,CNB02950,unspecified product,CNB02950
85380,AE017342.1,gene,896256,897579,+,.,ID=CNB02960;description=unspecified product,CNB02960,unspecified product,CNB02960
6803,AE017342.1,gene,903367,903655,+,.,ID=CNB02970;description=unspecified product,CNB02970,unspecified product,CNB02970


In [40]:
len([g for g in genes_in_qtl_maybe if g in gene_analysis.gene.tolist()])

38

In [42]:
jgff[(jgff.id=='CNB03220')]

Unnamed: 0,chrom,type,start,end,strand,phase,attribute,gene,description,id
21476,AE017342.1,gene,972411,974648,+,.,ID=CNB03220;description=unspecified product,CNB03220,unspecified product,CNB03220


In [44]:
gene_analysis[(gene_analysis.gene.isin(genes_in_qtl_maybe))].sort_values('gene')

Unnamed: 0,chrom,gene,mod_three,length_check,nonsense,missense
828,Chr02,CNB02850,0,0,0,5
849,Chr02,CNB02860,0,0,0,3
988,Chr02,CNB02870,0,0,0,0
982,Chr02,CNB02995,0,0,0,1
878,Chr02,CNB03000,0,0,0,3
1110,Chr02,CNB03005,0,0,0,3
807,Chr02,CNB03010,0,0,0,0
997,Chr02,CNB03020,0,0,0,2
1122,Chr02,CNB03030,0,0,0,2
875,Chr02,CNB03040,0,0,0,0
