Read data from GEM results file and annotation file and report genes that are overlapped by peaks. Include peak score and gene annotation in output.

In [1]:
peak_file = 'out6/out6.GEM_events.txt'
!head {peak_file}

Position	IP	Control	Fold	Expectd	Q_-lg10	P_-lg10	P_poiss	IPvsEMP	Noise	KmerGroup	MotifId	KG_score	Strand
G:25513	12007.7	  107.1	  112.1	  201.4	 999.00	 999.00	 999.00	  -0.70	   0.05	TTCCACGGAA_7/0	0	1.85	+
K:155473	 4922.2	  163.7	   30.1	  350.7	 999.00	 999.00	 999.00	  -0.65	   0.24	--------	-1	0.00	*
I:968521	 4821.0	   21.5	  223.8	   66.7	 999.00	 999.00	 999.00	  -0.68	   0.00	--------	-1	0.00	*
K:155633	 3817.8	  378.1	   10.1	  352.4	 999.00	 999.00	 999.00	  -0.67	   0.24	--------	-1	0.00	*
G:25392	 3738.3	  139.9	   26.7	  195.9	 999.00	 999.00	 999.00	  -0.31	   0.05	--------	-1	0.00	*
A:162953	 3372.5	   29.2	  115.7	  106.6	 999.00	 999.00	 999.00	  -0.73	   0.00	--------	-1	0.00	*
D:562677	 3138.8	  312.9	   10.0	  285.8	 999.00	 999.00	 999.00	  -0.52	   0.37	ATTCCACGGAA_9/0	0	2.11	+
L:1094955	 3001.2	   46.5	   64.5	  132.0	 999.00	 999.00	 999.00	  -0.51	   0.00	TTCCACGGATC_3/0	0	1.35	+
F:260851	 2698.9	   65.1	   41.5	  227.5	 999.00	 999.00	 999.00	  -0.50	   0.2

In [2]:
# Part 1:
# fill a dictionary with peak positions as keys and peak scores as values

# Steps:

# open filehandle for reading
f=open(peak_file, 'r')

# initialise dictionary
peaks = dict()

#extract the header 
header = f.readline()

# cycle through input lines
for line in f:
    # split line at tab (\t)
    fields = line.split('\t')
    
    # extract elements representing position and score
    chrom = fields[0]
    score = fields[5]
    
    # prepend 'chr' to chromosome name
    chrom = 'chr' + chrom
    
    # insert position and score into dictionary
    peaks[chrom] = score

#close filehandle 
f.close()

In [3]:
peaks

{'chrG:25513': ' 999.00',
 'chrK:155473': ' 999.00',
 'chrI:968521': ' 999.00',
 'chrK:155633': ' 999.00',
 'chrG:25392': ' 999.00',
 'chrA:162953': ' 999.00',
 'chrD:562677': ' 999.00',
 'chrL:1094955': ' 999.00',
 'chrF:260851': ' 999.00',
 'chrM:202410': ' 999.00',
 'chrI:968589': ' 999.00',
 'chrM:202966': ' 999.00',
 'chrJ:719129': ' 999.00',
 'chrM:852415': ' 999.00',
 'chrM:202790': ' 999.00',
 'chrG:145393': ' 999.00',
 'chrK:311025': ' 999.00',
 'chrM:202658': ' 999.00',
 'chrA:162860': ' 289.28',
 'chrL:1094737': ' 289.17',
 'chrM:203240': ' 250.98',
 'chrI:700709': ' 246.07',
 'chrF:260798': ' 246.01',
 'chrK:529767': ' 244.73',
 'chrE:376953': ' 230.82',
 'chrG:145476': ' 225.53',
 'chrI:968394': ' 202.62',
 'chrC:328445': ' 200.76',
 'chrF:260983': ' 187.55',
 'chrK:79829': ' 184.21',
 'chrM:852513': ' 183.53',
 'chrA:163065': ' 171.67',
 'chrK:572626': ' 163.66',
 'chrE:377037': ' 159.42',
 'chrI:968741': ' 101.37',
 'chrM:1164101': '  96.23',
 'chrJ:719252': '  88.94',
 

In [4]:
# Part 2:
# read in annotation and print information for genes that overlap with peaks

# specify a value for the size of the promoter region
promoter = 1000

# annotation file downloaded from CandidaDB:
file = '/data/genomes/yeast/C.glabrata/C_glabrata_CBS138_version_s02-m02-r03_chromosomal_feature.tab'

# output file for storing results
outfile = 'genes_with_peaks3.csv'

# open filehandles for reading and writing to new file
fh = open(file, 'r')
fout = open(outfile, 'w')

# generate a header line for the output file
header = '\t'.join(['Feature', 'location', 'score', 'symbol', 'description'])
fout.write(f'{header}\n')

for line in fh :
    
    # skip all header lines
    if line.startswith('!') :
        continue
        
    # split file into parts (number of columns might differ)
    parts = line.strip().split('\t')
    
    # assign elements to variables:
    if len(parts) == 17 :
        (feature, locus, alias, feature_type, chrom, start, stop, strand, primary, secondary, description, date, coord, blank1, blank2, reserve, standard) = parts
    elif len(parts) == 18 :
        (feature, locus, alias, feature_type, chrom, start, stop, strand, primary, secondary, description, date, coord, blank1, blank2, reserve, standard, ortho) = parts
    else :
        print(f'Unusual number of fields: {len(parts)} in {line.strip()}')
        continue
        
    # simplify chromosome name
    chrom_parts = chrom.split('_')
    chrom = chrom_parts[0].replace('Chr', 'chr')
    
    # turn coordinate strings into integers
    stop = int(stop)
    start = int(start)
    
    # change C/W strands into -/+ notation
    if strand == 'C' :
        strand = '-'
        
        # reverse coordinates for genes on minus strand
        (start, stop) = (stop, start)
        
        # extend by promoter region
        stop += promoter
        
    else :
        strand = '+'
        
        # extend by promoter region
        start -= promoter

    
    # go through each position covered by the gene (from start to stop)
    # and check if it is overlapping with a peak (stored as chrom:position in dictionary)
    for position in range(start, stop+1) :
        
        # construct the location as used in the dictionary
        location = f'{chrom}:{position}'
            
        # check if location is found in dictionary
        if location in peaks.keys() :
            
            # print out relavant information from peak and annotation file
            fout.write(f'{feature}\t{chrom}:{start}-{stop}\t{peaks[location]}\t{locus}\t{description}\n')
            
            # move on to next gene
            break
        
fh.close()
fout.close()

In [5]:
# print the first 10 lines of the results
!head {outfile}

Feature	location	score	symbol	description
CAGL0G05329g	chrG:499204-500491	  20.38		Ortholog(s) have role in positive regulation of transcription from RNA polymerase II promoter and SWI/SNF complex localization
CAGL0C03289g	chrC:327799-333745	 116.03	YBT1	Putative ABC transporter involved in bile acid transport; gene is upregulated in azole-resistant strain
CAGL0E03960g	chrE:375571-377464	 133.69		Ortholog(s) have eukaryotic initiation factor 4G binding activity, role in negative regulation of translation in response to stress and cytoplasmic mRNA processing body, nucleolus localization
CAGL0G05313g	chrG:497360-499913	  20.38		Ortholog(s) have transferase activity, transferring phosphorus-containing groups activity
CAGL0I07249g	chrI:698635-700771	 249.78		Putative GTPase-activating protein involved in cell wall and cytoskeleton homeostasis; gene is upregulated in azole-resistant strain
CAGL0C02937g	chrC:289289-291692	  30.08		Ortholog(s) have microtubule plus-end binding activity, role 

In [6]:
# check how many entries we have in the output file
!wc -l {outfile}

36 genes_with_peaks2.csv
