In [3]:
# Imports
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import statistics
import math
import utils

In [4]:
# Helper functions
def openness_dict(file):
    df = pd.read_csv(file)
    out_dict = {}
    for i, row in df.iterrows():
        gene_id = row['gene_id']
        a_t = row['openness_score']
        out_dict[gene_id] = a_t
    return out_dict

def sort_pairs(pairs):
    matched = {}
    only48 = {}
    only72= {}
    
    for gene in pairs.keys():
        oscore_48, oscore_72 = pairs[gene]
        gene = utils.FBgn_to_genesymbol(gene)
        if oscore_48 > 0 and oscore_72 > 0:
            matched[gene] = abs(oscore_48 - oscore_72) # Distance between the scores
        elif oscore_48 > 0:
            only48[gene] = oscore_48
        elif oscore_72 > 0:
            only72[gene] = oscore_72
    return matched, only48, only72

def normalize_oscores_sum(genes):
    total = 0
    for gene in genes.keys():
        score = genes[gene]
        total += score
    return {gene: genes[gene]/total for gene in genes}

def normalize_oscores_max(genes):
    max_score = 0
    for gene in genes.keys():
        score = genes[gene]
        if score > max_score:
            max_score = score
    return {gene: genes[gene]/max_score for gene in genes}

def combine_reps(rep1, rep2, rep3):
    combined_dict = {}
    for gene in gene_ids:
        try:
            rep1_openness = rep1[gene]
        except:
            rep1_openness = 0
        try:
            rep2_openness = rep2[gene]
        except:
            rep2_openness = 0
        try:
            rep3_openness = rep3[gene]
        except:
            rep3_openness = 0
        
        med = statistics.median([rep1_openness, rep2_openness, rep3_openness])
        if med > 0:
            combined_dict[gene] = med
    return combined_dict

In [5]:
# Load each file and create a dictionary of {gene id: openness score}
rep1_48 = openness_dict('Openness output/500promoter_48hr_rep1.csv')
rep2_48 = openness_dict('Openness output/500promoter_48hr_rep2.csv')
rep3_48 = openness_dict('Openness output/500promoter_48hr_rep3.csv')
rep1_72 = openness_dict('Openness output/500promoter_72hr_rep1.csv')
rep2_72 = openness_dict('Openness output/500promoter_72hr_rep2.csv')
rep3_72 = openness_dict('Openness output/500promoter_72hr_rep3.csv')
gene_ids = sorted(rep1_48)
len(gene_ids)

17807

In [6]:
# Pair up openness scores from 48 and 72 hours for each rep
rep1_pairs = {gene: (rep1_48[gene], rep1_72[gene]) for gene in gene_ids}
rep2_pairs = {gene: (rep2_48[gene], rep2_72[gene]) for gene in gene_ids}
rep3_pairs = {gene: (rep3_48[gene], rep3_72[gene]) for gene in gene_ids}

In [7]:
# Separate pairs into genes accessible in both time steps, only 48, and only 72
rep1_matched, rep1_48, rep1_72 = sort_pairs(rep1_pairs)
rep2_matched, rep2_48, rep2_72 = sort_pairs(rep2_pairs)
rep3_matched, rep3_48, rep3_72 = sort_pairs(rep3_pairs)
gene_ids = [utils.FBgn_to_genesymbol(gene) for gene in gene_ids]

In [8]:
# Combine the separated groups across replications
combined_matched = combine_reps(rep1_matched, rep2_matched, rep3_matched)
combined_48 = combine_reps(rep1_48, rep2_48, rep3_48)
combined_72 = combine_reps(rep1_72, rep2_72, rep3_72)

In [9]:
# Normalize the openness score within the group
combined_matched = normalize_oscores_max(combined_matched)
combined_48 = normalize_oscores_max(combined_48)
combined_72 = normalize_oscores_max(combined_72)

In [10]:
# Print a list of genes that can be fed to the gene onotology enrichment program (FlyEnrichr)
gene_list = combined_matched
text = ''
for gene in sorted(gene_list, key=lambda x: gene_list[x], reverse=True):
    print(gene)# + ',' + str(gene_list[gene])) # Uncomment to include normalized openness score for weighting
    text += gene + '\n'

pre-rRNA:CR45845
pre-rRNA:CR45847
tRNA:Leu-AAG-1-3
Fer1HCH
nero
tRNA:Asp-GTC-1-5
tRNA:Ala-CGC-1-2
Dpit47
28SrRNA-Psi:CR45859
tRNA:Cys-GCA-1-1
tRNA:Cys-GCA-1-2
tRNA:Glu-TTC-1-4
CG33229
lncRNA:CR43651
lncRNA:CR45463
mRpS16
asRNA:CR45969
tRNA:Ala-CGC-1-1
lncRNA:CR43958
CG2211
lncRNA:CR44867
Fsn
cbt
mir-4962
lncRNA:CR32218
tRNA:Val-AAC-2-1
tRNA:Ser-CGA-1-4
28SrRNA:CR45837
28SrRNA:CR45844
Snx3
lncRNA:Hsromega
Sap-r
Acyp2
asRNA:CR44987
lncRNA:CR46075
lncRNA:CR44092
lncRNA:CR45352
tRNA:Thr-CGT-1-3
lncRNA:CR45036
bel
CG17734
GstE12
Bacc
asRNA:CR45135
lncRNA:CR43626
lncRNA:CR43644
tRNA:iMet-CAT-1-2
kra
7SLRNA:CR32864
Fer2LCH
Rho1
Trl
CG8441
lncRNA:CR43857
lncRNA:CR44218
asRNA:CR43243
Tailor
asRNA:CR44390
Gdi
lncRNA:CR44417
HmgZ
tRNA:Ile-TAT-1-1
asRNA:CR46055
mor
CG4572
Vha55
lncRNA:CR31840
omd
asRNA:CR43889
lncRNA:CR42862
Sec61alpha
CG34228
alphaTub84B
by
Arf79F
lncRNA:CR44331
mRpL55
asRNA:CR45134
Lk6
CG45218
Vha16-1
Mapmodulin
lncRNA:CR44766
cg
pre-rRNA:CR45846
chic
lncRNA:CR45054
Sin3A
lncRNA

In [68]:
# Save the output
out = open('Openness GO analysis/combined_matched.txt', 'w')
out.write(text)
out.close()