In [1]:
#MIT License
#
#Copyright (c) 2023 Pierre Michel Joubert
#
#Permission is hereby granted, free of charge, to any person obtaining a copy
#of this software and associated documentation files (the "Software"), to deal
#in the Software without restriction, including without limitation the rights
#to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
#copies of the Software, and to permit persons to whom the Software is
#furnished to do so, subject to the following conditions:
#
#The above copyright notice and this permission notice shall be included in all
#copies or substantial portions of the Software.
#
#THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
#IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
#FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
#AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
#LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
#OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
#SOFTWARE.
import pandas as pd
from collections import Counter
import csv
import requests
import random
import pickle
import matplotlib.pyplot as plt
import numpy as np

In [2]:
## read in orthogroups
og_w_b71 = '../pipeline_methods/Orthogroups.tsv'
og_w_b71_df = pd.read_csv(og_w_b71, dtype='string', sep='\t', index_col = 0)

In [3]:
## to get the og that a gene belongs to for orthogrouping
df_lol = og_w_b71_df.values.tolist()
og_dict_w_b71 = {}

for i, row in enumerate(df_lol):
    og = og_w_b71_df.index[i]
    for cell in row:
        if not pd.isnull(cell):
            for protein in cell.split(', '):
                og_dict_w_b71[protein] = og

In [4]:
## to get all genes associated with one OG for orthogrouping
genes_per_og_w_b71 = {}

for gene in og_dict_w_b71:
    og = og_dict_w_b71[gene]
    if og not in genes_per_og_w_b71:
        genes_per_og_w_b71[og] = []
    genes_per_og_w_b71[og].append(gene)

In [5]:
input_files = [
    'B71_b71_assembly_complete_medium_expression.txt',
    'B71_b71_assembly_complete_medium_in_planta_expression.txt'
]

output_files =[
    'b71_assembly_complete_medium_expression_per_og.txt',
    'b71_assembly_complete_medium_in_planta_expression_per_og.txt'
]

In [13]:
for i in range(len(input_files)):
    input_file = input_files[i]
    print(input_file)
    output_file = output_files[i]
    ## get signal per gene
    signal_per_gene_dict = {}
    with open(input_file, newline = '') as file:
        file_reader = csv.reader(file, delimiter = '\t')
        for row in file_reader:
            if "ID=" in row[0]:
                gene = row[0].split(';')[0][3:]
            else:
                gene = row[0][:-2]
            signal_per_gene_dict[gene] = float(row[1])
    # organize by orthogroup
    og_signal_w_b71 = {}
    for og in genes_per_og_w_b71:
        og_signal_w_b71[og] = []
        for gene in genes_per_og_w_b71[og]:
            if "GCA004785725.2" in gene:
                og_signal_w_b71[og].append(signal_per_gene_dict[gene])
    # average signals together if there are more than 1
    og_signal_w_b71_averaged = {}
    for og in og_signal_w_b71:
        lst = og_signal_w_b71[og]
        if len(lst) > 1:
            og_signal_w_b71_averaged[og] = sum(lst) / len(lst)
        elif len(lst) == 1:
            og_signal_w_b71_averaged[og] = lst[0]
        elif len(lst) == 0:
            pass
        else:
            print('wtf')
    ## if ogs aren't represented in reference genome, just add median values
    median_value = np.median(list(og_signal_w_b71_averaged.values()))
    imputed_values = []
    for og in genes_per_og_w_b71:
        if og not in og_signal_w_b71_averaged:
            imputed_values.append(og)
            og_signal_w_b71_averaged[og] = median_value
    print(len(imputed_values))
    with open(output_file, 'w', newline = '') as output_csv:
        w = csv.writer(output_csv, delimiter = '\t')
        for key in og_signal_w_b71_averaged:
            w.writerow([key, og_signal_w_b71_averaged[key]])

B71_b71_assembly_complete_medium_expression.txt
4289
B71_b71_assembly_complete_medium_in_planta_expression.txt
4289
