In [1]:
import pandas as pd
import CococoNet_reader
import numpy as np
import tqdm as tq
import Name_resolver

In [2]:
def get_fc_table(species_1,species_2):
    """Takes two species common names in any order and returns the N to M table for that pair with species_1 as column 1 and species_2 as column 2 

    Args:
        species_1 (str): species common name
        species_2 (str): species common name

    Returns:
        str: All N-M gene pairs
    """
    import pandas as pd
    import os 

    list_of_files = os.listdir('/data/passala/Generated_Tables/Comparing_all_orthogorups_across_plants/N_M_FC_tables')
    trimmed_to_species_1 = list(filter(lambda x:species_1 in x, list_of_files))
    final_file = list(filter(lambda x: species_2 in x, trimmed_to_species_1))[0]
    formatted_final_file = f"/data/passala/Generated_Tables/Comparing_all_orthogorups_across_plants/N_M_FC_tables/{final_file}"
    read_in_file = pd.read_csv(formatted_final_file, index_col= 0)

    return read_in_file

In [3]:
def get_ncbi_clean_og2gene_for_species(species_1, og2genes_only_cococonet, ncbi_mapping):


    species_1_name = Name_resolver.species_name_resolver(species_1,desired_type='common')

    first_species_ortho_groups = og2genes_only_cococonet.loc[og2genes_only_cococonet['Species'] == species_1]
    shared_orthogroups = first_species_ortho_groups['Orthogroup'].unique()

    list_of_orthogene_pds = []
    for orthogroup in tq.tqdm(shared_orthogroups,desc ='inner_loop',position= 0,leave = False):
        species_1_genes = first_species_ortho_groups['Gene'].loc[first_species_ortho_groups['Orthogroup']== orthogroup].to_list()
        all_gene_combos = species_1_genes
        current_orthogroup_pd = pd.DataFrame(columns = [f'{species_1_name} OrthoGene'], data = all_gene_combos)
        current_orthogroup_pd['Orthogroup'] = orthogroup
        list_of_orthogene_pds.append(current_orthogroup_pd)

    final_species_lineup = pd.concat(list_of_orthogene_pds)
    ncbi_added_once = final_species_lineup.merge(right = ncbi_mapping[['Orthodb Gene','Symbol']], right_on = 'Orthodb Gene',left_on=f'{species_1_name} OrthoGene')
    ncbi_added_once_clean= ncbi_added_once.drop(columns = 'Orthodb Gene')
    return ncbi_added_once_clean

In [4]:
og_groups = pd.read_csv('/data/passala/OrthoDB_data/NCBI_data/og_2_Genes_with_ncbi_symbol.csv')
og_groups

Unnamed: 0,Orthogroup,Gene,Species,Ortholevel,Orthodb Gene,Symbol
0,100007at3193,4558_0:003a4f,4558,3193,4558_0:003a4f,LOC110434333
1,100007at3193,4577_0:004254,4577,3193,4577_0:004254,LOC100276500
2,100007at3193,4577_0:004e43,4577,3193,4577_0:004e43,LOC100275058
3,100067at3193,4558_0:00439a,4558,3193,4558_0:00439a,LOC8071983
4,100085at3193,4558_0:002fd8,4558,3193,4558_0:002fd8,LOC8086335
...,...,...,...,...,...,...
502887,9969at3193,15368_0:000e9a,15368,3193,15368_0:000e9a,LOC100834561
502888,99917at3193,39947_0:006297,39947,3193,39947_0:006297,LOC107276177
502889,99917at3193,39947_0:00634f,39947,3193,39947_0:00634f,LOC9269479
502890,99917at3193,39947_0:006629,39947,3193,39947_0:006629,LOC9269626


In [5]:
for species in og_groups['Species'].unique():
        # Get the species-specific data
        species_mask = og_groups['Species'] == species
        
        # Get the current orthogroups and shuffle them in place
        one_species_ogs = og_groups.loc[species_mask]
        
        shuffled_orthogroups = one_species_ogs['Orthogroup'].sample(frac = 1).values
        
        # Assign the shuffled orthogroups back
        og_groups.loc[species_mask,'Orthogroup'] = shuffled_orthogroups

In [6]:
og_groups

Unnamed: 0,Orthogroup,Gene,Species,Ortholevel,Orthodb Gene,Symbol
0,54153at3193,4558_0:003a4f,4558,3193,4558_0:003a4f,LOC110434333
1,2918at3193,4577_0:004254,4577,3193,4577_0:004254,LOC100276500
2,877701at3193,4577_0:004e43,4577,3193,4577_0:004e43,LOC100275058
3,881236at3193,4558_0:00439a,4558,3193,4558_0:00439a,LOC8071983
4,80578at3193,4558_0:002fd8,4558,3193,4558_0:002fd8,LOC8086335
...,...,...,...,...,...,...
502887,885521at3193,15368_0:000e9a,15368,3193,15368_0:000e9a,LOC100834561
502888,886813at3193,39947_0:006297,39947,3193,39947_0:006297,LOC107276177
502889,36207at3193,39947_0:00634f,39947,3193,39947_0:00634f,LOC9269479
502890,16768at3193,39947_0:006629,39947,3193,39947_0:006629,LOC9269626


In [7]:
ncbi_mapping = pd.read_csv('/data/passala/OrthoDB_data/NCBI_data/merged_ncbi_to_orthodb_fixed_non_genesymbol.csv')

In [8]:
# cleaned_og = get_ncbi_clean_og2gene_for_species(4577,og_groups,ncbi_mapping=ncbi_mapping)

# current_coconet = CococoNet_reader.read_cococonet(species_1=4577)
# in_coconet = cleaned_og.loc[cleaned_og['Symbol'].isin(current_coconet.index)]

# dict_version = in_coconet[['Orthogroup','Symbol']].set_index('Symbol').to_dict()
# true_dict = dict_version['Orthogroup']
# current_coconet.groupby(by = true_dict).mean().groupby(by = true_dict,axis = 1).mean()

In [9]:
species_with_nets_to_run = pd.read_csv('/data/passala/Generated_Tables/Reference_tables/species_for_running_cross_ortho_analysis.csv')
species_with_nets_to_run

Unnamed: 0,Species,Taxa ID,Common Name
0,Oryza sativa japonica,39947,rice_jp
1,Malus domestica,3750,apple
2,Sorghum bicolor,4558,sorghum
3,Zea mays,4577,maize
4,Vitis vinifera,29760,grape
5,Brassica rapa,3711,mustard
6,Glycine max,3847,soybean
7,Medicago truncatula,3880,medicago
8,Nicotiana tabacum,4097,tobacco
9,Solanum tuberosum,4113,potato


In [10]:
taxa_list = species_with_nets_to_run['Taxa ID'].to_list()


In [11]:
import pickle


In [12]:
for species in taxa_list:

    cleaned_og = get_ncbi_clean_og2gene_for_species(species,og_groups,ncbi_mapping=ncbi_mapping)
    species_name = Name_resolver.species_name_resolver(species,'common')
    print(species_name)
    current_coconet = CococoNet_reader.read_cococonet(species_1=species)
    in_coconet = cleaned_og.loc[cleaned_og['Symbol'].isin(current_coconet.index)]

    dict_version = in_coconet[['Orthogroup','Symbol']].set_index('Symbol').to_dict()
    true_dict = dict_version['Orthogroup']
    print("Calculating Table")
    final_coconet = current_coconet.groupby(by = true_dict).mean().groupby(by = true_dict,axis = 1).mean()
    print('Saving')
    final_coconet.to_csv(f'/data/passala/Post_dump_files/Generated_Tables/Comparing_all_orthogorups_across_plants/Orthogroup_based_coconets_null/{species_name}_merged_cococonet_null.csv')
    pickle.dump( final_coconet, open( f'/data/passala/Post_dump_files/Generated_Tables/Comparing_all_orthogorups_across_plants/Orthogroup_based_coconets_null/{species_name}_merged_cococonet_pickle_null.p', "wb" ))

inner_loop:   0%|          | 0/15757 [00:00<?, ?it/s]

                                                                  

rice_jp
Calculating Table
Saving


                                                                  

apple
Calculating Table
Saving


                                                                  

sorghum
Calculating Table
Saving


                                                                  

maize
Calculating Table
Saving


                                                                  

grape
Calculating Table
Saving


                                                                  

mustard
Calculating Table
Saving


                                                                  

soybean
Calculating Table
Saving


                                                                  

medicago
Calculating Table
Saving


                                                                  

tobacco
Calculating Table
Saving


                                                                  

potato
Calculating Table
Saving


                                                                  

tomato
Calculating Table
Saving


                                                                  

arabidopsis
Calculating Table
Saving


                                                                  

peanut
Calculating Table
Saving


                                                                  

cucumber
Calculating Table
Saving


                                                                  

brome
Calculating Table
Saving
