In [1]:
import pandas as pd

import numpy as np
import tqdm as tq


In [12]:
def read_cococonet(species_1):
    """[Takes a species name in any format, and returns a CoCoConet for that species]

    Args:
        species_1 ([str]): [Species in CoCoCoNet]

    Returns:
        [Panda Dataframe]: [GenexGene Matrix]
    """
    import h5py
    import pandas as pd

    #assert (type(species_1) == str ), 'Species Must Be A String'

    common_name_of_species = species_name_resolver(species_1= species_1, desired_type='common')


    cococonet_map = pd.read_csv('/data/passala/Generated_Tables/Reference_tables/All_CoCoCoNet_Paths.csv')
    file_location = cococonet_map['Path'].loc[cococonet_map['Common Name'] == common_name_of_species].item()

    net = h5py.File(file_location,'r')
    agg_dataset = net['agg'] 
    row_dataset = net['row']
    col_dataset = net['col']
    row_gene_list = []
    for gene in row_dataset:
        row_gene_list.append(gene.decode('UTF-8'))
    col_gene_list = []
    for gene in col_dataset: 
        col_gene_list.append(gene.decode('UTF-8'))
    net_df = pd.DataFrame(data = agg_dataset[:,:], index = row_gene_list, columns = col_gene_list)
    net.close()
    return net_df 

In [3]:
def species_name_resolver(species_1,desired_type = 'common'):
    """[Takes ambiguous form of species name and returns desired type]

    Args:
        species_1 ([str]): [Ambigious Species Name]
        desired_type (str, optional): [One of common, scientific, or taxa_id]. Defaults to 'common'.

    Returns:
        [str]: [Specified Species ID]
    """

    import pandas as pd
    
    # Assert 
    assert (desired_type in ['common','scientific','taxa_id']), 'Desired type should be common, scientific, or taxa_id'
   
    #Set up variable 
    fc_mapper = pd.read_csv('/data/passala/Generated_Tables/Reference_tables/Species_name_resolver.csv')

    #Convert Taxa to common names if NCBI taxa ID
    if type(species_1) == int:
        species_1 = fc_mapper['Common Name'].loc[fc_mapper['Taxa ID'] == species_1].item()

    #Convert scientific name to common names if given scientific
    if ' ' in species_1:
        species_1 = fc_mapper['Common Name'].loc[fc_mapper['Species'] == species_1].item()

    #Get Scientific Name
    scientific_1 = fc_mapper['Species'].loc[fc_mapper['Common Name'] == species_1].item()
    taxa_id_1 = fc_mapper['Taxa ID'].loc[fc_mapper['Common Name'] == species_1].item()

    if desired_type == 'common':
        return species_1
    elif desired_type =='scientific':
        return scientific_1
    elif desired_type == 'taxa_id':
        return taxa_id_1

In [4]:
def get_fc_table(species_1,species_2):
    """Takes two species common names in any order and returns the N to M table for that pair with species_1 as column 1 and species_2 as column 2 

    Args:
        species_1 (str): species common name
        species_2 (str): species common name

    Returns:
        str: All N-M gene pairs
    """
    import pandas as pd
    import os 

    list_of_files = os.listdir('/data/passala/Generated_Tables/Comparing_all_orthogorups_across_plants/Orthogroups_at_eukaryote_level/N_M_FC_based_on_gene')
    trimmed_to_species_1 = list(filter(lambda x:species_1 in x, list_of_files))
    final_file = list(filter(lambda x: species_2 in x, trimmed_to_species_1))[0]
    formatted_final_file = f"/data/passala/Generated_Tables/Comparing_all_orthogorups_across_plants/Orthogroups_at_eukaryote_level/N_M_FC_based_on_gene/{final_file}"
    read_in_file = pd.read_csv(formatted_final_file, index_col= 0)

    return read_in_file

In [5]:
def get_ncbi_clean_og2gene_for_species(species_1, og2genes_only_cococonet, ncbi_mapping):


    species_1_name = species_name_resolver(species_1,desired_type='common')

    first_species_ortho_groups = og2genes_only_cococonet.loc[og2genes_only_cococonet['Species'] == species_1]
    shared_orthogroups = first_species_ortho_groups['Orthogroup'].unique()

    list_of_orthogene_pds = []
    for orthogroup in tq.tqdm(shared_orthogroups,desc ='inner_loop',position= 0,leave = False):
        species_1_genes = first_species_ortho_groups['Gene'].loc[first_species_ortho_groups['Orthogroup']== orthogroup].to_list()
        all_gene_combos = species_1_genes
        current_orthogroup_pd = pd.DataFrame(columns = [f'{species_1_name} OrthoGene'], data = all_gene_combos)
        current_orthogroup_pd['Orthogroup'] = orthogroup
        list_of_orthogene_pds.append(current_orthogroup_pd)

    final_species_lineup = pd.concat(list_of_orthogene_pds)
    ncbi_added_once = final_species_lineup.merge(right = ncbi_mapping[['Orthodb Gene','Symbol']], right_on = 'Orthodb Gene',left_on=f'{species_1_name} OrthoGene')
    ncbi_added_once_clean= ncbi_added_once.drop(columns = 'Orthodb Gene')
    return ncbi_added_once_clean

In [6]:
og_groups = pd.read_csv('/data/passala/OrthoDB_data/NCBI_data/eukaryota_level_stuff/og_2_Genes_with_network_id.csv')
og_groups

Unnamed: 0,Orthogroup,Gene,Species,Ortholevel,Orthodb Gene,Symbol
0,0at2759,15368_0:006377,15368,2759,15368_0:006377,psbD
1,0at2759,29760_0:006494,29760,2759,29760_0:006494,psbD
2,0at2759,3659_0:004e0f,3659,2759,3659_0:004e0f,psbD
3,0at2759,3702_0:006ad0,3702,2759,3702_0:006ad0,psbD
4,0at2759,3711_0:005e40,3711,2759,3711_0:005e40,LOC117126101
...,...,...,...,...,...,...
505254,9at2759,4097_0:00f00f,4097,2759,4097_0:00f00f,petD
505255,9at2759,4113_0:00079d,4113,2759,4113_0:00079d,LOC107063147
505256,9at2759,4113_0:006ebb,4113,2759,4113_0:006ebb,petD
505257,9at2759,4558_0:006df8,4558,2759,4558_0:006df8,petD


In [7]:
ncbi_mapping = pd.read_csv('/data/passala/OrthoDB_data/NCBI_data/merged_ncbi_to_orthodb_fixed_non_genesymbol.csv')

In [7]:
# cleaned_og = get_ncbi_clean_og2gene_for_species(4577,og_groups,ncbi_mapping=ncbi_mapping)

# current_coconet = CococoNet_reader.read_cococonet(species_1=4577)
# in_coconet = cleaned_og.loc[cleaned_og['Symbol'].isin(current_coconet.index)]

# dict_version = in_coconet[['Orthogroup','Symbol']].set_index('Symbol').to_dict()
# true_dict = dict_version['Orthogroup']
# current_coconet.groupby(by = true_dict).mean().groupby(by = true_dict,axis = 1).mean()

inner_loop:   0%|          | 0/14596 [00:00<?, ?it/s]

                                                                  

KeyboardInterrupt: 

In [8]:
species_with_nets_to_run = pd.read_csv('/data/passala/Generated_Tables/Reference_tables/species_for_running_cross_ortho_analysis.csv')
species_with_nets_to_run

Unnamed: 0,Species,Taxa ID,Common Name
0,Oryza sativa japonica,39947,rice_jp
1,Malus domestica,3750,apple
2,Sorghum bicolor,4558,sorghum
3,Zea mays,4577,maize
4,Vitis vinifera,29760,grape
5,Brassica rapa,3711,mustard
6,Glycine max,3847,soybean
7,Medicago truncatula,3880,medicago
8,Nicotiana tabacum,4097,tobacco
9,Solanum tuberosum,4113,potato


In [9]:
taxa_list = species_with_nets_to_run['Taxa ID'].to_list()


In [10]:
import pickle


In [13]:
for species in taxa_list:

    cleaned_og = get_ncbi_clean_og2gene_for_species(species,og_groups,ncbi_mapping=ncbi_mapping)
    species_name = species_name_resolver(species,'common')
    print(species_name)
    current_coconet = read_cococonet(species_1=species)
    in_coconet = cleaned_og.loc[cleaned_og['Symbol'].isin(current_coconet.index)]

    dict_version = in_coconet[['Orthogroup','Symbol']].set_index('Symbol').to_dict()
    true_dict = dict_version['Orthogroup']
    print("Calculating Table")
    final_coconet = current_coconet.groupby(by = true_dict).mean().groupby(by = true_dict,axis = 1).mean()
    print('Saving')
    final_coconet.to_csv(f'/data/passala/Generated_Tables/Comparing_all_orthogorups_across_plants/Orthogroups_at_eukaryote_level/Orthogroup_cococonet_eukaryote/{species_name}_merged_cococonet.csv')
    pickle.dump( final_coconet, open( f'/data/passala/Generated_Tables/Comparing_all_orthogorups_across_plants/Orthogroups_at_eukaryote_level/Orthogroup_cococonet_eukaryote/{species_name}_merged_cococonet_pickle.p', "wb" ))

inner_loop:   0%|          | 0/13182 [00:00<?, ?it/s]

                                                                  

rice_jp
Calculating Table
Saving


                                                                  

apple
Calculating Table
Saving


                                                                  

sorghum
Calculating Table
Saving


                                                                  

maize
Calculating Table
Saving


                                                                  

grape
Calculating Table
Saving


                                                                  

mustard
Calculating Table
Saving


                                                                  

soybean
Calculating Table
Saving


                                                                  

medicago
Calculating Table
Saving


                                                                  

tobacco
Calculating Table
Saving


                                                                  

potato
Calculating Table
Saving


                                                                  

tomato
Calculating Table
Saving


                                                                  

arabidopsis
Calculating Table
Saving


                                                                  

peanut
Calculating Table
Saving


                                                                

cucumber
Calculating Table
Saving


                                                                  

brome
Calculating Table
Saving
