In [2]:
import pandas as pd
import sys

sys.path.append('../../')
from utils import clustering_utils, dataframe_utils

In [3]:
import requests
from bs4 import BeautifulSoup

def get_ttherm_id_from_keyword(term):
    url = f'https://tet.ciliate.org/search.php?gene_name={term}'
    try:
        # Get the HTML content of the web page
        response = requests.get(url)
        response.raise_for_status()  # Check if the request was successful
        html_content = response.content

        # Parse the HTML content using BeautifulSoup
        soup = BeautifulSoup(html_content, 'html.parser')

        # Find the specific table with class 'main'
        table = soup.find('table', class_='main')
        rows = table.find_all('tr')[1:]
        num_rows = len(rows)

        ttherm_id = None

        for row in rows:
            tds = row.find_all('td')
            first_td = tds[0].get_text().strip()
            second_td = tds[1].get_text().strip()

            if second_td == term:
                if ttherm_id is not None:
                    raise ValueError(f'The term \'{term}\' returned multiple TTHERM_ID entries.')
                ttherm_id = first_td

        return ttherm_id

    except requests.exceptions.RequestException as e:
        print(f"An error occurred while fetching the HTML content: {e}")

In [4]:
keys = [
'HHF1',
'HHF2',
'HHT2',
'HTA1',
'HTA2',
'HTB1',
'HTB2',
]

print(','.join(keys))
print(','.join([get_ttherm_id_from_keyword(key) for key in keys]))

HHF1,HHF2,HHT2,HTA1,HTA2,HTB1,HTB2
TTHERM_00498190,TTHERM_00189170,TTHERM_00189180,TTHERM_00790790,TTHERM_00316500,TTHERM_00633360,TTHERM_00283180


In [5]:
keys = [
'HHT4',
'HHT3',
'HTA3',
]

print(','.join(keys))
print(','.join([get_ttherm_id_from_keyword(key) for key in keys]))

HHT4,HHT3,HTA3
TTHERM_00016200,TTHERM_00016170,TTHERM_00143660


In [6]:
keys = [
'CNA1', # TTHERM_00146340
]

print(','.join(keys))
print(','.join([get_ttherm_id_from_keyword(key) for key in keys]))

CNA1
TTHERM_00146340


In [7]:
keys = [
'HHO1', # TTHERM_00823720
'MLH1', # TTHERM_00471820
]

print(','.join(keys))
print(','.join([get_ttherm_id_from_keyword(key) for key in keys]))

HHO1,MLH1
TTHERM_00823720,TTHERM_00471820


In [8]:
keys = [
'EZL2',
'EZL3',
'HAT1',
'HAT2',
'TXR1',
]

print(','.join(keys))
print(','.join([get_ttherm_id_from_keyword(key) for key in keys]))

EZL2,EZL3,HAT1,HAT2,TXR1
TTHERM_00300320,TTHERM_00499660,TTHERM_00046760,TTHERM_00248390,TTHERM_00256950


In [9]:
keys = [
'CPD1',
'CPD2',
'CPG1',
'CPH1',
'CPH2',
'CPH3',
'CPH4',
'SMC2',
'SMC4',
]

print(','.join(keys))
print(','.join([get_ttherm_id_from_keyword(key) for key in keys]))

CPD1,CPD2,CPG1,CPH1,CPH2,CPH3,CPH4,SMC2,SMC4
TTHERM_00486070,TTHERM_00392760,TTHERM_00919690,TTHERM_00728870,TTHERM_00540340,TTHERM_00554600,TTHERM_01299730,TTHERM_00812950,TTHERM_00446400


In [10]:
keys = [
    'REC8', 
    'SCC2', 
    'SCC3', 
    'SMC1', 
    'SMC3'
    ]

print(','.join(keys))
print(','.join([get_ttherm_id_from_keyword(key) for key in keys]))

REC8,SCC2,SCC3,SMC1,SMC3
TTHERM_00245660,TTHERM_00678460,TTHERM_00225630,TTHERM_01048090,TTHERM_00294810


In [11]:
keys = [
    'IMA1', 
    'IMA5', 
    'IMA9', 
    ]

print(','.join(keys))
print(','.join([get_ttherm_id_from_keyword(key) for key in keys]))

IMA1,IMA5,IMA9
TTHERM_01016220,TTHERM_00703970,TTHERM_00756470


In [12]:
keys = [
    'IMA10', 
    'IMA12', 
    'IMA13', 
    'IMA3', 
    'IMA5', 
    'IMA8', 
    ]

print(','.join(keys))
print(','.join([get_ttherm_id_from_keyword(key) for key in keys]))

IMA10,IMA12,IMA13,IMA3,IMA5,IMA8
TTHERM_00295520,TTHERM_00938930,TTHERM_00492970,TTHERM_00335950,TTHERM_00703970,TTHERM_00161640


In [13]:
keys = [
    'IMB2', 
    'IMB4',
    'IMB6',
    'IMB8',
    ]

print(','.join(keys))
print(','.join([get_ttherm_id_from_keyword(key) for key in keys]))

IMB2,IMB4,IMB6,IMB8


TypeError: sequence item 0: expected str instance, NoneType found

In [14]:
keys = [
'LIA1',
'LIA2',
'LIA3',
'LIA4',
'LIA5',
'LIA6',
'LIA7',
]

print(','.join(keys))
print(','.join([get_ttherm_id_from_keyword(key) for key in keys]))

LIA1,LIA2,LIA3,LIA4,LIA5,LIA6,LIA7
TTHERM_00675900,TTHERM_00569290,TTHERM_00675850,TTHERM_00085600,TTHERM_00653910,TTHERM_00849260,TTHERM_00193390


In [15]:
modified_annotation = pd.read_csv('../../active_files/complete_annotation.csv')
modified_annotation_no_muco = modified_annotation.copy()

In [16]:
xls_list = []

for n in list(range(2, 10)):
    xls_list.append(pd.read_excel(f'../../new_raw_data/rna_seq_processed/xls_files/mc-e22-08-0326-s0{n}.xls', sheet_name=0))

In [17]:
for idx, df in enumerate(xls_list):
    print(idx, len(set(df.iloc[:, 0].values)))

0 350
1 284
2 34
3 38
4 41
5 74
6 342
7 117


In [18]:
first_column_values = {}

for n in list(range(2, 10)):
    file_path = f'../../new_raw_data/rna_seq_processed/xls_files/mc-e22-08-0326-s0{n}.xls'

    excel_file = pd.ExcelFile(file_path)

    for sheet_name in excel_file.sheet_names:
        df = pd.read_excel(file_path, sheet_name=sheet_name)
        
        if n not in first_column_values:
            first_column_values[n] = []
        first_column_values[n] += df.iloc[:, 0].astype(str).to_list()


        if n == 3 and sheet_name == 'genes':
            print(sheet_name)
            print(df)
            print(first_column_values[n])

genes
              TTHERM
0             g19412
1             g24741
2               g371
3    TTHERM_00000070
4    TTHERM_00001090
..               ...
279  TTHERM_01179960
280  TTHERM_01251290
281  TTHERM_01276400
282  TTHERM_01299730
283  TTHERM_01358410

[284 rows x 1 columns]
['g19412', 'g24741', 'g371', 'TTHERM_00000070', 'TTHERM_00001090', 'TTHERM_000056049', 'TTHERM_000109313', 'TTHERM_00013120', 'TTHERM_00013900', 'TTHERM_00014940', 'TTHERM_000188352', 'TTHERM_000191179', 'TTHERM_000218898', 'TTHERM_00024350', 'TTHERM_00024380', 'TTHERM_00024410', 'TTHERM_00028890', 'TTHERM_00030580', 'TTHERM_000357059', 'TTHERM_00037210', 'TTHERM_000378989', 'TTHERM_000384969', 'TTHERM_000438819', 'TTHERM_000446389', 'TTHERM_000457099', 'TTHERM_00046490', 'TTHERM_000486559', 'TTHERM_00048840', 'TTHERM_000503740', 'TTHERM_00052240', 'TTHERM_000523058', 'TTHERM_000529532', 'TTHERM_000570549', 'TTHERM_00059040', 'TTHERM_000675819', 'TTHERM_000695779', 'TTHERM_000711791', 'TTHERM_000770709', 'TTH

In [19]:
for n in list(range(2, 10)):
    print(n, len(set(first_column_values[n])))

2 352
3 284
4 53
5 38
6 41
7 74
8 344
9 117


In [20]:
_n = 9
len(set(first_column_values[_n])), len(set(xls_list[_n-2].iloc[:, 0].values))

(117, 117)

In [21]:
_n = 8
len(set(first_column_values[_n])), len(set(xls_list[_n-2].iloc[:, 0].values))

(344, 342)

In [22]:
_n = 7
len(set(first_column_values[_n])), len(set(xls_list[_n-2].iloc[:, 0].values))

(74, 74)

In [23]:
_n = 6
len(set(first_column_values[_n])), len(set(xls_list[_n-2].iloc[:, 0].values))

(41, 41)

In [24]:
_n = 5
len(set(first_column_values[_n])), len(set(xls_list[_n-2].iloc[:, 0].values))

(38, 38)

In [25]:
_n = 4
len(set(first_column_values[_n])), len(set(xls_list[_n-2].iloc[:, 0].values))

(53, 34)

In [26]:
_n = 3
len(set(first_column_values[_n])), len(set(xls_list[_n-2].iloc[:, 0].values))

(284, 284)

In [27]:
gid_to_tthermid = {}

for num, xls in first_column_values.items():
    for id in set(xls):
        if id[: len('TTHERM_')] != 'TTHERM_':
            print(id)
            if id[: len('g')] == 'g':
                gid_to_tthermid[id] = get_ttherm_id_from_keyword(id)

nan
X
g19412
g371
g24741
g20657
nan
X
g18920
g1455


In [28]:
gid_to_tthermid

{'g19412': None,
 'g371': None,
 'g24741': None,
 'g20657': None,
 'g18920': None,
 'g1455': None}

In [29]:
{
'Cluster 5': {'mc-e22-08-0326-s03.xls': ['genes']}, # FIXME: translate gIDs
'Cluster 5A': {'mc-e22-08-0326-s04.xls': [0, 1, 2, 'telomere_maintenance']}, # unclear whether this cluster has 53 or 34 genes based on xls (assuming 53)
'Cluster 5B': {'mc-e22-08-0326-s05.xls': ['analysis2_annotated', 'DNA_repair']},
'Cluster 5C': {'mc-e22-08-0326-s06.xls': ['genes', 'chromosome_condensation', 'chromatin assembly', 'mitotic nuclear division', 'mitotic cell cycle process']},
'Cluster 5D': {'mc-e22-08-0326-s07.xls': ['genes']},
'Cluster 1': {'mc-e22-08-0326-s08.xls': ['genes']}, # only first tab
'Cluster 2': {'mc-e22-08-0326-s09.xls': ['genes']} # there are only 117 genes in here despite the paper saying there are 564 in the cluster
}

{'Cluster 5': {'mc-e22-08-0326-s03.xls': ['genes']},
 'Cluster 5A': {'mc-e22-08-0326-s04.xls': [0, 1, 2, 'telomere_maintenance']},
 'Cluster 5B': {'mc-e22-08-0326-s05.xls': ['analysis2_annotated',
   'DNA_repair']},
 'Cluster 5C': {'mc-e22-08-0326-s06.xls': ['genes',
   'chromosome_condensation',
   'chromatin assembly',
   'mitotic nuclear division',
   'mitotic cell cycle process']},
 'Cluster 5D': {'mc-e22-08-0326-s07.xls': ['genes']},
 'Cluster 1': {'mc-e22-08-0326-s08.xls': ['genes']},
 'Cluster 2': {'mc-e22-08-0326-s09.xls': ['genes']}}

In [30]:
muco_exp = [
'TTHERM_00052190',
'TTHERM_00011710',
'TTHERM_00321680',
'TTHERM_00445920',
'TTHERM_00410180',
'THERM_00410210',
'TTHERM_00313130',
'TTHERM_00059370',
'TTHERM_00283800',
'TTHERM_00227750',
'TTHERM_00241790',
'TTHERM_00318900',
'TTHERM_00852790',
'TTHERM_00467390',
'TTHERM_01332070',
'TTHERM_00497590',
'TTHERM_00013410',
'TTHERM_00527180',
'TTHERM_00624730',
'TTHERM_00624720',
'TTHERM_00378890',
'TTHERM_00522600',
'TTHERM_01055600',
'TTHERM_00558350',
'TTHERM_00221120',
'TTHERM_00572100',
'TTHERM_00420770',
'TTHERM_00566910',
'TTHERM_00658810',
'TTHERM_00047330',
'TTHERM_000193469',
'TTHERM_000486279',
'TTHERM_00141040',
'TTHERM_00227750',
'TTHERM_00317390',
'TTHERM_00670750',
'TTHERM_01122800',
'TTHERM_01213910',
'TTHERM_00886960',
]

In [31]:
de_regranulation_df = pd.read_csv('de_regranulation_logFC_0.2_adj_P_Val_0.0005_B_1.csv')

In [32]:
de_muco = list(set(list(de_regranulation_df['Unnamed: 0'].values)).intersection(set(muco_exp)))
de = list(set(list(de_regranulation_df['Unnamed: 0'].values)) - set(muco_exp))
muco = list(set(muco_exp) - set(list(de_regranulation_df['Unnamed: 0'].values)))

In [33]:
de_regranulation_annot_df = pd.DataFrame({
    'TTHERM_ID': de_muco + de + muco,
    'mucocysts': ['DE,EV' for _ in range(len(de_muco))] + ['DE' for _ in range(len(de))] + ['EV' for _ in range(len(muco))],
    })

In [34]:
modified_annotation = modified_annotation.merge(de_regranulation_annot_df, on='TTHERM_ID', how='outer')
modified_annotation

Unnamed: 0,TTHERM_ID,seed_ortholog,evalue,score,eggNOG_OGs,max_annot_lvl,COG_category,Description,Preferred_name,GOs,...,KEGG_TC,CAZy,BiGG_Reaction,PFAMs,TGD2021_description,peptide,common_name,InterPro,InterPro_description,mucocysts
0,TTHERM_00840110,5911.EAS05042,0.0,1278.5,"2E5M0@1|root,2SCE9@2759|Eukaryota,3ZEBN@5878|C...",5878|Ciliophora,-,-,-,-,...,-,-,-,-,hypothetical protein,MISSNQTADQENKVENKVANAEHVNQQSYDSIPQSLSPAVIAQIMD...,Unnamed,-,-,
1,TTHERM_01082930,31033.ENSTRUP00000031008,1.9999999999999997e-28,94.2,"COG5078@1|root,KOG0418@2759|Eukaryota,38KYZ@33...",33208|Metazoa,O,Belongs to the ubiquitin-conjugating enzyme fa...,UBE2K,"GO:0000209,GO:0003674,GO:0003824,GO:0004842,GO...",...,-,-,-,"UBA,UQ_con",ubiquitin-conjugating enzyme E2,MHKNILIILFQIFCCQIYTTIYTFYYFMANIVFIIHNVKLDLFSCF...,Unnamed,"IPR000608,IPR016135,IPR023313,IPR050113","Ubiquitin-conjugating enzyme E2,Ubiquitin-conj...",
2,TTHERM_01081610,5911.EAR82090,0.0,2110.7,"2A5FX@1|root,2RY9I@2759|Eukaryota",2759|Eukaryota,-,-,-,-,...,-,-,-,PRESAN,transmembrane protein putative,MSSQSPAKLNNQNCAAANQYYNDLESCVQGYCIKQQSGSGARGCFP...,Unnamed,-,-,
3,TTHERM_00059210,5911.EAR87408,3.2999999999999998e-298,983.9,"COG0575@1|root,KOG1440@2759|Eukaryota,3ZAR9@58...",5878|Ciliophora,I,Cytidylyltransferase family,-,-,...,-,-,-,CTP_transf_1,phosphatidate cytidylyltransferase,MSQVTNRSQKKSHQKRDEKSEEDSSDEKTDDFSEEELDKLQEAQKK...,Unnamed,"IPR000374,IPR016720","Phosphatidate cytidylyltransferase,Phosphatida...",
4,TTHERM_00535200,5911.EAS03184,0.0,2448.3,"2E77S@1|root,2SDUU@2759|Eukaryota",5911.EAS03184|-,S,Src homology 3 domains,-,-,...,-,-,-,-,beta-Pak interactive eXchange factor Src-like ...,MFTKSNSRSALAGLNSIVNSQNDSLTSRAQHQNYAKKDLTISNSTS...,Unnamed,"IPR001452,IPR036028,IPR051569","SH3 domain,SH3-like domain superfamily,SH3 and...",
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26705,TTHERM_00422230,,,,,,,,,,...,,,,,,,,,,DE
26706,TTHERM_00085120,,,,,,,,,,...,,,,,,,,,,DE
26707,TTHERM_000193469,,,,,,,,,,...,,,,,,,,,,EV
26708,THERM_00410210,,,,,,,,,,...,,,,,,,,,,EV


In [35]:
modified_annotation.isna().sum()

TTHERM_ID                   0
seed_ortholog              23
evalue                     23
score                      23
eggNOG_OGs                 23
max_annot_lvl              23
COG_category               23
Description                23
Preferred_name             23
GOs                        23
EC                         23
KEGG_ko                    23
KEGG_Pathway               23
KEGG_Module                23
KEGG_Reaction              23
KEGG_rclass                23
BRITE                      23
KEGG_TC                    23
CAZy                       23
BiGG_Reaction              23
PFAMs                      23
TGD2021_description        23
peptide                    23
common_name                23
InterPro                   23
InterPro_description       23
mucocysts               24739
dtype: int64

In [36]:
modified_annotation = modified_annotation.fillna('-')

In [37]:
modified_annotation.isna().sum()

TTHERM_ID               0
seed_ortholog           0
evalue                  0
score                   0
eggNOG_OGs              0
max_annot_lvl           0
COG_category            0
Description             0
Preferred_name          0
GOs                     0
EC                      0
KEGG_ko                 0
KEGG_Pathway            0
KEGG_Module             0
KEGG_Reaction           0
KEGG_rclass             0
BRITE                   0
KEGG_TC                 0
CAZy                    0
BiGG_Reaction           0
PFAMs                   0
TGD2021_description     0
peptide                 0
common_name             0
InterPro                0
InterPro_description    0
mucocysts               0
dtype: int64

In [38]:
modified_annotation.to_csv('../../active_files/complete_annotation.csv', index=False)

In [39]:
microarray = pd.read_csv('./test_nn3_leiden_label_df_round_1.csv')

rna_seq = pd.read_csv('./rna_seq_label_df_round_1.csv')

microarray_enrich = clustering_utils.compute_enrichment(microarray)

rna_seq_enrich = clustering_utils.compute_enrichment(rna_seq)

In [40]:
modified_annotation_no_muco.to_csv('../../active_files/complete_annotation.csv', index=False)

In [41]:
rna_ev = set([(int(m)) for m in rna_seq_enrich.loc[rna_seq_enrich['term'].isin(['EV'])]['module'].values])
microarray_ev = set([(int(m)) for m in microarray_enrich.loc[microarray_enrich['term'].isin(['EV'])]['module'].values])

rna_de = set([(int(m)) for m in rna_seq_enrich.loc[rna_seq_enrich['term'].isin(['DE'])]['module'].values])
microarray_de = set([(int(m)) for m in microarray_enrich.loc[microarray_enrich['term'].isin(['DE'])]['module'].values])

len(rna_ev), len(microarray_ev), len(rna_de), len(microarray_de)

(5, 6, 46, 63)

In [42]:
len(rna_ev.intersection(rna_de))

4

In [43]:
len(microarray_ev.intersection(microarray_de))

4

In [44]:
rna_seq_enrich.loc[rna_seq_enrich['term'].isin(['DE', 'EV'])].shape, microarray_enrich.loc[microarray_enrich['term'].isin(['DE', 'EV'])].shape

((51, 7), (69, 7))

In [45]:
rna_seq_enrich.loc[rna_seq_enrich['term'].isin(['EV'])].shape, microarray_enrich.loc[microarray_enrich['term'].isin(['EV'])].shape

((5, 7), (6, 7))

In [46]:
microarray_enrich.loc[microarray_enrich['term'].isin(['DE'])]

Unnamed: 0,module,term,info,fold_change,bonferroni,term_count,module_size
0,0.0,DE,differentially expressed in mucocyst regranula...,4.866983,4.006131e-07,15.0,42.0
0,1.0,DE,differentially expressed in mucocyst regranula...,11.222689,1.550976e-25,28.0,34.0
0,2.0,DE,differentially expressed in mucocyst regranula...,11.623499,2.580347e-27,29.0,34.0
0,3.0,DE,differentially expressed in mucocyst regranula...,12.217804,1.922427e-51,52.0,58.0
0,4.0,DE,differentially expressed in mucocyst regranula...,11.680758,4.686624e-34,36.0,42.0
...,...,...,...,...,...,...,...
0,549.0,DE,differentially expressed in mucocyst regranula...,3.028345,2.708612e-03,10.0,45.0
0,552.0,DE,differentially expressed in mucocyst regranula...,3.028345,8.130434e-03,8.0,36.0
0,579.0,DE,differentially expressed in mucocyst regranula...,2.839073,4.337479e-03,10.0,48.0
0,591.0,DE,differentially expressed in mucocyst regranula...,2.962511,3.185764e-03,10.0,46.0


In [47]:
rna_seq_enrich.loc[rna_seq_enrich['term'].isin(['DE'])]

Unnamed: 0,module,term,info,fold_change,bonferroni,term_count,module_size
0,57.0,DE,differentially expressed in mucocyst regranula...,3.406888,0.01499986,6.0,24.0
0,68.0,DE,differentially expressed in mucocyst regranula...,3.028345,0.0250912,6.0,27.0
0,107.0,DE,differentially expressed in mucocyst regranula...,3.586198,0.02444771,5.0,19.0
0,194.0,DE,differentially expressed in mucocyst regranula...,2.197992,0.02364824,10.0,62.0
0,296.0,DE,differentially expressed in mucocyst regranula...,2.92019,0.02923657,6.0,28.0
0,338.0,DE,differentially expressed in mucocyst regranula...,3.244655,0.0351935,5.0,21.0
0,354.0,DE,differentially expressed in mucocyst regranula...,2.417791,0.008917865,11.0,62.0
0,369.0,DE,differentially expressed in mucocyst regranula...,2.72551,0.03877211,6.0,30.0
0,376.0,DE,differentially expressed in mucocyst regranula...,2.868958,0.01109702,8.0,38.0
0,407.0,DE,differentially expressed in mucocyst regranula...,3.634014,0.04930764,4.0,15.0


In [48]:
microarray_enrich.loc[microarray_enrich['term'].isin(['EV'])]

Unnamed: 0,module,term,info,fold_change,bonferroni,term_count,module_size
1,1.0,EV,experimentally validated mucocyst-associated gene,41.346749,0.04699679,2.0,34.0
1,2.0,EV,experimentally validated mucocyst-associated gene,41.346749,0.04699679,2.0,34.0
1,3.0,EV,experimentally validated mucocyst-associated gene,109.069873,1.207551e-14,9.0,58.0
1,4.0,EV,experimentally validated mucocyst-associated gene,167.35589,2.919967e-18,10.0,42.0
0,5.0,EV,experimentally validated mucocyst-associated gene,281.157895,0.002849297,2.0,5.0
0,6.0,EV,experimentally validated mucocyst-associated gene,585.745614,8.707998e-12,5.0,6.0


In [49]:
print(','.join(['m'+str(int(m)).zfill(3) for m in microarray_enrich.loc[microarray_enrich['term'].isin(['EV'])]['module'].values]))

m001,m002,m003,m004,m005,m006


In [50]:
rna_seq_enrich.loc[rna_seq_enrich['term'].isin(['EV'])]

Unnamed: 0,module,term,info,fold_change,bonferroni,term_count,module_size
1,617.0,EV,experimentally validated mucocyst-associated gene,50.206767,0.03845631,2.0,28.0
1,632.0,EV,experimentally validated mucocyst-associated gene,46.859649,0.04130345,2.0,30.0
1,634.0,EV,experimentally validated mucocyst-associated gene,124.040248,1.029184e-09,6.0,34.0
0,636.0,EV,experimentally validated mucocyst-associated gene,136.673977,8.6823e-12,7.0,36.0
1,679.0,EV,experimentally validated mucocyst-associated gene,79.874402,4.175311e-07,5.0,44.0


In [51]:
rna_seq.loc[rna_seq['label'].isin([int(m) for m in rna_seq_enrich.loc[rna_seq_enrich['term'].isin(['EV'])]['module'].values])]

Unnamed: 0,TTHERM_ID,label
1810,TTHERM_01302830,679
1811,TTHERM_01277506,679
1812,TTHERM_01213910,679
1813,TTHERM_01106040,679
1814,TTHERM_00938950,679
...,...,...
3793,TTHERM_00133640,617
3794,TTHERM_00129820,617
3795,TTHERM_00113000,617
3796,TTHERM_00077760,617


In [60]:
microarray.loc[microarray['label'].isin([int(m) for m in microarray_enrich.loc[microarray_enrich['term'].isin(['EV'])]['module'].values])]

Unnamed: 0,TTHERM_ID,label
20207,TTHERM_01055600,6
20208,TTHERM_01002860,6
20209,TTHERM_00624730,6
20210,TTHERM_00624720,6
20211,TTHERM_00522600,6
...,...,...
20381,TTHERM_00138495,1
20382,TTHERM_00136330,1
20383,TTHERM_00129415,1
20384,TTHERM_00120780,1


In [52]:
print(','.join(['m'+str(int(m)).zfill(3) for m in rna_seq_enrich.loc[rna_seq_enrich['term'].isin(['EV'])]['module'].values]))

m617,m632,m634,m636,m679


In [61]:
rna_seq.loc[rna_seq['label'].isin([int(m) for m in rna_seq_enrich.loc[rna_seq_enrich['term'].isin(['EV'])]['module'].values])].to_csv('./rna_seq_ev_enriched_module_genes.csv', index=False)

In [62]:
microarray.loc[microarray['label'].isin([int(m) for m in microarray_enrich.loc[microarray_enrich['term'].isin(['EV'])]['module'].values])].to_csv('./microarray_ev_enriched_module_genes.csv', index=False)

In [55]:
rna_ev = set([(int(m)) for m in rna_seq_enrich.loc[rna_seq_enrich['term'].isin(['EV'])]['module'].values])
microarray_ev = set([(int(m)) for m in microarray_enrich.loc[microarray_enrich['term'].isin(['EV'])]['module'].values])

In [56]:
'select * from rna_seq_enrich where info LIKE "%ribo%"'

'select * from rna_seq_enrich where info LIKE "%ribo%"'

In [57]:
dataframe_utils.sql_query_df({'rna_seq_enrich': rna_seq_enrich}, 'select * from rna_seq_enrich where info LIKE "%ribosom%"').sort_values(by='module', ascending=False)

Unnamed: 0,module,term,info,fold_change,bonferroni,term_count,module_size
51,612.0,J,"Translation, ribosomal structure and biogenesis",9.843013,0.006940094,5.0,32.0
50,492.0,ko:K04373,ribosomal protein S6 kinase alpha-1/2/3/6,55.128999,1.369426e-06,6.0,51.0
49,425.0,J,"Translation, ribosomal structure and biogenesis",13.780218,4.786231e-05,7.0,32.0
48,279.0,ko:K04373,ribosomal protein S6 kinase alpha-1/2/3/6,29.287281,0.0281795,3.0,48.0
47,250.0,GO:0043232,Intracellular non-membrane-bounded organelle: ...,21.759674,4.635706e-08,10.0,25.0
46,250.0,GO:0043229,Intracellular organelle: Organized structure o...,8.137091,0.0002377703,10.0,25.0
45,250.0,GO:0043228,Non-membrane-bounded organelle: Organized stru...,21.62753,4.892925e-08,10.0,25.0
44,250.0,GO:0043226,Organelle: Organized structure of distinctive ...,7.967189,0.0002841145,10.0,25.0
43,250.0,GO:0006996,Organelle organization: A process that is carr...,17.675735,3.187546e-06,9.0,25.0
42,245.0,J,"Translation, ribosomal structure and biogenesis",10.755292,0.0002661936,7.0,41.0


In [58]:
print(', '.join([f'm{str(int(mod)).zfill(3)}' for mod in (dataframe_utils.sql_query_df({'rna_seq_enrich': rna_seq_enrich}, 'select distinct(module) from rna_seq_enrich where info LIKE "%ribosom%"').sort_values(by='module', ascending=False)['module'].values)]))

m612, m492, m425, m279, m250, m245, m239, m220, m218, m216, m202, m154, m099, m097, m096, m090, m084, m074
