In [1]:
import pandas as pd
import sys

sys.path.append('../../')
from utils import clustering_utils, dataframe_utils

In [2]:
import requests
from bs4 import BeautifulSoup

def get_ttherm_id_from_keyword(term):
    url = f'https://tet.ciliate.org/search.php?gene_name={term}'
    try:
        # Get the HTML content of the web page
        response = requests.get(url)
        response.raise_for_status()  # Check if the request was successful
        html_content = response.content

        # Parse the HTML content using BeautifulSoup
        soup = BeautifulSoup(html_content, 'html.parser')

        # Find the specific table with class 'main'
        table = soup.find('table', class_='main')
        rows = table.find_all('tr')[1:]
        num_rows = len(rows)

        ttherm_id = None

        for row in rows:
            tds = row.find_all('td')
            first_td = tds[0].get_text().strip()
            second_td = tds[1].get_text().strip()

            if second_td == term:
                if ttherm_id is not None:
                    raise ValueError(f'The term \'{term}\' returned multiple TTHERM_ID entries.')
                ttherm_id = first_td

        return ttherm_id

    except requests.exceptions.RequestException as e:
        print(f"An error occurred while fetching the HTML content: {e}")

In [3]:
keys = [
'HHF1',
'HHF2',
'HHT2',
'HTA1',
'HTA2',
'HTB1',
'HTB2',
]

print(','.join(keys))
print(','.join([get_ttherm_id_from_keyword(key) for key in keys]))

HHF1,HHF2,HHT2,HTA1,HTA2,HTB1,HTB2
TTHERM_00498190,TTHERM_00189170,TTHERM_00189180,TTHERM_00790790,TTHERM_00316500,TTHERM_00633360,TTHERM_00283180


In [4]:
keys = [
'HHT4',
'HHT3',
'HTA3',
]

print(','.join(keys))
print(','.join([get_ttherm_id_from_keyword(key) for key in keys]))

HHT4,HHT3,HTA3
TTHERM_00016200,TTHERM_00016170,TTHERM_00143660


In [5]:
keys = [
'CNA1', # TTHERM_00146340
]

print(','.join(keys))
print(','.join([get_ttherm_id_from_keyword(key) for key in keys]))

CNA1
TTHERM_00146340


In [6]:
keys = [
'HHO1', # TTHERM_00823720
'MLH1', # TTHERM_00471820
]

print(','.join(keys))
print(','.join([get_ttherm_id_from_keyword(key) for key in keys]))

HHO1,MLH1
TTHERM_00823720,TTHERM_00471820


In [7]:
keys = [
'EZL2',
'EZL3',
'HAT1',
'HAT2',
'TXR1',
]

print(','.join(keys))
print(','.join([get_ttherm_id_from_keyword(key) for key in keys]))

EZL2,EZL3,HAT1,HAT2,TXR1
TTHERM_00300320,TTHERM_00499660,TTHERM_00046760,TTHERM_00248390,TTHERM_00256950


In [8]:
keys = [
'CPD1',
'CPD2',
'CPG1',
'CPH1',
'CPH2',
'CPH3',
'CPH4',
'SMC2',
'SMC4',
]

print(','.join(keys))
print(','.join([get_ttherm_id_from_keyword(key) for key in keys]))

CPD1,CPD2,CPG1,CPH1,CPH2,CPH3,CPH4,SMC2,SMC4
TTHERM_00486070,TTHERM_00392760,TTHERM_00919690,TTHERM_00728870,TTHERM_00540340,TTHERM_00554600,TTHERM_01299730,TTHERM_00812950,TTHERM_00446400


In [9]:
keys = [
    'REC8', 
    'SCC2', 
    'SCC3', 
    'SMC1', 
    'SMC3'
    ]

print(','.join(keys))
print(','.join([get_ttherm_id_from_keyword(key) for key in keys]))

REC8,SCC2,SCC3,SMC1,SMC3
TTHERM_00245660,TTHERM_00678460,TTHERM_00225630,TTHERM_01048090,TTHERM_00294810


In [10]:
keys = [
    'IMA1', 
    'IMA5', 
    'IMA9', 
    ]

print(','.join(keys))
print(','.join([get_ttherm_id_from_keyword(key) for key in keys]))

IMA1,IMA5,IMA9
TTHERM_01016220,TTHERM_00703970,TTHERM_00756470


In [11]:
keys = [
    'IMA10', 
    'IMA12', 
    'IMA13', 
    'IMA3', 
    'IMA5', 
    'IMA8', 
    ]

print(','.join(keys))
print(','.join([get_ttherm_id_from_keyword(key) for key in keys]))

IMA10,IMA12,IMA13,IMA3,IMA5,IMA8
TTHERM_00295520,TTHERM_00938930,TTHERM_00492970,TTHERM_00335950,TTHERM_00703970,TTHERM_00161640


In [12]:
# keys = [
#     'IMB2', 
#     'IMB4',
#     'IMB6',
#     'IMB8',
#     ]

# print(','.join(keys))
# print(','.join([get_ttherm_id_from_keyword(key) for key in keys]))

In [13]:
keys = [
'LIA1',
'LIA2',
'LIA3',
'LIA4',
'LIA5',
'LIA6',
'LIA7',
]

print(','.join(keys))
print(','.join([get_ttherm_id_from_keyword(key) for key in keys]))

LIA1,LIA2,LIA3,LIA4,LIA5,LIA6,LIA7
TTHERM_00675900,TTHERM_00569290,TTHERM_00675850,TTHERM_00085600,TTHERM_00653910,TTHERM_00849260,TTHERM_00193390


In [14]:
modified_annotation = pd.read_csv('../../active_files/complete_annotation.csv')
modified_annotation_no_muco = modified_annotation.copy()

In [15]:
xls_list = []

for n in list(range(2, 10)):
    xls_list.append(pd.read_excel(f'../../new_raw_data/rna_seq_processed/xls_files/mc-e22-08-0326-s0{n}.xls', sheet_name=0))

In [16]:
for idx, df in enumerate(xls_list):
    print(idx, len(set(df.iloc[:, 0].values)))

0 350
1 284
2 34
3 38
4 41
5 74
6 342
7 117


In [17]:
first_column_values = {}

for n in list(range(2, 10)):
    file_path = f'../../new_raw_data/rna_seq_processed/xls_files/mc-e22-08-0326-s0{n}.xls'

    excel_file = pd.ExcelFile(file_path)

    for sheet_name in excel_file.sheet_names:
        df = pd.read_excel(file_path, sheet_name=sheet_name)
        
        if n not in first_column_values:
            first_column_values[n] = []
        first_column_values[n] += df.iloc[:, 0].astype(str).to_list()


        if n == 3 and sheet_name == 'genes':
            print(sheet_name)
            print(df)
            print(first_column_values[n])

genes
              TTHERM
0             g19412
1             g24741
2               g371
3    TTHERM_00000070
4    TTHERM_00001090
..               ...
279  TTHERM_01179960
280  TTHERM_01251290
281  TTHERM_01276400
282  TTHERM_01299730
283  TTHERM_01358410

[284 rows x 1 columns]
['g19412', 'g24741', 'g371', 'TTHERM_00000070', 'TTHERM_00001090', 'TTHERM_000056049', 'TTHERM_000109313', 'TTHERM_00013120', 'TTHERM_00013900', 'TTHERM_00014940', 'TTHERM_000188352', 'TTHERM_000191179', 'TTHERM_000218898', 'TTHERM_00024350', 'TTHERM_00024380', 'TTHERM_00024410', 'TTHERM_00028890', 'TTHERM_00030580', 'TTHERM_000357059', 'TTHERM_00037210', 'TTHERM_000378989', 'TTHERM_000384969', 'TTHERM_000438819', 'TTHERM_000446389', 'TTHERM_000457099', 'TTHERM_00046490', 'TTHERM_000486559', 'TTHERM_00048840', 'TTHERM_000503740', 'TTHERM_00052240', 'TTHERM_000523058', 'TTHERM_000529532', 'TTHERM_000570549', 'TTHERM_00059040', 'TTHERM_000675819', 'TTHERM_000695779', 'TTHERM_000711791', 'TTHERM_000770709', 'TTH

In [18]:
for n in list(range(2, 10)):
    print(n, len(set(first_column_values[n])))

2 352
3 284
4 53
5 38
6 41
7 74
8 344
9 117


In [19]:
_n = 9
len(set(first_column_values[_n])), len(set(xls_list[_n-2].iloc[:, 0].values))

(117, 117)

In [20]:
_n = 8
len(set(first_column_values[_n])), len(set(xls_list[_n-2].iloc[:, 0].values))

(344, 342)

In [21]:
_n = 7
len(set(first_column_values[_n])), len(set(xls_list[_n-2].iloc[:, 0].values))

(74, 74)

In [22]:
_n = 6
len(set(first_column_values[_n])), len(set(xls_list[_n-2].iloc[:, 0].values))

(41, 41)

In [23]:
_n = 5
len(set(first_column_values[_n])), len(set(xls_list[_n-2].iloc[:, 0].values))

(38, 38)

In [24]:
_n = 4
len(set(first_column_values[_n])), len(set(xls_list[_n-2].iloc[:, 0].values))

(53, 34)

In [25]:
_n = 3
len(set(first_column_values[_n])), len(set(xls_list[_n-2].iloc[:, 0].values))

(284, 284)

In [26]:
gid_to_tthermid = {}

for num, xls in first_column_values.items():
    for id in set(xls):
        if id[: len('TTHERM_')] != 'TTHERM_':
            print(id)
            if id[: len('g')] == 'g':
                gid_to_tthermid[id] = get_ttherm_id_from_keyword(id)

nan
X
g371
g19412
g24741
nan
g18920
g1455
X
g20657


In [27]:
gid_to_tthermid

{'g371': None,
 'g19412': None,
 'g24741': None,
 'g18920': None,
 'g1455': None,
 'g20657': None}

In [28]:
{
'Cluster 5': {'mc-e22-08-0326-s03.xls': ['genes']}, # FIXME: translate gIDs
'Cluster 5A': {'mc-e22-08-0326-s04.xls': [0, 1, 2, 'telomere_maintenance']}, # unclear whether this cluster has 53 or 34 genes based on xls (assuming 53)
'Cluster 5B': {'mc-e22-08-0326-s05.xls': ['analysis2_annotated', 'DNA_repair']},
'Cluster 5C': {'mc-e22-08-0326-s06.xls': ['genes', 'chromosome_condensation', 'chromatin assembly', 'mitotic nuclear division', 'mitotic cell cycle process']},
'Cluster 5D': {'mc-e22-08-0326-s07.xls': ['genes']},
'Cluster 1': {'mc-e22-08-0326-s08.xls': ['genes']}, # only first tab
'Cluster 2': {'mc-e22-08-0326-s09.xls': ['genes']} # there are only 117 genes in here despite the paper saying there are 564 in the cluster
}

{'Cluster 5': {'mc-e22-08-0326-s03.xls': ['genes']},
 'Cluster 5A': {'mc-e22-08-0326-s04.xls': [0, 1, 2, 'telomere_maintenance']},
 'Cluster 5B': {'mc-e22-08-0326-s05.xls': ['analysis2_annotated',
   'DNA_repair']},
 'Cluster 5C': {'mc-e22-08-0326-s06.xls': ['genes',
   'chromosome_condensation',
   'chromatin assembly',
   'mitotic nuclear division',
   'mitotic cell cycle process']},
 'Cluster 5D': {'mc-e22-08-0326-s07.xls': ['genes']},
 'Cluster 1': {'mc-e22-08-0326-s08.xls': ['genes']},
 'Cluster 2': {'mc-e22-08-0326-s09.xls': ['genes']}}

In [29]:
muco_exp = [
'TTHERM_00052190',
'TTHERM_00011710',
'TTHERM_00321680',
'TTHERM_00445920',
'TTHERM_00410180',
'TTHERM_00410210',
'TTHERM_00313130',
'TTHERM_00059370',
'TTHERM_00283800',
'TTHERM_00241790',
'TTHERM_00318900',
'TTHERM_00852790',
'TTHERM_00467390',
'TTHERM_01332070',
'TTHERM_00497590',
'TTHERM_00013410',
'TTHERM_00527180',
'TTHERM_00624730',
'TTHERM_00624720',
'TTHERM_00378890',
'TTHERM_00522600',
'TTHERM_01055600',
'TTHERM_00558350',
'TTHERM_00221120',
'TTHERM_00572100',
'TTHERM_00420770',
'TTHERM_00566910',
'TTHERM_00658810',
'TTHERM_00047330',
'TTHERM_000193469',
'TTHERM_000486279',
'TTHERM_00141040',
'TTHERM_00227750',
'TTHERM_00317390',
'TTHERM_00670750',
'TTHERM_01122800',
'TTHERM_01213910',
'TTHERM_00886960',
'TTHERM_01287970',
'TTHERM_00498010',
]

In [30]:
len(muco_exp)

40

In [31]:
de_regranulation_df = pd.read_csv('../../regranulation/de_regranulation_FC_1.5_adj_P_Val_0.0005_B_1.csv')

In [32]:
de_muco = list(set(list(de_regranulation_df['Unnamed: 0'].values)).intersection(set(muco_exp)))
de = list(set(list(de_regranulation_df['Unnamed: 0'].values)) - set(muco_exp))
muco = list(set(muco_exp) - set(list(de_regranulation_df['Unnamed: 0'].values)))

In [33]:
de_regranulation_annot_df = pd.DataFrame({
    'TTHERM_ID': de_muco + de + muco,
    'mucocysts': ['DE,EV' for _ in range(len(de_muco))] + ['DE' for _ in range(len(de))] + ['EV' for _ in range(len(muco))],
    })

In [34]:
de_regranulation_annot_df

Unnamed: 0,TTHERM_ID,mucocysts
0,TTHERM_00467390,"DE,EV"
1,TTHERM_00572100,"DE,EV"
2,TTHERM_00317390,"DE,EV"
3,TTHERM_00558350,"DE,EV"
4,TTHERM_00013410,"DE,EV"
...,...,...
1648,TTHERM_000193469,EV
1649,TTHERM_00624720,EV
1650,TTHERM_00378890,EV
1651,TTHERM_00497590,EV


In [35]:
modified_annotation = modified_annotation.merge(de_regranulation_annot_df, on='TTHERM_ID', how='outer')
modified_annotation

Unnamed: 0,TTHERM_ID,seed_ortholog,evalue,score,eggNOG_OGs,max_annot_lvl,COG_category,Description,Preferred_name,GOs,...,KEGG_TC,CAZy,BiGG_Reaction,PFAMs,TGD2021_description,peptide,common_name,InterPro,InterPro_description,mucocysts
0,TTHERM_00840110,5911.EAS05042,0.0,1278.5,"2E5M0@1|root,2SCE9@2759|Eukaryota,3ZEBN@5878|C...",5878|Ciliophora,-,-,-,-,...,-,-,-,-,hypothetical protein,MISSNQTADQENKVENKVANAEHVNQQSYDSIPQSLSPAVIAQIMD...,Unnamed,-,-,
1,TTHERM_01082930,31033.ENSTRUP00000031008,1.9999999999999997e-28,94.2,"COG5078@1|root,KOG0418@2759|Eukaryota,38KYZ@33...",33208|Metazoa,O,Belongs to the ubiquitin-conjugating enzyme fa...,UBE2K,"GO:0000209,GO:0003674,GO:0003824,GO:0004842,GO...",...,-,-,-,"UBA,UQ_con",ubiquitin-conjugating enzyme E2,MHKNILIILFQIFCCQIYTTIYTFYYFMANIVFIIHNVKLDLFSCF...,Unnamed,"IPR000608,IPR016135,IPR023313,IPR050113","Ubiquitin-conjugating enzyme E2,Ubiquitin-conj...",
2,TTHERM_01081610,5911.EAR82090,0.0,2110.7,"2A5FX@1|root,2RY9I@2759|Eukaryota",2759|Eukaryota,-,-,-,-,...,-,-,-,PRESAN,transmembrane protein putative,MSSQSPAKLNNQNCAAANQYYNDLESCVQGYCIKQQSGSGARGCFP...,Unnamed,-,-,
3,TTHERM_00059210,5911.EAR87408,3.2999999999999998e-298,983.9,"COG0575@1|root,KOG1440@2759|Eukaryota,3ZAR9@58...",5878|Ciliophora,I,Cytidylyltransferase family,-,-,...,-,-,-,CTP_transf_1,phosphatidate cytidylyltransferase,MSQVTNRSQKKSHQKRDEKSEEDSSDEKTDDFSEEELDKLQEAQKK...,Unnamed,"IPR000374,IPR016720","Phosphatidate cytidylyltransferase,Phosphatida...",
4,TTHERM_00535200,5911.EAS03184,0.0,2448.3,"2E77S@1|root,2SDUU@2759|Eukaryota",5911.EAS03184|-,S,Src homology 3 domains,-,-,...,-,-,-,-,beta-Pak interactive eXchange factor Src-like ...,MFTKSNSRSALAGLNSIVNSQNDSLTSRAQHQNYAKKDLTISNSTS...,Unnamed,"IPR001452,IPR036028,IPR051569","SH3 domain,SH3-like domain superfamily,SH3 and...",
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26701,TTHERM_00732475,,,,,,,,,,...,,,,,,,,,,DE
26702,TTHERM_00519812,,,,,,,,,,...,,,,,,,,,,DE
26703,TTHERM_01892110,,,,,,,,,,...,,,,,,,,,,DE
26704,TTHERM_000486279,,,,,,,,,,...,,,,,,,,,,EV


In [36]:
modified_annotation.isna().sum()

TTHERM_ID                   0
seed_ortholog              19
evalue                     19
score                      19
eggNOG_OGs                 19
max_annot_lvl              19
COG_category               19
Description                19
Preferred_name             19
GOs                        19
EC                         19
KEGG_ko                    19
KEGG_Pathway               19
KEGG_Module                19
KEGG_Reaction              19
KEGG_rclass                19
BRITE                      19
KEGG_TC                    19
CAZy                       19
BiGG_Reaction              19
PFAMs                      19
TGD2021_description        19
peptide                    19
common_name                19
InterPro                   19
InterPro_description       19
mucocysts               25053
dtype: int64

In [37]:
modified_annotation = modified_annotation.fillna('-')

In [38]:
modified_annotation.isna().sum()

TTHERM_ID               0
seed_ortholog           0
evalue                  0
score                   0
eggNOG_OGs              0
max_annot_lvl           0
COG_category            0
Description             0
Preferred_name          0
GOs                     0
EC                      0
KEGG_ko                 0
KEGG_Pathway            0
KEGG_Module             0
KEGG_Reaction           0
KEGG_rclass             0
BRITE                   0
KEGG_TC                 0
CAZy                    0
BiGG_Reaction           0
PFAMs                   0
TGD2021_description     0
peptide                 0
common_name             0
InterPro                0
InterPro_description    0
mucocysts               0
dtype: int64

In [39]:
modified_annotation.to_csv('../../active_files/complete_annotation.csv', index=False)

In [40]:
# MIN-MAX
microarray = pd.read_csv('./test_nn3_leiden_label_df_round_1.csv')
rna_seq = pd.read_csv('./rna_seq_label_df_round_1.csv')

# Z-SCORE
# microarray = pd.read_csv('./testz_nn3_leiden_label_df_round_1.csv')
# rna_seq = pd.read_csv('./rnaz_seq_label_df_round_1.csv')

microarray_enrich = clustering_utils.compute_enrichment(microarray)

rna_seq_enrich = clustering_utils.compute_enrichment(rna_seq)

In [41]:
modified_annotation_no_muco.to_csv('../../active_files/complete_annotation.csv', index=False)

In [42]:
rna_ev = set([(int(m)) for m in rna_seq_enrich.loc[rna_seq_enrich['term'].isin(['DE,EV', 'EV'])]['module'].values])
microarray_ev = set([(int(m)) for m in microarray_enrich.loc[microarray_enrich['term'].isin(['DE,EV', 'EV'])]['module'].values])

rna_de = set([(int(m)) for m in rna_seq_enrich.loc[rna_seq_enrich['term'].isin(['DE,EV', 'DE'])]['module'].values])
microarray_de = set([(int(m)) for m in microarray_enrich.loc[microarray_enrich['term'].isin(['DE,EV', 'DE'])]['module'].values])

len(rna_ev), len(microarray_ev), len(rna_de), len(microarray_de)

(5, 7, 40, 54)

In [43]:
len(rna_ev.intersection(rna_de))

3

In [44]:
rna_ev

{40, 194, 199, 219, 294}

In [45]:
print(*['m'+str(m).zfill(3) for m in rna_ev], sep=', ')

m194, m294, m199, m040, m219


In [46]:
print(*['m'+str(m).zfill(3) for m in microarray_ev], sep=', ')

m001, m002, m003, m004, m005, m006, m378


In [47]:
(rna_ev.intersection(rna_de))

{194, 199, 294}

In [48]:
print(*['m'+str(m).zfill(3) for m in rna_de], sep=', ')

m131, m280, m412, m029, m543, m294, m168, m296, m553, m044, m432, m561, m178, m564, m565, m568, m313, m570, m444, m445, m575, m064, m193, m194, m067, m580, m199, m072, m585, m586, m331, m594, m595, m091, m096, m360, m488, m109, m110, m254


In [49]:
len(microarray_ev.intersection(microarray_de))

5

In [50]:
rna_seq_enrich.loc[rna_seq_enrich['term'].isin(['DE,EV', 'DE', 'EV'])].shape, microarray_enrich.loc[microarray_enrich['term'].isin(['DE,EV', 'DE', 'EV'])].shape

((45, 7), (61, 7))

In [51]:
rna_seq_enrich.loc[rna_seq_enrich['term'].isin(['DE,EV', 'EV'])].shape, microarray_enrich.loc[microarray_enrich['term'].isin(['DE,EV', 'EV'])].shape

((5, 7), (7, 7))

In [52]:
microarray_enrich.loc[microarray_enrich['term'].isin(['DE,EV', 'DE'])]

Unnamed: 0,module,term,info,fold_change,bonferroni,term_count,module_size
0,0.0,DE,differentially expressed in mucocyst regranula...,6.192157,5.148742e-09,16.0,42.0
0,1.0,DE,differentially expressed in mucocyst regranula...,13.385987,1.3632650000000001e-27,28.0,34.0
0,2.0,DE,differentially expressed in mucocyst regranula...,13.385987,1.3632650000000001e-27,28.0,34.0
0,3.0,DE,differentially expressed in mucocyst regranula...,14.572922,2.244722e-55,52.0,58.0
0,4.0,DE,differentially expressed in mucocyst regranula...,13.545344,7.680706999999999e-35,35.0,42.0
0,8.0,DE,differentially expressed in mucocyst regranula...,3.075159,0.02801302,7.0,37.0
0,9.0,DE,differentially expressed in mucocyst regranula...,3.125849,0.0419013,5.0,26.0
0,29.0,DE,differentially expressed in mucocyst regranula...,4.063603,0.007192166,6.0,24.0
0,30.0,DE,differentially expressed in mucocyst regranula...,3.464055,0.000160334,13.0,61.0
0,47.0,DE,differentially expressed in mucocyst regranula...,3.458386,0.001098435,10.0,47.0


In [53]:
rna_seq_enrich.loc[rna_seq_enrich['term'].isin(['DE,EV', 'DE'])]

Unnamed: 0,module,term,info,fold_change,bonferroni,term_count,module_size
0,29.0,DE,differentially expressed in mucocyst regranula...,2.786471,0.03697979,6.0,35.0
0,44.0,DE,differentially expressed in mucocyst regranula...,3.694185,0.02322341,5.0,22.0
0,64.0,DE,differentially expressed in mucocyst regranula...,3.421982,0.004263893,8.0,38.0
0,67.0,DE,differentially expressed in mucocyst regranula...,2.370435,0.04673207,7.0,48.0
0,72.0,DE,differentially expressed in mucocyst regranula...,2.709069,0.04132424,6.0,36.0
0,91.0,DE,differentially expressed in mucocyst regranula...,3.16058,0.01221931,7.0,36.0
0,96.0,DE,differentially expressed in mucocyst regranula...,3.334238,0.004982285,8.0,39.0
0,109.0,DE,differentially expressed in mucocyst regranula...,2.554265,0.006395223,11.0,70.0
0,110.0,DE,differentially expressed in mucocyst regranula...,3.125849,0.002353673,10.0,52.0
0,131.0,DE,differentially expressed in mucocyst regranula...,2.994234,0.01596917,7.0,38.0


In [54]:
microarray_enrich.loc[microarray_enrich['term'].isin(['DE,EV', 'EV'])]

Unnamed: 0,module,term,info,fold_change,bonferroni,term_count,module_size
1,1.0,EV,experimentally validated mucocyst-associated gene,39.273529,0.04947943,2.0,34.0
1,2.0,EV,experimentally validated mucocyst-associated gene,58.910294,0.001139528,3.0,34.0
1,3.0,EV,experimentally validated mucocyst-associated gene,103.600862,1.895944e-14,9.0,58.0
1,4.0,EV,experimentally validated mucocyst-associated gene,158.964286,4.895547e-18,10.0,42.0
0,5.0,EV,experimentally validated mucocyst-associated gene,267.06,0.002999931,2.0,5.0
0,6.0,EV,experimentally validated mucocyst-associated gene,556.375,1.078811e-11,5.0,6.0
1,378.0,EV,experimentally validated mucocyst-associated gene,54.133784,0.001357697,3.0,37.0


In [55]:
print(','.join(['m'+str(int(m)).zfill(3) for m in microarray_enrich.loc[microarray_enrich['term'].isin(['DE,EV', 'EV'])]['module'].values]))

m001,m002,m003,m004,m005,m006,m378


In [56]:
rna_seq_enrich.loc[rna_seq_enrich['term'].isin(['DE,EV', 'EV'])]

Unnamed: 0,module,term,info,fold_change,bonferroni,term_count,module_size
0,40.0,EV,experimentally validated mucocyst-associated gene,86.148387,1.241951e-05,4.0,31.0
1,194.0,EV,experimentally validated mucocyst-associated gene,159.655435,5.1634399999999997e-20,11.0,46.0
1,199.0,EV,experimentally validated mucocyst-associated gene,46.580233,0.001850141,3.0,43.0
0,219.0,EV,experimentally validated mucocyst-associated gene,49.455556,0.03898948,2.0,27.0
1,294.0,EV,experimentally validated mucocyst-associated gene,43.074194,0.04498428,2.0,31.0


In [57]:
rna_seq.loc[rna_seq['label'].isin([int(m) for m in rna_seq_enrich.loc[rna_seq_enrich['term'].isin(['DE,EV', 'EV'])]['module'].values])]

Unnamed: 0,TTHERM_ID,label
13861,TTHERM_01297430,294
13862,TTHERM_01213910,294
13863,TTHERM_01142782,294
13864,TTHERM_01002740,294
13865,TTHERM_00833627,294
...,...,...
22002,TTHERM_00218460,40
22003,TTHERM_00101440,40
22004,TTHERM_00046372,40
22005,TTHERM_00041538,40


In [58]:
microarray.loc[microarray['label'].isin([int(m) for m in microarray_enrich.loc[microarray_enrich['term'].isin(['DE,EV', 'EV'])]['module'].values])]

Unnamed: 0,TTHERM_ID,label
8281,TTHERM_01123950,378
8282,TTHERM_01044620,378
8283,TTHERM_00962070,378
8284,TTHERM_00962020,378
8285,TTHERM_00939090,378
...,...,...
20381,TTHERM_00138495,1
20382,TTHERM_00136330,1
20383,TTHERM_00129415,1
20384,TTHERM_00120780,1


In [59]:
print(','.join(['m'+str(int(m)).zfill(3) for m in rna_seq_enrich.loc[rna_seq_enrich['term'].isin(['DE,EV', 'EV'])]['module'].values]))

m040,m194,m199,m219,m294


In [60]:
rna_seq.loc[rna_seq['label'].isin([int(m) for m in rna_seq_enrich.loc[rna_seq_enrich['term'].isin(['DE,EV', 'EV'])]['module'].values])]

Unnamed: 0,TTHERM_ID,label
13861,TTHERM_01297430,294
13862,TTHERM_01213910,294
13863,TTHERM_01142782,294
13864,TTHERM_01002740,294
13865,TTHERM_00833627,294
...,...,...
22002,TTHERM_00218460,40
22003,TTHERM_00101440,40
22004,TTHERM_00046372,40
22005,TTHERM_00041538,40


In [61]:
rna_seq.loc[rna_seq['label'].isin([int(m) for m in rna_seq_enrich.loc[rna_seq_enrich['term'].isin(['DE,EV', 'EV'])]['module'].values])].to_csv('./rna_seq_ev_enriched_module_genes.csv', index=False)

In [62]:
rna_seq_enrich.loc[rna_seq_enrich['term'].isin(['DE,EV', 'EV'])]['module'].values

array([ 40., 194., 199., 219., 294.])

In [63]:
rna_seq_enrich.loc[rna_seq_enrich['term'].isin(['DE,EV', 'EV'])]

Unnamed: 0,module,term,info,fold_change,bonferroni,term_count,module_size
0,40.0,EV,experimentally validated mucocyst-associated gene,86.148387,1.241951e-05,4.0,31.0
1,194.0,EV,experimentally validated mucocyst-associated gene,159.655435,5.1634399999999997e-20,11.0,46.0
1,199.0,EV,experimentally validated mucocyst-associated gene,46.580233,0.001850141,3.0,43.0
0,219.0,EV,experimentally validated mucocyst-associated gene,49.455556,0.03898948,2.0,27.0
1,294.0,EV,experimentally validated mucocyst-associated gene,43.074194,0.04498428,2.0,31.0


In [64]:
[int(m) for m in rna_seq_enrich.loc[rna_seq_enrich['term'].isin(['DE,EV', 'EV'])]['module'].values]

[40, 194, 199, 219, 294]

In [65]:
microarray.loc[microarray['label'].isin([int(m) for m in microarray_enrich.loc[microarray_enrich['term'].isin(['DE,EV', 'EV'])]['module'].values])].to_csv('./microarray_ev_enriched_module_genes.csv', index=False)

In [66]:
rna_ev = set([(int(m)) for m in rna_seq_enrich.loc[rna_seq_enrich['term'].isin(['DE,EV', 'EV'])]['module'].values])
microarray_ev = set([(int(m)) for m in microarray_enrich.loc[microarray_enrich['term'].isin(['DE,EV', 'EV'])]['module'].values])

In [67]:
'select * from rna_seq_enrich where info LIKE "%ribo%"'

'select * from rna_seq_enrich where info LIKE "%ribo%"'

In [68]:
dataframe_utils.sql_query_df({'rna_seq_enrich': rna_seq_enrich}, 'select * from rna_seq_enrich where info LIKE "%ribosom%"').sort_values(by='module', ascending=False)

Unnamed: 0,module,term,info,fold_change,bonferroni,term_count,module_size
53,725.0,J,"Translation, ribosomal structure and biogenesis",13.031555,0.0003466374,6.0,29.0
52,724.0,GO:0016071,Mrna metabolic process: The chemical reactions...,87.848684,0.001577549,4.0,19.0
51,721.0,GO:0043232,Intracellular non-membrane-bounded organelle: ...,25.382485,6.548972e-06,7.0,15.0
50,721.0,GO:0043229,Intracellular organelle: Organized structure o...,9.491851,0.002089845,7.0,15.0
49,721.0,GO:0043228,Non-membrane-bounded organelle: Organized stru...,25.22834,6.790286e-06,7.0,15.0
48,721.0,GO:0043226,Organelle: Organized structure of distinctive ...,9.293661,0.002359568,7.0,15.0
47,721.0,GO:0006996,Organelle organization: A process that is carr...,22.909559,1.20323e-05,7.0,15.0
46,692.0,GO:0043232,Intracellular non-membrane-bounded organelle: ...,10.119263,0.004884781,8.0,43.0
45,692.0,GO:0043228,Non-membrane-bounded organelle: Organized stru...,10.05781,0.00508401,8.0,43.0
44,462.0,J,"Translation, ribosomal structure and biogenesis",13.996855,1.365697e-07,10.0,45.0


In [69]:
print(', '.join([f'm{str(int(mod)).zfill(3)}' for mod in (dataframe_utils.sql_query_df({'rna_seq_enrich': rna_seq_enrich}, 'select distinct(module) from rna_seq_enrich where info LIKE "%ribosom%"').sort_values(by='module', ascending=False)['module'].values)]))

m725, m724, m721, m692, m462, m461, m460, m458, m454, m452, m413, m394, m393, m392, m389, m359, m250, m237, m234, m199


In [70]:
rna_ev

{40, 194, 199, 219, 294}