In [49]:
import re
import pandas as pd
import numpy as np

In [197]:
file = '../ppi_ml/results/walktrap/LinearSVC_100feats_fdr10_4steps_nochloro.csv'

In [270]:
def get_top(group):
    return group.nlargest(1, 'count')

def flatten(row):
    flat_list = [item for sublist in row for item in sublist]
    #print(flat_list)
    if len(flat_list) > 0:
        return flat_list
    else:
        return 'Uncharacterized'

def get_cluster_counts(df, cut_col):
    gb = df.groupby(cut_col)
    counts = gb.size().to_frame(name='n_ppis').reset_index()
    counts.join(gb.agg({'cp_cmplx_name':'count'}).rename(columns={'cp_cmplx_name':'cp_cmplx_name_count'}))
    return counts

def fmt_db_labels(string):
    if not pd.isna(string):
        if ', ' in string:
            string = string.split(', ')
            string = [i.capitalize() for i in string]
            if 'Bzip transcription factor complex' in string:
                string = 'Bzip transcription factor complex'
            if 'Scf(coi1) ubiquitin ligase complex' in string:
                string = 'Scf(coi1) ubiquitin ligase complex'
            if 'Lsm1-7-pat1 complex' in string:
                string = 'Lsm1-7-pat1 complex'
            if 'Muscle cell-specific swi/snf atp-dependent chromatin remodeling complex' in string:
                string = 'Swi/snf chromatin remodelling complex'
            if 'Neuronal ap-3 adaptor complex' in string:
                string = 'Ap-3 adaptor complex'
            if 'Atg-5-atg-12-atg-16.1-atg-16.2 complex' in string:
                string = 'Atg12-atg5 complex'
            if 'Cct-prefoldin complex' in string:
                string = 'Prefoldin complex'
        else:
            string = string.capitalize()
    return string

# get_melted_label_counts: potentially deprecated in favor of get_exploded_label_counts
# def get_melted_label_counts(df, cut_col):
#     cols = ['cp_cmplx_name', 'go_cmplx_name', 'corum_cmplx_name']
#     df_melted = pd.melt(df, id_vars=cut_col, value_vars=['cp_cmplx_name', 'go_cmplx_name', 'corum_cmplx_name'], var_name='complex_type')
#     for i in range(len(df_melted)):
#         cmplx = df_melted['value'][i]
#         if 'spliceosom' in str(cmplx):
#             df_melted.loc[i, 'value'] = 'Spliceosome'
            
#     result = df_melted.groupby([cut_col, 'complex_type', 'value']).size().reset_index(name='count')
#     result = result[result['value'] != 'protein-containing complex']
#     result = result[result['value'] != 'ribonucleoprotein complex']
    
#     top = result.groupby(cut_col, group_keys=False).apply(get_top).reset_index(drop=True)
#     top = top.merge(counts, how='left', left_on=cut_col, right_on=cut_col)
#     top['prop'] = top['count']/top['n_ppis']
#     # replace sparsely labeled cluster names
#     # change singleton labels to "unclustered"
#     for i in range(len(top)):
#         if top['prop'][i] == 1 and top['n_ppis'][i] == 1:
#             top.loc[i, 'value'] = 'Unclustered'
#         if top['prop'][i] < 0.7:
#             if top['n_ppis'][i] <= 10:
#                 top.loc[i, 'value'] = 'Small heterogenous complex'
#             else:
#                 top.loc[i, 'value'] = 'Large heterogeneous complex'
#     return top

def get_exploded_label_counts(df, df_counts, cut_col):
    cols = ['cp_cmplx_name', 'go_cmplx_name', 'corum_cmplx_name']
    
    df[cols] = df[cols].replace(to_replace=" [\(\[].*?[\)\]]", value="", regex=True)
    df[cols] = df[cols].replace(to_replace=", cytoplasmic", value="", regex=True)
    df[cols] = df[cols].replace(to_replace=", mitochondrial", value="", regex=True)
    df[cols] = df[cols].replace(to_replace=", chromatin", value="", regex=True)
    df[cols] = df[cols].replace(to_replace=", EGF induced", value="", regex=True)
    df[cols] = df[cols].replace(to_replace=", V1 domain", value="", regex=True)
    df['all_db_labels'] = df[cols].apply(lambda x: ', '.join(x.dropna()), axis=1)
    df['value'] = [fmt_db_labels(i) for i in df['all_db_labels']]
    df_exp = df.explode('value')
    
    result = df_exp.groupby([cut_col, 'value']).size().reset_index(name='count')
    result = result[result['value'] != 'Protein-containing complex']
    result = result[result['value'] != 'Ribonucleoprotein complex']
    
    top = result.groupby(cut_col, group_keys=False).apply(get_top).reset_index(drop=True)
    top = top.merge(df_counts, how='left', left_on=cut_col, right_on=cut_col)
    top['prop'] = top['count']/top['n_ppis']
    #top = top.reset_index(drop=True)
    # replace sparsely labeled cluster names
    # change singleton labels to "unclustered"
    for i in range(len(top)):
        # if top['n_ppis'][i] == 1 and top['prop'][i] >= 1:
        #     top.loc[i, 'value'] = 'Unclustered'
        if top['prop'][i] < 0.7:
            if top['n_ppis'][i] <= 10:
                top.loc[i, 'value'] = 'Uncharacterized complex'
            else:
                top.loc[i, 'value'] = 'Large heterogeneous complex'
    return top

def make_label_dict(df_top, df_counts, cut_col):
    cmplx_dict = {}
    #df_top = df_top.reset_index(drop=True)
    #df_counts = df_counts.reset_index(drop=True)
    for i in range(len(df_top)):
        cluster = df_top[cut_col][i]
        value = df_top['value'][i]
        count = df_top['count'][i]
        # check if the cluster key exists in the nested dictionary
        if cluster not in cmplx_dict:
            cmplx_dict[cluster] = {}
        # check if the value key exists in the inner  dictionary
        if value not in cmplx_dict[cluster]:
            cmplx_dict[cluster][value] = count
    # add all unclustered prots to dict
    for i in range(len(df_counts)):
        cluster = df_counts[cut_col][i]
        n_ppis = df_counts['n_ppis'][i]
        if n_ppis == 1:
            #if cluster not in cmplx_dict:
            cmplx_dict[cluster] = {}
            cmplx_dict[cluster]['Unclustered'] = 1
    return cmplx_dict
            
def get_cluster_labels(df, cmplx_dict, cut_col):
    cmplx_lst = []
    status_lst = []
    # test_list = ['ENOG502S2J2','KOG0142','KOG0257','KOG0409']
    for i in range(len(df)):
        cluster = df[cut_col][i]
        og = df['ID'][i]
        if cluster in cmplx_dict:
            labels = cmplx_dict.get(cluster, {})
            label = list(labels)[0]
            if len(label) < 1:
                label = 'Uncharacterized complex'
        else:
            label = 'Uncharacterized complex'
        cmplx_lst.append(label)
    return cmplx_lst

## Get complex labels based on melting method

In [156]:
# df = pd.read_csv(file)
# print(df.columns.values)
# keep_cols = ['ID', 'cut_199', 'cut_318', 'cut_398', 'cut_437', 'cut_557',
#        'cut_676', 'cut_796', 'cut_1202', 'cut_1608', 'cut_2014', 'old_cmplx_assignment', 'go_cmplx_name',
#        'corum_cmplx_name', 'cp_cmplx_name', 'human_gene_names_primary',
#        'human_gene_names_synonym', 'human_protein_names']
# df = df[keep_cols]
# print(df.columns.values)
# df_out = df.copy()
# cuts = ['cut_557', 'cut_676', 'cut_796', 'cut_1202', 'cut_1608', 'cut_2014']
# for cut_col in cuts:
#     counts = get_cluster_counts(df, cut_col)
#     top_labels = get_melted_label_counts(df, cut_col)
#     label_dict = make_label_dict(top_labels, counts, cut_col)
#     labels = get_cluster_labels(df, label_dict, cut_col)
#     df_out[f'{cut_col}_algo_label'] = labels
# #df_out.to_csv('../ppi_ml/results/walktrap/LinearSVC_100feats_fdr10_4steps_nochloro_algo_labels_091323.csv', index=False)
# df_out

['ID' 'cut_199' 'cut_318' 'cut_398' 'cut_437' 'cut_557' 'cut_676'
 'cut_796' 'cut_1202' 'cut_1608' 'cut_2014' 'amor_11' 'exca_2' 'tsar_5'
 'viri_13' 'old_status' 'old_cmplx_assignment' 'go_gene_name'
 'go_cmplx_name' 'corum_cmplx_name' 'cp_cmplx_name' 'corum_cmplx_synonym'
 'cp_cmplx_synonym' 'go_up_ids' 'corum_up_ids' 'cp_up_ids'
 'corum_cmplx_members' 'corum_cmplx_comment' 'corum_cmplx_disease'
 'cp_cmplx_members' 'cp_cmplx_assembly' 'human_entry'
 'human_gene_names_primary' 'human_gene_names_synonym'
 'human_protein_names' 'human_length' 'human_function_cc'
 'human_annotscore_1to5' 'human_subcellular_location_cc' 'arath_entry'
 'arath_gene_names_primary' 'arath_protein_names' 'arath_function_cc'
 'arath_subcellular_location_cc']
['ID' 'cut_199' 'cut_318' 'cut_398' 'cut_437' 'cut_557' 'cut_676'
 'cut_796' 'cut_1202' 'cut_1608' 'cut_2014' 'old_cmplx_assignment'
 'go_cmplx_name' 'corum_cmplx_name' 'cp_cmplx_name'
 'human_gene_names_primary' 'human_gene_names_synonym'
 'human_protein_na

Unnamed: 0,ID,cut_199,cut_318,cut_398,cut_437,cut_557,cut_676,cut_796,cut_1202,cut_1608,...,cp_cmplx_name,human_gene_names_primary,human_gene_names_synonym,human_protein_names,cut_557_algo_label,cut_676_algo_label,cut_796_algo_label,cut_1202_algo_label,cut_1608_algo_label,cut_2014_algo_label
0,KOG0357,0,0,0,0,0,0,0,0,0,...,Chaperonin-containing T-complex,"CCT5, BBS10","CCTE KIAA0098, C12orf58",T-complex protein 1 subunit epsilon (TCP-1-eps...,Large heterogeneous complex,26S Proteasome complex,26S Proteasome complex,Chaperonin-containing T-complex,Chaperonin-containing T-complex,Chaperonin-containing T-complex
1,KOG0358,0,0,0,0,0,0,0,0,0,...,Chaperonin-containing T-complex,CCT4,CCTD SRB,T-complex protein 1 subunit delta (TCP-1-delta...,Large heterogeneous complex,26S Proteasome complex,26S Proteasome complex,Chaperonin-containing T-complex,Chaperonin-containing T-complex,Chaperonin-containing T-complex
2,KOG0359,0,0,0,0,0,0,0,0,0,...,Chaperonin-containing T-complex,"CCT6A, CCT6B","CCT6 CCTZ, NA",T-complex protein 1 subunit zeta (TCP-1-zeta) ...,Large heterogeneous complex,26S Proteasome complex,26S Proteasome complex,Chaperonin-containing T-complex,Chaperonin-containing T-complex,Chaperonin-containing T-complex
3,KOG0360,0,0,0,0,0,0,0,0,0,...,Chaperonin-containing T-complex,"TCP1, BBS10, MKKS","CCT1 CCTA, C12orf58, BBS6",T-complex protein 1 subunit alpha (TCP-1-alpha...,Large heterogeneous complex,26S Proteasome complex,26S Proteasome complex,Chaperonin-containing T-complex,Chaperonin-containing T-complex,Chaperonin-containing T-complex
4,KOG0361,0,0,0,0,0,0,0,0,0,...,Chaperonin-containing T-complex,CCT7,CCTH NIP7-1,T-complex protein 1 subunit eta (TCP-1-eta) (C...,Large heterogeneous complex,26S Proteasome complex,26S Proteasome complex,Chaperonin-containing T-complex,Chaperonin-containing T-complex,Chaperonin-containing T-complex
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3188,KOG3210,194,301,380,419,539,657,777,1183,1589,...,SNO2-SNZ1 pyridoxal 5'-phosphate synthase comp...,,,,Unclustered,Unclustered,Unclustered,Unclustered,Unclustered,Unclustered
3189,ENOG502QPN1,195,303,382,421,541,659,779,1185,1591,...,,,,,Unclustered,Unclustered,Unclustered,Unclustered,Unclustered,Unclustered
3190,KOG1379,196,307,386,425,545,663,783,1189,1595,...,,PPTC7,TAPP2C,Protein phosphatase PTC7 homolog (EC 3.1.3.16)...,Unclustered,Unclustered,Unclustered,Unclustered,Unclustered,Unclustered
3191,KOG4067,197,313,393,432,552,671,791,1197,1603,...,,HIKESHI,C11orf73,Protein Hikeshi,Unclustered,Unclustered,Unclustered,Unclustered,Unclustered,Unclustered


## Get complex labels based on exploding method

In [271]:
df = pd.read_csv(file)
keep_cols = ['ID', 'cut_199', 'cut_318', 'cut_398', 'cut_437', 'cut_557',
       'cut_676', 'cut_796', 'cut_1202', 'cut_1608', 'cut_2014', 'old_cmplx_assignment', 'go_cmplx_name',
       'corum_cmplx_name', 'cp_cmplx_name', 'human_gene_names_primary',
       'human_gene_names_synonym', 'human_protein_names']
df = df[keep_cols]
df_out = df.copy()
cuts = ['cut_557', 'cut_676', 'cut_796', 'cut_1202', 'cut_1608', 'cut_2014']
for cut_col in cuts:
    counts = get_cluster_counts(df, cut_col) # good
    top_labels = get_exploded_label_counts(df, counts, cut_col) # good
    if cut_col == 'cut_1608':
        print(top_labels[top_labels['cut_1608']==18])
    label_dict = make_label_dict(top_labels, counts, cut_col) # good
    labels = get_cluster_labels(df, label_dict, cut_col) # good
    df_out[f'{cut_col}_algo_label'] = labels
df_out.to_csv('../ppi_ml/results/walktrap/LinearSVC_100feats_fdr10_4steps_nochloro_explode_algo_labels_091923_fixedquestionmark.csv', index=False)
df_out

    cut_1608 value  count  n_ppis     prop
18        18           46      64  0.71875


Unnamed: 0,ID,cut_199,cut_318,cut_398,cut_437,cut_557,cut_676,cut_796,cut_1202,cut_1608,...,cp_cmplx_name,human_gene_names_primary,human_gene_names_synonym,human_protein_names,cut_557_algo_label,cut_676_algo_label,cut_796_algo_label,cut_1202_algo_label,cut_1608_algo_label,cut_2014_algo_label
0,KOG0357,0,0,0,0,0,0,0,0,0,...,Chaperonin-containing T-complex,"CCT5, BBS10","CCTE KIAA0098, C12orf58",T-complex protein 1 subunit epsilon (TCP-1-eps...,Large heterogeneous complex,Proteasome regulatory particle,Proteasome regulatory particle,Cct complex,Cct complex,Cct complex
1,KOG0358,0,0,0,0,0,0,0,0,0,...,Chaperonin-containing T-complex,CCT4,CCTD SRB,T-complex protein 1 subunit delta (TCP-1-delta...,Large heterogeneous complex,Proteasome regulatory particle,Proteasome regulatory particle,Cct complex,Cct complex,Cct complex
2,KOG0359,0,0,0,0,0,0,0,0,0,...,Chaperonin-containing T-complex,"CCT6A, CCT6B","CCT6 CCTZ, NA",T-complex protein 1 subunit zeta (TCP-1-zeta) ...,Large heterogeneous complex,Proteasome regulatory particle,Proteasome regulatory particle,Cct complex,Cct complex,Cct complex
3,KOG0360,0,0,0,0,0,0,0,0,0,...,Chaperonin-containing T-complex,"TCP1, BBS10, MKKS","CCT1 CCTA, C12orf58, BBS6",T-complex protein 1 subunit alpha (TCP-1-alpha...,Large heterogeneous complex,Proteasome regulatory particle,Proteasome regulatory particle,Cct complex,Cct complex,Cct complex
4,KOG0361,0,0,0,0,0,0,0,0,0,...,Chaperonin-containing T-complex,CCT7,CCTH NIP7-1,T-complex protein 1 subunit eta (TCP-1-eta) (C...,Large heterogeneous complex,Proteasome regulatory particle,Proteasome regulatory particle,Cct complex,Cct complex,Cct complex
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3188,KOG3210,194,301,380,419,539,657,777,1183,1589,...,SNO2-SNZ1 pyridoxal 5'-phosphate synthase comp...,,,,Unclustered,Unclustered,Unclustered,Unclustered,Unclustered,Unclustered
3189,ENOG502QPN1,195,303,382,421,541,659,779,1185,1591,...,,,,,Unclustered,Unclustered,Unclustered,Unclustered,Unclustered,Unclustered
3190,KOG1379,196,307,386,425,545,663,783,1189,1595,...,,PPTC7,TAPP2C,Protein phosphatase PTC7 homolog (EC 3.1.3.16)...,Unclustered,Unclustered,Unclustered,Unclustered,Unclustered,Unclustered
3191,KOG4067,197,313,393,432,552,671,791,1197,1603,...,,HIKESHI,C11orf73,Protein Hikeshi,Unclustered,Unclustered,Unclustered,Unclustered,Unclustered,Unclustered


In [255]:
cols = ['cut_796_algo_label','cut_1202_algo_label','cut_1608_algo_label','cut_2014_algo_label']
df_out[cols].nunique()

cut_796_algo_label      79
cut_1202_algo_label    109
cut_1608_algo_label    125
cut_2014_algo_label    107
dtype: int64

In [272]:
test_list = ['ENOG502S2J2','KOG0142','KOG0257','KOG0409','KOG0452','KOG0702','KOG0813','KOG0910']
df_out[df_out['ID'].isin(test_list)]

Unnamed: 0,ID,cut_199,cut_318,cut_398,cut_437,cut_557,cut_676,cut_796,cut_1202,cut_1608,...,cp_cmplx_name,human_gene_names_primary,human_gene_names_synonym,human_protein_names,cut_557_algo_label,cut_676_algo_label,cut_796_algo_label,cut_1202_algo_label,cut_1608_algo_label,cut_2014_algo_label
2499,ENOG502S2J2,3,3,4,6,7,9,11,15,18,...,,DNPH1,C6orf108 RCL,2'-deoxynucleoside 5'-phosphate N-hydrolase 1 ...,Large heterogeneous complex,Large heterogeneous complex,Large heterogeneous complex,Large heterogeneous complex,Uncharacterized complex,Uncharacterized complex
2500,KOG0142,3,3,4,6,7,9,11,15,18,...,,"IDI1, IDI2",,Isopentenyl-diphosphate Delta-isomerase 1 (EC ...,Large heterogeneous complex,Large heterogeneous complex,Large heterogeneous complex,Large heterogeneous complex,Uncharacterized complex,Uncharacterized complex
2501,KOG0257,3,3,4,6,7,9,11,15,18,...,,"KYAT1, KYAT3","CCBL1, CCBL2 KAT3",Kynurenine--oxoglutarate transaminase 1 (EC 2....,Large heterogeneous complex,Large heterogeneous complex,Large heterogeneous complex,Large heterogeneous complex,Uncharacterized complex,Uncharacterized complex
2502,KOG0409,3,3,4,6,7,9,11,15,18,...,,"HIBADH, GLYR1","NA, HIBDL NDF NP60","3-hydroxyisobutyrate dehydrogenase, mitochondr...",Large heterogeneous complex,Large heterogeneous complex,Large heterogeneous complex,Large heterogeneous complex,Uncharacterized complex,Uncharacterized complex
2503,KOG0452,3,3,4,6,7,9,11,15,18,...,,"ACO1, IREB2","IREB1, NA",Cytoplasmic aconitate hydratase (Aconitase) (E...,Large heterogeneous complex,Large heterogeneous complex,Large heterogeneous complex,Large heterogeneous complex,Uncharacterized complex,Uncharacterized complex
2504,KOG0702,3,3,4,6,7,9,11,15,18,...,,"AGFG2, AGFG1","HRBL RABR, HRB RAB RIP",Arf-GAP domain and FG repeat-containing protei...,Large heterogeneous complex,Large heterogeneous complex,Large heterogeneous complex,Large heterogeneous complex,Uncharacterized complex,Uncharacterized complex
2505,KOG0813,3,3,4,6,7,9,11,15,18,...,,"HAGH, LACTB2, MBLAC2, HAGHL, PNKD","GLO2 HAGH1, NA, KIAA1184 MR1 TAHCCP2","Hydroxyacylglutathione hydrolase, mitochondria...",Large heterogeneous complex,Large heterogeneous complex,Large heterogeneous complex,Large heterogeneous complex,Uncharacterized complex,Uncharacterized complex
2506,KOG0910,3,3,4,6,7,9,11,15,18,...,,TXN2,TRX2,"Thioredoxin, mitochondrial (MTRX) (Mt-Trx) (Th...",Large heterogeneous complex,Large heterogeneous complex,Large heterogeneous complex,Large heterogeneous complex,Uncharacterized complex,Uncharacterized complex


In [242]:
label_dict[2011]

{'': 1}

In [235]:
df

Unnamed: 0,ID,cut_199,cut_318,cut_398,cut_437,cut_557,cut_676,cut_796,cut_1202,cut_1608,cut_2014,old_cmplx_assignment,go_cmplx_name,corum_cmplx_name,cp_cmplx_name,human_gene_names_primary,human_gene_names_synonym,human_protein_names,all_db_labels,value
0,KOG0357,0,0,0,0,0,0,0,0,0,0,CCT complex,chaperonin-containing T-complex,"CCT complex, BBS-chaperonin complex, CCT compl...",Chaperonin-containing T-complex,"CCT5, BBS10","CCTE KIAA0098, C12orf58",T-complex protein 1 subunit epsilon (TCP-1-eps...,"Chaperonin-containing T-complex, chaperonin-co...","[Chaperonin-containing t-complex, Chaperonin-c..."
1,KOG0358,0,0,0,0,0,0,0,0,0,0,CCT complex,"chaperonin-containing T-complex, zona pellucid...","CCT complex, BBS-chaperonin complex, CCT compl...",Chaperonin-containing T-complex,CCT4,CCTD SRB,T-complex protein 1 subunit delta (TCP-1-delta...,"Chaperonin-containing T-complex, chaperonin-co...","[Chaperonin-containing t-complex, Chaperonin-c..."
2,KOG0359,0,0,0,0,0,0,0,0,0,0,CCT complex,chaperonin-containing T-complex,"CCT complex, CCT complex, testis specific",Chaperonin-containing T-complex,"CCT6A, CCT6B","CCT6 CCTZ, NA",T-complex protein 1 subunit zeta (TCP-1-zeta) ...,"Chaperonin-containing T-complex, chaperonin-co...","[Chaperonin-containing t-complex, Chaperonin-c..."
3,KOG0360,0,0,0,0,0,0,0,0,0,0,CCT complex,"chaperonin-containing T-complex, zona pellucid...","CCT complex, CCT complex, testis specific, CCT...",Chaperonin-containing T-complex,"TCP1, BBS10, MKKS","CCT1 CCTA, C12orf58, BBS6",T-complex protein 1 subunit alpha (TCP-1-alpha...,"Chaperonin-containing T-complex, chaperonin-co...","[Chaperonin-containing t-complex, Chaperonin-c..."
4,KOG0361,0,0,0,0,0,0,0,0,0,0,CCT complex,chaperonin-containing T-complex,"CCT complex, CCT complex, testis specific, CCT...",Chaperonin-containing T-complex,CCT7,CCTH NIP7-1,T-complex protein 1 subunit eta (TCP-1-eta) (C...,"Chaperonin-containing T-complex, chaperonin-co...","[Chaperonin-containing t-complex, Chaperonin-c..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3188,KOG3210,194,301,380,419,539,657,777,1183,1589,1995,,,,SNO2-SNZ1 pyridoxal 5'-phosphate synthase comp...,,,,SNO2-SNZ1 pyridoxal 5'-phosphate synthase comp...,[Sno2-snz1 pyridoxal 5'-phosphate synthase com...
3189,ENOG502QPN1,195,303,382,421,541,659,779,1185,1591,1997,,,,,,,,,
3190,KOG1379,196,307,386,425,545,663,783,1189,1595,2001,Uncharacterized mystery complex 9,,,,PPTC7,TAPP2C,Protein phosphatase PTC7 homolog (EC 3.1.3.16)...,,
3191,KOG4067,197,313,393,432,552,671,791,1197,1603,2009,Prune complex,,,,HIKESHI,C11orf73,Protein Hikeshi,,


In [236]:
df_out

Unnamed: 0,ID,cut_199,cut_318,cut_398,cut_437,cut_557,cut_676,cut_796,cut_1202,cut_1608,...,cp_cmplx_name,human_gene_names_primary,human_gene_names_synonym,human_protein_names,cut_557_algo_label,cut_676_algo_label,cut_796_algo_label,cut_1202_algo_label,cut_1608_algo_label,cut_2014_algo_label
0,KOG0357,0,0,0,0,0,0,0,0,0,...,Chaperonin-containing T-complex,"CCT5, BBS10","CCTE KIAA0098, C12orf58",T-complex protein 1 subunit epsilon (TCP-1-eps...,Large heterogeneous complex,Proteasome regulatory particle,Proteasome regulatory particle,Cct complex,Cct complex,Cct complex
1,KOG0358,0,0,0,0,0,0,0,0,0,...,Chaperonin-containing T-complex,CCT4,CCTD SRB,T-complex protein 1 subunit delta (TCP-1-delta...,Large heterogeneous complex,Proteasome regulatory particle,Proteasome regulatory particle,Cct complex,Cct complex,Cct complex
2,KOG0359,0,0,0,0,0,0,0,0,0,...,Chaperonin-containing T-complex,"CCT6A, CCT6B","CCT6 CCTZ, NA",T-complex protein 1 subunit zeta (TCP-1-zeta) ...,Large heterogeneous complex,Proteasome regulatory particle,Proteasome regulatory particle,Cct complex,Cct complex,Cct complex
3,KOG0360,0,0,0,0,0,0,0,0,0,...,Chaperonin-containing T-complex,"TCP1, BBS10, MKKS","CCT1 CCTA, C12orf58, BBS6",T-complex protein 1 subunit alpha (TCP-1-alpha...,Large heterogeneous complex,Proteasome regulatory particle,Proteasome regulatory particle,Cct complex,Cct complex,Cct complex
4,KOG0361,0,0,0,0,0,0,0,0,0,...,Chaperonin-containing T-complex,CCT7,CCTH NIP7-1,T-complex protein 1 subunit eta (TCP-1-eta) (C...,Large heterogeneous complex,Proteasome regulatory particle,Proteasome regulatory particle,Cct complex,Cct complex,Cct complex
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3188,KOG3210,194,301,380,419,539,657,777,1183,1589,...,SNO2-SNZ1 pyridoxal 5'-phosphate synthase comp...,,,,Sno1-snz1 pyridoxal 5'-phosphate synthase complex,Sno1-snz1 pyridoxal 5'-phosphate synthase complex,Sno1-snz1 pyridoxal 5'-phosphate synthase complex,Sno1-snz1 pyridoxal 5'-phosphate synthase complex,Sno1-snz1 pyridoxal 5'-phosphate synthase complex,Sno1-snz1 pyridoxal 5'-phosphate synthase complex
3189,ENOG502QPN1,195,303,382,421,541,659,779,1185,1591,...,,,,,Sno1-snz1 pyridoxal 5'-phosphate synthase complex,Sno1-snz1 pyridoxal 5'-phosphate synthase complex,Sno1-snz1 pyridoxal 5'-phosphate synthase complex,Sno1-snz1 pyridoxal 5'-phosphate synthase complex,Sno1-snz1 pyridoxal 5'-phosphate synthase complex,Sno1-snz1 pyridoxal 5'-phosphate synthase complex
3190,KOG1379,196,307,386,425,545,663,783,1189,1595,...,,PPTC7,TAPP2C,Protein phosphatase PTC7 homolog (EC 3.1.3.16)...,Sno1-snz1 pyridoxal 5'-phosphate synthase complex,Sno1-snz1 pyridoxal 5'-phosphate synthase complex,Sno1-snz1 pyridoxal 5'-phosphate synthase complex,Sno1-snz1 pyridoxal 5'-phosphate synthase complex,Sno1-snz1 pyridoxal 5'-phosphate synthase complex,Sno1-snz1 pyridoxal 5'-phosphate synthase complex
3191,KOG4067,197,313,393,432,552,671,791,1197,1603,...,,HIKESHI,C11orf73,Protein Hikeshi,Sno1-snz1 pyridoxal 5'-phosphate synthase complex,Sno1-snz1 pyridoxal 5'-phosphate synthase complex,Sno1-snz1 pyridoxal 5'-phosphate synthase complex,Sno1-snz1 pyridoxal 5'-phosphate synthase complex,Sno1-snz1 pyridoxal 5'-phosphate synthase complex,Sno1-snz1 pyridoxal 5'-phosphate synthase complex
