In [49]:
import re
import pandas as pd
import numpy as np

In [197]:
file = '../ppi_ml/results/walktrap/LinearSVC_100feats_fdr10_4steps_nochloro.csv'

In [418]:
def get_top(group):
    return group.nlargest(1, 'count')

def flatten(row):
    flat_list = [item for sublist in row for item in sublist]
    #print(flat_list)
    if len(flat_list) > 0:
        return flat_list
    else:
        return 'Uncharacterized'

def get_cluster_counts(df, cut_col):
    gb = df.groupby(cut_col)
    counts = gb.size().to_frame(name='n_ppis').reset_index()
    counts.join(gb.agg({'cp_cmplx_name':'count'}).rename(columns={'cp_cmplx_name':'cp_cmplx_name_count'}))
    return counts

def fmt_db_labels(string):
    if not pd.isna(string):
        if ', ' in string:
            string = string.split(', ')
            string = [i.capitalize() for i in string]
            if 'Bzip transcription factor complex' in string:
                string = 'Bzip transcription factor complex'
            if 'Scf(coi1) ubiquitin ligase complex' in string:
                string = 'Scf(coi1) ubiquitin ligase complex'
            if 'Lsm1-7-pat1 complex' in string:
                string = 'Lsm1-7-pat1 complex'
            if 'Muscle cell-specific swi/snf atp-dependent chromatin remodeling complex' in string:
                string = 'Swi/snf chromatin remodelling complex'
            if 'Neuronal ap-3 adaptor complex' in string:
                string = 'Ap-3 adaptor complex'
            if 'Atg-5-atg-12-atg-16.1-atg-16.2 complex' in string:
                string = 'Atg12-atg5 complex'
            if 'Cct-prefoldin complex' in string:
                string = 'Prefoldin complex'
            if 'Nucleosome' in string:
                string = 'Nucleosome'
            if 'Eif3 complex' in string:
                string = 'Eif3 complex'
            if any('ribosomal' in sub.lower() for sub in string):
                string = 'Ribosome'
            if any('ribosome' in sub.lower() for sub in string):
                string = 'Ribosome'
            if any('preribosome' in sub.lower() for sub in string):
                string = 'Ribosome'
            if any('spliceosom' in sub.lower() for sub in string):
                string = 'Spliceosome'
        else:
            if 'spliceosom' in string.lower():
                string = 'Spliceosome'
            string = string.capitalize()
    return string

# get_melted_label_counts: potentially deprecated in favor of get_exploded_label_counts
# def get_melted_label_counts(df, cut_col):
#     cols = ['cp_cmplx_name', 'go_cmplx_name', 'corum_cmplx_name']
#     df_melted = pd.melt(df, id_vars=cut_col, value_vars=['cp_cmplx_name', 'go_cmplx_name', 'corum_cmplx_name'], var_name='complex_type')
#     for i in range(len(df_melted)):
#         cmplx = df_melted['value'][i]
#         if 'spliceosom' in str(cmplx):
#             df_melted.loc[i, 'value'] = 'Spliceosome'
            
#     result = df_melted.groupby([cut_col, 'complex_type', 'value']).size().reset_index(name='count')
#     result = result[result['value'] != 'protein-containing complex']
#     result = result[result['value'] != 'ribonucleoprotein complex']
    
#     top = result.groupby(cut_col, group_keys=False).apply(get_top).reset_index(drop=True)
#     top = top.merge(counts, how='left', left_on=cut_col, right_on=cut_col)
#     top['prop'] = top['count']/top['n_ppis']
#     # replace sparsely labeled cluster names
#     # change singleton labels to "unclustered"
#     for i in range(len(top)):
#         if top['prop'][i] == 1 and top['n_ppis'][i] == 1:
#             top.loc[i, 'value'] = 'Unclustered'
#         if top['prop'][i] < 0.7:
#             if top['n_ppis'][i] <= 10:
#                 top.loc[i, 'value'] = 'Small heterogenous complex'
#             else:
#                 top.loc[i, 'value'] = 'Large heterogeneous complex'
#     return top

def get_combined_labels(df):
    cols = ['cp_cmplx_name', 'go_cmplx_name', 'corum_cmplx_name']
    df[cols] = df[cols].replace(to_replace=" [\(\[].*?[\)\]]", value="", regex=True)
    df[cols] = df[cols].replace(to_replace=", cytoplasmic", value="", regex=True)
    df[cols] = df[cols].replace(to_replace=", mitochondrial", value="", regex=True)
    df[cols] = df[cols].replace(to_replace=", chromatin", value="", regex=True)
    df[cols] = df[cols].replace(to_replace=", EGF induced", value="", regex=True)
    df[cols] = df[cols].replace(to_replace=", V1 domain", value="", regex=True)
    df['all_db_labels'] = df[cols].apply(lambda x: ', '.join(x.dropna()), axis=1)
    df['value'] = [fmt_db_labels(i) for i in df['all_db_labels']]
    return df

def get_exploded_label_counts(df, df_counts, cut_col):
    df_exp = df.explode('value')
    result = df_exp.groupby([cut_col, 'value']).size().reset_index(name='count')
    result = result[result['value'] != 'Protein-containing complex']
    result = result[result['value'] != 'Ribonucleoprotein complex']
    
    top = result.groupby(cut_col, group_keys=False).apply(get_top).reset_index(drop=True)
    top = top.merge(df_counts, how='left', left_on=cut_col, right_on=cut_col)
    top['prop'] = top['count']/top['n_ppis']
    #top = top.reset_index(drop=True)
    # replace sparsely labeled cluster names
    # change singleton labels to "unclustered"
    for i in range(len(top)):
        # if top['n_ppis'][i] == 1 and top['prop'][i] >= 1:
        #     top.loc[i, 'value'] = 'Unclustered'
        if top['prop'][i] < 0.6:
            if top['n_ppis'][i] <= 10:
                top.loc[i, 'value'] = 'Uncharacterized complex'
            else:
                top.loc[i, 'value'] = 'Large heterogeneous complex'
    return top

def make_label_dict(df_top, df_counts, cut_col):
    cmplx_dict = {}
    #df_top = df_top.reset_index(drop=True)
    #df_counts = df_counts.reset_index(drop=True)
    for i in range(len(df_top)):
        cluster = df_top[cut_col][i]
        value = df_top['value'][i]
        count = df_top['count'][i]
        # check if the cluster key exists in the nested dictionary
        if cluster not in cmplx_dict:
            cmplx_dict[cluster] = {}
        # check if the value key exists in the inner  dictionary
        if value not in cmplx_dict[cluster]:
            cmplx_dict[cluster][value] = count
    # add all unclustered prots to dict
    for i in range(len(df_counts)):
        cluster = df_counts[cut_col][i]
        n_ppis = df_counts['n_ppis'][i]
        if n_ppis == 1:
            #if cluster not in cmplx_dict:
            cmplx_dict[cluster] = {}
            cmplx_dict[cluster]['Unclustered'] = 1
    return cmplx_dict
            
def get_cluster_labels(df, cmplx_dict, cut_col):
    cmplx_lst = []
    status_lst = []
    for i in range(len(df)):
        cluster = df[cut_col][i]
        og = df['ID'][i]
        if cluster in cmplx_dict:
            labels = cmplx_dict.get(cluster, {})
            label = list(labels)[0]
            if len(label) < 1:
                label = 'Uncharacterized complex' # if most common label is blank, then uncharacterized
        else:
            label = 'Uncharacterized complex' # if no label, then uncharacterized
        cmplx_lst.append(label)
        if label == 'Uncharacterized complex':
            status_lst.append('Uncharacterized')
        elif label == 'Unclustered':
            status_lst.append('Unclustered')
        elif label == 'Large heterogeneous complex':
            status_lst.append('Large heterogeneous complex')
        elif label.lower() in ', '.join(df['value'][i]).lower():
            status_lst.append('Known')
        elif 'ribosom' in label.lower() or 'spliceosom' in label.lower():
            status_lst.append('Known')
        else:
            status_lst.append('Novel association')
    return cmplx_lst, status_lst

## Get complex labels based on melting method

In [156]:
# df = pd.read_csv(file)
# print(df.columns.values)
# keep_cols = ['ID', 'cut_199', 'cut_318', 'cut_398', 'cut_437', 'cut_557',
#        'cut_676', 'cut_796', 'cut_1202', 'cut_1608', 'cut_2014', 'old_cmplx_assignment', 'go_cmplx_name',
#        'corum_cmplx_name', 'cp_cmplx_name', 'human_gene_names_primary',
#        'human_gene_names_synonym', 'human_protein_names']
# df = df[keep_cols]
# print(df.columns.values)
# df_out = df.copy()
# cuts = ['cut_557', 'cut_676', 'cut_796', 'cut_1202', 'cut_1608', 'cut_2014']
# for cut_col in cuts:
#     counts = get_cluster_counts(df, cut_col)
#     top_labels = get_melted_label_counts(df, cut_col)
#     label_dict = make_label_dict(top_labels, counts, cut_col)
#     labels = get_cluster_labels(df, label_dict, cut_col)
#     df_out[f'{cut_col}_algo_label'] = labels
# #df_out.to_csv('../ppi_ml/results/walktrap/LinearSVC_100feats_fdr10_4steps_nochloro_algo_labels_091323.csv', index=False)
# df_out

['ID' 'cut_199' 'cut_318' 'cut_398' 'cut_437' 'cut_557' 'cut_676'
 'cut_796' 'cut_1202' 'cut_1608' 'cut_2014' 'amor_11' 'exca_2' 'tsar_5'
 'viri_13' 'old_status' 'old_cmplx_assignment' 'go_gene_name'
 'go_cmplx_name' 'corum_cmplx_name' 'cp_cmplx_name' 'corum_cmplx_synonym'
 'cp_cmplx_synonym' 'go_up_ids' 'corum_up_ids' 'cp_up_ids'
 'corum_cmplx_members' 'corum_cmplx_comment' 'corum_cmplx_disease'
 'cp_cmplx_members' 'cp_cmplx_assembly' 'human_entry'
 'human_gene_names_primary' 'human_gene_names_synonym'
 'human_protein_names' 'human_length' 'human_function_cc'
 'human_annotscore_1to5' 'human_subcellular_location_cc' 'arath_entry'
 'arath_gene_names_primary' 'arath_protein_names' 'arath_function_cc'
 'arath_subcellular_location_cc']
['ID' 'cut_199' 'cut_318' 'cut_398' 'cut_437' 'cut_557' 'cut_676'
 'cut_796' 'cut_1202' 'cut_1608' 'cut_2014' 'old_cmplx_assignment'
 'go_cmplx_name' 'corum_cmplx_name' 'cp_cmplx_name'
 'human_gene_names_primary' 'human_gene_names_synonym'
 'human_protein_na

Unnamed: 0,ID,cut_199,cut_318,cut_398,cut_437,cut_557,cut_676,cut_796,cut_1202,cut_1608,...,cp_cmplx_name,human_gene_names_primary,human_gene_names_synonym,human_protein_names,cut_557_algo_label,cut_676_algo_label,cut_796_algo_label,cut_1202_algo_label,cut_1608_algo_label,cut_2014_algo_label
0,KOG0357,0,0,0,0,0,0,0,0,0,...,Chaperonin-containing T-complex,"CCT5, BBS10","CCTE KIAA0098, C12orf58",T-complex protein 1 subunit epsilon (TCP-1-eps...,Large heterogeneous complex,26S Proteasome complex,26S Proteasome complex,Chaperonin-containing T-complex,Chaperonin-containing T-complex,Chaperonin-containing T-complex
1,KOG0358,0,0,0,0,0,0,0,0,0,...,Chaperonin-containing T-complex,CCT4,CCTD SRB,T-complex protein 1 subunit delta (TCP-1-delta...,Large heterogeneous complex,26S Proteasome complex,26S Proteasome complex,Chaperonin-containing T-complex,Chaperonin-containing T-complex,Chaperonin-containing T-complex
2,KOG0359,0,0,0,0,0,0,0,0,0,...,Chaperonin-containing T-complex,"CCT6A, CCT6B","CCT6 CCTZ, NA",T-complex protein 1 subunit zeta (TCP-1-zeta) ...,Large heterogeneous complex,26S Proteasome complex,26S Proteasome complex,Chaperonin-containing T-complex,Chaperonin-containing T-complex,Chaperonin-containing T-complex
3,KOG0360,0,0,0,0,0,0,0,0,0,...,Chaperonin-containing T-complex,"TCP1, BBS10, MKKS","CCT1 CCTA, C12orf58, BBS6",T-complex protein 1 subunit alpha (TCP-1-alpha...,Large heterogeneous complex,26S Proteasome complex,26S Proteasome complex,Chaperonin-containing T-complex,Chaperonin-containing T-complex,Chaperonin-containing T-complex
4,KOG0361,0,0,0,0,0,0,0,0,0,...,Chaperonin-containing T-complex,CCT7,CCTH NIP7-1,T-complex protein 1 subunit eta (TCP-1-eta) (C...,Large heterogeneous complex,26S Proteasome complex,26S Proteasome complex,Chaperonin-containing T-complex,Chaperonin-containing T-complex,Chaperonin-containing T-complex
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3188,KOG3210,194,301,380,419,539,657,777,1183,1589,...,SNO2-SNZ1 pyridoxal 5'-phosphate synthase comp...,,,,Unclustered,Unclustered,Unclustered,Unclustered,Unclustered,Unclustered
3189,ENOG502QPN1,195,303,382,421,541,659,779,1185,1591,...,,,,,Unclustered,Unclustered,Unclustered,Unclustered,Unclustered,Unclustered
3190,KOG1379,196,307,386,425,545,663,783,1189,1595,...,,PPTC7,TAPP2C,Protein phosphatase PTC7 homolog (EC 3.1.3.16)...,Unclustered,Unclustered,Unclustered,Unclustered,Unclustered,Unclustered
3191,KOG4067,197,313,393,432,552,671,791,1197,1603,...,,HIKESHI,C11orf73,Protein Hikeshi,Unclustered,Unclustered,Unclustered,Unclustered,Unclustered,Unclustered


## Get complex labels based on exploding method

In [419]:
df = pd.read_csv(file)
keep_cols = ['ID', 'cut_199', 'cut_318', 'cut_398', 'cut_437', 'cut_557',
       'cut_676', 'cut_796', 'cut_1202', 'cut_1608', 'cut_2014', 'old_cmplx_assignment', 'go_cmplx_name',
       'corum_cmplx_name', 'cp_cmplx_name', 'human_gene_names_primary',
       'human_gene_names_synonym', 'human_protein_names']
df = df[keep_cols]
cuts = ['cut_557', 'cut_676', 'cut_796', 'cut_1202', 'cut_1608', 'cut_2014']
for cut_col in cuts:
    counts = get_cluster_counts(df, cut_col) # good
    df_out = get_combined_labels(df)
    top_labels = get_exploded_label_counts(df_out, counts, cut_col) # good
    label_dict = make_label_dict(top_labels, counts, cut_col) # good
    labels, statuses = get_cluster_labels(df, label_dict, cut_col) # good
    df_out[f'{cut_col}_algo_label'] = labels
    df_out[f'{cut_col}_algo_status'] = statuses

    
cut_choice = 'cut_1202'
alt_cuts = ['cut_1608', 'cut_2014']

# replace large heterogeneous labels where possible
label2replace = 'Large heterogeneous complex'
df_out['final_label'] = df[f'{cut_choice}_algo_label']
df_out['final_status'] = df[f'{cut_choice}_algo_status']

for i in range(len(alt_cuts)):
    df_out['final_label'] = np.where(df_out[f'final_label']==label2replace, df_out[f'{alt_cuts[i]}_algo_label'], df_out['final_label'])
    df_out['final_status'] = np.where(df_out[f'final_status']==label2replace, df_out[f'{alt_cuts[i]}_algo_status'], df_out['final_status'])

comb_cols = ['final_label', 'grp_idx']
df_out['grp_idx'] = df_out.groupby(['final_label',cut_choice]).ngroup()
df_out['granulated_cmplx_name'] = df[comb_cols].apply(lambda row: '_'.join(row.values.astype(str)), axis=1)
df_out

Unnamed: 0,ID,cut_199,cut_318,cut_398,cut_437,cut_557,cut_676,cut_796,cut_1202,cut_1608,...,cut_1202_algo_label,cut_1202_algo_status,cut_1608_algo_label,cut_1608_algo_status,cut_2014_algo_label,cut_2014_algo_status,final_label,final_status,grp_idx,granulated_cmplx_name
0,KOG0357,0,0,0,0,0,0,0,0,0,...,Cct complex,Known,Cct complex,Known,Cct complex,Known,Cct complex,Known,25,Cct complex_25
1,KOG0358,0,0,0,0,0,0,0,0,0,...,Cct complex,Known,Cct complex,Known,Cct complex,Known,Cct complex,Known,25,Cct complex_25
2,KOG0359,0,0,0,0,0,0,0,0,0,...,Cct complex,Known,Cct complex,Known,Cct complex,Known,Cct complex,Known,25,Cct complex_25
3,KOG0360,0,0,0,0,0,0,0,0,0,...,Cct complex,Known,Cct complex,Known,Cct complex,Known,Cct complex,Known,25,Cct complex_25
4,KOG0361,0,0,0,0,0,0,0,0,0,...,Cct complex,Known,Cct complex,Known,Cct complex,Known,Cct complex,Known,25,Cct complex_25
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3188,KOG3210,194,301,380,419,539,657,777,1183,1589,...,Unclustered,Unclustered,Unclustered,Unclustered,Unclustered,Unclustered,Unclustered,Unclustered,1218,Unclustered_1218
3189,ENOG502QPN1,195,303,382,421,541,659,779,1185,1591,...,Unclustered,Unclustered,Unclustered,Unclustered,Unclustered,Unclustered,Unclustered,Unclustered,1220,Unclustered_1220
3190,KOG1379,196,307,386,425,545,663,783,1189,1595,...,Unclustered,Unclustered,Unclustered,Unclustered,Unclustered,Unclustered,Unclustered,Unclustered,1224,Unclustered_1224
3191,KOG4067,197,313,393,432,552,671,791,1197,1603,...,Unclustered,Unclustered,Unclustered,Unclustered,Unclustered,Unclustered,Unclustered,Unclustered,1232,Unclustered_1232


In [420]:
df_out.to_csv('../ppi_ml/results/walktrap/LinearSVC_100feats_fdr10_4steps_nochloro_dynamic_algo_labels_092623.csv', index=False)

In [426]:
cmplx_list = list(set(df_out['granulated_cmplx_name'].to_list()))
cmplx_out = '../ppi_ml/figures/cplot_cmplx_labels/cmplx_list.txt'
with open(cmplx_out, 'w') as f:
    f.write("\n".join(map(str, cmplx_list)))

In [421]:
cols = ['cut_796_algo_label','cut_1202_algo_label','cut_1608_algo_label','cut_2014_algo_label', 'final_label']
df_out[cols].nunique()

cut_796_algo_label      84
cut_1202_algo_label    110
cut_1608_algo_label    119
cut_2014_algo_label     96
final_label            137
dtype: int64

In [422]:
test_list = ['ENOG502S2J2','KOG0142','KOG0257','KOG0409','KOG0452','KOG0702','KOG0813','KOG0910']
df_out[df_out['ID'].isin(test_list)]

Unnamed: 0,ID,cut_199,cut_318,cut_398,cut_437,cut_557,cut_676,cut_796,cut_1202,cut_1608,...,cut_1202_algo_label,cut_1202_algo_status,cut_1608_algo_label,cut_1608_algo_status,cut_2014_algo_label,cut_2014_algo_status,final_label,final_status,grp_idx,granulated_cmplx_name
2499,ENOG502S2J2,3,3,4,6,7,9,11,15,18,...,Uncharacterized complex,Uncharacterized,Uncharacterized complex,Uncharacterized,Uncharacterized complex,Uncharacterized,Uncharacterized complex,Uncharacterized,198,Uncharacterized complex_198
2500,KOG0142,3,3,4,6,7,9,11,15,18,...,Uncharacterized complex,Uncharacterized,Uncharacterized complex,Uncharacterized,Uncharacterized complex,Uncharacterized,Uncharacterized complex,Uncharacterized,198,Uncharacterized complex_198
2501,KOG0257,3,3,4,6,7,9,11,15,18,...,Uncharacterized complex,Uncharacterized,Uncharacterized complex,Uncharacterized,Uncharacterized complex,Uncharacterized,Uncharacterized complex,Uncharacterized,198,Uncharacterized complex_198
2502,KOG0409,3,3,4,6,7,9,11,15,18,...,Uncharacterized complex,Uncharacterized,Uncharacterized complex,Uncharacterized,Uncharacterized complex,Uncharacterized,Uncharacterized complex,Uncharacterized,198,Uncharacterized complex_198
2503,KOG0452,3,3,4,6,7,9,11,15,18,...,Uncharacterized complex,Uncharacterized,Uncharacterized complex,Uncharacterized,Uncharacterized complex,Uncharacterized,Uncharacterized complex,Uncharacterized,198,Uncharacterized complex_198
2504,KOG0702,3,3,4,6,7,9,11,15,18,...,Uncharacterized complex,Uncharacterized,Uncharacterized complex,Uncharacterized,Uncharacterized complex,Uncharacterized,Uncharacterized complex,Uncharacterized,198,Uncharacterized complex_198
2505,KOG0813,3,3,4,6,7,9,11,15,18,...,Uncharacterized complex,Uncharacterized,Uncharacterized complex,Uncharacterized,Uncharacterized complex,Uncharacterized,Uncharacterized complex,Uncharacterized,198,Uncharacterized complex_198
2506,KOG0910,3,3,4,6,7,9,11,15,18,...,Uncharacterized complex,Uncharacterized,Uncharacterized complex,Uncharacterized,Uncharacterized complex,Uncharacterized,Uncharacterized complex,Uncharacterized,198,Uncharacterized complex_198


In [423]:
ribo_test_list = ['KOG0600','KOG0279','KOG0330','KOG0378','KOG0397','KOG0407','KOG0463']
df_out[df_out['ID'].isin(ribo_test_list)]

Unnamed: 0,ID,cut_199,cut_318,cut_398,cut_437,cut_557,cut_676,cut_796,cut_1202,cut_1608,...,cut_1202_algo_label,cut_1202_algo_status,cut_1608_algo_label,cut_1608_algo_status,cut_2014_algo_label,cut_2014_algo_status,final_label,final_status,grp_idx,granulated_cmplx_name
1031,KOG0600,0,0,1,1,1,1,10,14,17,...,Large heterogeneous complex,Large heterogeneous complex,Large heterogeneous complex,Large heterogeneous complex,Unclustered,Unclustered,Unclustered,Unclustered,289,Unclustered_289
1032,KOG0279,0,0,1,1,1,1,10,14,33,...,Large heterogeneous complex,Large heterogeneous complex,Ribosome,Known,Ribosome,Known,Ribosome,Known,153,Ribosome_153
1033,KOG0330,0,0,1,1,1,1,10,14,33,...,Large heterogeneous complex,Large heterogeneous complex,Ribosome,Known,Ribosome,Known,Ribosome,Known,153,Ribosome_153
1034,KOG0378,0,0,1,1,1,1,10,14,33,...,Large heterogeneous complex,Large heterogeneous complex,Ribosome,Known,Ribosome,Known,Ribosome,Known,153,Ribosome_153
1035,KOG0397,0,0,1,1,1,1,10,14,33,...,Large heterogeneous complex,Large heterogeneous complex,Ribosome,Known,Ribosome,Known,Ribosome,Known,153,Ribosome_153
1036,KOG0407,0,0,1,1,1,1,10,14,33,...,Large heterogeneous complex,Large heterogeneous complex,Ribosome,Known,Ribosome,Known,Ribosome,Known,153,Ribosome_153
1037,KOG0463,0,0,1,1,1,1,10,14,33,...,Large heterogeneous complex,Large heterogeneous complex,Ribosome,Known,Ribosome,Known,Ribosome,Known,153,Ribosome_153


In [417]:
len(df_out[df_out['final_label']=='Large heterogeneous complex'])

201

In [235]:
df

Unnamed: 0,ID,cut_199,cut_318,cut_398,cut_437,cut_557,cut_676,cut_796,cut_1202,cut_1608,cut_2014,old_cmplx_assignment,go_cmplx_name,corum_cmplx_name,cp_cmplx_name,human_gene_names_primary,human_gene_names_synonym,human_protein_names,all_db_labels,value
0,KOG0357,0,0,0,0,0,0,0,0,0,0,CCT complex,chaperonin-containing T-complex,"CCT complex, BBS-chaperonin complex, CCT compl...",Chaperonin-containing T-complex,"CCT5, BBS10","CCTE KIAA0098, C12orf58",T-complex protein 1 subunit epsilon (TCP-1-eps...,"Chaperonin-containing T-complex, chaperonin-co...","[Chaperonin-containing t-complex, Chaperonin-c..."
1,KOG0358,0,0,0,0,0,0,0,0,0,0,CCT complex,"chaperonin-containing T-complex, zona pellucid...","CCT complex, BBS-chaperonin complex, CCT compl...",Chaperonin-containing T-complex,CCT4,CCTD SRB,T-complex protein 1 subunit delta (TCP-1-delta...,"Chaperonin-containing T-complex, chaperonin-co...","[Chaperonin-containing t-complex, Chaperonin-c..."
2,KOG0359,0,0,0,0,0,0,0,0,0,0,CCT complex,chaperonin-containing T-complex,"CCT complex, CCT complex, testis specific",Chaperonin-containing T-complex,"CCT6A, CCT6B","CCT6 CCTZ, NA",T-complex protein 1 subunit zeta (TCP-1-zeta) ...,"Chaperonin-containing T-complex, chaperonin-co...","[Chaperonin-containing t-complex, Chaperonin-c..."
3,KOG0360,0,0,0,0,0,0,0,0,0,0,CCT complex,"chaperonin-containing T-complex, zona pellucid...","CCT complex, CCT complex, testis specific, CCT...",Chaperonin-containing T-complex,"TCP1, BBS10, MKKS","CCT1 CCTA, C12orf58, BBS6",T-complex protein 1 subunit alpha (TCP-1-alpha...,"Chaperonin-containing T-complex, chaperonin-co...","[Chaperonin-containing t-complex, Chaperonin-c..."
4,KOG0361,0,0,0,0,0,0,0,0,0,0,CCT complex,chaperonin-containing T-complex,"CCT complex, CCT complex, testis specific, CCT...",Chaperonin-containing T-complex,CCT7,CCTH NIP7-1,T-complex protein 1 subunit eta (TCP-1-eta) (C...,"Chaperonin-containing T-complex, chaperonin-co...","[Chaperonin-containing t-complex, Chaperonin-c..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3188,KOG3210,194,301,380,419,539,657,777,1183,1589,1995,,,,SNO2-SNZ1 pyridoxal 5'-phosphate synthase comp...,,,,SNO2-SNZ1 pyridoxal 5'-phosphate synthase comp...,[Sno2-snz1 pyridoxal 5'-phosphate synthase com...
3189,ENOG502QPN1,195,303,382,421,541,659,779,1185,1591,1997,,,,,,,,,
3190,KOG1379,196,307,386,425,545,663,783,1189,1595,2001,Uncharacterized mystery complex 9,,,,PPTC7,TAPP2C,Protein phosphatase PTC7 homolog (EC 3.1.3.16)...,,
3191,KOG4067,197,313,393,432,552,671,791,1197,1603,2009,Prune complex,,,,HIKESHI,C11orf73,Protein Hikeshi,,


In [279]:
gpt_cols = ['cut_1608','cut_2014','old_cmplx_assignment','go_cmplx_name','corum_cmplx_name','cp_cmplx_name']
df_out[gpt_cols].sample(n=5).reset_index(drop=True)

Unnamed: 0,cut_1608,cut_2014,old_cmplx_assignment,go_cmplx_name,corum_cmplx_name,cp_cmplx_name
0,1559,1965,,,,
1,210,288,BBSome complex,BBSome,"BBSome, BBSome core complex",BBSome complex
2,952,1315,,,,
3,284,408,Exocyst complex,exocyst,"Exocyst Sec6/8 complex, Exocyst complex","Exocyst, Exocyst, EXOC6 variant, Exocyst, EXOC..."
4,523,749,LAS1 RNA processome complex,"ESC/E(Z) complex, ribonucleoprotein complex, c...","Polycomb repressive complex 2 (PRC 2), EED-EZH...",
