In [6]:
import pandas as pd
import numpy as np

In [7]:
# Folders:
folder_datafiles = '/mbshome/nvelthuijs/Cofactors/20200402_Transcriptionregulationlist/Datafiles/'
folder_corrected_names = '/mbshome/nvelthuijs/Cofactors/20200402_Transcriptionregulationlist/Datafiles_corrected_names/'
folder_output = '/mbshome/nvelthuijs/Cofactors/20200402_Transcriptionregulationlist/Output_files/'

# Files:
selected_bp_file = folder_datafiles + 'selected_GO_terms_bp.txt'
selected_mf_file = folder_datafiles + 'selected_GO_terms_mf.txt'
selected_cc_file = folder_datafiles + 'selected_GO_terms_cc.txt'
annotations_file = folder_corrected_names + 'goa_human_corrected_names.txt'
coregulators_file = folder_datafiles + 'gocofs.txt'
go_full_file = folder_datafiles + 'go_full.txt'
coregulators_list1 = folder_output + '06_Transcription_regulatorlist_baitset1_without_scores.csv'
coregulators_list2 = folder_output + '07_Transcription_regulatorlist_baitset2_without_scores.csv'
coregulators_list3 = folder_output + '08_Transcription_regulatorlist_baitset3_without_scores.csv'

In [8]:
# Calculate weight of selected GO-terms:

# Create set of all selected GO terms:
selected_terms = set()
with open(selected_bp_file) as selected_bp, open(selected_mf_file) as selected_mf, open(selected_cc_file) as selected_cc:
    files = [selected_bp, selected_mf, selected_cc]
    for file in files:
        for line in file:
            selected_terms.add(line.strip())
print('Number of selected GO-terms:', len(selected_terms))

# Create set of coregulators:
with open(coregulators_file) as coregulators:
    coregs = set()
    for line in coregulators:
        coregs.add(line.strip())
coregs_number = len(coregs)
print('Number of coregulators:', coregs_number)

# Read file with annotations into dataframe,
# select rows with a selected annotations
# to a coregulator:
with open(annotations_file) as annotations:
    annotations_df = pd.read_table(annotations, sep = '\t', skiprows = 38, header = None, usecols = [2, 4])
annotations_df.columns = ['Protein', 'GO-term']
annotations_selected_coregs_df = annotations_df.loc[annotations_df['Protein'].isin(coregs) & annotations_df['GO-term'].isin(selected_terms)].drop_duplicates().copy()

# Create dictionary where every selected GO-term
# is a key, which value is a tuple of
# (number of times used on a coregulator, weight):
counts_dict = {}
terms_annotations = list(annotations_selected_coregs_df['GO-term'])
for term in selected_terms:
    count = terms_annotations.count(term)
    counts_dict[term] = (count, count / (coregs_number * 2) + 0.5)
    
# Create dictionary where every GO-term is a key,
# which value is a tuple of (term name, namespace):
term_definitions = {}
with open(go_full_file) as go_full:
    for line in go_full:
        if line.startswith('id:'):
            last_id = line.strip()
        elif line.startswith('name:'):
            last_name = line.strip()
        elif line.startswith('namespace:'):
            last_namespace = line.strip()
            term_definitions[last_id[4:]] = (last_name[6:], last_namespace[11:])
        elif line.startswith('[Typedef]'):
            break
            
# Write weight of selected terms to a file:
print('Missed terms in full GO:')

with open(folder_output + '09_Selected_terms_weight.tab', 'w') as outfile:
    for term in counts_dict.keys():
        count = str(counts_dict[term][0])
        weight = str(counts_dict[term][1])
        if term in term_definitions.keys():
            aspect = term_definitions[term][1]
            definition = term_definitions[term][0]
        else:
            aspect = 'ERROR'
            definition = 'ERROR'
            print('\t', term, count)
        newline = '\t'.join([term, count, weight, aspect, definition])
        outfile.write(newline + '\n')

# Create sets of selected GO-terms for each aspect:
with open(folder_output + '09_Selected_terms_weight.tab') as selected_terms_file:
    selected_cc = set()
    selected_mf = set()
    selected_bp = set()
    for line in selected_terms_file:
        line = line.strip().split('\t')
        term = line[0]
        namespace = line[3]
        if namespace == 'biological_process':
            selected_bp.add(term)
        elif namespace == 'cellular_component':
            selected_cc.add(term)
        elif namespace == 'molecular_function':
            selected_mf.add(term)

Number of selected GO-terms: 478
Number of coregulators: 440
Missed terms in full GO:
	 GO:0001158 0
	 GO:0090568 0
	 GO:0044798 0
	 GO:0001012 0
	 GO:0035389 0
	 GO:0030702 0
	 GO:1902368 0
	 GO:0035390 0
	 GO:0000980 0
	 GO:0044213 0
	 GO:1990141 0
	 GO:0070870 0
	 GO:1904497 0
	 GO:0001047 0
	 GO:0099114 0
	 GO:0070869 0
	 GO:0070924 0
	 GO:0006344 0
	 GO:0044212 0
	 GO:0035326 0
	 GO:1990152 0
	 GO:0006343 0


In [9]:
def select_branch(branch, df):
    """
    Takes the GO-df and the branch ('F', 'C', or 'P') and
    returns a df with only annotations in that branch.
    """
    return df.loc[df['Branch'] == branch].drop(columns = ['Branch']).copy().reset_index(drop = True)

def go_dict(df):
    out_dict = {}
    for index, row in df.iterrows():
        uniprot_id, gene_symbol, term = (row[i] for i in range(3))
        if not uniprot_id in out_dict.keys():
            out_dict[uniprot_id] = {term}
        else:
            out_dict[uniprot_id].add(term)
    return out_dict

go_df = pd.read_table(annotations_file,
                      skiprows = 38,
                      header = None,
                      usecols = [1, 2, 8, 4]).rename(columns = {1:'UniProtID',
                                                                2:'Gene Symbol',
                                                                4:'GO-term',
                                                                8:'Branch'})

go_cc, go_mf, go_bp = (select_branch(x, go_df) for x in ['C', 'F', 'P'])
cc_dict, mf_dict, bp_dict = (go_dict(x) for x in [go_cc, go_mf, go_bp])

In [10]:
def go_terms(id_list, go_dict):
    """
    Takes the UniProt ID column (as a list) and the dictionary translating
    IDs to corresponding GO terms. Returns a list that can be used a new
    column in the dataframe containing all GO-terms annotated to that protein.
    """
    return [go_dict[x] if x in go_dict.keys() else np.nan for x in id_list]

def go_terms_hits(all_terms_column, go_interesting):
    """
    Takes a column containing sets of GO-terms and returns a
    list that can be used as a new column containing the GO-terms that are
    in the interesting GO terms set.
    """
    all_terms_column = list(all_terms_column)
    interesting_annotations_column = []
    for annotations in all_terms_column:
        newset = set()
        if isinstance(annotations, set):
            for annotation in annotations:
                if annotation in go_interesting:
                    newset.add(annotation)
        if len(newset) == 0:
            interesting_annotations_column.append(np.nan)
        else:
            interesting_annotations_column.append(newset)
    return interesting_annotations_column

def interesting_go_counts(df, go_interesting, branch):
    """
    
    branch = 'MF', 'CC' or 'BP'.
    """
    hits_col = []
    tot_ann_col = []
    fraction_col = []
    for index, row in list_df.iterrows():
        gos = row['{} terms'.format(branch)]
        hits = 0
        if isinstance(gos, set):
            annots = len(gos)
            for term in gos:
                if term in go_interesting:
                    hits += 1
        else:
            annots = 0
        hits_col.append(hits)
        tot_ann_col.append(annots)
        if annots > 0:
            fraction_col.append(hits/annots)
        else:
            fraction_col.append(np.nan)
    outdf = pd.DataFrame({
        '{} selected terms number'.format(branch) : hits_col,
        '{} total annotations'.format(branch) : tot_ann_col
    })
    return pd.concat([df, outdf], axis = 1)

def domain_scores_col(df, weightdict, branch):
    new_col_pos = []
    new_col_neg = []
    for index, row in df.iterrows():
        terms = row['{} selected terms'.format(branch)]
        total = row['{} total annotations'.format(branch)]
        if isinstance(terms, set):
            neg_score = 0
            all_weights = [weightdict[x][1] for x in terms]
            if len(all_weights) <= 5:
                weightsum = sum(all_weights)
            else:
                all_weights.sort(reverse=True)
                weightsum = sum(all_weights[:5])
        else:
            weightsum = 0
            if total > 5:
                neg_score = -5
            else:
                neg_score = -total
        new_col_pos.append(weightsum)
        new_col_neg.append(neg_score)
    df['{} GO score'.format(branch)] = new_col_pos
    df['{} GO score penalty'.format(branch)] = new_col_neg
    return df

def domainsums(df):
    new_col_pos = []
    new_col_neg = []
    new_col_total = []
    for index, row in df.iterrows():
        pos_scores = [row['{} GO score'.format(x)] for x in ['CC', 'BP', 'MF']]
        neg_scores = [row['{} GO score penalty'.format(x)] for x in ['CC', 'BP', 'MF']]
        sumscore_pos = sum(pos_scores)
        sumscore_neg = sum(neg_scores)
        new_col_pos.append(sumscore_pos)
        new_col_neg.append(sumscore_neg)
        new_col_total.append((sumscore_pos + sumscore_neg) / 3)
    df['Total GO score'] = new_col_pos
    df['Total GO score penalty'] = new_col_neg
    df['GO score'] = new_col_total
    return df

def mine_score(df):
    in_humap = df['ComplexID_huMAP'].notnull().astype('int')
    in_corum = df['ComplexID_CORUM'].notnull().astype('int')
    in_biogrid = df['Interaction_TF_BioGRID']
    in_intact = df['Interaction_TF_IntACT']
    in_bait = (df['In_bait_crems'] |
               df['In_bait_snfs'] |
               df['In_bait_nursa'] |
               df['In_bait_gocofs'] |
               df['In_NVS'])

    score = (in_humap +
             in_corum +
             in_biogrid +
             in_intact +
             in_bait)
    
    df['Mine score'] = score
    return df

for coregulator_list_file, name in zip([coregulators_list1, coregulators_list2, coregulators_list3],
                                       ['10_Transcription_regulatorlist_baitset1_with_scores.csv',
                                        '11_Transcription_regulatorlist_baitset2_with_scores.csv',
                                        '12_Transcription_regulatorlist_baitset3_with_scores.csv']):
    with open(coregulator_list_file) as coregulatorlist:
        list_df = pd.read_csv(coregulatorlist)

    unips = list(list_df['UniProt ID'])
    list_df['CC terms'] = go_terms(unips, cc_dict)
    list_df['BP terms'] = go_terms(unips, bp_dict)
    list_df['MF terms'] = go_terms(unips, mf_dict)

    list_df['CC selected terms'] = go_terms_hits(list_df['CC terms'], selected_cc)
    list_df['BP selected terms'] = go_terms_hits(list_df['BP terms'], selected_bp)
    list_df['MF selected terms'] = go_terms_hits(list_df['MF terms'], selected_mf)

    list_df = interesting_go_counts(list_df, selected_cc, 'CC')
    list_df = interesting_go_counts(list_df, selected_bp, 'BP')
    list_df = interesting_go_counts(list_df, selected_mf, 'MF')

    for x in ['CC', 'BP', 'MF']:
        list_df = domain_scores_col(list_df, counts_dict, x)
    list_df = domainsums(list_df)

    list_df = mine_score(list_df)

    list_df['Final score'] = list_df['GO score'] + list_df['Mine score']

    list_df.to_csv(folder_output + name, index = False)

print('DONE!')

DONE!
