In [2]:
import pandas as pd
import numpy as np

In [1]:
def should_drop(seq, min_len, max_len):
    aa = "AGVILFPYMTSHNQWRKDEC"
    if len(seq) < min_len:
        return True
    if len(seq) > max_len:
        return True
    for i in seq:
        if i not in aa:
            return True
    return False

In [None]:
def drop_bad_seqs(df, min_len, max_len):
    df["should_drop"] = df["Sequence"].apply(should_drop, args=(min_len, max_len))
    df = df[df["should_drop"]==False].copy()
    df.drop(columns=["should_drop"], inplace=True)
    return df

In [None]:
def clean_df(path,has_interactors=True, excel=True, min_len=31, max_len=5000):
    if excel:
        df = pd.read_excel(path)
    else:
        df = pd.read_csv(path,sep='\t')
    df.set_index("Entry",inplace=True)
    
    df = drop_bad_seqs(df,min_len,max_len)
    df.fillna("",inplace=True)
    
    if has_interactors:
        clean_interactors = lambda a:[b.strip() for b in a.split(";")]
        df["Interacts with"] = df["Interacts with"].apply(clean_interactors)
    
    return df

In [None]:
def get_interacting_pairs(df):
    interacting_pairs = set()
    all_proteins = set()

    for protein1 in df.index:
        all_proteins.add(protein1)

        for protein2 in df.loc[protein1,'Interacts with']:
            all_proteins.add(protein2)
            interacting_pairs.add(frozenset([protein1,protein2]))
    
    n = len(interacting_pairs)
    
    noninteracting_pairs = set()
    
    while len(noninteracting_pairs) < n:
        pair_pick = frozenset(np.random.choice(list(all_proteins), 2, replace=False))
        if (pair_pick not in interacting_pairs):
            noninteracting_pairs.add(pair_pick)
            
    interact_df = pd.DataFrame({"proteins" : list(interacting_pairs), "interacts" : 1})
    noninteract_df = pd.DataFrame({"proteins" : list(noninteracting_pairs), "interacts" : 0})
    
    return pd.concat([interact_df,noninteract_df])

In [None]:
def generate_protein_pairs(df):
    
    pairs_df = get_interacting_pairs(df)
    pairs_df.set_index("proteins", inplace=True)
    
    pairs_df["seq1"] = np.nan
    pairs_df["seq2"] = np.nan

    i = 0
    for pair in pairs_df.index:
        try:
            pairs_df.iloc[i,1] = df.loc[list(pair)[0],"Sequence"]
            pairs_df.iloc[i,2] = df.loc[list(pair)[1],"Sequence"]
        except:
            pass
        i += 1
        
    pairs_df.dropna(inplace=True)
    return pairs_df

In [3]:
def fill_interactors(df,all_prots):
    covid_df = df.copy()
    for id1,row in covid_df.iterrows():
        for id2 in row["Interacts with"]:
            if id2 in covid_df.index:
                if id2 in covid_df.loc[id1,"Interacts with"]:
                    continue
                else:
                    covid_df.loc[id1,"Interacts with"].append(id2)
            elif id2 in all_prots.index:
                covid_df.loc[id2,"Sequence"] = all_prots.loc[id2,"Sequence"]
                covid_df.loc[id2,"Interacts with"] = [id1]
    return covid_df