In [1]:
import pandas as pd
import scipy
from scipy import stats
from scipy.stats import mannwhitneyu
import math
import numpy as np
import re
from monster.import_data import *

# Importing data

In [2]:
### importing data
#

# motifs
lst_motifs = import_list_motifs('../data/lst_motifs')
df_motifs_features = pd.read_csv('df_motifs_features.tsv')

# positive dataset
seqs_path_pos = '../data/datasets/minc_nr_positive_dataset.fasta'
pos_dict = import_fasta_sequences_as_dict(seqs_path_pos)
df_pos_features = pd.read_csv('df_pos_features.tsv')

# negative dataset
seqs_path_neg = '../data/datasets/minc_nr_negative_dataset.fasta'
neg_dict = import_fasta_sequences_as_dict(seqs_path_neg)
df_neg_features = pd.read_csv('df_neg_features.tsv')

## motifs

In [3]:
lst_motifs

['GHWT',
 'GHWTQ',
 'HWT',
 'HWTQ',
 'PGNV',
 'PKPK',
 'PTHP',
 'ADAE',
 'TQLA',
 'YPSG',
 'PYPGQ',
 'CGIGG',
 'CGCCG',
 'KEEKK',
 'CGIGR',
 'FSLFL',
 'PYPSG',
 'EEEKK',
 'KEGKK',
 'FSLEL',
 'KKEKK',
 'CGNCG',
 'GYPSG',
 'CGSGG',
 'KNEKK',
 'FLLFL',
 'KEERK',
 'FKLFL',
 'RYPSG',
 'CCCGG',
 'KEEIK',
 'KEETK',
 'CGDGF',
 'FSLSL',
 'LYPSG',
 'GEEKK',
 'CGGGF',
 'KEEKG',
 'CLIGG',
 'CNICG',
 'EEGKK',
 'PKPK',
 'FSIFL',
 'EKEKK',
 'CGGGY',
 'KDKK',
 'KKGKK',
 'EEERK',
 'EEETK',
 'LSLFL',
 'KDKC',
 'PKYK',
 'FLIFL',
 'CCCGF',
 'CNGGG',
 'CGCCA',
 'WKPK',
 'KDKM',
 'CGDSG',
 'GEGKK',
 'KKETK',
 'KDCK',
 'CQNGG',
 'CGEGE',
 'PKEK',
 'CGGGA',
 'GEETK',
 'EDKK',
 'KEETG',
 'PKPF',
 'CGKAG',
 'EKEKG',
 'FLISL',
 'PTPK',
 'YYPGG',
 'KDAK',
 'PKPY',
 'VYPSG',
 'KDMK',
 'CCSGY',
 'DEEKK',
 'KEENK',
 'KELKK',
 'PSPK',
 'KEDKK',
 'KDEKK',
 'GDKK',
 'KEEEK',
 'PYQSG',
 'EEERG',
 'KKPK',
 'KEEDK',
 'KEESK',
 'KDEK',
 'PKPN',
 'KEEKI',
 'PNPGG',
 'KEEKE',
 'TDKK',
 'PPPK',
 'PSPGG',
 'CGIG',
 'DAP',
 'HW

In [4]:
df_motifs_CLUMPs = pd.read_csv('df_motifs_CLUMPs.tsv')
df_motifs_CLUMPs

Unnamed: 0,motif,CLUMP
0,GHWT,0
1,GHWTQ,0
2,HWT,0
3,HWTQ,0
4,PGNV,1
...,...,...
193,VHAA,4
194,VKSY,2
195,KTD,8
196,DKE,8


In [5]:
df_motifs_CLUMPs.rename(columns = {'motif' : 'id'}, inplace= True)
df_all_motifs_all_features = df_motifs_CLUMPs.merge(df_motifs_features)
df_all_motifs_all_features.head()

Unnamed: 0,id,CLUMP,seq_len,gravy,tiny,small,aliphatic,aromatic,non_polar,polar,charged,basic,acidic,helix,turn,sheet
0,GHWT,0,4,-1.3,0.5,0.5,0.0,0.5,0.5,0.5,0.25,0.25,0.0,0.25,0.25,0.0
1,GHWTQ,0,5,-1.74,0.4,0.4,0.0,0.4,0.4,0.6,0.2,0.2,0.0,0.2,0.2,0.0
2,HWT,0,3,-1.6,0.333333,0.333333,0.0,0.666667,0.333333,0.666667,0.333333,0.333333,0.0,0.333333,0.0,0.0
3,HWTQ,0,4,-2.075,0.25,0.25,0.0,0.5,0.25,0.75,0.25,0.25,0.0,0.25,0.0,0.0
4,PGNV,1,4,-0.325,0.25,0.75,0.25,0.0,0.75,0.25,0.0,0.0,0.0,0.25,0.75,0.0


In [6]:
df_motifs_features

Unnamed: 0,id,seq_len,gravy,tiny,small,aliphatic,aromatic,non_polar,polar,charged,basic,acidic,helix,turn,sheet
0,GHWT,4,-1.300000,0.500000,0.500000,0.00,0.500000,0.500000,0.500000,0.250000,0.250000,0.000000,0.250000,0.250000,0.000000
1,GHWTQ,5,-1.740000,0.400000,0.400000,0.00,0.400000,0.400000,0.600000,0.200000,0.200000,0.000000,0.200000,0.200000,0.000000
2,HWT,3,-1.600000,0.333333,0.333333,0.00,0.666667,0.333333,0.666667,0.333333,0.333333,0.000000,0.333333,0.000000,0.000000
3,HWTQ,4,-2.075000,0.250000,0.250000,0.00,0.500000,0.250000,0.750000,0.250000,0.250000,0.000000,0.250000,0.000000,0.000000
4,PGNV,4,-0.325000,0.250000,0.750000,0.25,0.000000,0.750000,0.250000,0.000000,0.000000,0.000000,0.250000,0.750000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
193,VHAA,4,1.150000,0.500000,0.750000,0.75,0.250000,0.750000,0.250000,0.250000,0.250000,0.000000,0.250000,0.000000,0.500000
194,VKSY,4,-0.450000,0.250000,0.500000,0.25,0.250000,0.500000,0.500000,0.250000,0.250000,0.000000,0.500000,0.250000,0.000000
195,KTD,3,-2.700000,0.333333,0.000000,0.00,0.000000,0.000000,1.000000,0.666667,0.333333,0.333333,0.000000,0.000000,0.000000
196,DKE,3,-3.633333,0.000000,0.000000,0.00,0.000000,0.000000,1.000000,1.000000,0.333333,0.666667,0.000000,0.000000,0.333333


## positive dataset

In [7]:
pos_dict

{'Minc3s00007g00481': 'MKYYILSAILKRIILIKIIFLETTNFALSQTITPPCSCSNVKPNFGTNSNIPQQLCVPPLAYDQKSVWLTWNKPDNYENIADFNVYMAGKKIGSAKANSAVNTLSGPYIQNFYKNDLNNFHTKILFTTYLVTGLNPNTIYTFTVRAVDANGAESGNSNQVVVKTAENYGKIVDITTFGATGDGTTLNTQAIQKAIDSCSSSTSAFGCKVLIPKGIFLSGPLFLRSQMTFELANGAILRATSNPSKFPNQYGNTPSAFLNALNGSLTNIRVIGPGSVDGNGWKLASNAIDELGRQIPVYAKGSPSTVNNLGILAANQVQTHGNNYYSRSRLANFNFVTNLHIGGGITFINPSMTTIGLADSKNVSIISVRFQTYNINNGDGIDIGRSSNIQIIGSFFDTGDDCIAMGTGCGSNAGQGAPVQCILIKNNYFRHGHGAPAFGGSAGDGIKDVLVEDNVAFLTDNGIRFKSSPQCGGGAQNVYARDIAMQSVGSYNNFTFGGRQFSGDTTAGHPFVFMLDYDSNPSGNAKIPAQFKDITITRCSVDNIKPTKSGEILYAVGHDGGNIYQPVYNKNIVRFFNKLIAYSTPYGLDRDVLYFWGWDLDWR',
 'Minc3s00008g00574': 'MSNKLIISILILTIIYTVVNSLTVPEQNAVVDCINKYRSQLANGKTKNKNGGNFPSGKDILEVSYSKDLEKSAQRWANKCIFDHNGTDLYSGGKFYGENLYLDGDFEHKNITQLMIDACNAWWGESTTDGVPPSWINNFLPTDNKENDEKFEAVGHWTQMAWAKTYQIGCALKVCHKPDCNGNLIDCRYYPGGNGMGSPIYQQGKPASGCGKAGPSTKYSGLCKPDPHQNN',
 'Minc3s00011g00761': 'MIIYKDVFTEDELSSDSYPMKLVDDLIFEFKGRQVVRKEGDIALAGANPSAEEMDEGTEEHVERGIDFVLNHRLQEMNCYEDQATFKAYIK

In [8]:
df_pos_features.head()

Unnamed: 0,id,seq_len,gravy,tiny,small,aliphatic,aromatic,non_polar,polar,charged,basic,acidic,helix,turn,sheet
0,Minc3s00007g00481,603,-0.140962,0.338308,0.558872,0.276949,0.119403,0.558872,0.441128,0.154229,0.089552,0.064677,0.313433,0.323383,0.160862
1,Minc3s00008g00574,231,-0.580952,0.307359,0.528139,0.220779,0.121212,0.528139,0.471861,0.220779,0.121212,0.099567,0.268398,0.311688,0.17316
2,Minc3s00011g00761,179,-0.444693,0.206704,0.502793,0.27933,0.106145,0.502793,0.497207,0.357542,0.150838,0.206704,0.301676,0.139665,0.312849
3,Minc3s00013g00811,65,0.627692,0.384615,0.646154,0.323077,0.061538,0.646154,0.353846,0.169231,0.061538,0.107692,0.323077,0.169231,0.2
4,Minc3s00020g01281,87,0.113793,0.448276,0.724138,0.149425,0.218391,0.724138,0.275862,0.114943,0.068966,0.045977,0.356322,0.367816,0.149425


## negative dataset

In [9]:
neg_dict

{'Minc3s00001g00059': 'MSELDQLRQEAEQLKSQIREARKQANDTTLASVAANLEPIGRIQMRTRRTLRGHLAKIYAMHWASDSRNLVSASQDGKLIVWDSYTTNKVHAIPLRSSWVMTCAYAPSGSFVACGGLDNICSIYSLKTREGNVRVSRELPGHTGYLSCCRFLDDNQIVTSSGDMTCALWDIETGQQLTTFTGHTGDVMSLSLSPDMRTFISGACDASAKLWDIREGMCKQTFPGHESDINAVSFFPNGHAFATGSDDATCRLFDIRADQELAMYSHDNIICGITSVAFSKSGRLLFAGYDDFNCNVWDSMRQERAGVLAGHDNRVSCLGVTDDGMAVCTGSWDSFLKIWN',
 'Minc3s00002g00155': 'MASNSKKSRKSLNNSAHSDNEDDESSRSSMYDRDDSERKNKEQFGHWECTVCTFQNKQEAFKCLMCDTRKGTSTRKPRLNPSVVQQQTLVQKLAVEVERQKKQRNAEAQSSPDPLSSPYSNAGLNFVNNETALQSGSYSNGGKQQNSLLHRRMTFRDSLVVRSSAKKTIVTVGGKNFTITEFKPRISSRGRKKSTNGNNIVQ',
 'Minc3s00006g00398': 'MPAEAANNGGNTTTEKSENVTEQTKKAPVIFLFDVDGTLTMPRQKITDEMLQFMRNLSQRVPIAVVGGSDLCKIFEQLPNEDNELLKLFSFIFAENGLMGFEGVEELPRASITKELGEKRLQDLTNFCLRYMSEIDLPLKRGTFIELRNGMMNVCPIGRSCTQEERMSFVEYENKFPVRQDFVKALEQRFPVNENSLKFSIGGQISIDIFPSGWDKTFCMRYLEPKYEAIHFFGDKTTPGGNDYEIFIHPGTVGHSVTDPIDCCKQVTETLGQLGL',
 'Minc3s00006g00399': 'MTTKNLKNEIFVKEEIVEEEEEEMRIDEDGKLKKGGEGERGEIQNGIDGGGILVDKNTKITSTQTATNGRPPRKNGILFAA

In [10]:
df_neg_features.head()

Unnamed: 0,id,seq_len,gravy,tiny,small,aliphatic,aromatic,non_polar,polar,charged,basic,acidic,helix,turn,sheet
0,Minc3s00001g00059,340,-0.241176,0.367647,0.517647,0.267647,0.108824,0.517647,0.482353,0.235294,0.120588,0.114706,0.264706,0.238235,0.232353
1,Minc3s00002g00155,202,-1.108911,0.316832,0.351485,0.178218,0.069307,0.351485,0.648515,0.287129,0.188119,0.09901,0.188119,0.306931,0.173267
2,Minc3s00006g00398,276,-0.324275,0.253623,0.525362,0.235507,0.105072,0.525362,0.474638,0.26087,0.115942,0.144928,0.297101,0.235507,0.25
3,Minc3s00006g00399,588,-0.768878,0.29932,0.452381,0.227891,0.090136,0.452381,0.547619,0.277211,0.139456,0.137755,0.232993,0.30102,0.234694
4,Minc3s00006g00405,1296,-0.564506,0.251543,0.455247,0.26929,0.091821,0.455247,0.544753,0.301698,0.13966,0.162037,0.278549,0.20679,0.296296


# Occurrences of each motif in each sequence of the datasets

## Start and end position

In [11]:
def start_end_position(lst_motifs, dict_seqs, dataset):
    """start_end_position
       ------------------
       This function calculates the start and end position 
       of the motifs in the sequences.
       
       Arguments:
       lst_motifs -- list of motifs
       dict_seqs -- dictionary of fasta sequences where the key is the
                    id and the value is the sequence
       dataset -- 'positive' or 'negative'
       
       Output:
       df_start_end_position -- pandas dataframe where:
                                first column is the motif
                                second column is the sequence id
                                third column is the start position
                                fourth column is the end position
                    
    """
    
    lst_dict = [] 
    
    # Iterate the list of motifs
    # For each motif, go through the dictionary of sequences,
    for motif in lst_motifs:
            for seq_id in dict_seqs:
                # Assign the sequence to the variable record
                record = dict_seqs[seq_id]
                # Run the finditer (to find the start and end positions)
                for match in re.finditer(motif, record):
                    # append the motif, the sequence id, 
                    # the start and end position to the list.
                    lst_dict.append({'motif':motif, 'seq_id':seq_id, 
                                     'start':match.start(), 'end':match.end()})
    df_start_end_position = pd.DataFrame(lst_dict)
    df_start_end_position['dataset'] = dataset
    
    return df_start_end_position

In [12]:
df_start_end_position_pos = start_end_position(lst_motifs, pos_dict, 'positive')
df_start_end_position_pos

Unnamed: 0,motif,seq_id,start,end,dataset
0,GHWT,Minc3s00008g00574,154,158,positive
1,GHWT,Minc3s00736g16684,140,144,positive
2,GHWT,Minc3s01051g20218,140,144,positive
3,GHWT,Minc3s01143g21148,86,90,positive
4,GHWT,Minc3s01152g21216,154,158,positive
...,...,...,...,...,...
1292,KCS,Minc3s00139g05823,69,72,positive
1293,KCS,Minc3s00520g13673,69,72,positive
1294,KCS,Minc3s03136g32914,23,26,positive
1295,KCS,Minc3s10875g44433,69,72,positive


In [13]:
df_start_end_position_neg = start_end_position(lst_motifs, neg_dict, 'negative')
df_start_end_position_neg

Unnamed: 0,motif,seq_id,start,end,dataset
0,KEEKK,Minc3s01269g22302,90,95,negative
1,KEEKK,Minc3s01269g22302,95,100,negative
2,EEEKK,Minc3s00961g19324,561,566,negative
3,KEGKK,Minc3s02467g30196,618,623,negative
4,KKEKK,Minc3s00530g13810,347,352,negative
...,...,...,...,...,...
741,KCS,Minc3s01070g20414,330,333,negative
742,KCS,Minc3s01536g24546,215,218,negative
743,KCS,Minc3s02273g29198,330,333,negative
744,KCS,Minc3s03208g33140,98,101,negative


## Occurrence of each motif in each sequence

In [14]:
def occ_each_mot_in_each_seq(df_start_end_position, dataset):
    """occ_each_mot_in_each_seq
       ------------------------
       This function calculates the occurrences
       of each motif in each sequence.
       
       Arguments: 
       df_start_end_position -- pandas dataframe where:
                                first column is the motif
                                second column is the sequence id
                                third column is the start position
                                fourth column is the end position
       dataset -- 'positive' or 'negative'

       Output:
       df_occ_seq -- pandas dataframe where: 
                     first column is the motif
                     second column is the sequence id 
                     third column is the number of occurrences
    """
    
    df_occ_seq = df_start_end_position.groupby(
        ['motif','seq_id']).size().reset_index(name='occ') 
    
    df_occ_seq['dataset'] = dataset
    
    if df_occ_seq.shape == df_occ_seq.drop_duplicates().shape:
        return df_occ_seq

In [15]:
df_occ_seq_pos = occ_each_mot_in_each_seq(df_start_end_position_pos, "positive")
df_occ_seq_pos

Unnamed: 0,motif,seq_id,occ,dataset
0,AAIE,AAR37375.1,1,positive
1,AAIE,Minc3s00139g05823,1,positive
2,AAIE,Minc3s00520g13673,1,positive
3,AAIE,Minc3s00736g16684,1,positive
4,AAIE,Minc3s01051g20218,1,positive
...,...,...,...,...
1123,YVIL,Minc3s00520g13673,1,positive
1124,YVIL,Minc3s03136g32914,1,positive
1125,YVIL,Minc3s10875g44433,1,positive
1126,YYPGG,Minc3s00008g00574,1,positive


In [16]:
df_occ_seq_neg = occ_each_mot_in_each_seq(df_start_end_position_neg, "negative")
df_occ_seq_neg

Unnamed: 0,motif,seq_id,occ,dataset
0,AAIE,Minc3s00019g01192,1,negative
1,AAIE,Minc3s01070g20414,1,negative
2,AAIE,Minc3s02273g29198,1,negative
3,AAIE,Minc3s02858g31835,1,negative
4,AAIE,Minc3s02910g32038,1,negative
...,...,...,...,...
682,WNT,Minc3s00590g14758,1,negative
683,WNT,Minc3s01624g25187,1,negative
684,WNT,Minc3s02991g32337,1,negative
685,WNT,Minc3s08790g42601,1,negative


In [17]:
df_motifs_CLUMPs.rename(columns = {'id' : 'motif'}, inplace = True)
df_general = pd.concat([
    pd.merge(df_motifs_CLUMPs, df_occ_seq_pos, on='motif'),
    pd.merge(df_motifs_CLUMPs, df_occ_seq_neg, on='motif')
])
df_general

Unnamed: 0,motif,CLUMP,seq_id,occ,dataset
0,GHWT,0,Minc3s00008g00574,1,positive
1,GHWT,0,Minc3s00736g16684,1,positive
2,GHWT,0,Minc3s01051g20218,1,positive
3,GHWT,0,Minc3s01143g21148,1,positive
4,GHWT,0,Minc3s01152g21216,1,positive
...,...,...,...,...,...
682,KCS,9,Minc3s01070g20414,1,negative
683,KCS,9,Minc3s01536g24546,1,negative
684,KCS,9,Minc3s02273g29198,1,negative
685,KCS,9,Minc3s03208g33140,1,negative


## find extended motifs 

In [18]:
def find_extended_motifs(lst_motifs):
    """find_extended_motifs
       --------------------
       This function identifies extended motifs.
       
       Arguments:
       
    """
    dict_extended_motifs = {}
    lst_motifs = sorted(lst_motifs, key=len)
    print(lst_motifs)
    lst_known_motifs = []
    lst_all_extended_motifs = []
    for motif in lst_motifs:
        if motif not in lst_known_motifs:
            lst_extended_motifs = [m for m in lst_motifs if motif in m and motif != m]
            lst_known_motifs.append(motif)
            lst_known_motifs+=lst_extended_motifs
            if len(lst_extended_motifs)>0:
                lst_all_extended_motifs+=[[motif]+lst_extended_motifs]
    return lst_all_extended_motifs

In [19]:
def non_redundant_motifs(df_motifs_CLUMPs, lst_motifs):
    """non_redundant_motifs
       --------------------
       This function identifies non redundant extended motifs.
       Non redundant motifs (e.g. root motifs and non-extended motifs) 
       are stored in a list called lst_motifs_mask.
       
       Arguments:
       
    """
    ### Identify extended motifs
    #
    #   Non redundant motifs (e.g. root motifs and non-extended motifs) are stored
    #   in a list called lst_motifs_mask
    lst_motifs_mask = []
    for c in df_motifs_CLUMPs.CLUMP.unique():
        lst_motifs = df_motifs_CLUMPs.loc[df_motifs_CLUMPs.CLUMP==c, 'motif'].unique()
        print('--------------------------------------')
        print(f"Cluster {c} :")
        lst_ext_motifs = find_extended_motifs(lst_motifs)
        lst_all_ext_motifs = [j for i in lst_ext_motifs for j in i]
        lst_non_ext_motifs = [m for m in lst_motifs if m not in lst_all_ext_motifs]
        lst_root_motifs = [el[0] for el in lst_ext_motifs]
        lst_motifs_mask += lst_root_motifs+lst_non_ext_motifs
        print('Root-motifs', lst_root_motifs)
        print('Extended-motifs', lst_ext_motifs)
        print('Non-extended motifs', lst_non_ext_motifs)
        print('\n')
    
    return lst_motifs_mask

In [20]:
lst_motifs_mask = non_redundant_motifs(df_motifs_CLUMPs, lst_motifs)

--------------------------------------
Cluster 0 :
['HWT', 'WWS', 'WNT', 'CQY', 'FSL', 'NVY', 'WNS', 'HWF', 'GHWT', 'HWTQ', 'YSHS', 'FSVF', 'FTNS', 'GHWTQ']
Root-motifs ['HWT']
Extended-motifs [['HWT', 'GHWT', 'HWTQ', 'GHWTQ']]
Non-extended motifs ['WWS', 'WNT', 'YSHS', 'CQY', 'FSVF', 'FSL', 'NVY', 'WNS', 'FTNS', 'HWF']


--------------------------------------
Cluster 1 :
['PGNV', 'PTHP', 'YPSG', 'PKPF', 'PTPK', 'PKPY', 'PSPK', 'PKPN', 'PPPK', 'KPPG', 'RGIG', 'FPSP', 'KYPN', 'NGQP', 'PYPGQ', 'PYPSG', 'GYPSG', 'RYPSG', 'LYPSG', 'YYPGG', 'VYPSG', 'PYQSG']
Root-motifs ['YPSG']
Extended-motifs [['YPSG', 'PYPSG', 'GYPSG', 'RYPSG', 'LYPSG', 'VYPSG']]
Non-extended motifs ['PGNV', 'PTHP', 'PYPGQ', 'PKPF', 'PTPK', 'YYPGG', 'PKPY', 'PSPK', 'PYQSG', 'PKPN', 'PPPK', 'KPPG', 'RGIG', 'FPSP', 'KYPN', 'NGQP']


--------------------------------------
Cluster 2 :
['KHP', 'HGD', 'PKPK', 'PKYK', 'WKPK', 'KYKS', 'KQAQ', 'KTKL', 'PKAK', 'QEAF', 'AYKN', 'KMKG', 'FKAK', 'IKNN', 'KKIS', 'MDKF', 'VKSY']
Root-mo

In [21]:
### Creates a subset from df_general with only the non-redundant motifs 
#
df_general_non_redundant = df_general[df_general.motif.isin(lst_motifs_mask)]
df_general_non_redundant

Unnamed: 0,motif,CLUMP,seq_id,occ,dataset
22,HWT,0,Minc3s00008g00574,2,positive
23,HWT,0,Minc3s00736g16684,2,positive
24,HWT,0,Minc3s01051g20218,2,positive
25,HWT,0,Minc3s01143g21148,2,positive
26,HWT,0,Minc3s01152g21216,2,positive
...,...,...,...,...,...
682,KCS,9,Minc3s01070g20414,1,negative
683,KCS,9,Minc3s01536g24546,1,negative
684,KCS,9,Minc3s02273g29198,1,negative
685,KCS,9,Minc3s03208g33140,1,negative


### Calculus on non_redundant df

In [22]:
motifs_clu_non_redundant = df_general_non_redundant.loc[:, 
                                                        ['motif', 'CLUMP']]
print(motifs_clu_non_redundant.shape)
motifs_clu_non_redundant.head()

(1758, 2)


Unnamed: 0,motif,CLUMP
22,HWT,0
23,HWT,0
24,HWT,0
25,HWT,0
26,HWT,0


In [23]:
motifs_clu_non_redundant_counts = pd.DataFrame(
    motifs_clu_non_redundant.groupby(
        ['motif', 'CLUMP']).size()).sort_values( by = 0, ascending = False)
motifs_clu_non_redundant_counts.reset_index(inplace = True)
motifs_clu_non_redundant_counts.head()

Unnamed: 0,motif,CLUMP,0
0,CK,9,214
1,AEG,3,62
2,FSL,0,60
3,DKE,8,58
4,NVY,0,42


In [24]:
## which is the most frequent motif for each cluster?
lst_CLUMPs = list(set(motifs_clu_non_redundant_counts.CLUMP.unique()))
for CLUMP in lst_CLUMPs:
    df_mask = motifs_clu_non_redundant_counts['CLUMP'] == CLUMP
    filtered_df = motifs_clu_non_redundant_counts[df_mask].iloc[0, :]
    print(filtered_df)
    print('-----------')
    print('\n')

motif    FSL
CLUMP      0
0         60
Name: 2, dtype: object
-----------


motif    RGIG
CLUMP       1
0          11
Name: 34, dtype: object
-----------


motif    HGD
CLUMP      2
0         23
Name: 9, dtype: object
-----------


motif    AEG
CLUMP      3
0         62
Name: 1, dtype: object
-----------


motif    MIE
CLUMP      4
0         19
Name: 12, dtype: object
-----------


motif    FGGG
CLUMP       5
0          11
Name: 36, dtype: object
-----------


motif    EEER
CLUMP       6
0          12
Name: 25, dtype: object
-----------


motif    ELIY
CLUMP       7
0          11
Name: 38, dtype: object
-----------


motif    DKE
CLUMP      8
0         58
Name: 3, dtype: object
-----------


motif     CK
CLUMP      9
0        214
Name: 0, dtype: object
-----------




In [25]:
motifs_counts = pd.DataFrame(df_general_non_redundant.groupby(
    ['CLUMP', 'dataset'])['occ'].sum())
motifs_counts = motifs_counts.reset_index()
motifs_counts

Unnamed: 0,CLUMP,dataset,occ
0,0,negative,81
1,0,positive,121
2,1,negative,10
3,1,positive,126
4,2,negative,36
5,2,positive,144
6,3,negative,85
7,3,positive,129
8,4,negative,81
9,4,positive,154


In [26]:
motifs_counts = motifs_counts.pivot(
    index='CLUMP', columns='dataset', values='occ')
motifs_counts = motifs_counts.rename_axis(None,axis=1)
motifs_counts = motifs_counts.reset_index()
motifs_counts

Unnamed: 0,CLUMP,negative,positive
0,0,81,121
1,1,10,126
2,2,36,144
3,3,85,129
4,4,81,154
5,5,10,62
6,6,25,85
7,7,5,58
8,8,124,190
9,9,288,168


In [27]:
df_cnt_seq_per_cluster = df_general_non_redundant.groupby([
    'CLUMP', 'seq_id', 'dataset']).size().reset_index(name='temporary')
df_cnt_seq_per_cluster = df_cnt_seq_per_cluster.drop(columns = 'temporary')
df_cnt_seq_per_cluster = df_cnt_seq_per_cluster.drop_duplicates()
df_cnt_seq_per_cluster = df_cnt_seq_per_cluster.groupby(['CLUMP', 'dataset']
                                                       ).size().reset_index()
df_cnt_seq_per_cluster = df_cnt_seq_per_cluster.pivot(
    index='CLUMP', columns='dataset', values = 0)
df_cnt_seq_per_cluster = df_cnt_seq_per_cluster.rename_axis(None,axis=1)
df_cnt_seq_per_cluster = df_cnt_seq_per_cluster.reset_index()

df_cnt_seq_per_cluster

Unnamed: 0,CLUMP,negative,positive
0,0,72,54
1,1,10,50
2,2,30,58
3,3,66,57
4,4,75,66
5,5,10,40
6,6,20,28
7,7,5,44
8,8,109,76
9,9,208,95


# Jaccard index

In [28]:
motifs_counts['norm_negative'] = motifs_counts.negative/len(df_neg_features)
motifs_counts['norm_positive'] = motifs_counts.positive/len(df_pos_features)
motifs_counts['jaccard_1'] = motifs_counts.negative/motifs_counts.positive
motifs_counts['jaccard_norm_1']=motifs_counts.norm_negative/motifs_counts.norm_positive
motifs_counts

Unnamed: 0,CLUMP,negative,positive,norm_negative,norm_positive,jaccard_1,jaccard_norm_1
0,0,81,121,0.163636,0.751553,0.669421,0.217731
1,1,10,126,0.020202,0.782609,0.079365,0.025814
2,2,36,144,0.072727,0.89441,0.25,0.081313
3,3,85,129,0.171717,0.801242,0.658915,0.214314
4,4,81,154,0.163636,0.956522,0.525974,0.171074
5,5,10,62,0.020202,0.385093,0.16129,0.05246
6,6,25,85,0.050505,0.52795,0.294118,0.095663
7,7,5,58,0.010101,0.360248,0.086207,0.028039
8,8,124,190,0.250505,1.180124,0.652632,0.21227
9,9,288,168,0.581818,1.043478,1.714286,0.557576


In [29]:
# here we normalize the counts of the clusters in the datasets
# by dividing the numbers by the number of sequences in the respective
# dataset
df_cnt_seq_per_cluster['neg_norm'] = df_cnt_seq_per_cluster.negative / len(
    df_neg_features)
df_cnt_seq_per_cluster['pos_norm'] = df_cnt_seq_per_cluster.positive / len(
    df_pos_features)
df_cnt_seq_per_cluster

Unnamed: 0,CLUMP,negative,positive,neg_norm,pos_norm
0,0,72,54,0.145455,0.335404
1,1,10,50,0.020202,0.310559
2,2,30,58,0.060606,0.360248
3,3,66,57,0.133333,0.354037
4,4,75,66,0.151515,0.409938
5,5,10,40,0.020202,0.248447
6,6,20,28,0.040404,0.173913
7,7,5,44,0.010101,0.273292
8,8,109,76,0.220202,0.47205
9,9,208,95,0.420202,0.590062


In [30]:
df_cnt_seq_per_cluster[
    'jaccard_2'] = df_cnt_seq_per_cluster.negative/df_cnt_seq_per_cluster.positive
df_cnt_seq_per_cluster[
    'jaccard_norm_2'] = df_cnt_seq_per_cluster.neg_norm/df_cnt_seq_per_cluster.pos_norm
df_cnt_seq_per_cluster

Unnamed: 0,CLUMP,negative,positive,neg_norm,pos_norm,jaccard_2,jaccard_norm_2
0,0,72,54,0.145455,0.335404,1.333333,0.43367
1,1,10,50,0.020202,0.310559,0.2,0.065051
2,2,30,58,0.060606,0.360248,0.517241,0.168234
3,3,66,57,0.133333,0.354037,1.157895,0.376608
4,4,75,66,0.151515,0.409938,1.136364,0.369605
5,5,10,40,0.020202,0.248447,0.25,0.081313
6,6,20,28,0.040404,0.173913,0.714286,0.232323
7,7,5,44,0.010101,0.273292,0.113636,0.036961
8,8,109,76,0.220202,0.47205,1.434211,0.466481
9,9,208,95,0.420202,0.590062,2.189474,0.712132


In [31]:
motifs_counts

Unnamed: 0,CLUMP,negative,positive,norm_negative,norm_positive,jaccard_1,jaccard_norm_1
0,0,81,121,0.163636,0.751553,0.669421,0.217731
1,1,10,126,0.020202,0.782609,0.079365,0.025814
2,2,36,144,0.072727,0.89441,0.25,0.081313
3,3,85,129,0.171717,0.801242,0.658915,0.214314
4,4,81,154,0.163636,0.956522,0.525974,0.171074
5,5,10,62,0.020202,0.385093,0.16129,0.05246
6,6,25,85,0.050505,0.52795,0.294118,0.095663
7,7,5,58,0.010101,0.360248,0.086207,0.028039
8,8,124,190,0.250505,1.180124,0.652632,0.21227
9,9,288,168,0.581818,1.043478,1.714286,0.557576


In [32]:
df_cnt_seq_per_cluster

Unnamed: 0,CLUMP,negative,positive,neg_norm,pos_norm,jaccard_2,jaccard_norm_2
0,0,72,54,0.145455,0.335404,1.333333,0.43367
1,1,10,50,0.020202,0.310559,0.2,0.065051
2,2,30,58,0.060606,0.360248,0.517241,0.168234
3,3,66,57,0.133333,0.354037,1.157895,0.376608
4,4,75,66,0.151515,0.409938,1.136364,0.369605
5,5,10,40,0.020202,0.248447,0.25,0.081313
6,6,20,28,0.040404,0.173913,0.714286,0.232323
7,7,5,44,0.010101,0.273292,0.113636,0.036961
8,8,109,76,0.220202,0.47205,1.434211,0.466481
9,9,208,95,0.420202,0.590062,2.189474,0.712132


In [33]:
jaccard_norm_1 = pd.DataFrame(motifs_counts.loc[:, 'jaccard_norm_1']).sort_values(
    by = 'jaccard_norm_1', ascending = True)
jaccard_norm_1['jaccard_norm_1_rank'] = np.arange(1, len(jaccard_norm_1)+1)
jaccard_norm_1 = jaccard_norm_1.sort_index()
jaccard_norm_1

Unnamed: 0,jaccard_norm_1,jaccard_norm_1_rank
0,0.217731,9
1,0.025814,1
2,0.081313,4
3,0.214314,8
4,0.171074,6
5,0.05246,3
6,0.095663,5
7,0.028039,2
8,0.21227,7
9,0.557576,10


In [34]:
jaccard_norm_2 = pd.DataFrame(df_cnt_seq_per_cluster.loc[:, 'jaccard_norm_2']).sort_values(
    by = 'jaccard_norm_2', ascending = True)
jaccard_norm_2['jaccard_norm_2_rank'] = np.arange(1, len(jaccard_norm_2)+1)
jaccard_norm_2 = jaccard_norm_2.sort_index()
jaccard_norm_2

Unnamed: 0,jaccard_norm_2,jaccard_norm_2_rank
0,0.43367,8
1,0.065051,2
2,0.168234,4
3,0.376608,7
4,0.369605,6
5,0.081313,3
6,0.232323,5
7,0.036961,1
8,0.466481,9
9,0.712132,10


In [35]:
# here we are putting the values 1 - jaccard
# so that we can directly sum these values to the ones 
# of our score

df_jaccard_index = pd.concat([jaccard_norm_1, jaccard_norm_2], axis = 1)
df_jaccard_index.insert(0, 'CLUMP', np.arange(0, len(df_jaccard_index)))

df_jaccard_index = df_jaccard_index.loc[:, ['jaccard_norm_1', 'jaccard_norm_2']]
df_jaccard_index = 1 - df_jaccard_index
df_jaccard_index

Unnamed: 0,jaccard_norm_1,jaccard_norm_2
0,0.782269,0.56633
1,0.974186,0.934949
2,0.918687,0.831766
3,0.785686,0.623392
4,0.828926,0.630395
5,0.94754,0.918687
6,0.904337,0.767677
7,0.971961,0.963039
8,0.78773,0.533519
9,0.442424,0.287868


# Co-occurrence matrix

In [36]:
### Create co-occurence matrix
#

##  positive dataset
tmp_positive = df_general_non_redundant.loc[df_general_non_redundant.dataset=='positive',
                                            ['CLUMP', 'seq_id']].copy().drop_duplicates()

tmp_positive['cnt']=1
tmp_positive = tmp_positive.pivot(index='seq_id', columns='CLUMP', values='cnt').fillna(0)
tmp_positive.columns.name = None
cooccurence_matrix_positive = tmp_positive.T.dot(tmp_positive)
np.fill_diagonal(cooccurence_matrix_positive.values, 0)

##  negative dataset
tmp_negative = df_general_non_redundant.loc[df_general_non_redundant.dataset=='negative',
                                            ['CLUMP', 'seq_id']].copy().drop_duplicates()

tmp_negative['cnt']=1
tmp_negative = tmp_negative.pivot(index='seq_id', columns='CLUMP', values='cnt').fillna(0)
tmp_negative.columns.name = None
cooccurence_matrix_negative = tmp_negative.T.dot(tmp_negative)
np.fill_diagonal(cooccurence_matrix_negative.values, 0)

In [37]:
cooccurence_matrix_positive

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,0.0,28.0,25.0,31.0,33.0,25.0,6.0,28.0,28.0,41.0
1,28.0,0.0,29.0,28.0,36.0,20.0,3.0,32.0,28.0,43.0
2,25.0,29.0,0.0,33.0,35.0,16.0,11.0,25.0,44.0,47.0
3,31.0,28.0,33.0,0.0,29.0,16.0,13.0,19.0,44.0,44.0
4,33.0,36.0,35.0,29.0,0.0,20.0,12.0,30.0,36.0,48.0
5,25.0,20.0,16.0,16.0,20.0,0.0,2.0,15.0,16.0,30.0
6,6.0,3.0,11.0,13.0,12.0,2.0,0.0,1.0,27.0,14.0
7,28.0,32.0,25.0,19.0,30.0,15.0,1.0,0.0,22.0,35.0
8,28.0,28.0,44.0,44.0,36.0,16.0,27.0,22.0,0.0,55.0
9,41.0,43.0,47.0,44.0,48.0,30.0,14.0,35.0,55.0,0.0


In [38]:
cooccurence_matrix_negative

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,0.0,1.0,3.0,8.0,9.0,2.0,4.0,0.0,16.0,33.0
1,1.0,0.0,3.0,1.0,5.0,0.0,0.0,0.0,3.0,4.0
2,3.0,3.0,0.0,8.0,6.0,1.0,0.0,2.0,13.0,14.0
3,8.0,1.0,8.0,0.0,9.0,0.0,2.0,1.0,20.0,31.0
4,9.0,5.0,6.0,9.0,0.0,2.0,4.0,1.0,15.0,40.0
5,2.0,0.0,1.0,0.0,2.0,0.0,1.0,0.0,4.0,8.0
6,4.0,0.0,0.0,2.0,4.0,1.0,0.0,0.0,8.0,6.0
7,0.0,0.0,2.0,1.0,1.0,0.0,0.0,0.0,3.0,1.0
8,16.0,3.0,13.0,20.0,15.0,4.0,8.0,3.0,0.0,51.0
9,33.0,4.0,14.0,31.0,40.0,8.0,6.0,1.0,51.0,0.0


# Feature weight

In [39]:
# Creating a list with the name of the features
feature_lst = []
for col in df_pos_features.columns:
    feature_lst.append(col)
feature_lst.pop(0)
feature_lst.pop(0)
print('the number of candidate features is:', len(feature_lst))
feature_lst

the number of candidate features is: 13


['gravy',
 'tiny',
 'small',
 'aliphatic',
 'aromatic',
 'non_polar',
 'polar',
 'charged',
 'basic',
 'acidic',
 'helix',
 'turn',
 'sheet']

In [40]:
pos_dset_feat = df_pos_features.drop(columns = ['id', 'seq_len'])
print(pos_dset_feat.shape)
pos_dset_feat.head(3)

(161, 13)


Unnamed: 0,gravy,tiny,small,aliphatic,aromatic,non_polar,polar,charged,basic,acidic,helix,turn,sheet
0,-0.140962,0.338308,0.558872,0.276949,0.119403,0.558872,0.441128,0.154229,0.089552,0.064677,0.313433,0.323383,0.160862
1,-0.580952,0.307359,0.528139,0.220779,0.121212,0.528139,0.471861,0.220779,0.121212,0.099567,0.268398,0.311688,0.17316
2,-0.444693,0.206704,0.502793,0.27933,0.106145,0.502793,0.497207,0.357542,0.150838,0.206704,0.301676,0.139665,0.312849


In [41]:
neg_dset_feat = df_neg_features.drop(columns = ['id', 'seq_len'])
print(neg_dset_feat.shape)
neg_dset_feat.head(3)

(495, 13)


Unnamed: 0,gravy,tiny,small,aliphatic,aromatic,non_polar,polar,charged,basic,acidic,helix,turn,sheet
0,-0.241176,0.367647,0.517647,0.267647,0.108824,0.517647,0.482353,0.235294,0.120588,0.114706,0.264706,0.238235,0.232353
1,-1.108911,0.316832,0.351485,0.178218,0.069307,0.351485,0.648515,0.287129,0.188119,0.09901,0.188119,0.306931,0.173267
2,-0.324275,0.253623,0.525362,0.235507,0.105072,0.525362,0.474638,0.26087,0.115942,0.144928,0.297101,0.235507,0.25


In [42]:
# Creating a list with the p-values in it
p_values_lst = []
m = len(feature_lst)
for i in range(m):
    pos_values = pos_dset_feat.iloc[:, i]
    pos_values = list(pos_values)
    neg_values = neg_dset_feat.iloc[:, i]
    neg_values = list(neg_values)
    s, p = mannwhitneyu(pos_values, neg_values)
    p_values_lst.append(p)
p_values_lst

[0.0019459942180807517,
 0.012607265122709778,
 0.9597189018935256,
 2.2651361264259894e-15,
 0.9507567359602388,
 0.9597189018935256,
 0.9606726003379649,
 0.15458821514417756,
 0.26057984850797355,
 0.05205278804143554,
 2.768943582561357e-08,
 2.5991190933300425e-06,
 4.329771481105767e-06]

In [43]:
# Creating a dictionary with the feature as the key and the p-value
# as the value.
# The zip iterator is useful to pair each feature with its p-value
# to then create the dictionary 
zip_iterator = zip(feature_lst, p_values_lst)
dict_feat_p = dict(zip_iterator)
dict_feat_p

{'gravy': 0.0019459942180807517,
 'tiny': 0.012607265122709778,
 'small': 0.9597189018935256,
 'aliphatic': 2.2651361264259894e-15,
 'aromatic': 0.9507567359602388,
 'non_polar': 0.9597189018935256,
 'polar': 0.9606726003379649,
 'charged': 0.15458821514417756,
 'basic': 0.26057984850797355,
 'acidic': 0.05205278804143554,
 'helix': 2.768943582561357e-08,
 'turn': 2.5991190933300425e-06,
 'sheet': 4.329771481105767e-06}

In [44]:
# Creating a dictionary with the significant features their p-values

dict_significant_feat = {}
for feature, p_value in dict_feat_p.items():
    if p_value < 0.05:
        dict_significant_feat[feature] = p_value
dict_significant_feat

{'gravy': 0.0019459942180807517,
 'tiny': 0.012607265122709778,
 'aliphatic': 2.2651361264259894e-15,
 'helix': 2.768943582561357e-08,
 'turn': 2.5991190933300425e-06,
 'sheet': 4.329771481105767e-06}

In [45]:
# Sort features in order of significance
sign_feat = pd.DataFrame(dict_significant_feat, index = [0]).sort_values(
    by = 0, axis = 1, ascending = False).transpose().to_dict()
dict_significant_features = sign_feat[0]
dict_significant_features

{'tiny': 0.012607265122709778,
 'gravy': 0.0019459942180807517,
 'sheet': 4.329771481105767e-06,
 'turn': 2.5991190933300425e-06,
 'helix': 2.768943582561357e-08,
 'aliphatic': 2.2651361264259894e-15}

In [46]:
# Creating a list of the significant features in order of significance
lst_signif_features = list(dict_significant_features)
lst_signif_features

['tiny', 'gravy', 'sheet', 'turn', 'helix', 'aliphatic']

In [47]:
# Here we are creating a dictionary with the features and their p-values
# and scores from the -log10(p-value)
dict_feat_p_value_score_log = {} 
for feature in lst_signif_features:
    dict_feat_p_value_score_log[feature] = {}
    dict_feat_p_value_score_log[feature]['p-value'] = dict_significant_features[feature]
    dict_feat_p_value_score_log[feature]['score'] = -math.log10(dict_feat_p_value_score_log[feature]['p-value'])
dict_feat_p_value_score_log

{'tiny': {'p-value': 0.012607265122709778, 'score': 1.8993791141354173},
 'gravy': {'p-value': 0.0019459942180807517, 'score': 2.710858454437325},
 'sheet': {'p-value': 4.329771481105767e-06, 'score': 5.363535024458117},
 'turn': {'p-value': 2.5991190933300425e-06, 'score': 5.585173820386985},
 'helix': {'p-value': 2.768943582561357e-08, 'score': 7.557685892947779},
 'aliphatic': {'p-value': 2.2651361264259894e-15, 'score': 14.644905693351284}}

# CLUMPs voting

In [48]:
# Here we are creating a dictionary with the features and their p-values
# and scores from the -log10(p-value)
dict_feat_p_value_score_log

{'tiny': {'p-value': 0.012607265122709778, 'score': 1.8993791141354173},
 'gravy': {'p-value': 0.0019459942180807517, 'score': 2.710858454437325},
 'sheet': {'p-value': 4.329771481105767e-06, 'score': 5.363535024458117},
 'turn': {'p-value': 2.5991190933300425e-06, 'score': 5.585173820386985},
 'helix': {'p-value': 2.768943582561357e-08, 'score': 7.557685892947779},
 'aliphatic': {'p-value': 2.2651361264259894e-15, 'score': 14.644905693351284}}

In [49]:
# Creating a list with the positive dataset means of the significant 
# features
dict_pos_means = dict(pos_dset_feat.loc[:, lst_signif_features].mean())
dict_pos_means

{'tiny': 0.2834034891909498,
 'gravy': -0.4896251650801114,
 'sheet': 0.23068133309964053,
 'turn': 0.25494566616703496,
 'helix': 0.27613178308669706,
 'aliphatic': 0.24658259690574177}

In [50]:
# Creating a list with the negaive dataset means of the significant 
# features
dict_neg_means = dict(neg_dset_feat.loc[:, lst_signif_features].mean())
dict_neg_means

{'tiny': 0.26091760814193554,
 'gravy': -0.3444228952769298,
 'sheet': 0.25155914031139265,
 'turn': 0.22734984235805394,
 'helix': 0.30783187617867197,
 'aliphatic': 0.2829720294472124}

In [51]:
# Here we are creating a dictionary with the features and their means
# for the positive and the negative datasets
dict_pos_neg_means = {} 
for feature in lst_signif_features:
    dict_pos_neg_means[feature] = {}
    dict_pos_neg_means[feature]['pos_mean'] = dict_pos_means[feature]
    dict_pos_neg_means[feature]['neg_mean'] = dict_neg_means[feature]
dict_pos_neg_means

{'tiny': {'pos_mean': 0.2834034891909498, 'neg_mean': 0.26091760814193554},
 'gravy': {'pos_mean': -0.4896251650801114, 'neg_mean': -0.3444228952769298},
 'sheet': {'pos_mean': 0.23068133309964053, 'neg_mean': 0.25155914031139265},
 'turn': {'pos_mean': 0.25494566616703496, 'neg_mean': 0.22734984235805394},
 'helix': {'pos_mean': 0.27613178308669706, 'neg_mean': 0.30783187617867197},
 'aliphatic': {'pos_mean': 0.24658259690574177,
  'neg_mean': 0.2829720294472124}}

## clusters data

In [56]:
lst_signif_features = list(dict_significant_features)
lst_signif_features.insert(0, 'CLUMP')
df_all_motifs_signif_features= df_all_motifs_all_features.loc[
    :, lst_signif_features]
lst_signif_features.pop(0)
lst_signif_features

['tiny', 'gravy', 'sheet', 'turn', 'helix', 'aliphatic']

In [57]:
df_clusters_means = df_all_motifs_signif_features.groupby(
    'CLUMP').mean().reset_index()
df_clusters_means

Unnamed: 0,CLUMP,tiny,gravy,sheet,turn,helix,aliphatic
0,0,0.314286,-0.73619,0.02381,0.264286,0.425,0.065476
1,1,0.270455,-1.375,0.009091,0.645455,0.179545,0.040909
2,2,0.166667,-1.737745,0.132353,0.215686,0.176471,0.132353
3,3,0.352778,-1.231111,0.494444,0.212037,0.083333,0.259259
4,4,0.37197,1.054924,0.376515,0.188636,0.37803,0.525758
5,5,0.788636,0.345682,0.027273,0.568182,0.084091,0.054545
6,6,0.1,-3.238333,0.411667,0.08,0.02,0.02
7,7,0.117857,2.289643,0.392857,0.135714,0.8,0.510714
8,8,0.112667,-3.0348,0.174667,0.056,0.02,0.02
9,9,0.541667,-0.742262,0.077381,0.297619,0.059524,0.119048


In [58]:
dict_feat_scores = {}
higher_scores = np.arange(1, len(df_clusters_means)+1)
for feature in lst_signif_features:
    df_clu_feature = pd.DataFrame(df_clusters_means.loc[:, feature])
    lst_scores_feature = []
    lst_higher_scores_feature = []
    if dict_pos_neg_means[feature][
        'pos_mean'] - dict_pos_neg_means[feature]['neg_mean'] > 0:
        df_clu_feature = df_clu_feature.sort_values(ascending = True, by = feature)
        for i in range(len(df_clu_feature)):
            if float(df_clu_feature.iloc[i]) > dict_pos_neg_means[
                feature]['pos_mean']:
                feat_higher_score = i+1
                lst_higher_scores_feature.append(feat_higher_score)
                new_list_higher_scores_features = list(np.arange(1, len(lst_higher_scores_feature)+1))
                new_list_higher_scores_features
            else:
                feat_score = 0
                lst_scores_feature.append(feat_score)
        lst_intermediate_scores = lst_scores_feature + new_list_higher_scores_features
    else:
        df_clu_feature = df_clu_feature.sort_values(ascending = False, by = feature)
        for i in range(len(df_clu_feature)):
            if float(df_clu_feature.iloc[i]) < dict_pos_neg_means[
                feature]['pos_mean']:
                feat_score = i+1
                lst_higher_scores_feature.append(feat_higher_score)
                new_list_higher_scores_features = list(np.arange(1, len(lst_higher_scores_feature)+1))
                new_list_higher_scores_features
            else:
                feat_score = 0
                lst_scores_feature.append(feat_score)
        lst_intermediate_scores = lst_scores_feature + new_list_higher_scores_features
    dict_feat_scores[feature] = lst_intermediate_scores

In [59]:
dict_feat_scores

{'tiny': [0, 0, 0, 0, 0, 1, 2, 3, 4, 5],
 'gravy': [0, 0, 0, 1, 2, 3, 4, 5, 6, 7],
 'sheet': [0, 0, 0, 0, 1, 2, 3, 4, 5, 6],
 'turn': [0, 0, 0, 0, 0, 0, 1, 2, 3, 4],
 'helix': [0, 0, 0, 1, 2, 3, 4, 5, 6, 7],
 'aliphatic': [0, 0, 0, 1, 2, 3, 4, 5, 6, 7]}

In [60]:
for feature in lst_signif_features:
    df_clu_feature = pd.DataFrame(df_clusters_means.loc[:, feature])
    if dict_pos_neg_means[feature][
        'pos_mean'] - dict_pos_neg_means[feature]['neg_mean'] > 0:
        df_clu_feature = df_clu_feature.sort_values(ascending = True, by = feature)
        df_clu_feature['score_'+feature] = dict_feat_scores[feature]
        df_clu_feature = df_clu_feature.sort_index()
        print(df_clu_feature)
    else : 
        df_clu_feature = df_clu_feature.sort_values(ascending = False, by = feature)
        df_clu_feature['score_'+feature] = dict_feat_scores[feature]
        df_clu_feature = df_clu_feature.sort_index()
        print(df_clu_feature)
        

       tiny  score_tiny
0  0.314286           1
1  0.270455           0
2  0.166667           0
3  0.352778           2
4  0.371970           3
5  0.788636           5
6  0.100000           0
7  0.117857           0
8  0.112667           0
9  0.541667           4
      gravy  score_gravy
0 -0.736190            1
1 -1.375000            4
2 -1.737745            5
3 -1.231111            3
4  1.054924            0
5  0.345682            0
6 -3.238333            7
7  2.289643            0
8 -3.034800            6
9 -0.742262            2
      sheet  score_sheet
0  0.023810            5
1  0.009091            6
2  0.132353            2
3  0.494444            0
4  0.376515            0
5  0.027273            4
6  0.411667            0
7  0.392857            0
8  0.174667            1
9  0.077381            3
       turn  score_turn
0  0.264286           1
1  0.645455           4
2  0.215686           0
3  0.212037           0
4  0.188636           0
5  0.568182           3
6  0.080000       

In [61]:
lst_final_scores = []
for feature in lst_signif_features:
    df_clu_feature = pd.DataFrame(df_clusters_means.loc[:, feature])
    if dict_pos_neg_means[feature][
        'pos_mean'] - dict_pos_neg_means[feature]['neg_mean'] > 0:
        df_clu_feature = df_clu_feature.sort_values(ascending = True, by = feature)
        df_clu_feature['score_'+feature] = dict_feat_scores[feature]
        df_clu_feature = df_clu_feature.sort_index()
        df_clu_feature = pd.DataFrame(df_clu_feature.iloc[:, 1])
        df_clu_feature = df_clu_feature.rename(columns = {'score_'+feature : feature})
        lst_clu_feature = list(df_clu_feature[feature])
        print(lst_clu_feature)
    else : 
        df_clu_feature = df_clu_feature.sort_values(ascending = False, by = feature)
        df_clu_feature['score_'+feature] = dict_feat_scores[feature]
        df_clu_feature = df_clu_feature.sort_index()
        df_clu_feature = pd.DataFrame(df_clu_feature.iloc[:, 1])
        df_clu_feature = df_clu_feature.rename(columns = {'score_'+feature : feature})
        lst_clu_feature = list(df_clu_feature[feature])
        print(lst_clu_feature)
    lst_final_scores.append(lst_clu_feature)
print('------------------------------')
print(lst_final_scores)
final_scores = pd.DataFrame(lst_final_scores).transpose()
final_scores.columns = lst_signif_features
final_scores= final_scores.transpose()
final_scores

[1, 0, 0, 2, 3, 5, 0, 0, 0, 4]
[1, 4, 5, 3, 0, 0, 7, 0, 6, 2]
[5, 6, 2, 0, 0, 4, 0, 0, 1, 3]
[1, 4, 0, 0, 0, 3, 0, 0, 0, 2]
[0, 1, 2, 4, 0, 3, 6, 0, 7, 5]
[3, 5, 1, 0, 0, 4, 6, 0, 7, 2]
------------------------------
[[1, 0, 0, 2, 3, 5, 0, 0, 0, 4], [1, 4, 5, 3, 0, 0, 7, 0, 6, 2], [5, 6, 2, 0, 0, 4, 0, 0, 1, 3], [1, 4, 0, 0, 0, 3, 0, 0, 0, 2], [0, 1, 2, 4, 0, 3, 6, 0, 7, 5], [3, 5, 1, 0, 0, 4, 6, 0, 7, 2]]


Unnamed: 0,0,1,2,3,4,5,6,7,8,9
tiny,1,0,0,2,3,5,0,0,0,4
gravy,1,4,5,3,0,0,7,0,6,2
sheet,5,6,2,0,0,4,0,0,1,3
turn,1,4,0,0,0,3,0,0,0,2
helix,0,1,2,4,0,3,6,0,7,5
aliphatic,3,5,1,0,0,4,6,0,7,2


# Final results

In [62]:
dict_feat_p_value_score_log

{'tiny': {'p-value': 0.012607265122709778, 'score': 1.8993791141354173},
 'gravy': {'p-value': 0.0019459942180807517, 'score': 2.710858454437325},
 'sheet': {'p-value': 4.329771481105767e-06, 'score': 5.363535024458117},
 'turn': {'p-value': 2.5991190933300425e-06, 'score': 5.585173820386985},
 'helix': {'p-value': 2.768943582561357e-08, 'score': 7.557685892947779},
 'aliphatic': {'p-value': 2.2651361264259894e-15, 'score': 14.644905693351284}}

In [63]:
final_scores

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
tiny,1,0,0,2,3,5,0,0,0,4
gravy,1,4,5,3,0,0,7,0,6,2
sheet,5,6,2,0,0,4,0,0,1,3
turn,1,4,0,0,0,3,0,0,0,2
helix,0,1,2,4,0,3,6,0,7,5
aliphatic,3,5,1,0,0,4,6,0,7,2


In [64]:
feat_scores = []
for feature in lst_signif_features:
    feat_scores.append(dict_feat_p_value_score_log[feature]['score'])
feat_scores

[1.8993791141354173,
 2.710858454437325,
 5.363535024458117,
 5.585173820386985,
 7.557685892947779,
 14.644905693351284]

In [65]:
df_clusters_weights_final = final_scores.mul(feat_scores, axis = 0)
df_clusters_weights_final

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
tiny,1.899379,0.0,0.0,3.798758,5.698137,9.496896,0.0,0.0,0.0,7.597516
gravy,2.710858,10.843434,13.554292,8.132575,0.0,0.0,18.976009,0.0,16.265151,5.421717
sheet,26.817675,32.18121,10.72707,0.0,0.0,21.45414,0.0,0.0,5.363535,16.090605
turn,5.585174,22.340695,0.0,0.0,0.0,16.755521,0.0,0.0,0.0,11.170348
helix,0.0,7.557686,15.115372,30.230744,0.0,22.673058,45.346115,0.0,52.903801,37.788429
aliphatic,43.934717,73.224528,14.644906,0.0,0.0,58.579623,87.869434,0.0,102.51434,29.289811


In [66]:
monster_score = pd.DataFrame(df_clusters_weights_final.sum()).rename(
    columns = {0 : 'monster_score'}).sort_values(by = 'monster_score', ascending = False)
monster_score

Unnamed: 0,monster_score
8,177.046827
6,152.191559
1,146.147554
5,128.959238
9,107.358427
0,80.947804
2,54.04164
3,42.162077
4,5.698137
7,0.0


In [67]:
monster_score= (monster_score - monster_score.min()) / (
    monster_score.max() - monster_score.min())
monster_score

Unnamed: 0,monster_score
8,1.0
6,0.859612
1,0.825474
5,0.728391
9,0.606384
0,0.457211
2,0.305239
3,0.238141
4,0.032184
7,0.0


In [69]:
def test_ranking(test_dataframe, ranking_col_name):
    ranking = np.arange(1, len(test_dataframe)+1)
    test_dataframe[ranking_col_name] = ranking
    df_test = test_dataframe.sort_index()
    #df_test = pd.DataFrame(df_test.loc[:, ranking_col_name])
    return df_test

In [68]:
monster_score_jacc1 = pd.concat([pd.DataFrame(B3.loc[
    :, 'monster_score']), pd.DataFrame(df_jaccard_index.loc[:, [
    'jaccard_1', 'jaccard_norm_1']])], axis = 1)
monster_score_jacc1 

NameError: name 'B3' is not defined