In [1]:
import pandas as pd
import scipy
from scipy import stats
from scipy.stats import mannwhitneyu
import math
import numpy as np
import re
from monster.import_data import *

# Importing data

In [2]:
### importing data
#

# motifs
lst_motifs = import_list_motifs('../data/lst_motifs')
df_motifs_features = pd.read_csv('df_motifs_features.tsv')
df_clusters = pd.read_csv('df_motifs_CLUMPs_standard_scaling.tsv')

# positive dataset
seqs_path_pos = '../data/datasets/minc_nr_positive_dataset.fasta'
pos_dict = import_fasta_sequences_as_dict(seqs_path_pos)
pos_dset_feat = pd.read_csv('df_pos_features.tsv')

# negative dataset
seqs_path_neg = '../data/datasets/minc_nr_negative_dataset.fasta'
neg_dict = import_fasta_sequences_as_dict(seqs_path_neg)
neg_dset_feat = pd.read_csv('df_neg_features.tsv')

# Formatting data

In [3]:
def format_input_data(df_motifs_features, df_clusters,
    pos_dset_feat, neg_dset_feat):
    """format_input_data
       -----------------
       This function formats input data in the correct
       way for it to work with the following functions.
       
       Arguments:
       df_motifs_features -- pandas dataframe with data of
                             feature values of the motifs, but no
                             information about the CLUMPs
       df_clusters -- pandas dataframe of the motif and corresponding CLUMP
       pos_dset_feat -- pandas dataframe with data of feature values
                        of the positive dataset
       neg_dset_feat -- pandas dataframe with data of feature values
                        of the negative dataset

       Output:
       df_all_motifs_all_features -- pandas dataframe with data of
                                     feature values of the motifs
       pos_dset_feat -- pandas dataframe with data of feature values
                        of the positive dataset
       neg_dset_feat -- pandas dataframe with data of feature values
                        of the negative dataset
    """
    df_motifs_features.rename(columns = {'id' : 'motif'}, inplace = True)
    df_all_motifs_all_features = df_clusters.merge(df_motifs_features)
    pos_dset_feat.drop(columns = 'id', inplace = True)
    neg_dset_feat.drop(columns = 'id', inplace = True)
    
    return df_all_motifs_all_features, pos_dset_feat, neg_dset_feat

In [4]:
df_all_motifs_all_features, pos_dset_feat, neg_dset_feat = format_input_data(df_motifs_features, df_clusters, pos_dset_feat, neg_dset_feat)

# occ_each_mot_in_each_seq

In [5]:
def start_end_position(lst_motifs, dict_seqs, dataset):
    """start_end_position
       ------------------
       This function calculates the start and end position 
       of the motifs in the sequences.
       
       Arguments:
       lst_motifs -- list of motifs
       dict_seqs -- dictionary of fasta sequences where the key is the
                    id and the value is the sequence
       dataset -- 'positive' or 'negative'
       
       Output:
       df_start_end_position -- pandas dataframe where:
                                first column is the motif
                                second column is the sequence id
                                third column is the start position
                                fourth column is the end position
                    
    """
    
    lst_dict = [] 
    
    # Iterate the list of motifs
    # For each motif, go through the dictionary of sequences,
    for motif in lst_motifs:
            for seq_id in dict_seqs:
                # Assign the sequence to the variable record
                record = dict_seqs[seq_id]
                # Run the finditer (to find the start and end positions)
                for match in re.finditer(motif, record):
                    # append the motif, the sequence id, 
                    # the start and end position to the list.
                    lst_dict.append({'motif':motif, 'seq_id':seq_id, 
                                     'start':match.start(), 'end':match.end()})
    df_start_end_position = pd.DataFrame(lst_dict)
    df_start_end_position['dataset'] = dataset
    
    return df_start_end_position

In [6]:
def occ_each_mot_in_each_seq(df_start_end_position, dataset):
    """occ_each_mot_in_each_seq
       ------------------------
       This function calculates the occurrences
       of each motif in each sequence.
       
       Arguments: 
       df_start_end_position -- pandas dataframe where:
                                first column is the motif
                                second column is the sequence id
                                third column is the start position
                                fourth column is the end position
       dataset -- 'positive' or 'negative'

       Output:
       df_occ_seq -- pandas dataframe where: 
                     first column is the motif
                     second column is the sequence id 
                     third column is the number of occurrences
                     fourth column is the dataset
    """
    
    df_occ_seq = df_start_end_position.groupby(
        ['motif','seq_id']).size().reset_index(name='occ') 
    
    df_occ_seq['dataset'] = dataset
    
    if df_occ_seq.shape == df_occ_seq.drop_duplicates().shape:
        return df_occ_seq

In [7]:
def df_occs_mots_CLUMPs_both_dsets(df_start_end_position_pos, 
                                   df_start_end_position_neg, df_clusters):
    """df_occs_mots_CLUMPs_both_dsets
       ------------------------------
       This function gathers information of the occurrences
       of each motif in each sequence in each dataset, and of 
       the corresponding CLUMP.
       
       Arguments:
       df_start_end_position_pos -- pandas dataframe where:
                                    first column is the motif
                                    second column is the sequence id
                                    third column is the start position
                                    fourth column is the end position
       df_start_end_position_neg -- pandas dataframe where:
                                    first column is the motif
                                    second column is the sequence id
                                    third column is the start position
                                    fourth column is the end position
       df_clusters -- pandas dataframe of the motif and corresponding CLUMP
       
       Output:
       df_general -- pandas dataframe with: motif, CLUMP, seq_id, occ, dataset
    """
    df_occ_seq_pos = occ_each_mot_in_each_seq(
        df_start_end_position_pos, "positive")
    df_occ_seq_neg = occ_each_mot_in_each_seq(
        df_start_end_position_neg, "negative")   
    
    df_clusters.rename(columns = {'id' : 'motif'}, inplace = True)
    df_general = pd.concat(
        [pd.merge(df_clusters, df_occ_seq_pos, on='motif'), 
         pd.merge(df_clusters, df_occ_seq_neg, on='motif')])
    
    return df_general

In [8]:
def find_occurrences_of_mots_in_datasets(df_clusters, pos_dict, neg_dict):
    """find_occurrences_of_mots_in_datasets
       ------------------------------------
       This function finds the occurrences of motifs of CLUMPs 
       in the positive and negative datasets, by finding their
       start and end position. It then combines the information
       in a pandas dataframe.
       
       Arguments:
       df_clusters -- pandas dataframe of the motif and corresponding CLUMP
       pos_dict -- dictionary of fasta sequences where the key is the
                   id and the value is the sequence of positive dataset
       neg_dict -- dictionary of fasta sequences where the key is the
                   id and the value is the sequence of negative dataset
       
       Output:
       df_general -- pandas dataframe with: motif, CLUMP, seq_id, occ, dataset
    """
    
    lst_motifs = list(df_clusters.motif)
    
    df_start_end_position_pos = start_end_position(
        lst_motifs, pos_dict, 'positive')
    df_start_end_position_neg = start_end_position(
        lst_motifs, neg_dict, 'negative')
    
    df_general = df_occs_mots_CLUMPs_both_dsets(df_start_end_position_pos,
                                                df_start_end_position_neg, 
                                                df_clusters)
    
    return df_general

# non_redundant_motifs

In [9]:
def find_extended_motifs(lst_motifs):
    """find_extended_motifs
       --------------------
       This function identifies extended motifs in CLUMPs.
       
       Arguments:
       lst_motifs -- list of motifs
       
       Output: 
       lst_all_extended_motifs -- ???????
    """
    
    dict_extended_motifs = {}
    lst_motifs = sorted(lst_motifs, key=len)
    print(lst_motifs)
    lst_known_motifs = []
    lst_all_extended_motifs = []
    for motif in lst_motifs:
        if motif not in lst_known_motifs:
            lst_extended_motifs = [
                m for m in lst_motifs if motif in m and motif != m]
            lst_known_motifs.append(motif)
            lst_known_motifs+=lst_extended_motifs
            if len(lst_extended_motifs)>0:
                lst_all_extended_motifs+=[[motif]+lst_extended_motifs]
                
    return lst_all_extended_motifs

In [10]:
def non_redundant_motifs(df_clusters, df_general):
    """non_redundant_motifs
       --------------------
       This function identifies non redundant extended motifs.
       Non redundant motifs (e.g. root motifs and non-extended motifs) 
       are stored in a list called lst_motifs_mask.
       
       Arguments:
       df_clusters -- pandas dataframe of the motif and corresponding CLUMP.
       df_general -- pandas dataframe with: motif, CLUMP, seq_id, occ, dataset
       
       Output:
       lst_motifs_mask -- list of non redundant motifs.
       df_general_non_redundant -- pandas dataframe where the occurrences
       belong only to the non redundant motifs
       
    """
    
    lst_motifs = list(df_clusters.motif)
    
    lst_motifs_mask = []
    for c in df_clusters.CLUMP.unique():
        lst_motifs = df_clusters.loc[
            df_clusters.CLUMP==c, 'motif'].unique()
        print('--------------------------------------')
        print(f"CLUMP {c} :")
        lst_ext_motifs = find_extended_motifs(lst_motifs)
        lst_all_ext_motifs = [j for i in lst_ext_motifs for j in i]
        lst_non_ext_motifs = [
            m for m in lst_motifs if m not in lst_all_ext_motifs]
        lst_root_motifs = [el[0] for el in lst_ext_motifs]
        lst_motifs_mask += lst_root_motifs+lst_non_ext_motifs
        print('Root-motifs', lst_root_motifs)
        print('Extended-motifs', lst_ext_motifs)
        print('Non-extended motifs', lst_non_ext_motifs)
        print('\n')
        
    # Selecting the rows of the df_general where the occurrences
    # belong only to the non redundant motifs.
    df_general_non_redundant = df_general[df_general.motif.isin(
        lst_motifs_mask)]
    
    return lst_motifs_mask, df_general_non_redundant

In [11]:
def count_all_occurrences_mots_of_CLUMPs(df_general_non_redundant):
    """count_all_occurrences_mots_of_CLUMPs
       ------------------------------------
       This function calculates all the occurrences of each CLUMP 
       in each dataset including multiple occurrences 
       of a motif in a sequence.
       
       Arguments:
       df_general_non_redundant -- pandas dataframe where the occurrences
       belong only to the non redundant motifs.
       
       Output:
       motif_counts -- pandas dataframe with the number of
                       occurrences of the CLUMP in the two 
                       datasets.
       
    """
    motifs_counts = pd.DataFrame(df_general_non_redundant.groupby(
        ['CLUMP', 'dataset'])['occ'].sum())
    motifs_counts = motifs_counts.reset_index()
    motifs_counts = motifs_counts.pivot(
        index='CLUMP', columns='dataset', values='occ')
    motifs_counts = motifs_counts.rename_axis(None,axis=1)
    motifs_counts = motifs_counts.reset_index()
    
    return motifs_counts

In [12]:
def count_nb_seqs_containing_mots_of_CLUMPs(df_general_non_redundant):
    """count_nb_seqs_containing_mots_of_CLUMPs
       ---------------------------------------
       This function calculates how many sequences contain a 
       motif of a CLUMP, without considering if the motif is present
       more than once in a sequence.
       Hence how many sequences are found by the CLUMP, considering the
       sequence only once.
       
       Arguments:
       df_general_non_redundant -- pandas dataframe where the occurrences
       belong only to the non redundant motifs.
       
       Output:
       df_cnt_seq_per_cluster -- pandas dataframe with the number of
                                 sequences found by the CLUMP in the
                                 two datasets.
       
    """
    # Calculating how many sequences contain a motif of a CLUMP
    # (how many sequences are found by the CLUMP, considering the
    # sequence only once) without considering if the motif is present 
    # more than once in a sequence.
    df_cnt_seq_per_cluster = df_general_non_redundant.groupby([
        'CLUMP', 'seq_id', 'dataset']).size().reset_index(name='temporary')
    df_cnt_seq_per_cluster = df_cnt_seq_per_cluster.drop(
        columns = 'temporary')
    df_cnt_seq_per_cluster = df_cnt_seq_per_cluster.drop_duplicates()
    df_cnt_seq_per_cluster = df_cnt_seq_per_cluster.groupby(
        ['CLUMP', 'dataset']).size().reset_index()
    df_cnt_seq_per_cluster = df_cnt_seq_per_cluster.pivot(
        index='CLUMP', columns='dataset', values = 0)
    df_cnt_seq_per_cluster = df_cnt_seq_per_cluster.rename_axis(None,axis=1)
    df_cnt_seq_per_cluster = df_cnt_seq_per_cluster.reset_index()
    
    return df_cnt_seq_per_cluster

In [13]:
def find_occ_and_nb_seqs(df_clusters, df_general):
    """find_occ_and_nb_seqs
       --------------------
       This function finds the non redudant motifs from the 
       original df_clusters.
       Then finds the subset of df_general where only the 
       non redundant motifs are considered.
       Finally calculates the occurrences of the motifs of the CLUMPs 
       and the number of sequences found by the CLUMP. 
       (for more information about the two outputs, read doc of following
       two functions:
       count_all_occurrences_mots_of_CLUMPs()  
       count_nb_seqs_containing_mots_of_CLUMPs()).
       
       Arguments:
       df_clusters -- pandas dataframe of the motif and corresponding CLUMP.
       df_general -- pandas dataframe with: motif, CLUMP, seq_id, occ, dataset.
       
       Output:
       lst_motifs_mask -- list of non redundant motifs.
       motif_counts -- pandas dataframe with the number of
                       occurrences of the CLUMP in the two 
                       datasets.
       df_cnt_seq_per_cluster -- pandas dataframe with the number of
                                 sequences found by the CLUMP in the
                                 two datasets.
    """
    lst_motifs_mask, df_general_non_redundant = non_redundant_motifs(
        df_clusters, df_general)
    motif_counts = count_all_occurrences_mots_of_CLUMPs(
        df_general_non_redundant)
    df_cnt_seq_per_cluster = count_nb_seqs_containing_mots_of_CLUMPs(
        df_general_non_redundant)
    
    return lst_motifs_mask, df_general_non_redundant, motif_counts, df_cnt_seq_per_cluster

# Test last 2 functions

In [14]:
df_general = find_occurrences_of_mots_in_datasets(
    df_clusters, pos_dict, neg_dict)
df_general

Unnamed: 0,motif,CLUMP,seq_id,occ,dataset
0,GHWT,0,Minc3s00008g00574,1,positive
1,GHWT,0,Minc3s00736g16684,1,positive
2,GHWT,0,Minc3s01051g20218,1,positive
3,GHWT,0,Minc3s01143g21148,1,positive
4,GHWT,0,Minc3s01152g21216,1,positive
...,...,...,...,...,...
682,KCS,10,Minc3s01070g20414,1,negative
683,KCS,10,Minc3s01536g24546,1,negative
684,KCS,10,Minc3s02273g29198,1,negative
685,KCS,10,Minc3s03208g33140,1,negative


In [15]:
lst_motifs_mask, motifs_counts, df_cnt_seq_per_cluster = find_occ_and_nb_seqs(
    df_clusters, df_general)
lst_motifs_mask

--------------------------------------
CLUMP 0 :
['HWT', 'WWS', 'WNT', 'CQY', 'FSL', 'NVY', 'WNS', 'HWF', 'GHWT', 'HWTQ', 'YSHS', 'FSVF', 'FTNS', 'GHWTQ']
Root-motifs ['HWT']
Extended-motifs [['HWT', 'GHWT', 'HWTQ', 'GHWTQ']]
Non-extended motifs ['WWS', 'WNT', 'YSHS', 'CQY', 'FSVF', 'FSL', 'NVY', 'WNS', 'FTNS', 'HWF']


--------------------------------------
CLUMP 1 :
['PGNV', 'PTHP', 'YPSG', 'PKPF', 'PTPK', 'PKPY', 'PSPK', 'PKPN', 'PPPK', 'KPPG', 'RGIG', 'FPSP', 'KYPN', 'NGQP', 'PYPGQ', 'PYPSG', 'GYPSG', 'RYPSG', 'LYPSG', 'YYPGG', 'VYPSG', 'PYQSG']
Root-motifs ['YPSG']
Extended-motifs [['YPSG', 'PYPSG', 'GYPSG', 'RYPSG', 'LYPSG', 'VYPSG']]
Non-extended motifs ['PGNV', 'PTHP', 'PYPGQ', 'PKPF', 'PTPK', 'YYPGG', 'PKPY', 'PSPK', 'PYQSG', 'PKPN', 'PPPK', 'KPPG', 'RGIG', 'FPSP', 'KYPN', 'NGQP']


--------------------------------------
CLUMP 2 :
['KHP', 'HGD', 'PKPK', 'PKYK', 'WKPK', 'KYKS', 'KQAQ', 'KTKL', 'PKAK', 'QEAF', 'AYKN', 'KMKG', 'FKAK', 'IKNN', 'KKIS', 'MDKF', 'VKSY']
Root-motifs [

['HWT',
 'WWS',
 'WNT',
 'YSHS',
 'CQY',
 'FSVF',
 'FSL',
 'NVY',
 'WNS',
 'FTNS',
 'HWF',
 'YPSG',
 'PGNV',
 'PTHP',
 'PYPGQ',
 'PKPF',
 'PTPK',
 'YYPGG',
 'PKPY',
 'PSPK',
 'PYQSG',
 'PKPN',
 'PPPK',
 'KPPG',
 'RGIG',
 'FPSP',
 'KYPN',
 'NGQP',
 'PKPK',
 'PKYK',
 'WKPK',
 'KHP',
 'KYKS',
 'KQAQ',
 'KTKL',
 'PKAK',
 'QEAF',
 'HGD',
 'AYKN',
 'KMKG',
 'FKAK',
 'IKNN',
 'KKIS',
 'MDKF',
 'VKSY',
 'ADAE',
 'CGEGE',
 'DAP',
 'ACGE',
 'ATEN',
 'DQAG',
 'KDLE',
 'MGVE',
 'QEKL',
 'SNEE',
 'AEG',
 'AEAD',
 'AETD',
 'DENL',
 'NEAK',
 'NEPL',
 'TEAK',
 'VDEG',
 'TQLA',
 'NLVG',
 'AYAR',
 'DIAK',
 'NLTL',
 'QLAK',
 'AAIE',
 'ILDN',
 'RNLL',
 'MIE',
 'IANG',
 'ILAN',
 'QVAS',
 'TILG',
 'TML',
 'VEAV',
 'VHAA',
 'CGGG',
 'CGIGG',
 'CGCCG',
 'CGIGR',
 'CGNCG',
 'CGSGG',
 'CCCGG',
 'CGDGF',
 'CNICG',
 'CCCGF',
 'CNGGG',
 'CGCCA',
 'CGDSG',
 'CQNGG',
 'CGKAG',
 'CCSGY',
 'PNPGG',
 'PSPGG',
 'FGGG',
 'EEER',
 'KEEKK',
 'EEEKK',
 'KKEKK',
 'KNEKK',
 'KEERK',
 'KEEIK',
 'KEETK',
 'GEEKK',
 'KEEKG',
 'E

In [16]:
motifs_counts

Unnamed: 0,CLUMP,negative,positive
0,0,81,110
1,1,10,112
2,2,36,133
3,3,85,129
4,4,73,119
5,5,10,62
6,6,25,85
7,7,5,58
8,8,124,190
9,9,8,35


In [17]:
df_cnt_seq_per_cluster

Unnamed: 0,CLUMP,negative,positive
0,0,72,54
1,1,10,50
2,2,30,58
3,3,66,57
4,4,69,53
5,5,10,40
6,6,20,28
7,7,5,44
8,8,109,76
9,9,8,23


# J1 and J2

## J1

In [18]:
def calculate_J1(motifs_counts, neg_dset_feat, pos_dset_feat):
    """calculate_J1
       ------------
       This function calculates J1. Modified Jaccard Index with data of the 
       occurrences of the motifs of the CLUMPs.
       
       Arguments:
       motif_counts -- pandas dataframe with the number of
                       occurrences of the CLUMP in the two 
                       datasets.
       neg_dset_feat -- pandas dataframe with data of feature values
                        of the negative dataset
       pos_dset_feat -- pandas dataframe with data of feature values
                        of the positive dataset
                       
       Output:
       J1 -- pandas dataframe with the results of the J1
    """
    motifs_counts['norm_negative'] = motifs_counts.negative/len(neg_dset_feat)
    motifs_counts['norm_positive'] = motifs_counts.positive/len(pos_dset_feat)

    ## J1
    motifs_counts[
        'jaccard_norm_1'
    ]= motifs_counts.norm_negative/motifs_counts.norm_positive
    J1 = pd.DataFrame(motifs_counts['jaccard_norm_1'])
    
    return J1

## J2

In [19]:
def calculate_J2(df_cnt_seq_per_cluster, neg_dset_feat, pos_dset_feat):
    """calculate_J2
       ------------
       This function calculates J2. Modified Jaccard Index with data of the 
       number of sequences found by a CLUMP.
       
       Arguments:
       df_cnt_seq_per_cluster -- pandas dataframe with the number of
                                 sequences found by the CLUMP in the
                                 two datasets.
       neg_dset_feat -- pandas dataframe with data of feature values
                        of the negative dataset
       pos_dset_feat -- pandas dataframe with data of feature values
                        of the positive dataset
                       
       Output:
       J2 -- pandas dataframe with the results of the J2
    """
    df_cnt_seq_per_cluster[
        'norm_negative'] = df_cnt_seq_per_cluster.negative/len(
        neg_dset_feat)
    df_cnt_seq_per_cluster[
        'norm_positive'] = df_cnt_seq_per_cluster.positive/len(
        pos_dset_feat)

    ## J2
    df_cnt_seq_per_cluster[
        'jaccard_norm_2'
    ] = df_cnt_seq_per_cluster.norm_negative/df_cnt_seq_per_cluster.norm_positive
    J2 = pd.DataFrame(df_cnt_seq_per_cluster['jaccard_norm_2'])
    
    return J2

## calculate_J1_and_J2

In [20]:
def calculate_J1_and_J2(
    motifs_counts, df_cnt_seq_per_cluster, neg_dset_feat, pos_dset_feat):
    """calculate_J1_and_J2
       -------------------
       This function calculates the J1 and J2. Two modified Jaccard Indexes.
       (for more information about the input of the two indexes
       please read doc of the two following functions:
       calculate_J1()
       calculate_J2())
       
       Arguments:
       motif_counts -- pandas dataframe with the number of
                       occurrences of the CLUMP in the two 
                       datasets.
       df_cnt_seq_per_cluster -- pandas dataframe with the number of
                                 sequences found by the CLUMP in the
                                 two datasets.
       neg_dset_feat -- pandas dataframe with data of feature values
                        of the negative dataset.
       pos_dset_feat -- pandas dataframe with data of feature values
                        of the positive dataset.
                        
       Output:
       df_jaccard_index -- pandas dataframe with results of
                           calculation of 1-J1 and 1-J2.
    """
    J1 = calculate_J1(motifs_counts, neg_dset_feat, pos_dset_feat)
    J2 = calculate_J2(df_cnt_seq_per_cluster, neg_dset_feat, pos_dset_feat)
    
    # here we are calculating the values of 1 - jaccard
    # since we want a score by maximization. With values to directly 
    # sum the ones of the CLUMP score
    df_jaccard_index = pd.concat([J1, J2], axis = 1)
    df_jaccard_index = 1 - df_jaccard_index
    df_jaccard_index.insert(0, 'CLUMP', np.arange(0, len(df_jaccard_index)))

    return df_jaccard_index

In [21]:
calculate_J1_and_J2(
    motifs_counts, df_cnt_seq_per_cluster, neg_dset_feat, pos_dset_feat)

Unnamed: 0,CLUMP,jaccard_norm_1,jaccard_norm_2
0,0,0.760496,0.56633
1,1,0.97096,0.934949
2,2,0.911962,0.831766
3,3,0.785686,0.623392
4,4,0.800475,0.576558
5,5,0.94754,0.918687
6,6,0.904337,0.767677
7,7,0.971961,0.963039
8,8,0.78773,0.533519
9,9,0.925657,0.886869


# CLUMP_score

## feature_selection

In [22]:
def feature_weight(pos_dset_feat, neg_dset_feat, feature_lst):
    """feature_weight
       --------------
       This function calculates which features are significant.
       To find that feature_weight uses the Mann-Whitney test, 
       which calculates the significance of the difference between 
       two datasets means (also for unpaired datasets), 
       and gives a p-value as part of the output.

       null hypothesis (H0) : the difference between the two means 
       is not statistically significant. p-value >= 0.05
       alternative hypothesis (H1): the difference between the two means 
       is statistically significant. p-value < 0.05 
       
       The features that result to be significant will receive a score.
       Where the score = -log10(p-value)
       
       Arguments:
       neg_dset_feat -- pandas dataframe with data of feature values
                        of the negative dataset.
       pos_dset_feat -- pandas dataframe with data of feature values
                        of the positive dataset. 
       feature_lst -- list of features
                        
       Output:
       dict_significant_features -- dictionary with significant
                                    features as the keys and 
                                    the p-values as the values
    """
    # Calculating the p-values
    # Creating a list with the p-values in it
    p_values_lst = []
    m = len(feature_lst)
    for i in range(m):
        pos_values = pos_dset_feat.iloc[:, i]
        pos_values = list(pos_values)
        neg_values = neg_dset_feat.iloc[:, i]
        neg_values = list(neg_values)
        s, p = mannwhitneyu(pos_values, neg_values)
        p_values_lst.append(p)
    
    # Creating a dictionary with the feature as the key and the p-value
    # as the value.
    # The zip iterator is useful to pair each feature with its p-value
    # to then create the dictionary 
    zip_iterator = zip(feature_lst, p_values_lst)
    dict_feat_p = dict(zip_iterator)
    
    # Creating a dictionary with the significant features their p-values
    dict_significant_feat = {}
    for feature, p_value in dict_feat_p.items():
        if p_value < 0.05:
            dict_significant_feat[feature] = p_value
    # Sort features in order of significance
    sign_feat = pd.DataFrame(dict_significant_feat, index = [0]).sort_values(
        by = 0, axis = 1, ascending = False).transpose().to_dict()
    dict_significant_features = sign_feat[0]
    
    return dict_significant_features

In [23]:
def feature_selection(pos_dset_feat, neg_dset_feat):
    """feature_selection
       -----------------
       This function calculates which features help to distinguish 
       the best the positive and the negative datasets (find an enrichment).
       
       Arguments:
       neg_dset_feat -- pandas dataframe with data of feature values
                        of the negative dataset.
       pos_dset_feat -- pandas dataframe with data of feature values
                        of the positive dataset.    

                        
       Output:
       dict_feat_p_value_score_log -- dictionary of significant features
                                      as keys and dictionaries as values.
                                      These dictionaries contain information
                                      of the the p-value and the score.
       lst_signif_features -- list of significant features
    """
    
    # What are the candidate features?
    pos_dset_feat = pos_dset_feat.drop(columns = ['seq_len'])
    neg_dset_feat = neg_dset_feat.drop(columns = ['seq_len'])

    feature_lst = []
    for col in pos_dset_feat.columns:
        feature_lst.append(col)
    print('the number of candidate features is:', len(feature_lst))
    
    # Finding significant features and their p-values
    dict_significant_features = feature_weight(
        pos_dset_feat, neg_dset_feat, feature_lst)

    # Creating a list of the significant features in order of significance
    lst_signif_features = list(dict_significant_features)
    print('the number of significant features is:', len(lst_signif_features))
    print('significant features to calculate the CLUMP_score are:', 
          lst_signif_features)
    
    # Here we are creating a dictionary with the features and their p-values
    # and scores from the -log10(p-value)
    dict_feat_p_value_score_log = {} 
    for feature in lst_signif_features:
        dict_feat_p_value_score_log[feature] = {}
        dict_feat_p_value_score_log[feature][
            'p-value'] = dict_significant_features[feature]
        dict_feat_p_value_score_log[feature]['score'] = -math.log10(
            dict_feat_p_value_score_log[feature]['p-value'])
    
    return lst_signif_features, dict_feat_p_value_score_log

In [24]:
lst_signif_features, dict_feat_p_value_score_log = feature_selection(pos_dset_feat, neg_dset_feat)
lst_signif_features

the number of candidate features is: 13
the number of significant features is: 6
significant features to calculate the CLUMP_score are: ['tiny', 'gravy', 'sheet', 'turn', 'helix', 'aliphatic']


['tiny', 'gravy', 'sheet', 'turn', 'helix', 'aliphatic']

In [25]:
dict_feat_p_value_score_log

{'tiny': {'p-value': 0.012607265122709778, 'score': 1.8993791141354173},
 'gravy': {'p-value': 0.0019459942180807517, 'score': 2.710858454437325},
 'sheet': {'p-value': 4.329771481105767e-06, 'score': 5.363535024458117},
 'turn': {'p-value': 2.5991190933300425e-06, 'score': 5.585173820386985},
 'helix': {'p-value': 2.768943582561357e-08, 'score': 7.557685892947779},
 'aliphatic': {'p-value': 2.2651361264259894e-15, 'score': 14.644905693351284}}

## Average_calculation

In [26]:
def dataset_average_calculation(
    neg_dset_feat, pos_dset_feat, lst_signif_features):
    """dataset_average_calculation
       ---------------------------
       This function calculates the average of each significant 
       feature for the two datasets.
       
       Arguments:
       neg_dset_feat -- pandas dataframe with data of feature values
                        of the negative dataset.
       pos_dset_feat -- pandas dataframe with data of feature values
                        of the positive dataset.
       lst_signif_features -- list of significant features
       
       Output:
       dict_pos_neg_means -- dictionary with the significant feature
                             as the key and dictionaries as the values.
                             The dictionaries contain information
                             of the positive and negative average value
                             for that feature.
    """
    # Creating a list with the positive dataset means of the significant 
    # features
    dict_pos_means = dict(pos_dset_feat.loc[:, lst_signif_features].mean())
    # Creating a list with the negaive dataset means of the significant 
    # features
    dict_neg_means = dict(neg_dset_feat.loc[:, lst_signif_features].mean())
    
    # Creating a dictionary with the features and their means
    # for the positive and the negative datasets
    dict_pos_neg_means = {} 
    for feature in lst_signif_features:
        dict_pos_neg_means[feature] = {}
        dict_pos_neg_means[feature]['pos_mean'] = dict_pos_means[feature]
        dict_pos_neg_means[feature]['neg_mean'] = dict_neg_means[feature]
    
    return dict_pos_neg_means

In [27]:
def CLUMPs_average_calculation(df_all_motifs_all_features, 
                               lst_signif_features):
    """CLUMPs_average_calculation
       --------------------------
       This function calculates the average of each significant
       feature for the CLUMPs.
    
       Arguments:
       df_all_motifs_all_features -- pandas dataframe with data of 
                                     feature values of the motifs.
       lst_signif_features -- list of significant features
                             
       Output:
       df_clusters_means -- pandas dataframe with data of average of the CLUMPs
                            values for significant features.
       
    """
    
    # Creating a list of the significant features
    # and selecting subset of df_all_motifs_signif_features of
    # values only of significant features
    lst_signif_features.insert(0, 'CLUMP')
    df_all_motifs_signif_features= df_all_motifs_all_features.loc[
        :, lst_signif_features]
    lst_signif_features.pop(0)
    
    # Calculating the averages
    df_clusters_means = df_all_motifs_signif_features.groupby(
    'CLUMP').mean().reset_index()
    
    return df_clusters_means

In [28]:
def average_calculation(pos_dset_feat, neg_dset_feat, lst_signif_features,
    df_all_motifs_all_features):
    """average_calculation
       -------------------
       This function calculates the average of each significant feature
       for the CLUMPs and the two datasets.
       
       Arguments:
       neg_dset_feat -- pandas dataframe with data of feature values
                        of the negative dataset.
       pos_dset_feat -- pandas dataframe with data of feature values
                        of the positive dataset.
       lst_signif_features -- list of significant features
       df_all_motifs_all_features -- pandas dataframe with data of
                                     feature values of the motifs.
       
       Output:
       dict_pos_neg_means -- dictionary with the significant feature
                             as the key and dictionaries as the values.
                             The dictionaries contain information
                             of the positive and negative average value
                             for that feature.
       df_clusters_means -- pandas dataframe with data of average of the CLUMPs
                            values for significant features.
       
    """
    
    # Calculating the features' means in the positive and negative
    # datasets
    dict_pos_neg_means = dataset_average_calculation(
        neg_dset_feat, pos_dset_feat, lst_signif_features)
    
    # Calculating the features' means of the CLUMPs
    df_clusters_means = CLUMPs_average_calculation(
        df_all_motifs_all_features, lst_signif_features)
    
    return dict_pos_neg_means, df_clusters_means

In [29]:
dict_pos_neg_means, df_clusters_means = average_calculation(pos_dset_feat, neg_dset_feat, lst_signif_features,
    df_all_motifs_all_features)
dict_pos_neg_means

{'tiny': {'pos_mean': 0.2834034891909498, 'neg_mean': 0.26091760814193554},
 'gravy': {'pos_mean': -0.4896251650801114, 'neg_mean': -0.3444228952769298},
 'sheet': {'pos_mean': 0.23068133309964053, 'neg_mean': 0.25155914031139265},
 'turn': {'pos_mean': 0.25494566616703496, 'neg_mean': 0.22734984235805394},
 'helix': {'pos_mean': 0.27613178308669706, 'neg_mean': 0.30783187617867197},
 'aliphatic': {'pos_mean': 0.24658259690574177,
  'neg_mean': 0.2829720294472124}}

In [30]:
df_clusters_means

Unnamed: 0,CLUMP,tiny,gravy,sheet,turn,helix,aliphatic
0,0,0.314286,-0.73619,0.02381,0.264286,0.425,0.065476
1,1,0.270455,-1.375,0.009091,0.645455,0.179545,0.040909
2,2,0.166667,-1.737745,0.132353,0.215686,0.176471,0.132353
3,3,0.352778,-1.231111,0.494444,0.212037,0.083333,0.259259
4,4,0.313725,0.721078,0.446078,0.147059,0.362745,0.539216
5,5,0.788636,0.345682,0.027273,0.568182,0.084091,0.054545
6,6,0.1,-3.238333,0.411667,0.08,0.02,0.02
7,7,0.117857,2.289643,0.392857,0.135714,0.8,0.510714
8,8,0.112667,-3.0348,0.174667,0.056,0.02,0.02
9,9,0.57,2.19,0.14,0.33,0.43,0.48


## CLUMPs_sorting

In [31]:
def CLUMPs_sorting(
    df_clusters_means, lst_signif_features, dict_pos_neg_means):
    """CLUMPs_sorting
       --------------
       This function sorts CLUMPs and gives them votes
       in accordance with the following criteria.
       
        if average_positive_dataset - average_negative_dataset > 0:
            for clusters with average > average_positive_dataset: 
                                        ranking from 1 to r
                r = clusters in the right half
            for clusters with average < average_positive_dataset: 0

        if average_positive_dataset - average_negative_dataset < 0:
            for clusters with average < average_positive_dataset: 
                                        ranking from 1 to r
                r = clusters in the right half
            for clusters with average > average_positive_dataset: 0
        
       Arguments: 
       df_clusters_means -- pandas dataframe with data of average of the CLUMPs
                            values for significant features.
       lst_signif_features -- list of significant features
       dict_pos_neg_means -- dictionary with the significant feature
                             as the key and dictionaries as the values.
                             The dictionaries contain information
                             of the positive and negative average value
                             for that feature.
       Output:
       dict_feat_scores -- dictionary with the features as the keys
                           and a list of the votes as the values.
                           N.B. The votes in the lists are not sorted
                           by CLUMP, but by vote.
                           It will be in the function CLUMPs_voting
                           that the votes will be assigned to the 
                           corresponding CLUMPs.
        
    """
    dict_feat_scores = {}
    higher_scores = np.arange(1, len(df_clusters_means)+1)
    
    for feature in lst_signif_features:
        df_clu_feature = pd.DataFrame(df_clusters_means.loc[:, feature])
        lst_scores_feature = []
        lst_higher_scores_feature = []

        
        
        
        # if average_positive_dataset - average_negative_dataset > 0:
        #    for clusters with average > average_positive_dataset: 
        #                                ranking from 1 to r
        #        r = clusters in the right half
        #    for clusters with average < average_positive_dataset: 0
        if dict_pos_neg_means[feature][
            'pos_mean'] - dict_pos_neg_means[feature]['neg_mean'] > 0:
            # sorting ascendingly
            df_clu_feature = df_clu_feature.sort_values(
                ascending = True, by = feature)
            
            for i in range(len(df_clu_feature)):
                
                # if the feature average is greater than the positive
                # dataset average
                if float(df_clu_feature.iloc[i]) > dict_pos_neg_means[
                    feature]['pos_mean']:
                    feat_higher_score = i+1
                    lst_higher_scores_feature.append(feat_higher_score)
                    new_list_higher_scores_features = list(np.arange(
                        1, len(lst_higher_scores_feature)+1))
                    new_list_higher_scores_features
                # if the feature average is less than the positive
                # dataset average
                else:
                    feat_score = 0
                    lst_scores_feature.append(feat_score)
                    
            lst_intermediate_scores = lst_scores_feature + new_list_higher_scores_features
        
        
        
        
        # if average_positive_dataset - average_negative_dataset < 0:
        #    for clusters with average < average_positive_dataset: 
        #                                ranking from 1 to r
        #        r = clusters in the right half
        #    for clusters with average > average_positive_dataset: 0
        else:
            # sorting descendingly
            df_clu_feature = df_clu_feature.sort_values(
                ascending = False, by = feature)
            
            for i in range(len(df_clu_feature)):

                # if the feature average is less than the positive
                # dataset average
                if float(df_clu_feature.iloc[i]) < dict_pos_neg_means[
                    feature]['pos_mean']:
                    feat_score = i+1
                    lst_higher_scores_feature.append(feat_higher_score)
                    new_list_higher_scores_features = list(np.arange(
                        1, len(lst_higher_scores_feature)+1))
                    new_list_higher_scores_features
                # if the feature average is greater than the positive
                # dataset average
                else:
                    feat_score = 0
                    lst_scores_feature.append(feat_score)
        
            lst_intermediate_scores = lst_scores_feature + new_list_higher_scores_features
            
        dict_feat_scores[feature] = lst_intermediate_scores
        
    return dict_feat_scores

In [32]:
dict_feat_scores = CLUMPs_sorting(
    df_clusters_means, lst_signif_features, dict_pos_neg_means)

## CLUMPs_voting

In [33]:
def CLUMPs_voting(lst_signif_features, df_clusters_means, 
                  dict_pos_neg_means, dict_feat_scores):
    """CLUMPs_voting
       -------------
       This function assignes the calculated votes (CLUMPs_sorting)
       to the corresponding CLUMPs.
       
       Arguments:
       lst_signif_features -- list of significant features
       df_clusters_means -- pandas dataframe with data of average of the CLUMPs
                            values for significant features.
       dict_pos_neg_means -- dictionary with the significant feature
                             as the key and dictionaries as the values.
                             The dictionaries contain information
                             of the positive and negative average value
                             for that feature.
       dict_feat_scores -- dictionary with the features as the keys
                           and a list of the votes as the values.
                           N.B. The votes in the lists are not sorted
                           by CLUMP, but by vote.
                           It will be this function CLUMPs_voting that will 
                           assign the votes to the corresponding CLUMPs.
       
       Output: 
       final_votes -- pandas DataFrame with the vote of each CLUMP
                      for each feature.
    """
    lst_final_scores = []
    for feature in lst_signif_features:
        df_clu_feature = pd.DataFrame(df_clusters_means.loc[:, feature])
        
        # if average_positive_dataset - average_negative_dataset > 0:
        if dict_pos_neg_means[feature][
            'pos_mean'] - dict_pos_neg_means[feature]['neg_mean'] > 0:
            # sort values of CLUMPs for that feature ascendingly
            df_clu_feature = df_clu_feature.sort_values(ascending = True, by = feature)
            df_clu_feature['score_'+feature] = dict_feat_scores[feature]
            df_clu_feature = df_clu_feature.sort_index()
            df_clu_feature = pd.DataFrame(df_clu_feature.iloc[:, 1])
            df_clu_feature = df_clu_feature.rename(columns = {'score_'+feature : feature})
            lst_clu_feature = list(df_clu_feature[feature])
        
        # if average_positive_dataset - average_negative_dataset < 0:
        else : 
            # sort values of CLUMPs for that feature descendingly
            df_clu_feature = df_clu_feature.sort_values(ascending = False, by = feature)
            df_clu_feature['score_'+feature] = dict_feat_scores[feature]
            df_clu_feature = df_clu_feature.sort_index()
            df_clu_feature = pd.DataFrame(df_clu_feature.iloc[:, 1])
            df_clu_feature = df_clu_feature.rename(columns = {'score_'+feature : feature})
            lst_clu_feature = list(df_clu_feature[feature])
        lst_final_scores.append(lst_clu_feature)

    final_votes = pd.DataFrame(lst_final_scores).transpose()
    final_votes.columns = lst_signif_features
    final_votes= final_votes.transpose()
    
    return final_votes

In [34]:
final_votes = CLUMPs_voting(lst_signif_features, df_clusters_means, 
                            dict_pos_neg_means, dict_feat_scores)
final_votes

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
tiny,2,0,0,3,1,6,0,0,0,5,4
gravy,1,4,5,3,0,0,7,0,6,0,2
sheet,6,7,3,0,0,5,0,0,1,2,4
turn,1,5,0,0,0,4,0,0,0,3,2
helix,0,1,2,4,0,3,6,0,7,0,5
aliphatic,3,5,1,0,0,4,6,0,7,0,2


## CLUMPs_scoring

In [35]:
def CLUMPs_scoring(
    final_votes, lst_signif_features, dict_feat_p_value_score_log):
    """CLUMPs_scoring
       --------------
       This function multiplies the final_votes by the calculated
       feature weights. Then sums the values of all the features 
       for each CLUMP. Finally, normalizes the result in a range from
       0 to 1.
       
       Arguments: 
       final_votes -- pandas DataFrame with the vote of each CLUMP
                      for each feature.
       lst_signif_features -- list of significant features
       dict_feat_p_value_score_log -- dictionary of significant features
                                      as keys and dictionaries as values.
                                      These dictionaries contain information
                                      of the the p-value and the score.
                                      
       Output:
       CLUMP_score_results -- pandas DataFrame with the normalized results of 
                              the CLUMP_scoring for each CLUMP
    """
    
    # Extracting the scores of the significant features
    feat_scores = []
    for feature in lst_signif_features:
        feat_scores.append(dict_feat_p_value_score_log[feature]['score'])
    
    # Multiplying the CLUMPs votes by the feature scores (feature weights)
    df_clusters_weights_final = final_votes.mul(feat_scores, axis = 0)
    
    # Summing all the values of a CLUMP
    CLUMP_score_results = pd.DataFrame(df_clusters_weights_final.sum()).rename(
        columns = {0 : 'CLUMP_score'}).sort_values(
        by = 'CLUMP_score', ascending = False)
    
    # Normalizing the results in a range from 0 to 1
    CLUMP_score_results= (CLUMP_score_results - CLUMP_score_results.min()) / (
        CLUMP_score_results.max() - CLUMP_score_results.min())
    CLUMP_score_results.sort_index(inplace = True)
    
    return CLUMP_score_results

## CLUMP_score

In [36]:
def CLUMP_score(pos_dset_feat, neg_dset_feat, df_all_motifs_all_features):
    """CLUMP_score
       -----------
       This function calculates the CLUMP_score, which is 
       based on the averages of the CLUMPs' and of the two datasets' values
       for a set of significant features. Significant features are those
       for which the function finds an enrichment in one of the two datasets, 
       that is significant compared to the other.
       
       Arguments: 
       neg_dset_feat -- pandas dataframe with data of feature values
                        of the negative dataset.
       pos_dset_feat -- pandas dataframe with data of feature values
                        of the positive dataset. 
       df_all_motifs_all_features -- pandas dataframe with data of
                                     feature values of the motifs.
                        
       Output:
       CLUMP_score_results -- pandas DataFrame with the normalized results of 
                              the CLUMP_scoring for each CLUMP
    """
    ## feature selection
    lst_signif_features, dict_feat_p_value_score_log = feature_selection(
        pos_dset_feat, neg_dset_feat)
    
    ## average calculation
    dict_pos_neg_means, df_clusters_means = average_calculation(
        pos_dset_feat, neg_dset_feat, lst_signif_features,
        df_all_motifs_all_features)
    
    ## CLUMPs sorting
    dict_feat_scores = CLUMPs_sorting(
        df_clusters_means, lst_signif_features, dict_pos_neg_means)
    
    ## CLUMPs voting
    final_votes = CLUMPs_voting(lst_signif_features, df_clusters_means, 
                            dict_pos_neg_means, dict_feat_scores)
    
    ## CLUMPs_scoring
    CLUMP_score_results = CLUMPs_scoring(
        final_votes, lst_signif_features, dict_feat_p_value_score_log)
    
    return CLUMP_score_results

In [37]:
CLUMP_score_results = CLUMP_score(pos_dset_feat, neg_dset_feat, df_all_motifs_all_features)
CLUMP_score_results

the number of candidate features is: 13
the number of significant features is: 6
significant features to calculate the CLUMP_score are: ['tiny', 'gravy', 'sheet', 'turn', 'helix', 'aliphatic']


Unnamed: 0,CLUMP_score
0,0.498234
1,0.887315
2,0.335534
3,0.248869
4,0.010728
5,0.800959
6,0.859612
7,0.0
8,1.0
9,0.208868


# MOnSTER_score

In [38]:
def monster_ranking(MOnSTER_score_results):
    """monster_ranking
       ---------------
       This function ranks the CLUMPs based on the 
       the results of the MOnSTER_score calculation.
       
       Arguments:
       MOnSTER_score_results -- pandas dataframe with the MOnSTER_score
                                results.
       Output:    
       MOnSTER_score_results -- pandas dataframe with the MOnSTER_score
                                results.
    """
       
    ranking = np.arange(1, len(MOnSTER_score_results)+1)
    MOnSTER_score_results.sort_values(by = 'monster_score', ascending = False,
                                      inplace= True)
    MOnSTER_score_results['ranking'] = ranking
    MOnSTER_score_results = MOnSTER_score_results.sort_index()
    
    return MOnSTER_score_results

In [39]:
def MOnSTER_score(
    pos_dset_feat, neg_dset_feat, motifs_counts, df_cnt_seq_per_cluster, 
    df_clusters, df_all_motifs_all_features):
    """MOnSTER_score
       -------------
       This function calculates the MOnSTER score.
       MOnSTER score is between 0 and 2.
       To do that, it calculates the CLUMP_score (from 0 to 1), 
       J1 (from 0 to 1) and J2 (from 0 to 1), multiplies J1 and J2 by 0.5 
       and then sums the 3 indexes.
       
       For further information about the calculation of these 3 indexes
       please read doc of the following functions:
       CLUMP_score()
       calculate_J1_and_J2()
       
       Arguments:
       pos_dset_feat -- pandas dataframe with data of feature values
                        of the positive dataset.
       neg_dset_feat -- pandas dataframe with data of feature values
                        of the negative dataset.
       motif_counts -- pandas dataframe with the number of
                       occurrences of the CLUMP in the two 
                       datasets.
       df_cnt_seq_per_cluster -- pandas dataframe with the number of
                                 sequences found by the CLUMP in the
                                 two datasets.
       df_clusters -- pandas dataframe of the motif and corresponding CLUMP.
       df_all_motifs_all_features -- pandas dataframe with data of
                                     feature values of the motifs.

       Output:
       MOnSTER_score_results -- pandas dataframe with the MOnSTER_score
                                results.
    """
    
    # Calculating the CLUMP_score
    CLUMP_score_results = CLUMP_score(pos_dset_feat, neg_dset_feat, 
                                      df_all_motifs_all_features)
    # Calculating J1 and J2
    df_jaccard_index = calculate_J1_and_J2(
        motifs_counts, df_cnt_seq_per_cluster, neg_dset_feat, pos_dset_feat)
    
    # Multiplying J1 and J2 by 0.5
    df_jacc = df_jaccard_index.copy()
    df_jacc['jaccard_norm_1'] = df_jacc['jaccard_norm_1']*0.5
    df_jacc['jaccard_norm_2'] = df_jacc['jaccard_norm_2']*0.5
    
    # Calculating MOnSTER_score
    MOnSTER_score_results = pd.concat([CLUMP_score_results, df_jacc], axis =1)
    MOnSTER_score_results.drop(columns = 'CLUMP', inplace = True)
    MOnSTER_score_results = pd.DataFrame(MOnSTER_score_results.sum(
        axis =1)).rename(columns = {0 : 'monster_score'})
    MOnSTER_score_results.insert(0, 'CLUMP', list(df_clusters.CLUMP.unique()))
    
    MOnSTER_score_results = monster_ranking(MOnSTER_score_results)
    
    return MOnSTER_score_results


In [40]:
MOnSTER_score_results = MOnSTER_score(pos_dset_feat, neg_dset_feat, 
                                      motifs_counts, df_cnt_seq_per_cluster, 
                                      df_clusters, df_all_motifs_all_features)
MOnSTER_score_results

the number of candidate features is: 13
the number of significant features is: 6
significant features to calculate the CLUMP_score are: ['tiny', 'gravy', 'sheet', 'turn', 'helix', 'aliphatic']


Unnamed: 0,CLUMP,monster_score,ranking
0,0,1.161647,6
1,1,1.840269,1
2,2,1.207398,5
3,3,0.953408,10
4,4,0.699245,11
5,5,1.734073,2
6,6,1.695619,3
7,7,0.9675,9
8,8,1.660625,4
9,9,1.115131,7
