In [1]:
import pandas as pd
pd.set_option('display.max_columns', None)
import numpy as np
import sys
import os

In [2]:
from Bio import SeqIO
from Bio.Seq import Seq

In [3]:
from monster.import_data import *

In [4]:
from monster.features_extraction import MotifProperties
from monster.features_extraction import from_lst_to_dict
from monster.features_extraction import feature_calculation

%load_ext autoreload
%autoreload 2

# Data loading

In [5]:
### importing data
#

# positive dataset
seqs_path_pos = '../data/datasets/minc_nr_positive_dataset.fasta'
pos_dict = import_fasta_sequences_as_dict(seqs_path_pos)

# negative dataset
seqs_path_neg = '../data/datasets/minc_nr_negative_dataset.fasta'
neg_dict = import_fasta_sequences_as_dict(seqs_path_neg)

# motifs
lst_motifs = import_list_motifs('../data/lst_motifs')

# Motifs clustering based of physicochemical properties

## Features extraction from the datasets sequences

In [6]:
### Open motifs files for each tool
#
motif_merci = pd.read_csv('../data/merci_lst_motifs.txt',
                          header=None,
                          names=['motif'])

motif_streme = pd.read_csv('../data/lst_motifs_fimo_streme.txt',
                          header=None,
                          names=['motif'])

motif_dimotif = pd.read_csv('../data/dimotif_lst_motifs.txt',
                          header=None,
                          names=['motif'])

### Add a 'tool' column to keep track
#
motif_merci['tool'] = 'merci'
motif_streme['tool'] = 'streme'
motif_dimotif['tool'] = 'dimotif'

print('dimotif', motif_dimotif.shape[0])
print('streme', motif_streme.shape[0])
print('merci', motif_merci.shape[0])


dimotif 100
streme 91
merci 10


## Features extraction 

### test 

In [7]:
dict_motifs = from_lst_to_dict(lst_motifs)

In [8]:
df_motifs_features = feature_calculation(dict_motifs)
df_motifs_features

Unnamed: 0,id,seq_len,gravy,tiny,small,aliphatic,aromatic,non_polar,polar,charged,basic,acidic,helix,turn,sheet
0,GHWT,4,-1.300000,0.500000,0.500000,0.00,0.500000,0.500000,0.500000,0.250000,0.250000,0.000000,0.250000,0.250000,0.000000
1,GHWTQ,5,-1.740000,0.400000,0.400000,0.00,0.400000,0.400000,0.600000,0.200000,0.200000,0.000000,0.200000,0.200000,0.000000
2,HWT,3,-1.600000,0.333333,0.333333,0.00,0.666667,0.333333,0.666667,0.333333,0.333333,0.000000,0.333333,0.000000,0.000000
3,HWTQ,4,-2.075000,0.250000,0.250000,0.00,0.500000,0.250000,0.750000,0.250000,0.250000,0.000000,0.250000,0.000000,0.000000
4,PGNV,4,-0.325000,0.250000,0.750000,0.25,0.000000,0.750000,0.250000,0.000000,0.000000,0.000000,0.250000,0.750000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
193,VHAA,4,1.150000,0.500000,0.750000,0.75,0.250000,0.750000,0.250000,0.250000,0.250000,0.000000,0.250000,0.000000,0.500000
194,VKSY,4,-0.450000,0.250000,0.500000,0.25,0.250000,0.500000,0.500000,0.250000,0.250000,0.000000,0.500000,0.250000,0.000000
195,KTD,3,-2.700000,0.333333,0.000000,0.00,0.000000,0.000000,1.000000,0.666667,0.333333,0.333333,0.000000,0.000000,0.000000
196,DKE,3,-3.633333,0.000000,0.000000,0.00,0.000000,0.000000,1.000000,1.000000,0.333333,0.666667,0.000000,0.000000,0.333333


In [9]:
len(df_motifs_features)

198

In [10]:
df_motifs_features.to_csv('df_motifs_features.tsv', header= True,
                         index = None)

In [11]:
df_pos_features = feature_calculation(pos_dict)
df_pos_features

Unnamed: 0,id,seq_len,gravy,tiny,small,aliphatic,aromatic,non_polar,polar,charged,basic,acidic,helix,turn,sheet
0,Minc3s00007g00481,603,-0.140962,0.338308,0.558872,0.276949,0.119403,0.558872,0.441128,0.154229,0.089552,0.064677,0.313433,0.323383,0.160862
1,Minc3s00008g00574,231,-0.580952,0.307359,0.528139,0.220779,0.121212,0.528139,0.471861,0.220779,0.121212,0.099567,0.268398,0.311688,0.173160
2,Minc3s00011g00761,179,-0.444693,0.206704,0.502793,0.279330,0.106145,0.502793,0.497207,0.357542,0.150838,0.206704,0.301676,0.139665,0.312849
3,Minc3s00013g00811,65,0.627692,0.384615,0.646154,0.323077,0.061538,0.646154,0.353846,0.169231,0.061538,0.107692,0.323077,0.169231,0.200000
4,Minc3s00020g01281,87,0.113793,0.448276,0.724138,0.149425,0.218391,0.724138,0.275862,0.114943,0.068966,0.045977,0.356322,0.367816,0.149425
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
156,Minc04584,209,-0.843541,0.296651,0.492823,0.181818,0.076555,0.492823,0.507177,0.248804,0.124402,0.124402,0.200957,0.344498,0.181818
157,Minc06775,77,-0.570130,0.207792,0.545455,0.233766,0.064935,0.545455,0.454545,0.402597,0.207792,0.194805,0.246753,0.168831,0.337662
158,CL5Contig2_1-EST,225,-0.424444,0.248889,0.480000,0.328889,0.075556,0.480000,0.520000,0.320000,0.173333,0.146667,0.288889,0.191111,0.324444
159,CL312Contig1_1-EST,429,-1.394872,0.282051,0.368298,0.191142,0.020979,0.368298,0.631702,0.400932,0.125874,0.275058,0.125874,0.263403,0.342657


In [12]:
df_pos_features.to_csv('df_pos_features.tsv', header= True,
                         index = None)

In [13]:
df_neg_features = feature_calculation(neg_dict)
df_neg_features

Unnamed: 0,id,seq_len,gravy,tiny,small,aliphatic,aromatic,non_polar,polar,charged,basic,acidic,helix,turn,sheet
0,Minc3s00001g00059,340,-0.241176,0.367647,0.517647,0.267647,0.108824,0.517647,0.482353,0.235294,0.120588,0.114706,0.264706,0.238235,0.232353
1,Minc3s00002g00155,202,-1.108911,0.316832,0.351485,0.178218,0.069307,0.351485,0.648515,0.287129,0.188119,0.099010,0.188119,0.306931,0.173267
2,Minc3s00006g00398,276,-0.324275,0.253623,0.525362,0.235507,0.105072,0.525362,0.474638,0.260870,0.115942,0.144928,0.297101,0.235507,0.250000
3,Minc3s00006g00399,588,-0.768878,0.299320,0.452381,0.227891,0.090136,0.452381,0.547619,0.277211,0.139456,0.137755,0.232993,0.301020,0.234694
4,Minc3s00006g00405,1296,-0.564506,0.251543,0.455247,0.269290,0.091821,0.455247,0.544753,0.301698,0.139660,0.162037,0.278549,0.206790,0.296296
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
490,Minc3s11519g44955,186,-0.308065,0.231183,0.505376,0.311828,0.102151,0.505376,0.494624,0.209677,0.129032,0.080645,0.354839,0.252688,0.220430
491,Minc3s11623g45023,247,-0.367611,0.251012,0.506073,0.271255,0.113360,0.506073,0.493927,0.295547,0.137652,0.157895,0.295547,0.202429,0.263158
492,Minc3s11692g45078,170,-0.741176,0.258824,0.447059,0.276471,0.017647,0.447059,0.552941,0.305882,0.135294,0.170588,0.176471,0.188235,0.352941
493,Minc3s11874g45210,267,-0.747191,0.314607,0.498127,0.202247,0.127341,0.498127,0.501873,0.280899,0.161049,0.119850,0.258427,0.262172,0.224719


In [14]:
df_neg_features.to_csv('df_neg_features.tsv', header= True,
                         index = None)