In [1]:
%load_ext autoreload
%autoreload 2

In [15]:
from src.data import make_dataset
from src.features import build_features
from src.features import original
from src.utils import utils

data_files = ['2011.csv', '2012.csv', '2013.csv', '2014.csv', '2015.csv', '2016.csv']

print('---> Reading in TrigramVecs...')
trigram_to_idx, trigram_vecs_data = make_dataset.read_trigram_vecs()
print(f'Number of possible 3-grams: {len(trigram_to_idx)}')
print(f'Dimension of TrigramVecs: {len(trigram_vecs_data[0])}')

print('\n---> Reading in strains...')
strains_by_year = make_dataset.read_strains_from(data_files)
print(f'Strains from {len(data_files)} years were read.')
print(f'Example strain:\n{strains_by_year[0][0]}')

# print('\n---> Constructing training data...')
# num_of_samples = 2

# strains_by_year = build_features.sample_strains(raw_amino_sequences, num_of_samples)
# print(f'Picked {len(strains_by_year[0])} strains by year')

# trigrams_by_year = build_features.split_to_trigrams(strains_by_year)
# print(f'Each of {len(trigrams_by_year[0])} year strains were split into {len(trigrams_by_year[0][0])} trigrams.')

# trigrams_series = build_features.to_time_series(trigrams_by_year)
# # print(f'{len(trigrams_series[0])/564} strains sampled from each year to create {len(trigrams_series)} training examples.')
# # print(f'Example: {trigrams_series}, length: {len(trigrams_series)}')

# training_indexes = build_features.trigrams_to_indexes(trigrams_series, trigram_to_idx)
# print('\nIndex conversion performed.')
# print(f'Example: {training_indexes[:2]}')
# trigram_vecs = build_features.indexes_to_trigram_vecs(training_indexes, trigram_vecs_data)
# print('\nTrigramVec conversion performed.')
# print(f'It\'s {len(trigram_vecs)} trigrams in strains of each of {len(trigram_vecs[0])} years where' +
#     f' each Trigram is encoded by {len(trigram_vecs[0][0])} numbers')


print('\n---> Constructing training data...')
num_of_samples = 100
training_trigrams = original.construct_training_data(strains_by_year, num_of_samples)
print('%d amino acid sequences sampled from each year to create %d training examples.' % (num_of_samples, len(training_trigrams)))
print('Examples: {}'.format(training_trigrams[:2]))
print(f'Shape: {len(training_trigrams)}x{len(training_trigrams[0])}')

training_indexes = original.convert_to_indexes(training_trigrams, trigram_to_idx)
print('\nIndex conversion performed.')
print('Examples: {}'.format(training_indexes[:2]))
print(f'Shape: {len(training_indexes)}x{len(training_indexes[0])}')

training_vecs = original.convert_to_prot_vecs(training_indexes, trigram_vecs_data)
print('\nProtVec conversion performed.')
print(f'Shape: {len(training_vecs)}:{len(training_vecs[0])}:{len(training_vecs[0][0])}')

---> Reading in TrigramVecs...
Number of possible 3-grams: 9048
Dimension of TrigramVecs: 100

---> Reading in strains...
Strains from 6 years were read.
Example strain:
MKTIIALSYILCLVFAQKLPGNDNSTATLCLGHHAVPNGTIVKTITNDQIEVTNATELVQSSSTGEICDSPHQILDGENCTLIDALLGDPQCDGFQNKKWDLFVERSKAYSNCYPYDVPDYASLRSLVASSGTLEFNNESFNWTGVTQNGTSSACIRRSNSSFFSRLNWLTHLNFKYPALNVTMPNNEQFDKLYIWGVHHPGTDKDQIFLYAQSSGRITVSTKRSQQAVIPNIGSRPRIRNIPSRISIYWTIVKPGDILLINSTGNLIAPRGYFKIRSGKSSIMRSDAPIGKCNSECITPNGSIPNDKPFQNVNRITYGACPRYVKQSTLKLATGMRNVPEKQTRGIFGAIAGFIENGWEGMVDGWYGFRHQNSEGRGQAADLKSTQAAIDQINGKLNRLIGKTNEKFHQIEKEFSEVEGRIQDLEKYVEDTKIDLWSYNAELLVALENQHTIDLTDSEMNKLFEKTKKQLRENAEDMGNGCFKIYHKCDNACIESIRNGTYNHDVYRDEALNNRFQIKGVELKSGYKDWILWISFAISCFLLCVALLGFIMWACQKGNIRCNICI

---> Constructing training data...
100 amino acid sequences sampled from each year to create 56400 training examples.
Examples: [['MKT', 'MKT', 'MKT', 'MKT', 'MKT', 'MKT'], ['KTI', 'KTI', 'KTI', 'KTI', 'KTI', 'KTI']]
Shape: 56400x6

Index conversion performed.
E