In [9]:
%load_ext autoreload
%autoreload 2

In [53]:
from src.data import make_dataset
from src.features import build_features
from src.features import original

data_files = ['2011.csv', '2012.csv', '2013.csv', '2014.csv', '2015.csv', '2016.csv']

print('---> Reading in TrigramVecs...')
trigram_to_idx, trigram_vecs_data = make_dataset.read_trigram_vecs()
print(f'Number of possible 3-grams: {len(trigram_to_idx)}')
print(f'Dimension of TrigramVecs: {len(trigram_vecs_data[0])}')

print('\n---> Reading in strains...')
strains_by_year = make_dataset.read_strains_from(data_files)
print(f'Strains from {len(data_files)} years were read.')
print(f'Shape: {len(strains_by_year)}x{len(strains_by_year[0])}x{len(strains_by_year[0][0])}')
print(f'Example strain:\n{strains_by_year[0][0]}')

print('\n---> Constructing training data...')
num_of_samples = 100

strains_by_year = build_features.sample_strains(strains_by_year, num_of_samples)

trigrams_by_year = build_features.split_to_trigrams(strains_by_year)
print(f'Each of {len(trigrams_by_year[0])} year strains were split into {len(trigrams_by_year[0][0])} trigrams.')
print(f'Shape: {len(trigrams_by_year)}x{len(trigrams_by_year[0])}x{len(trigrams_by_year[0][0])}')

trigram_idxs_by_year = build_features.trigrams_to_indexes(trigrams_by_year, trigram_to_idx)
print('\nIndex conversion performed.')
print(f'Shape: {len(trigram_idxs_by_year)}x{len(trigram_idxs_by_year[0])}x{len(trigram_idxs_by_year[0][0])}')

concated_trigrams_by_year = build_features.concat_trigrams(trigram_idxs_by_year)
print('\nIndex concatenation performed.')
print(f'Shape: {len(concated_trigrams_by_year)}x{len(concated_trigrams_by_year[0])}')

trigram_vecs = build_features.indexes_to_trigram_vecs(concated_trigrams_by_year, trigram_vecs_data)
print('\nTrigramVec conversion performed.')
print(f'Shape: {len(training_vecs)}:{len(training_vecs[0])}:{len(training_vecs[0][0])}')

print('\n---> Compering to original code...')
num_of_samples = 100
training_trigrams = original.construct_training_data(strains_by_year, num_of_samples)
training_indexes = original.convert_to_indexes(training_trigrams, trigram_to_idx)
training_vecs = original.convert_to_prot_vecs(training_indexes, trigram_vecs_data)
print('ProtVec conversion performed.')
print(f'Shape: {len(training_vecs)}:{len(training_vecs[0])}:{len(training_vecs[0][0])}')

---> Reading in TrigramVecs...
Number of possible 3-grams: 9048
Dimension of TrigramVecs: 100

---> Reading in strains...
Strains from 6 years were read.
Shape: 6x577x566
Example strain:
MKTIIALSYILCLVFAQKLPGNDNSTATLCLGHHAVPNGTIVKTITNDQIEVTNATELVQSSSTGEICDSPHQILDGENCTLIDALLGDPQCDGFQNKKWDLFVERSKAYSNCYPYDVPDYASLRSLVASSGTLEFNNESFNWTGVTQNGTSSACIRRSNSSFFSRLNWLTHLNFKYPALNVTMPNNEQFDKLYIWGVHHPGTDKDQIFLYAQSSGRITVSTKRSQQAVIPNIGSRPRIRNIPSRISIYWTIVKPGDILLINSTGNLIAPRGYFKIRSGKSSIMRSDAPIGKCNSECITPNGSIPNDKPFQNVNRITYGACPRYVKQSTLKLATGMRNVPEKQTRGIFGAIAGFIENGWEGMVDGWYGFRHQNSEGRGQAADLKSTQAAIDQINGKLNRLIGKTNEKFHQIEKEFSEVEGRIQDLEKYVEDTKIDLWSYNAELLVALENQHTIDLTDSEMNKLFEKTKKQLRENAEDMGNGCFKIYHKCDNACIESIRNGTYNHDVYRDEALNNRFQIKGVELKSGYKDWILWISFAISCFLLCVALLGFIMWACQKGNIRCNICI

---> Constructing training data...
Each of 100 year strains were split into 564 trigrams.
Shape: 6x100x564

Index conversion performed.
Shape: 6x100x564

Index concatenation performed.
Shape: 6x56400

TrigramVec conversion performed.
Shape: 6:56