# Establishing a Good Distance Between Flu Sequences

First we need to establish the meaning of a distance between Influenza sequences. We will look at a special subset of the data where we know roughly what the similarity/distance is bewtween a pair. 

## Jaccard with k-grams

We will run k-grams and Jaccard similarity on the sequences and cross validate and see if the ones who are similar match our domain knowledge.

In [1]:
import pickle
import gzip
import numpy as np
import pandas as pd
import numba
from collections import Counter
from itertools import product
from FASTA_parse import sequence

In [2]:
# read in the sequences
with gzip.open('influenza.fna.pklz', 'rb') as fin:
    fna = pickle.load(fin)

In [18]:
fna_meta, fna_seq = fna.get_dataframe()

In [19]:
fna_seq.drop_duplicates(subset='Sequence', inplace=True)

In [5]:
test = fna_seq.Sequence[0]

In [47]:
@numba.njit
def tokenize(seq, grams=2):
    split = [seq[i:i+grams] for i in range(len(seq)-grams+1)]
    return split

@numba.jit
def generate_vocab(grams=2):
     # generate general "vocab" list
    vocab = Counter()
    for t in product('ACTG', 'ACTG', repeat=grams):
        s = []
        for i in range(grams):
            s.append(t[i])
        vocab[''.join(s)] = 0
        
    return vocab

@numba.jit
def generate_vec(seq_list, vocab, grams, r):
    new_vec_col = []
    for seq in seq_list:
        vec = vocab.copy()
        tokens = tokenize(seq, grams=grams)
        for j, gram in enumerate(tokens):
            vec[gram] += 1
            if r > 0:
                for i in range(r):
                    vec[tokens[j-i]] += 1
                    vec[tokens[j+i]] += 1
        new_vec_col.append(np.array(vec.values()))
        
    return new_vec_col
    
def generate_bag(seq_df, grams=2, r=0):
    # generate general "vocab" list
    vocab = generate_vocab(grams)
    new_vec_col = generate_vec(seq_df.Sequence.tolist(), vocab, grams, r)
        
    seq_df['BOW_vector_' + str(grams) + '_' + str(r)] = new_vec_col
    return seq_df
                    

In [None]:
fna_seq = generate_bag(fna_seq, grams=5)

In [None]:
fna_seq = generate_bag(fna_seq, grams=6)

In [None]:
fna_seq.head()

In [26]:
fna_seq.to_pickle('./fna_df_bow.pkl')

In [30]:
from sklearn.cluster import KMeans

In [46]:
np.array(fna_seq.BOW_vector_5_)

array([list([5, 2, 7, 4, 2, 1, 2, 0, 4, 4, 2, 4, 2, 2, 1, 3, 1, 2, 2, 4, 3, 1, 2, 1, 0, 1, 0, 3, 0, 1, 0, 0, 7, 2, 1, 1, 5, 2, 2, 2, 3, 3, 2, 3, 4, 2, 4, 9, 1, 2, 2, 2, 5, 1, 1, 1, 4, 0, 1, 2, 1, 0, 2, 2, 3, 0, 5, 1, 0, 2, 1, 0, 2, 1, 0, 0, 3, 2, 1, 3, 1, 2, 5, 0, 2, 0, 1, 0, 1, 0, 3, 0, 1, 0, 1, 0, 0, 1, 1, 3, 2, 1, 0, 0, 0, 0, 0, 1, 4, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 3, 4, 8, 2, 2, 0, 1, 1, 2, 0, 3, 1, 2, 1, 0, 2, 4, 1, 0, 3, 2, 1, 1, 0, 0, 1, 0, 3, 1, 0, 0, 2, 3, 2, 1, 1, 4, 1, 0, 0, 2, 3, 1, 2, 2, 1, 1, 5, 2, 2, 3, 2, 6, 0, 1, 0, 0, 0, 2, 3, 6, 2, 7, 5, 3, 3, 1, 2, 3, 0, 1, 2, 3, 0, 0, 1, 1, 1, 1, 0, 4, 1, 4, 5, 1, 1, 1, 0, 0, 1, 1, 2, 1, 0, 0, 2, 3, 0, 1, 1, 2, 0, 1, 0, 0, 2, 3, 2, 1, 1, 1, 5, 1, 1, 1, 2, 2, 2, 0, 0, 1, 0, 3, 1, 2, 1, 1, 3, 7, 1, 5, 1, 1, 2, 0, 0, 3, 2, 3, 5, 3, 1, 4, 1, 1, 0, 0, 0, 3, 1, 1, 0, 4, 1, 1, 0, 1, 0, 0, 0, 2, 0, 2, 0, 1, 0, 1, 0, 1, 2, 3, 3, 3, 2, 1, 4, 4, 3, 1, 0, 3, 1, 2, 1, 1, 0, 1, 4, 3, 3, 1, 4, 3, 2, 2, 2, 1, 0, 1, 1, 0, 

In [45]:
km = KMeans(10)
km.fit_predict(fna_seq.sample(100).BOW_vector_5_.values)

ValueError: setting an array element with a sequence.