In [163]:
import pandas as pd
import networkx as nx
from itertools import combinations

def hamming_distance(a, b):
    return sum(x != y for x, y in zip(a, b))

def average_hamming(seqs):
    seqs = list(seqs)
    n = len(seqs)
    accumulative_mismatches = 0
    if n == 1:
        return 0
    for i in range(n-1):
        for j in range(i+1, n):
            accumulative_mismatches += mismatches(seqs[i],seqs[j])

    return accumulative_mismatches/math.comb(n,2)

def mismatches(s1,s2):
    n = 0
    for i in range(len(s1)):
        if s1[i] != s2[i]:
            n += 1

    return n

def phred_to_prob(phred):
    val = ord(phred) - 33
    return 10 ** -(val/10)

def overall_phred(qual_str):
    P = 1
    for q in qual_str:
        P *= 1 - phred_to_prob(q)

    return 1 - P

In [92]:
import math
def error_prob_to_phred33(p):
    """
    Convert an error probability to a Phred+33 ASCII character.

    Parameters:
        p (float): Error probability (between 0 and 1)

    Returns:
        str: Single-character Phred+33 quality symbol

    Raises:
        ValueError: If p is not between 0 and 1
    """
    if not (0 < p <= 1):
        raise ValueError("Error probability must be in (0, 1].")

    # Calculate Phred score
    phred = -10 * math.log10(p)
    phred = int(round(phred))

    # Clamp to typical Phred+33 range [0, 41]
    phred = max(0, min(phred, 41))
    return phred


def phred33_to_symbol(phred):
    # Convert to ASCII character
    return chr(phred + 33)

def error_prob_to_phred33_symbol(p):
    return phred33_to_symbol(error_prob_to_phred33(p))

In [94]:
print(phred_to_prob('A'))
print(error_prob_to_phred33(overall_phred('FFFFFFFFFF:')))

0.000630957344480193
23


In [135]:
import glob
infiles = {}
for f in glob.glob("wnv_mg*.tsv"):
    infiles[f.lstrip("wnv_").rstrip(".tsv")] = f 

    
print(infiles)
print(sorted(infiles.keys()))

{'mg7': 'wnv_mg7.tsv', 'mg6': 'wnv_mg6.tsv', 'mg4': 'wnv_mg4.tsv', 'mg3': 'wnv_mg3.tsv', 'mg8': 'wnv_mg8.tsv', 'mg9': 'wnv_mg9.tsv'}
['mg3', 'mg4', 'mg6', 'mg7', 'mg8', 'mg9']


In [154]:
header = ['ID', 'cellbarcode', 'umi', 'viral', 'cellbarcode_quality', 'umi_quality', 'viral_quality', 'min_qual_cellbarcode', 'min_qual_umi', 'min_qual_virus']
dfs = []
for mg in sorted(infiles.keys()):
    mg_data = pd.read_csv(infiles[mg], delimiter="\t", names=header, on_bad_lines='skip')
    mg_data.insert(0, 'midgut', mg)
    dfs.append(mg_data)
    
wnv = pd.concat(dfs, ignore_index=True)

In [155]:

print(len(wnv), "rows")
print(wnv["midgut"].value_counts())

10454 rows
midgut
mg3    3885
mg8    3121
mg7    1835
mg4     701
mg6     640
mg9     272
Name: count, dtype: int64


In [156]:
wnv['overall_quality'] = wnv[['min_qual_cellbarcode','min_qual_umi','min_qual_virus']].min(axis=1)

In [157]:
wnv

Unnamed: 0,midgut,ID,cellbarcode,umi,viral,cellbarcode_quality,umi_quality,viral_quality,min_qual_cellbarcode,min_qual_umi,min_qual_virus,overall_quality
0,mg3,A00405:682:HV27LDSX5:1:1101:7898:2143,TGGGCGTAGTCGAGT,GTGCCGCTCTA,CTGACCGTGACGGTTACTGCGGCTACCCTGCTT,FFFFFFFFFFFFFFF,FFFFFFF:FFF,FFFFFFFFFFF:FFFFFFFFFFF::FFFFFFFF,37,25,25,25
1,mg3,A00405:682:HV27LDSX5:1:1101:10022:4539,CAGGTGCGCTCTAAT,TCTGGGACGTC,CTTACCGTTACTGTGACTGCGGCGACACTCCTT,FFFFFFFFF:FFFFF,FFFFFFFFFF:,FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF,25,25,37,25
2,mg3,A00405:682:HV27LDSX5:1:1101:26576:5619,GTAACTGTCTTCACA,GACGGGTTCAC,CTCACCGTTACCGTTACAGCAGCCACTCTACTC,FFFFFFFFFFFFFFF,FFFFFFFFFFF,FFFFFFFFFFFFFFFFFFFFFFFF:FFFFFFFF,37,37,25,25
3,mg3,A00405:682:HV27LDSX5:1:1101:24343:7764,ACGGGTGTGGTGCGC,TCTAATTCTGG,CTGACAGTCACAGTGACTGCCGCGACTCTTCTT,F:FFF::::FFFF:F,FF:FFFFF:FF,FFFFF:FFF:FFFFFFFFFFF:F:FFFFFFFFF,25,25,25,25
4,mg3,A00405:682:HV27LDSX5:1:1101:23149:8234,ATTGGTGTGGTGCGC,TCTAATTCTGG,CTAACTGTAACCGTCACTGCGGCAACTCTGCTC,"F,FFFFFF,:,F:::","FFFFFF,,F,F",":F:F:F::FF:F,F:FF:::F:,,,FF:,FF::",11,11,11,11
...,...,...,...,...,...,...,...,...,...,...,...,...
10449,mg9,A00405:732:HCVWNDSX7:2:1201:27326:21919,GGGACCTCCGTGGCC,ACGATGCCATC,CTAACTGTAACAGTTACTGCTGCTACTCTACTT,"F,,FFFFFFFFF:FF",",FFFFFFFFFF","FF,F,,,:FFFFF,FF:::FFF,,F,F,FF,FF",11,11,11,11
10450,mg9,A00405:732:HCVWNDSX7:2:1201:28583:35900,GTTCCAACTTTCTTC,TGCATGATGGG,CTTACTGTTACAGTCACTGCAGCCACTCTGCTT,"FFFFFFFFFF,FFFF",FFFFF:FFF::,"FFFFFFFFFF:F:FFFFFFFFF,FFFFFFFFF:",11,25,11,11
10451,mg9,A00405:732:HCVWNDSX7:2:1202:9914:11209,AAACGGGTTCACTAC,TACTGCAGCTA,CTCACCGTAACCGTCACAGCTGCTACGCTGCTT,"FFFF,::FFFF:FFF",:FFFFFFFF:F,"F:FFF:F:FFF,FF:F::,FFFF:FFFFFFFFF",11,25,11,11
10452,mg9,A00405:732:HCVWNDSX7:2:1202:27471:23171,TTTGCGCTCTAATTC,TGGGACGTCCG,CTGACTGTGACTGTAACTGCTGCTACCCTTCTC,FFFFFFFFFFFFFFF,"F:F,FFFFFFF",FFFFF:FFFFFFFFF:FFFFFFFF:FFFFFFFF,37,11,25,11


## Look at representation by viral sequence

In [166]:
def cluster_barcodes(barcodes, max_distance=1):
    G = nx.Graph()
    G.add_nodes_from(barcodes)
    for a, b in combinations(barcodes, 2):
        if mismatches(a, b) <= max_distance:
            G.add_edge(a, b)
    return list(nx.connected_components(G))

In [209]:
viral_barcode_counts = wnv['viral'].value_counts()
len(viral_barcode_counts)

398

In [216]:
ct = pd.crosstab(wnv['viral'], wnv['midgut'])
ct_sorted = ct.assign(row_sum=ct.sum(axis=1)).sort_values('row_sum', ascending=False).drop(columns='row_sum')
for col in ct_sorted.columns:
    ct_sorted[f'{col}_rank'] = ct_sorted[col].rank(ascending=False, method='min')

print(len(ct_sorted))
ct_sorted.head(20)

398


midgut,mg3,mg4,mg6,mg7,mg8,mg9,mg3_rank,mg4_rank,mg6_rank,mg7_rank,mg8_rank,mg9_rank
viral,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
CTCACCGTTACTGTCACTGCAGCAACCCTACTT,324,61,43,132,242,25,3.0,1.0,1.0,2.0,1.0,1.0
CTGACGGTAACAGTGACAGCAGCAACCCTACTT,378,15,20,137,95,17,1.0,12.0,6.0,1.0,5.0,3.0
CTGACAGTTACCGTTACAGCAGCCACGCTTCTG,235,29,43,92,205,22,6.0,7.0,1.0,3.0,2.0,2.0
CTGACAGTCACAGTGACTGCCGCGACTCTTCTT,283,14,28,53,146,15,4.0,13.0,4.0,6.0,3.0,4.0
CTGACGGTCACAGTTACTGCTGCTACCCTGCTC,329,4,0,7,41,1,2.0,30.0,102.0,59.0,28.0,43.0
CTAACTGTAACCGTCACTGCGGCAACTCTGCTC,237,8,12,79,30,14,5.0,21.0,17.0,4.0,38.0,5.0
CTGACTGTGACCGTGACTGCTGCCACTCTACTA,218,0,1,20,49,0,7.0,98.0,68.0,25.0,21.0,80.0
CTTACCGTCACCGTAACCGCAGCCACTCTACTC,131,50,18,34,37,3,10.0,3.0,9.0,12.0,32.0,22.0
CTTACCGTAACGGTAACTGCCGCCACGCTGCTT,181,1,2,35,45,4,9.0,57.0,55.0,11.0,24.0,17.0
CTGACTGTTACTGTCACGGCTGCCACTCTCCTT,9,8,17,71,131,5,46.0,21.0,10.0,5.0,4.0,12.0


In [171]:
clusters = cluster_barcodes(ct_sorted.index)
print(len(clusters), "clusters formed")

In [210]:
mapping = {} # map the sequence to its cluster number
collapsed = {} # same as mapping but it's to the representative sequence
repr_seqs = []
for i,cluster in enumerate(clusters):
    seqs = list(cluster)
    repr_seqs.append(f"{seqs[0]}_{i}")
    for seq in seqs:
        mapping[seq] = i
        collapsed[seq] = repr_seqs[-1]

collapsed

{'CTCACCGTTACTGTGACTGCAGCAACCCTACTT': 'CTCACCGTTACTGTGACTGCAGCAACCCTACTT_0',
 'CTCACCGTTACTGTCACTGCAGCAACCCTACTT': 'CTCACCGTTACTGTGACTGCAGCAACCCTACTT_0',
 'CTCACCGTAACTGTCACTGCAGCAACCCTACTT': 'CTCACCGTTACTGTGACTGCAGCAACCCTACTT_0',
 'CTCACCGTTACTGTCACTGCAGCAACCCTACTG': 'CTCACCGTTACTGTGACTGCAGCAACCCTACTT_0',
 'CTCACTGTTACTGTCACTGCAGCAACCCTACTT': 'CTCACCGTTACTGTGACTGCAGCAACCCTACTT_0',
 'CTCACCGTTACTGTCACTGCAGCAACCCTCCTT': 'CTCACCGTTACTGTGACTGCAGCAACCCTACTT_0',
 'CTCACCGTTACTGTCACAGCAGCAACCCTACTT': 'CTCACCGTTACTGTGACTGCAGCAACCCTACTT_0',
 'CTCACCGTTACAGTCACTGCAGCAACCCTACTT': 'CTCACCGTTACTGTGACTGCAGCAACCCTACTT_0',
 'CTCACCGTTACTGTCACTGCAGCAACCCTACTA': 'CTCACCGTTACTGTGACTGCAGCAACCCTACTT_0',
 'CTCACCGTTACTGTCACTGCAGCAACACTACTT': 'CTCACCGTTACTGTGACTGCAGCAACCCTACTT_0',
 'CTCACCGTTACTGTCACTGCAGCAACCCTTCTT': 'CTCACCGTTACTGTGACTGCAGCAACCCTACTT_0',
 'CTGACGGTAACAGTGACCGCAGCAACCCTACTA': 'CTGACGGTAACAGTGACCGCAGCAACCCTACTA_1',
 'CTGACGGTAACAGTGACAGCAGCAACCCTACTA': 'CTGACGGTAACAGTGACCGCAGCAACCCTACTA_1',

In [213]:
representative_viral_barcode = [ collapsed[s] for s in wnv['viral']]
wnv.insert(0, "rep_wnv", representative_viral_barcode)
wnv

Unnamed: 0,rep_wnv,midgut,ID,cellbarcode,umi,viral,cellbarcode_quality,umi_quality,viral_quality,min_qual_cellbarcode,min_qual_umi,min_qual_virus,overall_quality
0,CTGACCGTGACGGTTACTGCGGCTACCCTGCTG_30,mg3,A00405:682:HV27LDSX5:1:1101:7898:2143,TGGGCGTAGTCGAGT,GTGCCGCTCTA,CTGACCGTGACGGTTACTGCGGCTACCCTGCTT,FFFFFFFFFFFFFFF,FFFFFFF:FFF,FFFFFFFFFFF:FFFFFFFFFFF::FFFFFFFF,37,25,25,25
1,CTAACCGTTACTGTGACTGCGGCGACACTCCTT_10,mg3,A00405:682:HV27LDSX5:1:1101:10022:4539,CAGGTGCGCTCTAAT,TCTGGGACGTC,CTTACCGTTACTGTGACTGCGGCGACACTCCTT,FFFFFFFFF:FFFFF,FFFFFFFFFF:,FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF,25,25,37,25
2,CTCACCGTTACCGTTACAGCAGCCACTCTACTC_16,mg3,A00405:682:HV27LDSX5:1:1101:26576:5619,GTAACTGTCTTCACA,GACGGGTTCAC,CTCACCGTTACCGTTACAGCAGCCACTCTACTC,FFFFFFFFFFFFFFF,FFFFFFFFFFF,FFFFFFFFFFFFFFFFFFFFFFFF:FFFFFFFF,37,37,25,25
3,CTGACAGTCACAGTGACTGCCGCGACACTTCTG_3,mg3,A00405:682:HV27LDSX5:1:1101:24343:7764,ACGGGTGTGGTGCGC,TCTAATTCTGG,CTGACAGTCACAGTGACTGCCGCGACTCTTCTT,F:FFF::::FFFF:F,FF:FFFFF:FF,FFFFF:FFF:FFFFFFFFFFF:F:FFFFFFFFF,25,25,25,25
4,CTAACTGTAACCGTCACTGCTGCAACTCTGCTC_5,mg3,A00405:682:HV27LDSX5:1:1101:23149:8234,ATTGGTGTGGTGCGC,TCTAATTCTGG,CTAACTGTAACCGTCACTGCGGCAACTCTGCTC,"F,FFFFFF,:,F:::","FFFFFF,,F,F",":F:F:F::FF:F,F:FF:::F:,,,FF:,FF::",11,11,11,11
...,...,...,...,...,...,...,...,...,...,...,...,...,...
10449,CTAACTGTAACAGTTACTGCTGCTACTCTACTT_21,mg9,A00405:732:HCVWNDSX7:2:1201:27326:21919,GGGACCTCCGTGGCC,ACGATGCCATC,CTAACTGTAACAGTTACTGCTGCTACTCTACTT,"F,,FFFFFFFFF:FF",",FFFFFFFFFF","FF,F,,,:FFFFF,FF:::FFF,,F,F,FF,FF",11,11,11,11
10450,CTTACTGTTACAGTCACTGCAGCCACTCTGCTT_101,mg9,A00405:732:HCVWNDSX7:2:1201:28583:35900,GTTCCAACTTTCTTC,TGCATGATGGG,CTTACTGTTACAGTCACTGCAGCCACTCTGCTT,"FFFFFFFFFF,FFFF",FFFFF:FFF::,"FFFFFFFFFF:F:FFFFFFFFF,FFFFFFFFF:",11,25,11,11
10451,CTCACCGTAACCGTCACAGCTGCTACGCTGCTT_78,mg9,A00405:732:HCVWNDSX7:2:1202:9914:11209,AAACGGGTTCACTAC,TACTGCAGCTA,CTCACCGTAACCGTCACAGCTGCTACGCTGCTT,"FFFF,::FFFF:FFF",:FFFFFFFF:F,"F:FFF:F:FFF,FF:F::,FFFF:FFFFFFFFF",11,25,11,11
10452,CTGACTGTGACTGTAACTGCTGCTACCCTTCTC_51,mg9,A00405:732:HCVWNDSX7:2:1202:27471:23171,TTTGCGCTCTAATTC,TGGGACGTCCG,CTGACTGTGACTGTAACTGCTGCTACCCTTCTC,FFFFFFFFFFFFFFF,"F:F,FFFFFFF",FFFFF:FFFFFFFFF:FFFFFFFF:FFFFFFFF,37,11,25,11


In [215]:
ct = pd.crosstab(wnv['rep_wnv'], wnv['midgut'])
ct_sorted = ct.assign(row_sum=ct.sum(axis=1)).sort_values('row_sum', ascending=False).drop(columns='row_sum')
for col in ct_sorted.columns:
    ct_sorted[f'{col}_rank'] = ct_sorted[col].rank(ascending=False, method='min')

print(len(ct_sorted))
ct_sorted.head(20)


185


midgut,mg3,mg4,mg6,mg7,mg8,mg9,mg3_rank,mg4_rank,mg6_rank,mg7_rank,mg8_rank,mg9_rank
rep_wnv,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
CTCACCGTTACTGTGACTGCAGCAACCCTACTT_0,331,61,45,135,247,25,2.0,1.0,1.0,2.0,1.0,1.0
CTGACGGTAACAGTGACCGCAGCAACCCTACTA_1,389,17,20,142,99,17,1.0,12.0,8.0,1.0,6.0,3.0
CTGACCGTTACCGTTACAGCAGCCACGCTTCTG_2,240,29,45,96,210,24,6.0,7.0,1.0,3.0,2.0,2.0
CTGACAGTCACAGTGACTGCCGCGACACTTCTG_3,294,15,29,54,159,16,4.0,14.0,5.0,7.0,3.0,4.0
CTAACTGTAACCGTCACTGCTGCAACTCTGCTC_5,245,8,12,80,30,15,5.0,21.0,18.0,4.0,37.0,5.0
CTGACGGTCACAGTTACTGCTGCAACCCTGCTC_4,331,4,0,7,45,1,2.0,30.0,81.0,60.0,25.0,43.0
CTGACTGTGACCGTGACTGCTGCCACTCTGCTA_6,227,0,1,21,50,0,7.0,81.0,67.0,24.0,21.0,71.0
CTTACCGTCACCGTAACCGCAGCCACTCTACTC_7,133,51,19,35,37,3,10.0,3.0,10.0,11.0,31.0,23.0
CTTACCGTAACGGTAACTGCCGCCACGCTGCTG_8,187,1,3,35,46,4,9.0,55.0,46.0,11.0,24.0,18.0
CTGACTGTTACAGTCACGGCTGCCACTCTCCTT_9,9,8,20,74,135,5,47.0,21.0,8.0,5.0,4.0,13.0


In [203]:
#
# this was to test the hamming functions
#
seqs = list(clusters[0])
N = 0
n = len(seqs)
accumulative_mismatches = 0
for i in range(n-1):
    for j in range(i, n):
        
        k = mismatches(seqs[i],seqs[j])
        accumulative_mismatches += k
        if False: 
            print(f"{i:2d} ", seqs[i])
            print(f"{j:2d} ", end=" ")
            for a,b in zip(seqs[i], seqs[j]):
                if a == b: 
                    print(b, end="")
                else:
                    print("*", end="")
            print(f"\t{k}\t{accumulative_mismatches}\n")
print(accumulative_mismatches, math.comb(n, 2))

import itertools
accumulative_mismatches = 0
for i,j in itertools.combinations(range(n),2):

    k = mismatches(seqs[i],seqs[j])
    accumulative_mismatches += k

print(accumulative_mismatches, math.comb(n, 2), accumulative_mismatches/math.comb(n, 2))
        
#hamming_distance(seqs[1],seqs[2])
#average_hamming(clusters[0])

98 55
98 55 1.7818181818181817


In [124]:
result = wnv.groupby("cellbarcode").agg(
        ave_mismatch_wnv = ("viral", average_hamming),
        n_seqs = ("viral", "count"),
        min_qual = ("overall_quality", "min")
    ).reset_index()
result

Unnamed: 0,cellbarcode,ave_mismatch_wnv,n_seqs,min_qual
0,AAACCTGCGCTAACC,0.0,2,11.0
1,AAACCTGCTTCTCGT,0.0,1,25.0
2,AAACCTGGTGATAAA,0.0,2,11.0
3,AAACGGGAACCATGT,0.0,1,25.0
4,AAACGGGGTCCATCC,0.0,1,11.0
...,...,...,...,...
1619,TTTGTCAAGACACTA,0.0,1,11.0
1620,TTTGTCAAGCTCCTT,0.0,2,11.0
1621,TTTGTCACACATGTG,0.0,1,11.0
1622,TTTGTCACATGCGCA,0.0,1,25.0


In [125]:
top_candidates = result[(result["ave_mismatch_wnv"] > 0) & (result["min_qual"] >= 25)].sort_values(by="min_qual", ascending=False)
top_candidates

Unnamed: 0,cellbarcode,ave_mismatch_wnv,n_seqs,min_qual
580,CCGTTCACTACTACT,4.5,2,37.0
1610,TTTCTCGTACTGTCT,6.0,3,37.0
1022,GCTTCCAAACAGAGC,6.0,3,37.0
195,AGACGTTCGTGGCCA,6.666667,3,25.0
206,AGAGTGGTGCGCTCT,11.0,4,25.0
242,AGGCCGTTGTCCGCC,4.0,2,25.0
284,AGTTGGGTGTGGTGC,10.25,4,25.0
386,CAACCTCTCAATCTC,4.5,2,25.0
314,ATCCACCACAGCGTT,4.0,3,25.0
403,CACAAACAGCTACCT,3.5,2,25.0


In [113]:
top_qual_barcodes = list(top_candidates['cellbarcode'])

In [114]:
top_qual_barcodes

['TTTACTGCATGATGG',
 'ACATGGGTGTGGTGC',
 'ATCCAACTTTCTTCT',
 'AGCTCTAATTCTGGG',
 'GCTGGGTGTGGTGCG',
 'GGGTGTGGTGCGCTC',
 'GTCTAACTTTCTTCT',
 'GTCACAACTTTCTTC',
 'GTGTGGTGCGCTCTA',
 'TCACAGACGGGTTCA',
 'TGCGTGGTGCGCTCT',
 'TGTCCAACTTTCTTC']

In [115]:
all_top_qual = wnv[wnv["cellbarcode"].isin(top_qual_barcodes)].copy()

In [116]:
all_top_qual["cellbarcode"] = pd.Categorical(
    all_top_qual["cellbarcode"],
    categories=top_qual_barcodes,
    ordered=True
)

## Order by quality score and evaluate viral divergence

In [117]:
all_top_qual[["cellbarcode","umi","viral","overall_quality"]].sort_values("cellbarcode").reset_index().head(20)

Unnamed: 0,index,cellbarcode,umi,viral,overall_quality
0,634,TTTACTGCATGATGG,GTGTGGTGCGC,CTCACAGTCACAGTAACTGCTGCTACGCTCCTG,37.0
1,609,TTTACTGCATGATGG,GTGTGGTGCGC,CTGACCGTCACTGTAACTGCCGCTACTCTCCTT,37.0
2,657,ACATGGGTGTGGTGC,GCTCTAATTCT,CTTACCGTCACCGTAACCGCAGCCACTCTACTC,37.0
3,484,ACATGGGTGTGGTGC,GCTCTAATTCT,CTAACTGTTACGGTCACTGCAGCGACACTCCTA,25.0
4,618,ATCCAACTTTCTTCT,GCATGATGGGT,CTTACCGTTACTGTGACTGCGGCGACACTCCTT,25.0
5,328,ATCCAACTTTCTTCT,GCATGATGGGT,CTCACAGTGACCGTCACAGCCGCCACTCTCCTA,25.0
6,320,AGCTCTAATTCTGGG,ACGTCCGTGGC,CTCACCGTTACTGTCACTGCAGCAACCCTACTT,25.0
7,569,AGCTCTAATTCTGGG,ACGTCCGTGGC,CTCACAGTCACAGTAACCGCTGCAACGCTACTT,25.0
8,133,GCTGGGTGTGGTGCG,CTCTAATTCTG,CTCACAGTGACTGTAACAGCAGCTACGCTGCTA,25.0
9,492,GCTGGGTGTGGTGCG,CTCTAATTCTG,CTTACCGTTACTGTGACTGCGGCGACACTCCTT,37.0


## Verify the divergence of the viral sequences sharing the same cell bar code and UMI

In [119]:
x = all_top_qual[["cellbarcode","umi","viral","overall_quality"]].sort_values("cellbarcode").reset_index()
x.groupby(["cellbarcode", "umi"], observed=True).agg(
    min_quality = ("overall_quality", "min"),
    ave_mismatch_wnv=("viral", average_hamming),
    n_seqs=("viral", "count")
).sort_values(by="cellbarcode", ascending=True)
   

Unnamed: 0_level_0,Unnamed: 1_level_0,min_quality,ave_mismatch_wnv,n_seqs
cellbarcode,umi,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
TTTACTGCATGATGG,GTGTGGTGCGC,37.0,3.0,2
ACATGGGTGTGGTGC,GCTCTAATTCT,25.0,5.0,2
ATCCAACTTTCTTCT,GCATGATGGGT,25.0,5.0,2
AGCTCTAATTCTGGG,ACGTCCGTGGC,25.0,3.5,2
GCTGGGTGTGGTGCG,CTCTAATTCTG,25.0,13.0,4
GGGTGTGGTGCGCTC,TAATTCTGGGA,25.0,5.333333,3
GTCTAACTTTCTTCT,GCATGATGGGT,25.0,7.333333,3
GTCACAACTTTCTTC,TGCATGATGGG,25.0,6.666667,3
GTGTGGTGCGCTCTA,ATTCTGGGACG,25.0,3.5,2
TCACAGACGGGTTCA,CTACTACTGCA,25.0,4.0,2
