In [59]:
import pandas as pd

from opengsync_server.tools import tools

In [None]:
def min_hamming_distance(ref: str, s: list[str]) -> int:
    res = len(ref)
    for seq in s:
        res = min(res, _hamming_distance_shared_bases(ref, seq))
    return res



In [65]:
df = pd.DataFrame({
    'sample': ['S1', 'S2', 'S3', 'S4', 'S5', 'S6'],
    'sequence_i7': ['AAAA', 'ATTT', 'AATG', "ATTT", "CCCC", "TTTT"],
    'sequence_i5': ['TTTT', 'TTTC', 'CTTT', None, "TCCC", None],
    "pool": ['P1', 'P1', 'P1', 'P2', 'P2', 'P2']
})
df

Unnamed: 0,sample,sequence_i7,sequence_i5,pool
0,S1,AAAA,TTTT,P1
1,S2,ATTT,TTTC,P1
2,S3,AATG,CTTT,P1
3,S4,ATTT,,P2
4,S5,CCCC,TCCC,P2
5,S6,TTTT,,P2


In [90]:
def check_indices(df: pd.DataFrame, groupby: str | None = None) -> pd.DataFrame:
    df["error"] = None
    df["warning"] = None

    indices = ["sequence_i7"]
    if "sequence_i5" in df.columns and not df["sequence_i5"].isna().all():
        indices.append("sequence_i5")

    df["combined_index"] = ""
    for index in indices:
        df[index] = df[index].apply(lambda x: x.strip() if pd.notna(x) else "")
        _max = int(df[index].str.len().max())
        df["combined_index"] += df[index].str.ljust(_max, "N")
        
    if len(df) > 1:
        if "sequence_i5" in df.columns:
            same_barcode_in_different_indices = df["sequence_i7"] == df["sequence_i5"]
            df.loc[same_barcode_in_different_indices, "warning"] = "Same barcode in different indices"
        
        df["min_hamming_bases"] = None
        if groupby is None:
            df["min_hamming_bases"] = min_hamming_distances(df["combined_index"].tolist())
        else:
            for _, _df in df.groupby(groupby):
                if len(_df) < 2:
                    _df["min_hamming_bases"] = _df["combined_index"].apply(lambda x: len(x))
                else:
                    _df["min_hamming_bases"] = min_hamming_distances(_df["combined_index"].tolist())
                df.loc[_df.index, "min_hamming_bases"] = _df["min_hamming_bases"]
            
    else:
        df["min_hamming_bases"] = 1

    df.loc[df["min_hamming_bases"] < 1, "error"] = "Hamming distance of 0 between barcode combination in two or more libraries on same lane."
    df.loc[df["min_hamming_bases"] < 3, "warning"] = "Small hamming distance between barcode combination in two or more libraries on same lane."

    return df

In [91]:
check_indices(df, groupby=None)[["sample", "combined_index", "min_hamming_dist", "min_hamming_bases", "error", "warning"]]

Unnamed: 0,sample,combined_index,min_hamming_dist,min_hamming_bases,error,warning
0,S1,AAAATTTT,0.375,3,,
1,S2,ATTTTTTC,0.0,0,Hamming distance of 0 between barcode combinat...,Small hamming distance between barcode combina...
2,S3,AATGCTTT,0.25,2,,Small hamming distance between barcode combina...
3,S4,ATTTNNNN,0.0,0,Hamming distance of 0 between barcode combinat...,Small hamming distance between barcode combina...
4,S5,CCCCTCCC,0.5,4,,
5,S6,TTTTNNNN,0.25,1,,Small hamming distance between barcode combina...


In [34]:

df["i7_hamming_distance"] = min_hamming_distances(df["sequence_i7"].fillna('').tolist())
df["i5_hamming_distance"] = min_hamming_distances(df["sequence_i5"].fillna('').tolist())
df["combined_hamming_distance"] = min_hamming_distances(df["combined_index"].tolist())
df

Unnamed: 0,sample,sequence_i7,sequence_i5,combined_index,i7_hamming_distance,i5_hamming_distance,combined_hamming_distance
0,S1,AAAA,TTTT,AAAATTTT,2,0,3
1,S2,ATTT,TTTC,ATTTTTTC,0,0,0
2,S3,AATG,CTTT,AATGCTTT,2,0,2
3,S4,ATTT,,ATTTNNNN,0,0,0
4,S5,CCCC,TCCC,CCCCTCCC,4,0,4
5,S6,TTTT,,TTTTNNNN,1,0,1


In [None]:
tools.check_indices()