In [None]:
!pip install biopython
!pip install python-Levenshtein

In [None]:
from tqdm import tqdm
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from Bio import SeqIO
import os
from collections import Counter
import torch
from Levenshtein import distance


In [None]:
def readDataFromFile(filenames):
    dfs = []  # List to store individual DataFrames

    for filename in filenames:
        file_path = os.path.abspath(filename)  # Ensure absolute path
        # Read and store data directly from the generator
        df = pd.DataFrame.from_records([
            {
                "ID": "|".join(record.description.split("|")[1:]),
                "Sequence": str(record.seq),  # Extract sequence
            }
            for record in SeqIO.parse(file_path, "fasta")
        ])
          
        dfs.append(df)  # Append each DataFrame

    # Concatenate all DataFrames into one
    return pd.concat(dfs, ignore_index=True) if dfs else pd.DataFrame() 

In [None]:
def removeDuplicateSeq(datas, ids):
    unique_dict = {}  # Dictionary to store the last occurrence of each data
    for data, id_ in zip(datas, ids):
        unique_dict[data] = id_  # Overwrites previous values, keeping only the last occurrence
    return list(unique_dict.keys()),list(unique_dict.values())

In [None]:
def removeDuplicateIds(datas,ids):
    datas = np.array(datas)
    ids = np.array(ids)
    # Find duplicated (id, sid) pairs
    unique_ids, counts = np.unique(ids, return_counts=True)
    duplicate_ids = {id for id in unique_ids[counts > 1]}  # Convert to set of tuples
    
    
    # Step 2: Find indices to remove
    indices_to_remove = np.array([
        ( id_val in duplicate_ids and 'X' in data_val) 
        for id_val, data_val in tqdm(zip(ids, datas))
    ])
    
    # Remove those indices
    filtered_data = datas[~indices_to_remove]
    filtered_id = ids[~indices_to_remove]

     # Find unique id pairs and keep only the first occurrence
    _,unique_indices = np.unique(filtered_id, return_index=True)
    
    # Filter arrays
    filtered_data = filtered_data[unique_indices]
    filtered_id = filtered_id[unique_indices]

    return filtered_data,filtered_id

In [None]:
def removeXSeq( datas, ids):
    # Step 1: Compute X frequencies for all sequences
    Xfreqs = np.array([Counter(seq).get("X", 0) / len(seq) for seq in datas])

    # Step 2: Create a mask to filter out sequences where X frequency > 0.1
    mask = Xfreqs == 0  # True for sequences to keep
    # Step 3: Apply the mask to filter data
    return (
        np.array(datas)[mask], 
        np.array(ids)[mask]
    )

In [None]:
def removeIncompleteSeq(datas, ids):
    filtered_datas=[]
    filtered_ids=[]
    for data, id_ in zip(datas, ids):
        if len(data) >= 100:
            filtered_datas.append(data)
            filtered_ids.append(id_)

    return filtered_datas, filtered_ids
    

In [None]:
from tqdm import tqdm
import numpy as np
from collections import defaultdict

def hamming_distance_np(arr1, arr2):
    """Vectorized Hamming distance calculation using NumPy."""
    return np.sum(arr1 != arr2, axis=1)

def process_length_group(sequences, ids, threshold):
    """Processes a group of sequences of the same length and removes similar ones."""
    if not sequences:
        return [], []
    
    # Sort sequences to prioritize those with "human" in their ID
    sorted_data = sorted(zip(sequences, ids), key=lambda x: "Human"not in x[1])  
    sorted_sequences, sorted_ids = zip(*sorted_data)  

    unique_sequences = []
    unique_ids = []

    np_sequences = np.array([list(seq) for seq in sorted_sequences], dtype="<U1")  # Convert to NumPy array

    for i in tqdm(range(len(np_sequences))):
        seq = np_sequences[i]
        id_ = sorted_ids[i]

        if not unique_sequences:
            unique_sequences.append(seq)
            unique_ids.append(id_)
            continue

        # Convert unique sequences to NumPy array for fast comparisons
        np_unique = np.array(unique_sequences, dtype="<U1")

        # Compute Hamming distance
        dists = hamming_distance_np(np_unique, seq)
        normalized_dists = dists / len(seq)

        # Add if it is different enough
        if np.all(normalized_dists >= threshold):
            unique_sequences.append(seq)
            unique_ids.append(id_)

            # if len(unique_sequences) % 100 == 0:
            #     print(f"Unique Sequences (Length {len(seq)}): {len(unique_sequences)}")

    # Convert back to strings
    unique_sequences = ["".join(seq) for seq in unique_sequences]
    
    return unique_sequences, unique_ids

def remove_similar_sequences(sequences, ids, threshold=0.01/2):
    if not sequences:
        return [], []

    # Group sequences by length
    length_groups = defaultdict(list)
    for seq, id_ in zip(sequences, ids):
        length_groups[len(seq)].append((seq, id_))

    # Process each group separately
    unique_sequences = []
    unique_ids = []

    for length, seq_group in length_groups.items():
        group_sequences, group_ids = zip(*seq_group)
        u_seqs, u_ids = process_length_group(group_sequences, group_ids, threshold)
        unique_sequences.extend(u_seqs)
        unique_ids.extend(u_ids)

    return unique_sequences, unique_ids


In [None]:
fileNamesOriginalDatas=["/kaggle/input/ncbi-viruses/africa.fa",
                "/kaggle/input/ncbi-viruses/europe.fa",
                "/kaggle/input/ncbi-viruses/Australia.fa",
                "/kaggle/input/ncbi-viruses/asia.fa",
                "/kaggle/input/ncbi-viruses/south.fa",
                "/kaggle/input/ncbi-viruses/north.fa"
             ]



df=readDataFromFile(fileNamesOriginalDatas)
datas = df['Sequence'].values  # Returns a NumPy array
ids = df['ID'].values  # Returns a NumPy array
print(ids[0])
print(len(datas))
print(len(ids))

list_data,list_id=xremoveDuplicateSeq(datas,ids)
list_data,list_id=removeDuplicateIds(list_data,list_id)
list_data,list_id=removeXSeq(list_data,list_id)
list_data,list_id=removeIncompleteSeq(list_data,list_id)
list_data,list_id=remove_similar_sequences(list_data,list_id)

print(len(list_data))
print(len(list_id))

df = pd.DataFrame({"ID": list_id, "Data": list_data})
df.to_csv("ncbi_data.csv", index=False)

print(f"Saved {len(list_data)} seq")


In [None]:
def readDataFromfile(filename):
    file_path = os.path.abspath(filename)  # Ensure absolute path

    # Read CSV file
    df = pd.read_csv(file_path)
    # print(df)
    # Rename columns for clarity
    df.columns = ["ID","Sequence"]

    # Extract virus_ID (part after first "|") and seq_ID (part after second "|")
    df["Virus_ID"] = df["ID"].apply(lambda x: "".join(x.split("|")[1:]) if "|" in x else "")
    df["Seq_ID"] = df["ID"].apply(lambda x: x.split("|")[0] if "|" in x else "")
    df["Class"] = df["ID"].apply(lambda x: x.split("|")[-1] if "|" in x else "")
    df["Length"] = df["Sequence"].apply(lambda x: len(x))

    return df[["Sequence","Virus_ID", "Seq_ID", "Class","Length"]]  # Return relevant columns

df=readDataFromfile("/kaggle/working/ncbi_data.csv")

In [None]:

df["Class"] = df["Class"].str.lower()  #Ensure consistent casing
labels = np.array((df["Class"] != "human").astype(int))
ids=df["Virus_ID"]
seq_ids=df["Seq_ID"]+" "+df["Virus_ID"]

# convert string id to numeric
_,ids = np.unique(ids, return_inverse=True)
_,seq_ids = np.unique(seq_ids, return_inverse=True)

In [None]:
num_human = (df["Class"].str.lower() == "human").sum()
num_non_human = (df["Class"].str.lower() != "human").sum()
print("Number of human samples:", num_human)
print("Number of NON human samples:", num_non_human)


In [None]:
uniq,count=np.unique(df["Virus_ID"],return_counts=True)
print(len(uniq))

In [None]:
human=set()
animal=set()
for virus_id,label in zip(ids,labels):
    if (label==0):
        human.add(virus_id)
    else:
        animal.add(virus_id)
print(len(animal))
print(len(human))

In [None]:
print(animal)