In [1]:
!pip install biopython
!pip install python-Levenshtein

Collecting biopython
  Downloading biopython-1.85-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (13 kB)
Downloading biopython-1.85-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.2/3.2 MB[0m [31m39.3 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: biopython
Successfully installed biopython-1.85
Collecting python-Levenshtein
  Downloading python_levenshtein-0.27.1-py3-none-any.whl.metadata (3.7 kB)
Collecting Levenshtein==0.27.1 (from python-Levenshtein)
  Downloading levenshtein-0.27.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.6 kB)
Collecting rapidfuzz<4.0.0,>=3.9.0 (from Levenshtein==0.27.1->python-Levenshtein)
  Downloading rapidfuzz-3.13.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Downloading python_levenshtein-0.27.1-py3-none-any.whl (9.4 kB)
Downloading levenshtein-0.27.1-cp3

In [2]:
from tqdm import tqdm
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from Bio import SeqIO
import os
from collections import Counter
import torch
from Levenshtein import distance


In [3]:
def readDataFromFile(filenames):
    dfs = []  # List to store individual DataFrames

    for filename in filenames:
        file_path = os.path.abspath(filename)  # Ensure absolute path
        # Read and store data directly from the generator
        df = pd.DataFrame.from_records([
            {
                "ID": "|".join(record.description.split("|")[1:]),
                "Sequence": str(record.seq),  # Extract sequence
            }
            for record in SeqIO.parse(file_path, "fasta")
        ])
          
        dfs.append(df)  # Append each DataFrame

    # Concatenate all DataFrames into one
    return pd.concat(dfs, ignore_index=True) if dfs else pd.DataFrame() 

In [4]:
def removeDuplicateSeq(datas, ids):
    unique_dict = {}  # Dictionary to store the last occurrence of each data
    for data, id_ in zip(datas, ids):
        unique_dict[data] = id_  # Overwrites previous values, keeping only the last occurrence
    return list(unique_dict.keys()),list(unique_dict.values())

In [5]:
def removeDuplicateIds(datas,ids):
    datas = np.array(datas)
    ids = np.array(ids)
    # Find duplicated (id, sid) pairs
    unique_ids, counts = np.unique(ids, return_counts=True)
    duplicate_ids = {id for id in unique_ids[counts > 1]}  # Convert to set of tuples
    
    
    # Step 2: Find indices to remove
    indices_to_remove = np.array([
        ( id_val in duplicate_ids and 'X' in data_val) 
        for id_val, data_val in tqdm(zip(ids, datas))
    ])
    
    # Remove those indices
    filtered_data = datas[~indices_to_remove]
    filtered_id = ids[~indices_to_remove]

     # Find unique id pairs and keep only the first occurrence
    _,unique_indices = np.unique(filtered_id, return_index=True)
    
    # Filter arrays
    filtered_data = filtered_data[unique_indices]
    filtered_id = filtered_id[unique_indices]

    return filtered_data,filtered_id

In [6]:
def removeXSeq( datas, ids):
    # Step 1: Compute X frequencies for all sequences
    Xfreqs = np.array([Counter(seq).get("X", 0) / len(seq) for seq in datas])

    # Step 2: Create a mask to filter out sequences where X frequency > 0.1
    mask = Xfreqs == 0  # True for sequences to keep
    # Step 3: Apply the mask to filter data
    return (
        np.array(datas)[mask], 
        np.array(ids)[mask]
    )

In [7]:
def removeIncompleteSeq(datas, ids):
    filtered_datas=[]
    filtered_ids=[]
    for data, id_ in zip(datas, ids):
        if len(data) >= 100:
            filtered_datas.append(data)
            filtered_ids.append(id_)

    return filtered_datas, filtered_ids
    

In [8]:
from tqdm import tqdm
import numpy as np
from collections import defaultdict

def hamming_distance_np(arr1, arr2):
    """Vectorized Hamming distance calculation using NumPy."""
    return np.sum(arr1 != arr2, axis=1)

def process_length_group(sequences, ids, threshold):
    """Processes a group of sequences of the same length and removes similar ones."""
    if not sequences:
        return [], []
    
    # Sort sequences to prioritize those with "human" in their ID
    sorted_data = sorted(zip(sequences, ids), key=lambda x: "Human"not in x[1])  
    sorted_sequences, sorted_ids = zip(*sorted_data)  

    unique_sequences = []
    unique_ids = []

    np_sequences = np.array([list(seq) for seq in sorted_sequences], dtype="<U1")  # Convert to NumPy array

    for i in tqdm(range(len(np_sequences))):
        seq = np_sequences[i]
        id_ = sorted_ids[i]

        if not unique_sequences:
            unique_sequences.append(seq)
            unique_ids.append(id_)
            continue

        # Convert unique sequences to NumPy array for fast comparisons
        np_unique = np.array(unique_sequences, dtype="<U1")

        # Compute Hamming distance
        dists = hamming_distance_np(np_unique, seq)
        normalized_dists = dists / len(seq)

        # Add if it is different enough
        if np.all(normalized_dists >= threshold):
            unique_sequences.append(seq)
            unique_ids.append(id_)

            # if len(unique_sequences) % 100 == 0:
            #     print(f"Unique Sequences (Length {len(seq)}): {len(unique_sequences)}")

    # Convert back to strings
    unique_sequences = ["".join(seq) for seq in unique_sequences]
    
    return unique_sequences, unique_ids

def remove_similar_sequences(sequences, ids, threshold=0.01/2):
    if not sequences:
        return [], []

    # Group sequences by length
    length_groups = defaultdict(list)
    for seq, id_ in zip(sequences, ids):
        length_groups[len(seq)].append((seq, id_))

    # Process each group separately
    unique_sequences = []
    unique_ids = []

    for length, seq_group in length_groups.items():
        group_sequences, group_ids = zip(*seq_group)
        u_seqs, u_ids = process_length_group(group_sequences, group_ids, threshold)
        unique_sequences.extend(u_seqs)
        unique_ids.extend(u_ids)

    return unique_sequences, unique_ids


In [9]:
fileNamesOriginalDatas=["/kaggle/input/ncbi-viruses/africa.fa",
                "/kaggle/input/ncbi-viruses/europe.fa",
                "/kaggle/input/ncbi-viruses/Australia.fa",
                "/kaggle/input/ncbi-viruses/asia.fa",
                "/kaggle/input/ncbi-viruses/south.fa",
                "/kaggle/input/ncbi-viruses/north.fa"
             ]



df=readDataFromFile(fileNamesOriginalDatas)
datas = df['Sequence'].values  # Returns a NumPy array
ids = df['ID'].values  # Returns a NumPy array
print(ids[0])
print(len(datas))
print(len(ids))

list_data,list_id=removeDuplicateSeq(datas,ids)
list_data,list_id=removeDuplicateIds(list_data,list_id)
list_data,list_id=removeXSeq(list_data,list_id)
list_data,list_id=removeIncompleteSeq(list_data,list_id)
list_data,list_id=remove_similar_sequences(list_data,list_id)

print(len(list_data))
print(len(list_id))

df = pd.DataFrame({"ID": list_id, "Data": list_data})
df.to_csv("ncbi_data.csv", index=False)

print(f"Saved {len(list_data)} seq")


PB2|Influenza A virus (A/Addis Ababa/1514A07305892N/2013(H3N2))|Ethiopia|2013|Human
203673
203673


195824it [00:00, 392813.43it/s]
100%|██████████| 22642/22642 [10:03<00:00, 37.52it/s]
100%|██████████| 422/422 [00:00<00:00, 1748.39it/s]
100%|██████████| 1783/1783 [00:03<00:00, 474.52it/s]
100%|██████████| 2565/2565 [00:09<00:00, 284.71it/s]
100%|██████████| 2424/2424 [00:05<00:00, 424.53it/s] 
100%|██████████| 457/457 [00:00<00:00, 1959.93it/s]
100%|██████████| 986/986 [00:01<00:00, 862.16it/s] 
100%|██████████| 969/969 [00:01<00:00, 867.99it/s] 
100%|██████████| 25/25 [00:00<00:00, 7455.22it/s]
100%|██████████| 80/80 [00:00<00:00, 7777.85it/s]
100%|██████████| 3/3 [00:00<00:00, 3021.11it/s]
100%|██████████| 4/4 [00:00<00:00, 11626.62it/s]
100%|██████████| 39/39 [00:00<00:00, 11105.08it/s]
100%|██████████| 26/26 [00:00<00:00, 13464.86it/s]
100%|██████████| 1/1 [00:00<00:00, 9404.27it/s]
100%|██████████| 7/7 [00:00<00:00, 10065.18it/s]
100%|██████████| 6/6 [00:00<00:00, 3591.01it/s]
100%|██████████| 1/1 [00:00<00:00, 7489.83it/s]
100%|██████████| 9/9 [00:00<00:00, 8192.00it/s]
100%|█

79012
79012
Saved 79012 seq


In [46]:
def readDataFromDF(filename):
    file_path = os.path.abspath(filename)  # Ensure absolute path

    # Read CSV file
    df = pd.read_csv(file_path)
    # print(df)
    # Rename columns for clarity
    df.columns = ["ID","Sequence"]

    # Extract virus_ID (part after first "|") and seq_ID (part after second "|")
    df["Virus_ID"] = df["ID"].apply(lambda x: "".join(x.split("|")[1:]) if "|" in x else "")
    df["Seq_ID"] = df["ID"].apply(lambda x: x.split("|")[0] if "|" in x else "")
    df["Class"] = df["ID"].apply(lambda x: x.split("|")[-1] if "|" in x else "")
    df["Length"] = df["Sequence"].apply(lambda x: len(x))

    return df[["Sequence","Virus_ID", "Seq_ID", "Class","Length"]]  # Return relevant columns

df=readDataFromDF("/kaggle/working/ncbi_data.csv")

In [47]:
human=set()
animal=set()
for virus_id,label in zip(ids,labels):
    if (label==0):
        human.add(virus_id)
    else:
        animal.add(virus_id)
print("number of aniaml viruses",len(animal))
print("number of human viruses",len(human))

number of aniaml viruses 29433
number of human viruses 14288


In [51]:
df["Class"]=df["Class"].str.lower()
non_human_df = df[df['Class'] != "human"]
human_df = df[df['Class'] == "human"]

non_human_virus_ids= non_human_df["Virus_ID"].unique()
non_human_virus_ids=np.array(non_human_virus_id)
np.random.shuffle(non_human_virus_id)
non_human_virus_ids=non_human_virus_id[:14288]


In [52]:
nonhuman_df = df[df['Virus_ID'].isin(non_human_virus_id)]


In [54]:
nonhuman_df.head()

29399


Unnamed: 0,Sequence,Virus_ID,Seq_ID,Class,Length
4708,MKTIIVLSCFFCLAFSQNPSENNNNTATLCLGHHAVPNGTIVKTIT...,Influenza A virus (A/American Black Duck/Maine...,HA,avian,566
4709,MKTIIVLSCFFCLAFCQNPSENNNNTATLCLGHHAVPNGTIVKTIT...,Influenza A virus (A/American Black Duck/Ohio/...,HA,avian,566
4710,MKTIIVLSYLFCLALGQDYSENNNSTATLCLGHHAVPNGTIVKTIT...,Influenza A virus (A/American Black Duck/Virgi...,HA,avian,566
4711,MKTVIALSYIFCLAFGQNLLGNDNSTATLCLGHHAVPNGTVVKTIT...,Influenza A virus (A/American Green-Winged Tea...,HA,avian,566
4716,MKTIIVLSYLFCLALGQDYSENNNSTATLCLGHHAVPNGTIVKTIT...,Influenza A virus (A/American black duck/Maine...,HA,avian,566


In [55]:
final_df = pd.concat([nonhuman_df, human_df], ignore_index=True)
print(len(final_df))
final_df.head(5)

47832


Unnamed: 0,Sequence,Virus_ID,Seq_ID,Class,Length
0,MKTIIVLSCFFCLAFSQNPSENNNNTATLCLGHHAVPNGTIVKTIT...,Influenza A virus (A/American Black Duck/Maine...,HA,avian,566
1,MKTIIVLSCFFCLAFCQNPSENNNNTATLCLGHHAVPNGTIVKTIT...,Influenza A virus (A/American Black Duck/Ohio/...,HA,avian,566
2,MKTIIVLSYLFCLALGQDYSENNNSTATLCLGHHAVPNGTIVKTIT...,Influenza A virus (A/American Black Duck/Virgi...,HA,avian,566
3,MKTVIALSYIFCLAFGQNLLGNDNSTATLCLGHHAVPNGTVVKTIT...,Influenza A virus (A/American Green-Winged Tea...,HA,avian,566
4,MKTIIVLSYLFCLALGQDYSENNNSTATLCLGHHAVPNGTIVKTIT...,Influenza A virus (A/American black duck/Maine...,HA,avian,566


In [62]:
def writeDataToFile(df):
    # Combine Virus_ID, Seq_ID, and Class into the ID column
    df["ID"] = df["Seq_ID"] + "|" + df["Virus_ID"] + "|" + df["Class"]
    df["Data"]=df["Sequence"]
    # Return the DataFrame with "ID" and "Sequence"
    final_df= df[["ID", "Data"]]
    print(final_df.head(5))
    final_df.to_csv("ncbi_data_final.csv", index=False)
writeDataToFile(final_df)

                                                  ID  \
0  HA|Influenza A virus (A/American Black Duck/Ma...   
1  HA|Influenza A virus (A/American Black Duck/Oh...   
2  HA|Influenza A virus (A/American Black Duck/Vi...   
3  HA|Influenza A virus (A/American Green-Winged ...   
4  HA|Influenza A virus (A/American black duck/Ma...   

                                                Data  
0  MKTIIVLSCFFCLAFSQNPSENNNNTATLCLGHHAVPNGTIVKTIT...  
1  MKTIIVLSCFFCLAFCQNPSENNNNTATLCLGHHAVPNGTIVKTIT...  
2  MKTIIVLSYLFCLALGQDYSENNNSTATLCLGHHAVPNGTIVKTIT...  
3  MKTVIALSYIFCLAFGQNLLGNDNSTATLCLGHHAVPNGTVVKTIT...  
4  MKTIIVLSYLFCLALGQDYSENNNSTATLCLGHHAVPNGTIVKTIT...  
