In [1]:
!pip install biopython



In [2]:
import os
import torch
from Bio import SeqIO
import pandas as pd
import random

In [3]:
def reduce_data(fasta_file, csv_file, reduced_amount, reduced_fasta_file, reduced_csv_file):
    # Read the FASTA file and select a random subset of sequences
    fasta_sequences = {}
    selected_identifiers = []
    with open(fasta_file, "r") as fasta:
        records = list(SeqIO.parse(fasta, "fasta"))
        random.shuffle(records)
        selected_records = records[:reduced_amount]
        for record in selected_records:
            fasta_sequences[record.id] = str(record.seq)
            selected_identifiers.append(record.id)

    # Read the CSV file and store conservation scores for matching identifiers
    csv_data = pd.read_csv(csv_file, delimiter='\t', index_col=0)
    reduced_csv_data = pd.DataFrame(
        columns=csv_data.columns)  # Create an empty DataFrame
    for identifier in selected_identifiers:
        # Search for partial matches in the index
        matching_index = csv_data.index[csv_data.index.str.contains(
            identifier)]
        if len(matching_index) > 0:
            reduced_csv_data = pd.concat(
                [reduced_csv_data, csv_data.loc[matching_index]])

    # Write reduced data to new files
    with open(reduced_fasta_file, "w") as fasta_reduced:
        with open(reduced_csv_file, "w") as csv_reduced:
            for identifier, sequence in fasta_sequences.items():
                fasta_reduced.write(f">{identifier}\n{sequence}\n")
            reduced_csv_data.to_csv(csv_reduced, sep='\t')


# reduce_data("curated_dataset/sequences.fasta", "curated_dataset/conservation_scores_formated.csv", 1000,"curated_dataset/reduced_input.fasta", "curated_dataset/reduced_input.csv")
reduce_data("curated_dataset/sequences.fasta", "curated_dataset/conservation_scores_formated.csv", 20000 ,"curated_dataset/reduced_input_20000.fasta", "curated_dataset/reduced_input_20000.csv")

# esm-extract esm2_t6_8M_UR50D curated_dataset/reduced_input.fasta curated_dataset/example_embeddings_esm2_reduced_input --repr_layers 0 5 6 --include mean per_tok

def get_embeddings_vectors_sample_data(folder_path):

    # Get the list of files in the folder
    files = os.listdir(folder_path)

    # Filter the files to only include .pt files
    pt_files = [file for file in files if file.endswith('.pt')]

    # Initialize a list to store the vectors
    vectors = []

    # Load the vectors from each .pt file
    for file in pt_files:
        file_path = os.path.join(folder_path, file)
        embeddings = torch.load(file_path)["representations"][6]
        #embeddings = torch.load(file_path)["representations"][6]
        float_vector = embeddings.numpy().astype(float)
        vectors.append(float_vector)
    return vectors


vectors = get_embeddings_vectors_sample_data('sample_data/example_embeddings_esm2')
print("vectors", len(vectors))


vectors 3


In [4]:
import pandas as pd

# Lire le fichier fasta et extraire les clés
fasta_file = "curated_dataset/sequences.fasta"
with open(fasta_file, "r") as f:
    fasta_keys = [line.strip().split()[0][1:] for line in f if line.startswith(">")]

# Lire le fichier Excel
excel_file = "embeddings_dict.xlsx"
df = pd.read_excel(excel_file, header=None, usecols=[0])

# Convertir la colonne de l'Excel en une liste de clés
excel_keys = df[0].tolist()

# Trouver les clés dans l'Excel qui ne sont pas dans le fichier fasta
keys_in_excel_not_in_fasta = set(excel_keys) - set(fasta_keys)

# Trouver les clés dans le fichier fasta qui ne sont pas dans l'Excel
keys_in_fasta_not_in_excel = set(fasta_keys) - set(excel_keys)

print("Clés dans l'Excel mais pas dans le fichier fasta :")
for key in keys_in_excel_not_in_fasta:
    print(key)

print("\nClés dans le fichier fasta mais pas dans l'Excel :")
for key in keys_in_fasta_not_in_excel:
    print(key)


Keys en el Excel pero no en el archivo fasta:
A0A131Z4G5.1/12-165 Pfam=PF09782.12 type=family
A0A058ZM29.1/429-454 Pfam=PF18412.4 type=domain
A0A2R6C946.1/11-93 Pfam=PF00352.24 type=domain
A0A5N3XT57.1/3-85 Pfam=PF01099.20 type=domain
A0A7J7AN84.1/338-538 Pfam=PF19712.2 type=domain
A0A2S6ZPA0.1/55-307 Pfam=PF09572.13 type=family
A0A5P2C7J4.1/1-181 Pfam=PF16157.8 type=family
A0A420IRX1.1/313-328 Pfam=PF01422.20 type=family
B2A5X8.1/345-509 Pfam=PF19364.2 type=family
A0A7S1EV37.1/40-153 Pfam=PF08613.14 type=family
A0A419GL35.1/32-110 Pfam=PF14110.9 type=family
A0A7S1PN89.1/202-221 Pfam=PF18345.4 type=domain
A0A1E5CFJ9.1/4-44 Pfam=PF11346.11 type=family
A0A2G2JH97.1/306-402 Pfam=PF10675.12 type=domain
D6A9Y9.1/6-117 Pfam=PF05331.14 type=family
A0A183WD41.1/10-68 Pfam=PF00219.21 type=domain
A0A1R3JBA4.1/29-239 Pfam=PF00314.20 type=domain
A0A1W4UT15.1/70-172 Pfam=PF11467.11 type=domain
A0A3D2W3W3.1/30-230 Pfam=PF07751.14 type=family
A0A3L6Q2N1.1/2-41 Pfam=PF18511.4 type=domain
A0A4X2KKJ0.1/