In [14]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Install necessary libraries
!pip install biopython

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [29]:
import pandas as pd
from Bio import SeqIO
import pandas as pd
import os
import shutil
import numpy as np
from Bio.SeqIO.FastaIO import SimpleFastaParser

In [45]:
# Define file paths
positive_fasta_path = '/content/drive/MyDrive/LB2/second_try/SVM/Positive_training.fasta'
negative_fasta_path = '/content/drive/MyDrive/LB2/second_try/SVM/Negative_training.fasta'
positive_txt_path = '/content/drive/MyDrive/LB2/second_try/SVM/Positive_training.txt'
output_training_csv = '/content/drive/MyDrive/LB2/second_try/SVM/training_data.csv'
output_test_csv = '/content/drive/MyDrive/LB2/second_try/SVM/test_set.csv'
output_test_fasta = '/content/drive/MyDrive/LB2/second_try/SVM/test_set.fasta'
tsv_file_path = 'drive/MyDrive/LB2/SVM/benchmarking_posneg.tsv'
fasta_file_path = 'drive/MyDrive/LB2/SVM/benchmarking_pos.fasta'
output_benchmarking_csv = 'drive/MyDrive/LB2/SVM/shuffled_filtered_benchmarking_posneg.csv'
entry_file_path = 'drive/MyDrive/LB2/SVM/benchmarking_all_entries.txt'
tsv_path = '/content/drive/MyDrive/LB2/benchmarking.tsv'

# Function to extract entries from FASTA file
def extract_entries_from_fasta(file_path):
    return [record.id.split('|')[1] for record in SeqIO.parse(file_path, "fasta")]

# Extract entries from FASTA files
positive_entries = extract_entries_from_fasta(positive_fasta_path)
negative_entries = extract_entries_from_fasta(negative_fasta_path)

# Read entries from the txt file
with open(positive_txt_path, 'r') as file:
    positive_txt_entries = file.read().splitlines()

# Create DataFrames for positive and negative entries
positive_df = pd.DataFrame(positive_entries + positive_txt_entries, columns=['Entry'])
positive_df['Label'] = 1

negative_df = pd.DataFrame(negative_entries, columns=['Entry'])
negative_df['Label'] = 0

# Combine and shuffle the DataFrames
combined_df = pd.concat([positive_df, negative_df]).sample(frac=1).reset_index(drop=True)

# Split into 5 subsets
subsets = np.array_split(combined_df, 5)

# Save the combined DataFrame to CSV
combined_df.to_csv(output_training_csv, index=False)
print("Training DataFrame created and saved to CSV successfully!")

# Save one subset as the test set
test_set = subsets[0]
test_set.to_csv(output_test_csv, index=False)

# Extract sequences from the original FASTA file based on the test set entries
test_entries = set(test_set['Entry'])

with open(positive_fasta_path, 'r') as fasta_file:
    with open(output_test_fasta, 'w') as output_handle:
        for title, sequence in SimpleFastaParser(fasta_file):
            entry = title.split('|')[1]
            if entry in test_entries:
                output_handle.write(f">{title}\n{sequence}\n")

print("Test set created and saved to CSV and FASTA successfully!")

# Load and display the head of the 'benchmarking_posneg.tsv' file
tsv_data = pd.read_csv(tsv_file_path, sep='\t')
print("Head of benchmarking_posneg.tsv:")
print(tsv_data.head())

# Display the head of the 'benchmarking_pos.fasta' file
with open(fasta_file_path, 'r') as fasta_file:
    fasta_head = [fasta_file.readline().strip() for _ in range(10)]
print("First 10 lines of benchmarking_pos.fasta:")
for line in fasta_head:
    print(line)

# Read the FASTA file entries
fasta_entries = set()
with open(fasta_file_path, 'r') as fasta_file:
    fasta_entries = {line.split("|")[1] for line in fasta_file if line.startswith(">")}

# Label TSV entries based on presence in FASTA entries
tsv_data['label'] = tsv_data['Entry'].apply(lambda x: 1 if x in fasta_entries else 0)

# Filter, shuffle, and save the DataFrame
filtered_df = tsv_data[['Entry', 'Length', 'label']].sample(frac=1).reset_index(drop=True)
filtered_df.to_csv(output_benchmarking_csv, index=False)
print("Benchmarking data processed and saved to CSV successfully!")

# Extract and save the 'Entry' column to a text file
filtered_df['Entry'].to_csv(entry_file_path, index=False, header=False)
print("Entries saved to text file successfully!")

# Check and display the head of benchmarking.tsv
benchmarking_df = pd.read_csv(tsv_path, sep=',')
print("Head of benchmarking.tsv:")
print(benchmarking_df.head())


Training DataFrame created and saved to CSV successfully!
Test set created and saved to CSV and FASTA successfully!
Head of benchmarking_posneg.tsv:
     From   Entry   Entry Name  Length  \
0  Q27082  Q27082   CFGA_TACTR     673   
1  P54900  P54900    SCN2B_RAT     215   
2  O96910  O96910    ATT_APLCA      76   
3  P28335  P28335  5HT2C_HUMAN     458   
4  Q96PL1  Q96PL1  SG3A2_HUMAN      93   

                                      Signal peptide  \
0  SIGNAL 1..19; /evidence="ECO:0000269|PubMed:82...   
1  SIGNAL 1..29; /evidence="ECO:0000269|PubMed:85...   
2  SIGNAL 1..18; /evidence="ECO:0000269|PubMed:96...   
3  SIGNAL 1..32; /evidence="ECO:0000269|PubMed:22...   
4  SIGNAL 1..21; /evidence="ECO:0000269|PubMed:12...   

                                   Taxonomic lineage  
0  cellular organisms (no rank), Eukaryota (super...  
1  cellular organisms (no rank), Eukaryota (super...  
2  cellular organisms (no rank), Eukaryota (super...  
3  cellular organisms (no rank), Eukaryot

In [46]:
combined_df.head()

Unnamed: 0,Entry,Label
0,Q94AI4,0
1,Q8VY16,0
2,P10896,0
3,P13385,1
4,O94626,0


In [47]:
benchmarking_df.head()

Unnamed: 0,Entry,Length,label
0,Q9VCU5,323,0
1,Q8K2H6,185,0
2,Q9LKA0,259,0
3,Q501D2,572,0
4,Q06811,1365,0
