# TF–IDF Feature Extraction for Protein Sequences
## This script reads MP and non-membrane protein (Non-MP) sequences from CSV files, computes k-mer-based TF–IDF features (using amino acid tokens), and outputs a normalized TF–IDF feature matrix with protein labels for downstream machine learning analysis.

In [None]:
import pandas as pd
import numpy as np
from collections import Counter
from math import log, sqrt
from itertools import product

def read_sequences_from_csv(csv_file_path, protein_name_col='protein_name', sequence_col='sequence'):
    df = pd.read_csv(csv_file_path)
    sequence_ids = df[protein_name_col].tolist()
    sequences = df[sequence_col].tolist()
    return sequence_ids, sequences

def extract_tfidf_features(sequences, kmer_length=2, predefined_aa=list('ACDEFGHIKLMNPQRSTUVWY')):

    # 1. GENERATE THE PREDEFINED FEATURE SPACE
    # Create all possible k-mers of length 'kmer_length' from the predefined_aa list
    feature_names = [''.join(km) for km in product(predefined_aa, repeat=kmer_length)]
    M = len(feature_names)  # Fixed size, e.g., 441 for k=2 and 21 AAs
    kmer_to_idx = {kmer: idx for idx, kmer in enumerate(feature_names)}
    print(f"Using a fixed feature space of {M} possible {kmer_length}-mers.")

    N = len(sequences)

    # 2. Compute Document Frequency (df_vector)
    # Initialize an array to count how many sequences contain each predefined k-mer
    df_vector = np.zeros(M)

    for seq in sequences:
        # Extract all overlapping k-mers from the sequence
        kmers_in_seq = [seq[i:i + kmer_length] for i in range(len(seq) - kmer_length + 1)]
        # We only care about k-mers that are in our predefined list
        unique_kmers_in_seq = set(kmers_in_seq) & set(feature_names) # Find intersection
        for kmer in unique_kmers_in_seq:
            df_vector[kmer_to_idx[kmer]] += 1

    # 3. Compute IDF vector
    idf_vector = np.log((N + 1) / (df_vector + 1))

    # 4. Compute TF-IDF matrix
    tfidf_matrix = np.zeros((N, M))

    for seq_idx, seq in enumerate(sequences):
        # Extract k-mers for this sequence
        kmers = [seq[i:i + kmer_length] for i in range(len(seq) - kmer_length + 1)]
        # Count ONLY the k-mers that are in our predefined list
        tf_counts = Counter([k for k in kmers if k in kmer_to_idx])

        # Build the TF vector for all predefined features
        tf_vector = np.array([tf_counts.get(feature_names[j], 0) for j in range(M)])

        # Calculate raw TF-IDF and L2 normalize
        raw_tfidf = tf_vector * idf_vector
        norm = np.linalg.norm(raw_tfidf, ord=2) # sqrt(sum(squares))
        if norm > 0:
            tfidf_matrix[seq_idx] = raw_tfidf / norm
        else:
            tfidf_matrix[seq_idx] = raw_tfidf  # for all-zero vectors

    return tfidf_matrix, np.array(feature_names)


my_amino_acids = list('ACDEFGHIKLMNPQRSTUVWY')

sequenceMP_ids, sequencesMP = read_sequences_from_csv("..../Shirafkan/MP_final_clean.csv")
sequenceNonMP_ids, sequencesNonMP = read_sequences_from_csv("..../Shirafkan/Non_MP_final_clean.csv")

all_sequences = sequencesMP + sequencesNonMP
all_sequence_ids = sequenceMP_ids + sequenceNonMP_ids

tfidf_matrix, feature_names = extract_tfidf_features(all_sequences,
                                                     kmer_length=1,
                                                     predefined_aa=my_amino_acids)

print(f"TF-IDF matrix shape: {tfidf_matrix.shape}")
print(f"Number of features (k-mers): {len(feature_names)}")

tfidf_df = pd.DataFrame(tfidf_matrix, columns=feature_names)
tfidf_df.insert(0, 'protein_name', all_sequence_ids)
tfidf_df.insert(1, 'sequence', all_sequences)
tfidf_df.insert(2, 'label', ['MP'] * len(sequencesMP) + ['Non-MP'] * len(sequencesNonMP))
tfidf_df.to_csv("..../Shirafkan/tfidf_kmer1_features_final.csv", index=False)