# TF‚ÄìIDF Feature Extraction for Protein Sequences
## This script reads MP and Non-MP sequences from CSV files, computes k-mer-based TF‚ÄìIDF features (using amino acid tokens), and outputs a normalized TF‚ÄìIDF feature matrix with protein labels for downstream machine learning analysis.

In [None]:
!git clone https://github.com/Superzchen/iFeature.git

%cd /content/iFeature

In [None]:
import subprocess
import os
import pandas as pd
import numpy as np
from concurrent.futures import ThreadPoolExecutor, as_completed
import threading

INPUT_CSV_FILE = '/content/drive/MyDrive/MP_Prediction_MB/Shirafkan/Non_MP_final_clean.csv'
TEMP_FASTA_FILE = '/content/temp_sequences.fasta'
BASE_OUTPUT_DIR = '/content/features'
COMBINED_OUTPUT_FILE = '/content/drive/MyDrive/MP_Prediction_MB/Shirafkan/BF/BF_Non_MP1.csv'
IFEAURE_SCRIPT_PATH = '/content/iFeature/iFeature.py'

os.makedirs(BASE_OUTPUT_DIR, exist_ok=True)

def convert_csv_to_fasta(csv_path, fasta_path):
    """
    Reads a CSV with 'protein_name' and 'sequence' columns and writes a FASTA file.
    """
    print(f"üîÑ Converting {os.path.basename(csv_path)} to FASTA format...")
    try:
        df = pd.read_csv(csv_path)

        if 'protein_name' not in df.columns or 'sequence' not in df.columns:
            raise ValueError("CSV file must contain 'protein_name' and 'sequence' columns.")

        with open(fasta_path, 'w') as f_out:
            for index, row in df.iterrows():
                protein_name = row['protein_name']
                sequence = row['sequence']
                f_out.write(f">{protein_name}\n")
                f_out.write(f"{sequence}\n")
        print(f" Conversion successful. FASTA file created at: {fasta_path}")
        return True
    except Exception as e:
        print(f" Error during CSV to FASTA conversion: {e}")
        return False

def run_ifeature(feature_type, fasta_file, output_dir, timeout=900):
    """Run iFeature extraction for a single feature type."""
    output_file = os.path.join(output_dir, f'{feature_type}_features.tsv')
    try:
        result = subprocess.run(
            ['python', IFEAURE_SCRIPT_PATH, '--file', fasta_file, '--type', feature_type, '--out', output_file],
            capture_output=True, text=True, check=True, timeout=timeout
        )
        return feature_type, True, "Success"
    except subprocess.CalledProcessError as e:
        return feature_type, False, f"Error: {e.stderr.strip()}"
    except subprocess.TimeoutExpired:
        return feature_type, False, f"Timeout after {timeout} seconds"
    except Exception as e:
        return feature_type, False, f"An unexpected exception occurred: {str(e)}"

def extract_features_parallel(feature_types, fasta_file, output_dir, max_workers=12):
    """Extract features in parallel using a thread pool."""
    print(f"üöÄ Starting feature extraction for {len(feature_types)} types using up to {max_workers} workers...")
    results = {}
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        future_to_feature = {
            executor.submit(run_ifeature, ft, fasta_file, output_dir): ft
            for ft in feature_types
        }
        for future in as_completed(future_to_feature):
            feature_type = future_to_feature[future]
            try:
                feature, success, message = future.result()
                results[feature] = (success, message)
                status = "‚úÖ" if success else "‚ùå"
                print(f"  {status} {feature:<15} | {message}")
            except Exception as e:
                results[feature_type] = (False, f"Future submission failed: {str(e)}")
                print(f"  ‚ùå {feature_type:<15} | Future submission failed")
    return results

def read_feature_file(file_path, feature_type):

    try:
        df = pd.read_csv(file_path, sep='\t')
        if df.empty:
            return None
        id_col = df.columns[0]
        df = df.rename(columns={id_col: 'base_id'})
        df['base_id'] = df['base_id'].astype(str).apply(
            lambda x: x.split('|')[1] if '|' in x and len(x.split('|')) > 1 else x
        )
        feature_cols = [col for col in df.columns if col != 'base_id']
        new_cols = {col: f"{feature_type}_{col}" for col in feature_cols}
        df = df.rename(columns=new_cols)
        return df
    except Exception as e:
        print(f"‚ùå Error processing {os.path.basename(file_path)}: {e}")
        return None

def combine_features_parallel(output_dir, success_results):
    """Combine successfully generated feature files."""
    files_to_process = [f"{ft}_features.tsv" for ft, (success, msg) in success_results.items() if success]
    if not files_to_process:
        print("No feature files were successfully generated to combine.")
        return []
    print(f"\nüìö Reading and processing {len(files_to_process)} feature files...")
    feature_dfs = []
    with ThreadPoolExecutor(max_workers=min(8, len(files_to_process))) as executor:
        future_to_file = {}
        for filename in files_to_process:
            file_path = os.path.join(output_dir, filename)
            feature_type = filename.replace('_features.tsv', '')
            future = executor.submit(read_feature_file, file_path, feature_type)
            future_to_file[future] = filename
        for future in as_completed(future_to_file):
            filename = future_to_file[future]
            try:
                df = future.result()
                if df is not None and not df.empty:
                    feature_dfs.append(df)
                    print(f"  ‚úÖ Processed {filename}")
                elif df is None:
                    pass
                else:
                    print(f"  ‚ö†Ô∏è Skipped {filename}: No valid data found.")
            except Exception as e:
                print(f"  ‚ùå Error processing future for {filename}: {e}")
    return feature_dfs

def merge_dataframes(dataframes):
    """Merge a list of dataframes on the 'base_id' column."""
    if not dataframes:
        return pd.DataFrame()
    merged_df = dataframes[0]
    for df in dataframes[1:]:
        merged_df = pd.merge(merged_df, df, on='base_id', how='outer')
    return merged_df

def main():

    try:
        if not convert_csv_to_fasta(INPUT_CSV_FILE, TEMP_FASTA_FILE):
            print("\nExiting script due to conversion failure.")
            return

        feature_types = [
           'AAC', 'DPC', 'TPC', 'CTDC', 'CTDT', 'CTDD',
           'CTriad', 'GAAC', 'GDPC', 'GTPC', 'PAAC', 'APAAC',
           'SOCNumber', 'QSOrder', 'NMBroto', 'Moran', 'Geary'
        ]
        extraction_results = extract_features_parallel(feature_types, TEMP_FASTA_FILE, BASE_OUTPUT_DIR)

        # Step 3: Combine and merge the results (no changes here)
        list_of_dfs = combine_features_parallel(BASE_OUTPUT_DIR, extraction_results)
        if not list_of_dfs:
            print("\nNo dataframes to merge. Exiting.")
            return

        print("\n Merging all features into a single dataframe...")
        combined_data = merge_dataframes(list_of_dfs)
        if combined_data.empty:
            print("Merging resulted in an empty dataframe. Check input files.")
            return

        combined_data = combined_data.fillna(0)
        combined_data.to_csv(COMBINED_OUTPUT_FILE, index=False)
        print("\n--- All tasks complete! ---")
        print(f"Final combined dataset shape: {combined_data.shape[0]} sequences, {combined_data.shape[1]} columns (including base_id)")
        print(f"Results saved to: {COMBINED_OUTPUT_FILE}")

    finally:
        if os.path.exists(TEMP_FASTA_FILE):
            os.remove(TEMP_FASTA_FILE)
            print(f"\nüßπ Cleaned up temporary file: {TEMP_FASTA_FILE}")

if __name__ == "__main__":
    main()

In [None]:
# Make headers.txt based on chosen features

with open('/content/headers.txt', 'w') as file:
    for header in headers_list:
        file.write(header + '\n')

In [None]:
socnumber_indices = list(range(305, 365))
qsorder_indices = list(range(445, 545))
moran_indices = list(range(1483, 1723))
nmbroto_indices = list(range(1723, 1963))
geary_indices = list(range(1963, 2203))

selected_indices = socnumber_indices + qsorder_indices + moran_indices + nmbroto_indices + geary_indices

selected_features = [feature_names['cleaned_headers'][i-2] for i in selected_indices]

# adjusted_indices = [i - 2 for i in selected_indices]

selected_df_names = df[selected_features]

# names_from_indices = df.columns[[i - 2 for i in selected_indices]].tolist()

# match = names_from_indices == selected_features
# print("Do the indices correspond to the features? ", match)

# if not match:
#     mismatches = [(idx, name_idx, name_feat) for idx, name_idx, name_feat in zip(selected_indices, names_from_indices, selected_features) if name_idx != name_feat]
#     print("Mismatches (1-based index, name from index, name from features):", mismatches)


In [None]:
existing_features = [col for col in selected_features if col in df.columns]
missing_features = [col for col in selected_features if col not in df.columns]

print(f"Found {len(existing_features)} existing features")
print(f"Missing {len(missing_features)} features: {missing_features}")

# Get existing columns
selected_df_names = df[existing_features].copy()

# Add missing columns filled with zeros
for feature in missing_features:
    selected_df_names[feature] = 0

# Reorder columns to match the original selected_features order
selected_df_names = selected_df_names[selected_features]

In [None]:
selected_df_names.to_csv(r'/content/drive/MyDrive/MP_Prediction_MB/MPFit/BF/BF_Non_MP_selected.csv',index=False)