# Fasta Process
## Cross-Dataset Cleaning and De-duplication Pipeline for MP-Non_MP Datasets

In [None]:
df_MP = pd.read_csv(r'/content/drive/MyDrive/MP_Prediction_MB/Shirafkan/MP.csv', encoding='latin1')
df_nonMP = pd.read_csv(r'/content/drive/MyDrive/MP_Prediction_MB/Shirafkan/Non_MP.csv', encoding='latin1')

print("="*50)
print("DATASET ANALYSIS REPORT")
print("="*50)

print("\n1. INITIAL DATASET STATS:")
print(f"MP Dataset: {df_MP['sequence'].shape[0]} total sequences, {df_MP['sequence'].unique().shape[0]} unique sequences.")
print(f"Non-MP Dataset: {df_nonMP['sequence'].shape[0]} total sequences, {df_nonMP['sequence'].unique().shape[0]} unique sequences.")

set_MP_seqs = set(df_MP['sequence'])
set_nonMP_seqs = set(df_nonMP['sequence'])

common_seqs = set_MP_seqs & set_nonMP_seqs
num_common = len(common_seqs)

print(f"\n2. CROSS-DATASET DUPLICATES:")
print(f"Number of common sequences between MP and Non-MP: {num_common}")

if num_common > 0:
    print(f"\n3. REMOVING {num_common} COMMON SEQUENCES FROM NON-MP DATASET:")

    df_MP_clean = df_MP[~df_MP['sequence'].isin(common_seqs)].copy()
    df_nonMP_clean = df_nonMP[~df_nonMP['sequence'].isin(common_seqs)].copy()

else:
    print("No common sequences found. Your datasets are clean in this regard.")
    df_MP_clean = df_MP.copy()
    df_nonMP_clean = df_nonMP.copy()

print(f"\n4. FINAL UNIQUE SEQUENCE COUNTS:")
print(f"MP unique sequences: {df_MP_clean['sequence'].unique().shape[0]}")
print(f"Non-MP unique sequences: {df_nonMP_clean['sequence'].unique().shape[0]}")
print(f"Total unique sequences in combined dataset: {pd.concat([df_MP_clean['sequence'], df_nonMP_clean['sequence']]).unique().shape[0]}")

print(f"\n5. REMOVING DUPLICATES WITHIN DATASETS AND SAVING CLEANED DATASETS...")

# Remove duplicates within each dataset to keep only unique sequences
df_MP_final_unique = df_MP_clean.drop_duplicates(subset=['sequence']).copy()
df_nonMP_final_unique = df_nonMP_clean.drop_duplicates(subset=['sequence']).copy()

print(f"MP Dataset after removing internal duplicates: {df_MP_final_unique.shape[0]} sequences")
print(f"Non-MP Dataset after removing internal duplicates: {df_nonMP_final_unique.shape[0]} sequences")

# Save the datasets with only unique sequences
df_MP_final_unique.to_csv('/content/drive/MyDrive/MP_Prediction_MB/Shirafkan/MP_clean.csv', index=False)
df_nonMP_final_unique.to_csv('/content/drive/MyDrive/MP_Prediction_MB/Shirafkan/Non_MP_clean.csv', index=False)
print("Cleaned datasets with unique sequences saved as 'MP_clean.csv' and 'Non_MP_clean.csv'")

print(f"\n6. FINAL SUMMARY:")
summary_data = {
    'Dataset': ['MP', 'Non-MP', 'Combined'],
    'Final Total': [df_MP_clean['sequence'].shape[0], df_nonMP_clean['sequence'].shape[0], df_MP_clean['sequence'].shape[0] + df_nonMP_clean['sequence'].shape[0]],
    'Final Unique': [df_MP_final_unique['sequence'].unique().shape[0], df_nonMP_final_unique['sequence'].unique().shape[0], len(set(df_MP_final_unique['sequence']) | set(df_nonMP_final_unique['sequence']))]
}

summary_df = pd.DataFrame(summary_data)
print(summary_df)

print("\n" + "="*50)
print("ANALYSIS COMPLETE")
print("="*50)

In [None]:
# Load your cleaned Shirafkan datasets
df_MP_clean = pd.read_csv('/content/drive/MyDrive/MP_Prediction_MB/DnaBinding/MP_clean.csv')
df_nonMP_clean = pd.read_csv('/content/drive/MyDrive/MP_Prediction_MB/DnaBinding/Non_MP_clean.csv')

# Load the MPFit datasets
df_MP_MPFit = pd.read_csv(r'/content/drive/MyDrive/MP_Prediction_MB/MPFit/MP_clean.csv', encoding='latin1')
df_NonMP_MPFit = pd.read_csv(r'/content/drive/MyDrive/MP_Prediction_MB/MPFit/Non_MP_clean.csv', encoding='latin1')

print("="*50)
print("CROSS-DATASET DUPLICATE ANALYSIS WITH MPFit")
print("="*50)

# Extract sequences from both MPFit datasets
mpfit_mp_sequences = set(df_MP_MPFit['sequence'].unique())
mpfit_nonmp_sequences = set(df_NonMP_MPFit['sequence'].unique())

# Combine all MPFit sequences
mpfit_all_sequences = mpfit_mp_sequences | mpfit_nonmp_sequences

print(f"MPFit MP Dataset: {len(mpfit_mp_sequences)} unique sequences")
print(f"MPFit Non-MP Dataset: {len(mpfit_nonmp_sequences)} unique sequences")
print(f"MPFit Combined: {len(mpfit_all_sequences)} unique sequences")

# Find common sequences between Shirafkan and MPFit
common_with_MPFit_MP = set(df_MP_clean['sequence']) & mpfit_all_sequences
common_with_MPFit_nonMP = set(df_nonMP_clean['sequence']) & mpfit_all_sequences

# Also find which specific MPFit dataset the common sequences come from
common_with_MPFit_MP_from_MP = set(df_MP_clean['sequence']) & mpfit_mp_sequences
common_with_MPFit_MP_from_NonMP = set(df_MP_clean['sequence']) & mpfit_nonmp_sequences
common_with_MPFit_nonMP_from_MP = set(df_nonMP_clean['sequence']) & mpfit_mp_sequences
common_with_MPFit_nonMP_from_NonMP = set(df_nonMP_clean['sequence']) & mpfit_nonmp_sequences

print(f"\n7. CROSS-DATASET DUPLICATES WITH MPFit:")
print(f"Common sequences between Shirafkan MP and MPFit: {len(common_with_MPFit_MP)}")
print(f"  - From MPFit MP: {len(common_with_MPFit_MP_from_MP)}")
print(f"  - From MPFit Non-MP: {len(common_with_MPFit_MP_from_NonMP)}")

print(f"Common sequences between Shirafkan Non-MP and MPFit: {len(common_with_MPFit_nonMP)}")
print(f"  - From MPFit MP: {len(common_with_MPFit_nonMP_from_MP)}")
print(f"  - From MPFit Non-MP: {len(common_with_MPFit_nonMP_from_NonMP)}")

# Remove common sequences from Shirafkan datasets
if len(common_with_MPFit_MP) > 0 or len(common_with_MPFit_nonMP) > 0:
    print(f"\n8. REMOVING COMMON SEQUENCES FROM SHIRAFKAN DATASETS:")

    # Remove from MP dataset
    df_MP_final = df_MP_clean[~df_MP_clean['sequence'].isin(common_with_MPFit_MP)].copy()
    print(f"Removed {len(common_with_MPFit_MP)} sequences from Shirafkan MP dataset")

    # Remove from Non-MP dataset
    df_nonMP_final = df_nonMP_clean[~df_nonMP_clean['sequence'].isin(common_with_MPFit_nonMP)].copy()
    print(f"Removed {len(common_with_MPFit_nonMP)} sequences from Shirafkan Non-MP dataset")

else:
    print("No common sequences found with MPFit dataset.")
    df_MP_final = df_MP_clean.copy()
    df_nonMP_final = df_nonMP_clean.copy()

print(f"\n9. FINAL DATASET COUNTS AFTER MPFit CLEANING:")
print(f"Shirafkan MP unique sequences: {df_MP_final['sequence'].unique().shape[0]}")
print(f"Shirafkan Non-MP unique sequences: {df_nonMP_final['sequence'].unique().shape[0]}")
print(f"Total unique sequences in combined Shirafkan dataset: {pd.concat([df_MP_final['sequence'], df_nonMP_final['sequence']]).unique().shape[0]}")

# Save the final cleaned datasets
df_MP_final.to_csv('/content/drive/MyDrive/MP_Prediction_MB/DnaBinding/MP_final_clean.csv', index=False)
df_nonMP_final.to_csv('/content/drive/MyDrive/MP_Prediction_MB/DnaBinding/Non_MP_final_clean.csv', index=False)
print("\nFinal cleaned datasets saved as 'MP_final_clean.csv' and 'Non_MP_final_clean.csv'")

print(f"\n10. FINAL SUMMARY:")
final_summary_data = {
    'Dataset': ['Shirafkan MP', 'Shirafkan Non-MP', 'Combined Shirafkan'],
    'After Internal Cleaning': [df_MP_clean['sequence'].shape[0], df_nonMP_clean['sequence'].shape[0],
                               df_MP_clean['sequence'].shape[0] + df_nonMP_clean['sequence'].shape[0]],
    'After MPFit Cleaning': [df_MP_final['sequence'].shape[0], df_nonMP_final['sequence'].shape[0],
                            df_MP_final['sequence'].shape[0] + df_nonMP_final['sequence'].shape[0]],
    'Removed due to MPFit': [len(common_with_MPFit_MP), len(common_with_MPFit_nonMP),
                            len(common_with_MPFit_MP) + len(common_with_MPFit_nonMP)]
}

final_summary_df = pd.DataFrame(final_summary_data)
print(final_summary_df)

# Optional: Save detailed information about removed sequences
if len(common_with_MPFit_MP) > 0 or len(common_with_MPFit_nonMP) > 0:
    # Create detailed removal information
    removed_details = []

    for seq in common_with_MPFit_MP:
        source = []
        if seq in common_with_MPFit_MP_from_MP:
            source.append('MPFit_MP')
        if seq in common_with_MPFit_MP_from_NonMP:
            source.append('MPFit_NonMP')
        removed_details.append({
            'sequence': seq,
            'shirafkan_source': 'MP',
            'mpfit_sources': ', '.join(source)
        })

    for seq in common_with_MPFit_nonMP:
        source = []
        if seq in common_with_MPFit_nonMP_from_MP:
            source.append('MPFit_MP')
        if seq in common_with_MPFit_nonMP_from_NonMP:
            source.append('MPFit_NonMP')
        removed_details.append({
            'sequence': seq,
            'shirafkan_source': 'Non-MP',
            'mpfit_sources': ', '.join(source)
        })

    removed_df = pd.DataFrame(removed_details)
    removed_df.to_csv('/content/drive/MyDrive/MP_Prediction_MB/Plant/removed_sequences_due_to_MPFit_detailed.csv', index=False)
    print(f"\nDetailed list of {len(removed_details)} removed sequences saved to 'removed_sequences_due_to_MPFit_detailed.csv'")

print("\n" + "="*50)
print("CROSS-DATASET CLEANING COMPLETE")
print("="*50)

In [None]:
# Load your cleaned Shirafkan datasets
df_MP_clean = pd.read_csv('/content/drive/MyDrive/MP_Prediction_MB/Shirafkan/MP_clean.csv')
df_nonMP_clean = pd.read_csv('/content/drive/MyDrive/MP_Prediction_MB/Shirafkan/Non_MP_clean.csv')

# Load the MPFit datasets
df_MP_MPFit = pd.read_csv(r'/content/drive/MyDrive/MP_Prediction_MB/MPFit/MP_clean.csv', encoding='latin1')
df_NonMP_MPFit = pd.read_csv(r'/content/drive/MyDrive/MP_Prediction_MB/MPFit/Non_MP_clean.csv', encoding='latin1')

# Load your feature matrices
my_MP_feature = pd.read_csv('/content/drive/MyDrive/MP_Prediction_MB/Shirafkan/BF/BF_MP_selected.csv')
my_nonMP_feature = pd.read_csv('/content/drive/MyDrive/MP_Prediction_MB/Shirafkan/BF/BF_Non_MP_selected.csv')

print("="*50)
print("CROSS-DATASET DUPLICATE ANALYSIS WITH MPFit")
print("="*50)

# Extract sequences from both MPFit datasets
mpfit_mp_sequences = set(df_MP_MPFit['sequence'].unique())
mpfit_nonmp_sequences = set(df_NonMP_MPFit['sequence'].unique())

# Combine all MPFit sequences
mpfit_all_sequences = mpfit_mp_sequences | mpfit_nonmp_sequences

print(f"MPFit MP Dataset: {len(mpfit_mp_sequences)} unique sequences")
print(f"MPFit Non-MP Dataset: {len(mpfit_nonmp_sequences)} unique sequences")
print(f"MPFit Combined: {len(mpfit_all_sequences)} unique sequences")

# Find common sequences between Shirafkan and MPFit
common_with_MPFit_MP = set(df_MP_clean['sequence']) & mpfit_all_sequences
common_with_MPFit_nonMP = set(df_nonMP_clean['sequence']) & mpfit_all_sequences

# Find the INDICES of sequences to remove from the original datasets
indices_to_remove_MP = df_MP_clean[df_MP_clean['sequence'].isin(common_with_MPFit_MP)].index.tolist()
indices_to_remove_nonMP = df_nonMP_clean[df_nonMP_clean['sequence'].isin(common_with_MPFit_nonMP)].index.tolist()

print(f"\n7. CROSS-DATASET DUPLICATES WITH MPFit:")
print(f"Common sequences between Shirafkan MP and MPFit: {len(common_with_MPFit_MP)}")
print(f"Indices to remove from MP_clean: {len(indices_to_remove_MP)}")
print(f"Common sequences between Shirafkan Non-MP and MPFit: {len(common_with_MPFit_nonMP)}")
print(f"Indices to remove from Non_MP_clean: {len(indices_to_remove_nonMP)}")

# Remove sequences from Shirafkan datasets using the indices
if len(indices_to_remove_MP) > 0 or len(indices_to_remove_nonMP) > 0:
    print(f"\n8. REMOVING COMMON SEQUENCES FROM SHIRAFKAN DATASETS:")

    # Remove from MP dataset using indices
    df_MP_final = df_MP_clean.drop(indices_to_remove_MP).copy()
    print(f"Removed {len(indices_to_remove_MP)} sequences from Shirafkan MP dataset")

    # Remove from Non-MP dataset using indices
    df_nonMP_final = df_nonMP_clean.drop(indices_to_remove_nonMP).copy()
    print(f"Removed {len(indices_to_remove_nonMP)} sequences from Shirafkan Non-MP dataset")

else:
    print("No common sequences found with MPFit dataset.")
    df_MP_final = df_MP_clean.copy()
    df_nonMP_final = df_nonMP_clean.copy()

print(f"\n9. FINAL DATASET COUNTS AFTER MPFit CLEANING:")
print(f"Shirafkan MP unique sequences: {df_MP_final['sequence'].unique().shape[0]}")
print(f"Shirafkan Non-MP unique sequences: {df_nonMP_final['sequence'].unique().shape[0]}")
print(f"Total unique sequences in combined Shirafkan dataset: {pd.concat([df_MP_final['sequence'], df_nonMP_final['sequence']]).unique().shape[0]}")

# Apply the same removal to your feature matrices
print(f"\n10. APPLYING REMOVAL TO FEATURE MATRICES:")
print(f"Original MP feature matrix shape: {my_MP_feature.shape}")
print(f"Original Non-MP feature matrix shape: {my_nonMP_feature.shape}")

# Remove indices from MP feature matrix
if len(indices_to_remove_MP) > 0:
    my_MP_feature_cleaned = my_MP_feature.drop(indices_to_remove_MP).copy()
    print(f"Removed {len(indices_to_remove_MP)} rows from MP feature matrix")
else:
    my_MP_feature_cleaned = my_MP_feature.copy()
    print("No rows removed from MP feature matrix")

# Remove indices from Non-MP feature matrix
if len(indices_to_remove_nonMP) > 0:
    my_nonMP_feature_cleaned = my_nonMP_feature.drop(indices_to_remove_nonMP).copy()
    print(f"Removed {len(indices_to_remove_nonMP)} rows from Non-MP feature matrix")
else:
    my_nonMP_feature_cleaned = my_nonMP_feature.copy()
    print("No rows removed from Non-MP feature matrix")

print(f"Cleaned MP feature matrix shape: {my_MP_feature_cleaned.shape}")
print(f"Cleaned Non-MP feature matrix shape: {my_nonMP_feature_cleaned.shape}")

# # Save the final cleaned datasets
# df_MP_final.to_csv('/content/drive/MyDrive/MP_Prediction_MB/DnaBinding/MP_final_clean.csv', index=False)
# df_nonMP_final.to_csv('/content/drive/MyDrive/MP_Prediction_MB/DnaBinding/Non_MP_final_clean.csv', index=False)

# # Save the cleaned feature matrices
my_MP_feature_cleaned.to_csv('/content/drive/MyDrive/MP_Prediction_MB/Shirafkan/BF/BF_MP_selected_final.csv', index=False)
my_nonMP_feature_cleaned.to_csv('/content/drive/MyDrive/MP_Prediction_MB/Shirafkan/BF/BF_Non_MP_selected_final.csv', index=False)

print("\nFinal cleaned datasets saved:")
print("- 'MP_final_clean.csv'")
print("- 'Non_MP_final_clean.csv'")
print("- 'BF_MP_selected_cleaned.csv'")
print("- 'BF_Non_MP_selected_cleaned.csv'")

print(f"\n11. FINAL SUMMARY:")
final_summary_data = {
    'Dataset': ['Shirafkan MP', 'Shirafkan Non-MP', 'MP Features', 'Non-MP Features'],
    'Original Count': [
        df_MP_clean.shape[0],
        df_nonMP_clean.shape[0],
        my_MP_feature.shape[0],
        my_nonMP_feature.shape[0]
    ],
    'After MPFit Cleaning': [
        df_MP_final.shape[0],
        df_nonMP_final.shape[0],
        my_MP_feature_cleaned.shape[0],
        my_nonMP_feature_cleaned.shape[0]
    ],
    'Removed': [
        len(indices_to_remove_MP),
        len(indices_to_remove_nonMP),
        my_MP_feature.shape[0] - my_MP_feature_cleaned.shape[0],
        my_nonMP_feature.shape[0] - my_nonMP_feature_cleaned.shape[0]
    ]
}

final_summary_df = pd.DataFrame(final_summary_data)
print(final_summary_df)

print("\n" + "="*50)
print("CROSS-DATASET CLEANING COMPLETE")
print("="*50)

In [None]:
import pandas as pd
import numpy as np

# Load your main datasets
df_MP = pd.read_csv(r'/content/MP_mpfit.csv', encoding='latin1')
df_nonMP = pd.read_csv(r'/content/Non_MP_mpfit.csv', encoding='latin1')

df_ext_MP = pd.read_csv(r'/content/MP.csv', encoding='latin1')
df_ext_nonMP = pd.read_csv(r'/content/Non_MP.csv', encoding='latin1')

print("="*60)
print("COMPREHENSIVE DATASET ANALYSIS REPORT (WITH EXTERNAL SET)")
print("="*60)

# 1. Print initial stats for all datasets
print("\n1. INITIAL DATASET STATS:")
print(f"Main MP Dataset: {df_MP['sequence'].shape[0]} total sequences, {df_MP['sequence'].unique().shape[0]} unique sequences.")
print(f"Main Non-MP Dataset: {df_nonMP['sequence'].shape[0]} total sequences, {df_nonMP['sequence'].unique().shape[0]} unique sequences.")
print(f"External MP Dataset: {df_ext_MP['sequence'].shape[0]} total sequences, {df_ext_MP['sequence'].unique().shape[0]} unique sequences.")
print(f"External Non-MP Dataset: {df_ext_nonMP['sequence'].shape[0]} total sequences, {df_ext_nonMP['sequence'].unique().shape[0]} unique sequences.")

# 2. Combine all sequences from all datasets to find global duplicates
all_sequences = pd.concat([
    df_MP['sequence'],
    df_nonMP['sequence'],
    df_ext_MP['sequence'],
    df_ext_nonMP['sequence']
])

print(f"\nTotal sequences across all datasets: {len(all_sequences)}")
print(f"Total unique sequences across all datasets: {all_sequences.unique().shape[0]}")

# 3. Find common sequences between ANY datasets
set_main_MP = set(df_MP['sequence'])
set_main_nonMP = set(df_nonMP['sequence'])
set_ext_MP = set(df_ext_MP['sequence'])
set_ext_nonMP = set(df_ext_nonMP['sequence'])

# Find sequences that appear in both main and external sets
common_main_ext_MP = set_main_MP & set_ext_MP
common_main_ext_nonMP = set_main_nonMP & set_ext_nonMP
common_mainMP_extNonMP = set_main_MP & set_ext_nonMP
common_mainNonMP_extMP = set_main_nonMP & set_ext_MP

# Combine all common sequences
all_common_seqs = common_main_ext_MP | common_main_ext_nonMP | common_mainMP_extNonMP | common_mainNonMP_extMP
num_all_common = len(all_common_seqs)

print(f"\n2. GLOBAL CROSS-DATASET DUPLICATES:")
print(f"Number of common sequences across all datasets: {num_all_common}")

if num_all_common > 0:
    print("\nCommon sequences found across datasets:")
    for seq in list(all_common_seqs)[:10]:  # Show first 10 to avoid too much output
        print(f"  '{seq}'")
    if num_all_common > 10:
        print(f"  ... and {num_all_common - 10} more")

    # 4. Find indices in ALL datasets
    print(f"\n3. FINDING INDICES IN ALL DATASETS:")

    # Main MP
    mp_common_mask = df_MP['sequence'].isin(all_common_seqs)
    mp_common_indices = df_MP[mp_common_mask].index.tolist()
    print(f"Indices in main MP.csv: {mp_common_indices} ({len(mp_common_indices)} sequences)")

    # Main Non-MP
    nonmp_common_mask = df_nonMP['sequence'].isin(all_common_seqs)
    nonmp_common_indices = df_nonMP[nonmp_common_mask].index.tolist()
    print(f"Indices in main Non_MP.csv: {nonmp_common_indices} ({len(nonmp_common_indices)} sequences)")

    # External MP
    ext_mp_common_mask = df_ext_MP['sequence'].isin(all_common_seqs)
    ext_mp_common_indices = df_ext_MP[ext_mp_common_mask].index.tolist()
    print(f"Indices in external MP1.csv: {ext_mp_common_indices} ({len(ext_mp_common_indices)} sequences)")

    # External Non-MP
    ext_nonmp_common_mask = df_ext_nonMP['sequence'].isin(all_common_seqs)
    ext_nonmp_common_indices = df_ext_nonMP[ext_nonmp_common_mask].index.tolist()
    print(f"Indices in external Non_MP1.csv: {ext_nonmp_common_indices} ({len(ext_nonmp_common_indices)} sequences)")

    # 5. Create detailed report
    print(f"\n4. DETAILED REPORT OF COMMON ENTRIES:")
    common_details = []

    for seq in all_common_seqs:
        # Find occurrences in each dataset
        main_mp_occurrences = df_MP[df_MP['sequence'] == seq]
        main_nonmp_occurrences = df_nonMP[df_nonMP['sequence'] == seq]
        ext_mp_occurrences = df_ext_MP[df_ext_MP['sequence'] == seq]
        ext_nonmp_occurrences = df_ext_nonMP[df_ext_nonMP['sequence'] == seq]

        # Add to report
        for _, row in main_mp_occurrences.iterrows():
            common_details.append({
                'Sequence': seq,
                'Dataset': 'Main_MP',
                'Index': row.name,
                'Protein_Name': row['protein_name']
            })

        for _, row in main_nonmp_occurrences.iterrows():
            common_details.append({
                'Sequence': seq,
                'Dataset': 'Main_NonMP',
                'Index': row.name,
                'Protein_Name': row['protein_name']
            })

        for _, row in ext_mp_occurrences.iterrows():
            common_details.append({
                'Sequence': seq,
                'Dataset': 'External_MP',
                'Index': row.name,
                'Protein_Name': row['protein_name']
            })

        for _, row in ext_nonmp_occurrences.iterrows():
            common_details.append({
                'Sequence': seq,
                'Dataset': 'External_NonMP',
                'Index': row.name,
                'Protein_Name': row['protein_name']
            })

    common_df = pd.DataFrame(common_details)
    pd.set_option('display.max_colwidth', None)
    print(common_df)

    print(f"\n5. REMOVING COMMON SEQUENCES FROM ALL DATASETS:")

    df_MP_clean = df_MP[~df_MP['sequence'].isin(all_common_seqs)].copy()
    df_nonMP_clean = df_nonMP[~df_nonMP['sequence'].isin(all_common_seqs)].copy()
    df_ext_MP_clean = df_ext_MP[~df_ext_MP['sequence'].isin(all_common_seqs)].copy()
    df_ext_nonMP_clean = df_ext_nonMP[~df_ext_nonMP['sequence'].isin(all_common_seqs)].copy()

    print(f"Main MP after cleaning: {df_MP_clean['sequence'].shape[0]} sequences")
    print(f"Main Non-MP after cleaning: {df_nonMP_clean['sequence'].shape[0]} sequences")
    print(f"External MP after cleaning: {df_ext_MP_clean['sequence'].shape[0]} sequences")
    print(f"External Non-MP after cleaning: {df_ext_nonMP_clean['sequence'].shape[0]} sequences")
    print(f"Total sequences removed from all datasets: {num_all_common}")

    # 7. FINAL UNIQUE SEQUENCE COUNTS
    print(f"\n6. FINAL UNIQUE SEQUENCE COUNTS:")
    all_clean_sequences = pd.concat([
        df_MP_clean['sequence'],
        df_nonMP_clean['sequence'],
        df_ext_MP_clean['sequence'],
        df_ext_nonMP_clean['sequence']
    ])

    print(f"Total unique sequences across all cleaned datasets: {all_clean_sequences.unique().shape[0]}")

    # 8. SAVE CLEANED DATASETS
    print(f"\n7. SAVING CLEANED DATASETS...")
    df_MP_clean.to_csv('/content/MP_clean.csv', index=False)
    df_nonMP_clean.to_csv('/content/Non_MP_clean.csv', index=False)
    df_ext_MP_clean.to_csv('/content/MP1_clean.csv', index=False)
    df_ext_nonMP_clean.to_csv('/content/Non_MP1_clean.csv', index=False)
    print("All cleaned datasets saved!")

else:
    print("No common sequences found across datasets. All datasets are clean.")
    # If no common sequences, use original datasets
    df_MP_clean = df_MP.copy()
    df_nonMP_clean = df_nonMP.copy()
    df_ext_MP_clean = df_ext_MP.copy()
    df_ext_nonMP_clean = df_ext_nonMP.copy()

# 9. COMPREHENSIVE SUMMARY TABLE
print(f"\n8. COMPREHENSIVE SUMMARY:")
summary_data = {
    'Dataset': ['Main_MP', 'Main_NonMP', 'External_MP', 'External_NonMP', 'Combined'],
    'Initial_Total': [
        df_MP['sequence'].shape[0],
        df_nonMP['sequence'].shape[0],
        df_ext_MP['sequence'].shape[0],
        df_ext_nonMP['sequence'].shape[0],
        df_MP['sequence'].shape[0] + df_nonMP['sequence'].shape[0] + df_ext_MP['sequence'].shape[0] + df_ext_nonMP['sequence'].shape[0]
    ],
    'Initial_Unique': [
        df_MP['sequence'].unique().shape[0],
        df_nonMP['sequence'].unique().shape[0],
        df_ext_MP['sequence'].unique().shape[0],
        df_ext_nonMP['sequence'].unique().shape[0],
        len(set(df_MP['sequence']) | set(df_nonMP['sequence']) | set(df_ext_MP['sequence']) | set(df_ext_nonMP['sequence']))
    ],
    'Final_Total': [
        df_MP_clean['sequence'].shape[0],
        df_nonMP_clean['sequence'].shape[0],
        df_ext_MP_clean['sequence'].shape[0],
        df_ext_nonMP_clean['sequence'].shape[0],
        df_MP_clean['sequence'].shape[0] + df_nonMP_clean['sequence'].shape[0] + df_ext_MP_clean['sequence'].shape[0] + df_ext_nonMP_clean['sequence'].shape[0]
    ],
    'Final_Unique': [
        df_MP_clean['sequence'].unique().shape[0],
        df_nonMP_clean['sequence'].unique().shape[0],
        df_ext_MP_clean['sequence'].unique().shape[0],
        df_ext_nonMP_clean['sequence'].unique().shape[0],
        len(set(df_MP_clean['sequence']) | set(df_nonMP_clean['sequence']) | set(df_ext_MP_clean['sequence']) | set(df_ext_nonMP_clean['sequence']))
    ]
}

summary_df = pd.DataFrame(summary_data)
print(summary_df)

print("\n" + "="*60)
print("COMPREHENSIVE ANALYSIS COMPLETE")
print("="*60)