In [1]:
from src.tools import MMseqs
import pandas as pd
import numpy as np
from src.files import XMLFile, InterProScanFile, FASTAFile
from utils import * 
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split

%load_ext autoreload 
%autoreload 2

In [4]:
def get_filter_datasets(max_length:int=800):

    df = load_ref_out()
    print(f'get_filter_datasets: Loaded {len(df)} proteins from ./ref.out')
    df = remove_partial(df)

    interpro_df = InterProScanFile('../data/putative_protein.interpro.tsv').to_df(max_e_value=1e-5, drop_duplicates=True)
    interpro_df.columns = ['interpro_' + col for col in interpro_df.columns]
    df = df.merge(interpro_df, left_index=True, right_index=True, how='left', validate='one_to_one')

    df['length'] = df.seq.apply(len)
    mask = df.length > max_length
    print(f'get_filter_datasets: Removing {mask.sum()} proteins which exceed the maximum specified length of {max_length}.')
    df = df[~mask].copy()

    df['label'] = df.apply(is_prodigal_error, axis=1).astype(int)
    print(f'get_filter_datasets: {df.label.sum()} out of {len(df)} proteins labeled as Prodigal errors.')

    mask = df.apply(is_ncbi_error, axis=1) & (df.label == 0)
    print(f'get_filter_datasets Removing {mask.sum()} spurious NCBI proteins from the set of "negative" test instances.')
    df = df[~mask]

    mmseqs = MMseqs()
    df = mmseqs.cluster(df, job_name='filter', sequence_identity=0.95, reps_only=True, overwrite=True)
    mmseqs.cleanup()

    train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
    print(f'get_filter_datasets: Training dataset contains {len(train_df)} proteins, testing dataset contains {len(test_df)} proteins.')
    train_df.to_csv('../data/filter_dataset_train.csv')
    test_df.to_csv('../data/filter_dataset_test.csv')

    return train_df, test_df


train_df, test_df = get_filter_datasets()


get_filter_datasets: Loaded 464706 proteins from ./ref.out
remove_partial: Removing 4309 sequences marked as partial by both Prodigal and the reference.
get_filter_datasets: Removing 16946 proteins which exceed the maximum specified length of 800.
get_filter_datasets: 42823 out of 443451 proteins labeled as Prodigal errors.
get_filter_datasets Removing 0 spurious NCBI proteins from the set of "negative" test instances.
MMseqs.load: Removing 3210 non-cluster representatives.
get_filter_datasets: Training dataset contains 352192 proteins, testing dataset contains 88049 proteins.
