In [None]:
import numpy as np
import pandas as pd 
from tqdm import tqdm
from collections import Counter
from itertools import product
import os
from utils import *
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score,confusion_matrix, ConfusionMatrixDisplay
import gc
import joblib
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt


In [None]:
df_ncbi=read_data_from_file(file_name_ncbi_datas)
df_test_ncbi=read_data_from_file(ncbi_test_file_name)

df_train = pd.concat([df_test_ncbi, df_ncbi], ignore_index=True)

df_gasaid=read_data_from_file(file_name_gasaid_datas)
df_test_gasaid=read_data_from_file(gasaid_test_file_name)

df_test = pd.concat([df_test_gasaid, df_gasaid], ignore_index=True)

In [None]:
print(len(df_train))
print(len(df_test))

In [None]:
# All the m-mers (not k-mers!) combinations
combos = get_all_combinations(alphabet, k)
combo_index = {mer: i for i, mer in enumerate(combos)}  # Faster lookup
v = np.zeros(len(combos))

In [None]:
def MFV(S, alphabet, k):
    V = np.zeros((len(S), len(combos)), dtype=np.float32)
    for j, seq  in enumerate(tqdm(S, desc="Processing sequences")):
        kmers = get_kmers(seq, k) # List of (k-mers)

        v.fill(0) # feature vector of size |Σ|^m
        
        for kmer in kmers:   
            idx = combo_index[kmer]  # find index of the i-th m-mer
            v[idx] += 1  # add the minimizer's score
        
        V[j, :] = v  # Assign the computed vector directly to row j
    return V

In [None]:
vectors = MFV(df_train['Sequence'], alphabet, k)
vectors.shape

In [None]:
labels = np.array((df_train["Class"].str.lower() != "human").astype(int))
labels.size

In [None]:
X_train, X_val, y_train, y_val = train_test_split(vectors, labels, test_size=0.2, random_state=42, stratify=labels)

In [None]:
del vectors        
del labels     
gc.collect()  # Force garbage collection

In [None]:
X_test = MFV(df_test['Sequence'], alphabet, k)
y_test = np.array((df_test["Class"].str.lower() != "human").astype(int))

In [None]:
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

In [None]:
preds = RandomForest(X_test,y_test,rf)

In [None]:
con_matrix(y_test,preds)

In [None]:
# Save
joblib.dump(rf, 'RF_All_train_MFV.pkl')

In [None]:
lr = LogisticRegression(max_iter=1000, random_state=42)
lr.fit(X_train, y_train)

In [None]:
preds = LogisticReg(X_test,y_test, lr)

In [None]:
con_matrix(y_test,preds)

In [None]:
# Save
joblib.dump(lr, 'LR_All_train_MFV.pkl')

In [None]:
svm = SVC(kernel='rbf',random_state=42)
svm.fit(X_train, y_train)

In [None]:
preds = SVM(X_test,y_test, svm)

In [None]:
con_matrix(y_test,preds)

In [None]:
# Save
joblib.dump(svm, 'SVM_All_train_MFV.pkl')