In [None]:
import numpy as np
import pandas as pd 
from tqdm import tqdm
from collections import Counter
from itertools import product
import os
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score
import gc
import joblib
from utils import *

from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression


In [None]:
df_train=read_data_from_file(file_name_ncbi_datas)

df_val=read_data_from_file(gasaid_test_file_name)

df_test_gasaid=read_data_from_file(gasaid_test_file_name)
df_test_ncbi=read_data_from_file(ncbi_test_file_name)

df_test = pd.concat([df_test_gasaid, df_test_ncbi], ignore_index=True)

In [None]:
df_Asian_flu=read_data_from_file(Asian_flu_test)
df_hong_kong_flu=read_data_from_file(hong_kong_flu_test)
df_pdmh1n1_flu=read_data_from_file(pdmh1n1_flu_test)
df_covid= read_data_from_file(covid_test)
df_cows=read_data_from_file(cows_test)

In [None]:
print(len(df_train))
print(len(df_test))

In [None]:
df_train = remove_duplicates(df_train)
df_val = remove_duplicates(df_val)

print(len(df_train))
print(len(df_val))

In [None]:
df_train=remove_common_sequences(df_train,df_test)
df_train=remove_common_sequences(df_train,df_val)
df_train=remove_common_sequences(df_train,df_Asian_flu)
df_train=remove_common_sequences(df_train,df_hong_kong_flu)
df_train=remove_common_sequences(df_train,df_covid)
df_train=remove_common_sequences(df_train,df_cows)



print(len(df_train))
print(len(df_val))

In [None]:
# All the m-mers (not k-mers!) combinations
combos = get_all_combinations(alphabet, m)

pfm = np.zeros((alphabet_size, m))

v = np.zeros(len(combos))

In [None]:
def Pwm2Vec(S, alphabet, k, m):
    V = np.zeros((len(S), (alphabet_size*m)), dtype=np.float32)
    for j, seq  in enumerate(tqdm(S, desc="Processing sequences")):
        A = comp_minimizers(seq, k, m)  # List of minimizers (m-mers)

        pfm.fill(0)
        for i in range(m):
            # the chars in pos i in A
            col = [a[i] for a in A]
            # PFM[c][i] = count of character c at position i across all m-mers
            pfm[:, i] = get_alphabet_count(col, alphabet)
        
        ppm = comp_ppm(pfm)
        
        V[j, :] = ppm.flatten()  # Assign the computed vector directly to row j
    return V

In [None]:
X_train = Pwm2Vec(df_train['Sequence'], alphabet, k, m)
X_train.shape

In [None]:
y_train = np.array((df_train["Class"].str.lower() != "human").astype(int))
y_train.size

In [None]:
X_val = Pwm2Vec(df_val['Sequence'], alphabet, k, m)
y_val = np.array((df_val["Class"].str.lower() != "human").astype(int))

In [None]:
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

In [None]:
preds = RandomForest(X_val,y_val, rf)
con_matrix(y_val,preds)

In [None]:
# Save
joblib.dump(rf, 'RF_PWM2Vec.pkl')

In [None]:
lr = LogisticRegression(max_iter=1000,C=10, random_state=42)
lr.fit(X_train, y_train)

In [None]:
preds = LogisticReg(X_val,y_val, lr)
con_matrix(y_val,preds)

In [None]:
# Save
joblib.dump(lr, 'LR_PWM2Vec.pkl')

In [None]:
svm = SVC(kernel='rbf',C=5,random_state=42,probability=True)
svm.fit(X_train, y_train)

In [None]:
preds = SVM(X_val,y_val, svm)
con_matrix(y_val,preds)

In [None]:
# Save
joblib.dump(svm, 'SVM_PWM2Vec.pkl')