In [2]:
import config

from tqdm import tqdm
import warnings
import os
from hyena_dna.standalone_hyenadna import HyenaDNAModel
import torch
import pickle
import pandas as pd
import re
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score


os.environ['PYTHONWARNINGS'] = 'ignore'
warnings.filterwarnings('ignore')
os.chdir(config.DIR_ROOT)

In [3]:
ks = [4, 5, 6, 7]

# Logistic Regression

In [None]:
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, StratifiedKFold, train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from tqdm import tqdm
import pandas as pd
import os, re

accuracy_scores_logreg = {}
cv_scores_logreg = {}

cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=1)

# сетка параметров для логистической регрессии
param_grid = {
    "clf__C": [0.01, 0.1, 1, 10],
    "clf__penalty": ["l2"],
    "clf__solver": ["lbfgs", "saga"],
}

for k in tqdm(ks):
    path_to_df = os.path.join(config.DIR_INCEST_MANY, f"{k}.csv")
    df = pd.read_csv(path_to_df)

    path_to_df_info = os.path.join(config.DIR_INCEST_MANY, "repbase_orf_type.txt")
    df_info = pd.read_csv(path_to_df_info, sep="\t")

    df_merge = pd.merge(df, df_info, how="left", on="name")
    df_merge = df_merge[df_merge["Good"] == 1].copy()

    emb_cols = [c for c in df.columns if re.fullmatch(r"emb_\d+", c)]
    X = df_merge[emb_cols].values
    y = df_merge["MainType"].values

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.3, random_state=1, stratify=y
    )

    pipe = Pipeline(
        steps=[
            ("scaler", StandardScaler(with_mean=True, with_std=True)),
            ("pca", PCA(n_components=10, random_state=1)),   # PCA перед логистической регрессией
            ("clf", LogisticRegression(
                max_iter=1000,
                multi_class="multinomial",
                class_weight="balanced",
                random_state=1
            )),
        ]
    )

    # подбор гиперпараметров через GridSearchCV
    grid = GridSearchCV(pipe, param_grid, cv=cv, scoring="accuracy", n_jobs=-1)
    grid.fit(X_train, y_train)

    best_model = grid.best_estimator_

    # CV score лучшей модели
    cv_score = grid.best_score_

    # тестовая точность
    y_pred = best_model.predict(X_test)
    accuracy_scores_logreg[k] = accuracy_score(y_test, y_pred)
    cv_scores_logreg[k] = cv_score

print("Logistic Regression (с PCA=10) — Test Accuracy per k:")
for k in ks:
    print(f"{k}: Test Accuracy = {accuracy_scores_logreg[k]:.3f} | CV = {cv_scores_logreg[k]:.3f}")

  0%|          | 0/4 [00:00<?, ?it/s]

# SVM

In [None]:
accuracy_scores_svm = {}
cv_scores_svm = {}
best_params_svm = {}

# Сетка гиперпараметров: линейное и RBF-ядра
param_grid = [
    {
        "clf__kernel": ["linear"],
        "clf__C": [0.1, 1, 3, 10, 30, 100],
    },
    {
        "clf__kernel": ["rbf"],
        "clf__C": [0.1, 1, 3, 10, 30, 100],
        "clf__gamma": ["scale", 0.01, 0.03, 0.1, 0.3, 1.0],
    },
]

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)

for k in ks:
    path_to_df = os.path.join(config.DIR_INCEST_MANY, f'{k}.csv')
    df = pd.read_csv(path_to_df)

    path_to_df_info = os.path.join(config.DIR_INCEST_MANY, 'repbase_orf_type.txt')
    df_info = pd.read_csv(path_to_df_info, sep='\t')

    df_merge = pd.merge(df, df_info, how='left', on='name')
    df_merge = df_merge[df_merge['Good'] == 1].copy()

    emb_cols = [c for c in df.columns if re.fullmatch(r"emb_\d+", c)]
    X = df_merge[emb_cols].values
    y = df_merge["MainType"].values

    # --- train/test split ---
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.30, random_state=1, stratify=y
    )

    pipe = Pipeline(
        steps=[
            ("scaler", StandardScaler(with_mean=True, with_std=True)),
            ("clf", SVC(class_weight="balanced", probability=False, random_state=1)),
        ]
    )

    grid = GridSearchCV(
        estimator=pipe,
        param_grid=param_grid,
        scoring="accuracy",
        n_jobs=-1,
        cv=cv,
        refit=True,
        verbose=0,
        return_train_score=False,
    )
    grid.fit(X_train, y_train)

    # --- Оценка на test ---
    best_model = grid.best_estimator_
    y_pred = best_model.predict(X_test)
    test_acc = accuracy_score(y_test, y_pred)

    # --- Сохранение результатов ---
    accuracy_scores_svm[k] = test_acc
    cv_scores_svm[k] = grid.best_score_
    best_params_svm[k] = grid.best_params_

# --- вывод результатов ---
print("SVM (GridSearchCV) — Test Accuracy per k:")
for k in ks:
    print(f"{k}: Test Accuracy = {accuracy_scores_svm[k]:.3f} | Best CV = {cv_scores_svm[k]:.3f} | Params = {best_params_svm[k]}")
