In [4]:
import config

from tqdm import tqdm
import warnings
import os
from hyena_dna.standalone_hyenadna import HyenaDNAModel
import torch
import pickle
import pandas as pd
import re
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, make_scorer, f1_score, classification_report
from sklearn.model_selection import cross_val_score
from sklearn.decomposition import PCA


os.environ['PYTHONWARNINGS'] = 'ignore'
warnings.filterwarnings('ignore')
os.chdir(config.DIR_ROOT)

In [2]:
ks = [4, 5, 6, 7]

# Logistic Regression

In [None]:
for k in tqdm(ks):
    path_to_df = os.path.join(config.DIR_INCEST_MANY, f"{k}.csv")
    df = pd.read_csv(path_to_df)

    path_to_df_info = os.path.join(config.DIR_INCEST_MANY, "repbase_orf_type.txt")
    df_info = pd.read_csv(path_to_df_info, sep="\t")

    df_merge = pd.merge(df, df_info, how="left", on="name")
    df_merge = df_merge[df_merge["Good"] == 1].copy()

    emb_cols = [c for c in df.columns if re.fullmatch(r"emb_\d+", c)]
    X = df_merge[emb_cols].values
    y = df_merge["MainType"].values


    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    pca = PCA(n_components=100, random_state=1)
    X_pca = pca.fit_transform(X_scaled)
    print('Суммарная доля дисперсии:', pca.explained_variance_ratio_.sum())
    scaler_pca = StandardScaler()
    X_pca_scaled = scaler_pca.fit_transform(X_pca)

    X_train, X_test, y_train, y_test = train_test_split(
        X_pca_scaled, y,
        test_size=0.2,     # 20% данных уйдет в тест
        random_state=42,   # фиксируем случайность для воспроизводимости
        stratify=y         # если нужно сохранить пропорции классов
    )

    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)

    param_grid = {
        "C": [0.01, 0.1, 1.0, 10.0],
        "solver": ["lbfgs"],
        "penalty": ["l2"],
        "multi_class": ["auto"],
        "class_weight": ["balanced"]
    }

    scoring = {
        "accuracy": "accuracy",
        "f1_macro": make_scorer(f1_score, average="macro")
    }

    gs = GridSearchCV(
        estimator=LogisticRegression(max_iter=2000),
        param_grid=param_grid,
        scoring=scoring,
        refit="f1_macro",
        cv=cv,
        n_jobs=-1,
        return_train_score=False,
        verbose=2
    )

    gs.fit(X_train, y_train)

    print("Лучшие параметры:", gs.best_params_)
    print("Лучшая CV F1-macro:", gs.best_score_)

    best_model = gs.best_estimator_

    y_pred = best_model.predict(X_test)
    print(f'-------Classification Report for k={k}-------')
    print(classification_report(y_test, y_pred))

  0%|          | 0/4 [00:00<?, ?it/s]

Суммарная доля дисперсии: 0.8862331773939391
Fitting 5 folds for each of 4 candidates, totalling 20 fits
[CV] END C=0.1, class_weight=balanced, multi_class=auto, penalty=l2, solver=lbfgs; total time=   1.4s
[CV] END C=0.01, class_weight=balanced, multi_class=auto, penalty=l2, solver=lbfgs; total time=   1.5s
[CV] END C=0.01, class_weight=balanced, multi_class=auto, penalty=l2, solver=lbfgs; total time=   1.6s
[CV] END C=0.1, class_weight=balanced, multi_class=auto, penalty=l2, solver=lbfgs; total time=   1.4s
[CV] END C=0.01, class_weight=balanced, multi_class=auto, penalty=l2, solver=lbfgs; total time=   1.5s
[CV] END C=0.01, class_weight=balanced, multi_class=auto, penalty=l2, solver=lbfgs; total time=   1.4s
[CV] END C=0.01, class_weight=balanced, multi_class=auto, penalty=l2, solver=lbfgs; total time=   1.8s
[CV] END C=0.1, class_weight=balanced, multi_class=auto, penalty=l2, solver=lbfgs; total time=   2.0s
[CV] END C=1.0, class_weight=balanced, multi_class=auto, penalty=l2, solve

 25%|██▌       | 1/4 [01:15<03:46, 75.61s/it]

Лучшие параметры: {'C': 10.0, 'class_weight': 'balanced', 'multi_class': 'auto', 'penalty': 'l2', 'solver': 'lbfgs'}
Лучшая CV F1-macro: 0.41573650332941864
-------Classification Report for k=4-------
                            precision    recall  f1-score   support

            DNA transposon       0.72      0.56      0.63      3121
       LTR retrotransposon       0.86      0.65      0.74      5728
   Non-LTR retrotransposon       0.57      0.65      0.60      1639
                     Other       0.33      0.44      0.37      1061
Penelope-like retroelement       0.07      0.63      0.12       105
          Satellite repeat       0.01      0.33      0.02        21

                  accuracy                           0.60     11675
                 macro avg       0.43      0.54      0.42     11675
              weighted avg       0.72      0.60      0.65     11675



# SVM

In [None]:
for k in tqdm(ks):
    path_to_df = os.path.join(config.DIR_INCEST_MANY, f"{k}.csv")
    df = pd.read_csv(path_to_df)

    path_to_df_info = os.path.join(config.DIR_INCEST_MANY, "repbase_orf_type.txt")
    df_info = pd.read_csv(path_to_df_info, sep="\t")

    df_merge = pd.merge(df, df_info, how="left", on="name")
    df_merge = df_merge[df_merge["Good"] == 1].copy()

    emb_cols = [c for c in df.columns if re.fullmatch(r"emb_\d+", c)]
    X = df_merge[emb_cols].values
    y = df_merge["MainType"].values

    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    pca = PCA(n_components=100, random_state=1)
    X_pca = pca.fit_transform(X_scaled)
    print('Суммарная доля дисперсии:', pca.explained_variance_ratio_.sum())
    scaler_pca = StandardScaler()
    X_pca_scaled = scaler_pca.fit_transform(X_pca)

    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)

    param_grid = {
        "C": [0.01, 0.1, 1.0, 10.0],
        "kernel": ["linear", "rbf"],
        "class_weight": ["balanced"]
    }

    scoring = {
        "accuracy": "accuracy",
        "f1_macro": make_scorer(f1_score, average="macro")
    }

    gs = GridSearchCV(
        estimator=SVC(max_iter=1000),
        param_grid=param_grid,
        scoring=scoring,
        refit="f1_macro",
        cv=cv,
        n_jobs=-1,
        return_train_score=False,
        verbose=1
    )

    gs.fit(X_pca_scaled, y)

    print("Лучшие параметры:", gs.best_params_)
    print("Лучшая CV F1-macro:", gs.best_score_)

    best_model = gs.best_estimator_