In [2]:
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import classification_report
import pandas as pd
import numpy as np
import os
import pickle

# Загрузка данных

In [3]:
os.chdir('..')

In [4]:
data_filename = os.path.join('data', f'X_y_characteristics.pkl')

X = None
y = None

if os.path.exists(data_filename):
    with open(data_filename, 'rb') as f:
        data = pickle.load(f)
    
    X = data['X']
    y = data['y']

X = np.array(X)
y = np.array(y)

In [5]:
from collections import Counter
counter = Counter(y)
counter

Counter({'LTR': 212,
         'nan': 178,
         'Helitron': 170,
         'DNA/MuDR': 130,
         'LINE': 113,
         'DNA+': 87,
         'TEG': 35,
         'DNA/HAT': 27,
         'Mix': 22,
         'Mix with Helitron': 18,
         'Unassigned': 16,
         'RathE1/2/3_cons': 7,
         'SINE': 7})

Фильтруем, берем только те, которых много.

In [6]:
families_to_filter = ['LTR', 'Helitron', 'DNA/MuDR', 'LINE']

indices = np.isin(y, families_to_filter)

X_filtered = X[indices]
y_filtered = y[indices]

Делаем равномерные классы

In [7]:
def balance_classes(X, y):
    X = np.array(X)
    y = np.array(y)
    # Определяем количество элементов в каждом классе
    class_counts = Counter(y)
    min_count = min(class_counts.values())

    # Собираем индексы для каждого класса
    indices_by_class = {cls: np.where(y == cls)[0] for cls in class_counts}

    # Оставляем только min_count элементов для каждого класса
    balanced_indices = []
    for cls, indices in indices_by_class.items():
        balanced_indices.extend(indices[:min_count])

    np.random.shuffle(balanced_indices)

    X_balanced = X[balanced_indices]
    y_balanced = y[balanced_indices]

    return X_balanced, y_balanced

In [8]:
X_balanced, y_balanced = balance_classes(X_filtered, y_filtered)

In [9]:
X_balanced.shape

(452, 109)

In [10]:
y_balanced.shape

(452,)

In [14]:
from catboost import CatBoostClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

X_train, X_test, y_train, y_test = train_test_split(X_balanced, y_balanced, test_size=0.2, random_state=42)

# Модель
clf = CatBoostClassifier(random_state=42, verbose=False)

# Параметры для подбора
param_grid = {
    'iterations': [500, 1000],
    'depth': [4, 6, 8],
    'learning_rate': [0.01, 0.05, 0.1],
    'l2_leaf_reg': [1, 3, 5]
}

# GridSearchCV
grid_search = GridSearchCV(clf, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)

# Лучшие параметры
print("Лучшие параметры:", grid_search.best_params_)

Лучшие параметры: {'depth': 4, 'iterations': 1000, 'l2_leaf_reg': 1, 'learning_rate': 0.1}


In [15]:
best_catboost_model = CatBoostClassifier(**grid_search.best_params_, verbose=False)
# Кросс-валидация
y_pred_cv = cross_val_predict(best_catboost_model, X_balanced, y_balanced, cv=5)

# Результат
print(classification_report(y_balanced, y_pred_cv))

              precision    recall  f1-score   support

    DNA/MuDR       0.62      0.65      0.63       113
    Helitron       0.64      0.64      0.64       113
        LINE       0.78      0.84      0.81       113
         LTR       0.89      0.79      0.84       113

    accuracy                           0.73       452
   macro avg       0.73      0.73      0.73       452
weighted avg       0.73      0.73      0.73       452



In [16]:
file_path = 'models_files/catboost_01.pkl'
if not os.path.exists(file_path):
    with open(file_path, 'wb') as f:  
        pickle.dump(best_catboost_model, f)
    print("Завершено")

Завершено


In [17]:
def load(file_path):  
    data_filename = os.path.join('data', file_path)
    
    X = None
    y = None
    
    if os.path.exists(data_filename):
        with open(data_filename, 'rb') as f:
            data = pickle.load(f)
    
        X = data['X']
        y = data['y']
    
    X = np.array(X)
    y = np.array(y)
    
    indices = np.isin(y, families_to_filter)
    
    X_filtered = X[indices]
    y_filtered = y[indices]
    
    X_balanced, y_balanced = balance_classes(X_filtered, y_filtered)
    return X_balanced, y_balanced

In [18]:
X_balanced, y_balanced = load('X_y_node2vec.pkl')

In [19]:
best_catboost_model = CatBoostClassifier(**grid_search.best_params_, verbose=False)
# Кросс-валидация
y_pred_cv = cross_val_predict(best_catboost_model, X_balanced, y_balanced, cv=5)

# Результат
print(classification_report(y_balanced, y_pred_cv))

              precision    recall  f1-score   support

    DNA/MuDR       0.22      0.20      0.21       113
    Helitron       0.31      0.32      0.31       113
        LINE       0.21      0.20      0.20       113
         LTR       0.24      0.25      0.24       113

    accuracy                           0.24       452
   macro avg       0.24      0.24      0.24       452
weighted avg       0.24      0.24      0.24       452



In [20]:
X_balanced, y_balanced = load('X_y_arope.pkl')

In [21]:
best_catboost_model = CatBoostClassifier(**grid_search.best_params_, verbose=False)
# Кросс-валидация
y_pred_cv = cross_val_predict(best_catboost_model, X_balanced, y_balanced, cv=5)

# Результат
print(classification_report(y_balanced, y_pred_cv))

              precision    recall  f1-score   support

    DNA/MuDR       0.44      0.27      0.33       113
    Helitron       0.31      0.62      0.42       113
        LINE       0.44      0.31      0.36       113
         LTR       0.28      0.20      0.24       113

    accuracy                           0.35       452
   macro avg       0.37      0.35      0.34       452
weighted avg       0.37      0.35      0.34       452



In [22]:
X_balanced, y_balanced = load('X_y_laplacian_eigenmaps.pkl')

In [23]:
best_catboost_model = CatBoostClassifier(**grid_search.best_params_, verbose=False)
# Кросс-валидация
y_pred_cv = cross_val_predict(best_catboost_model, X_balanced, y_balanced, cv=5)

# Результат
print(classification_report(y_balanced, y_pred_cv))

              precision    recall  f1-score   support

    DNA/MuDR       0.25      0.23      0.24       113
    Helitron       0.31      0.31      0.31       113
        LINE       0.29      0.30      0.29       113
         LTR       0.18      0.19      0.18       113

    accuracy                           0.26       452
   macro avg       0.26      0.26      0.26       452
weighted avg       0.26      0.26      0.26       452



In [24]:
X_balanced, y_balanced = load('X_y_node2vec_char.pkl')

In [25]:
best_catboost_model = CatBoostClassifier(**grid_search.best_params_, verbose=False)
# Кросс-валидация
y_pred_cv = cross_val_predict(best_catboost_model, X_balanced, y_balanced, cv=5)

# Результат
print(classification_report(y_balanced, y_pred_cv))

              precision    recall  f1-score   support

    DNA/MuDR       0.69      0.65      0.67       113
    Helitron       0.67      0.73      0.70       113
        LINE       0.76      0.81      0.78       113
         LTR       0.83      0.74      0.79       113

    accuracy                           0.73       452
   macro avg       0.74      0.73      0.73       452
weighted avg       0.74      0.73      0.73       452

