In [5]:
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
import numpy as np
import os
import pickle

# Загрузка данных

In [6]:
os.chdir('..')

In [7]:
data_filename = os.path.join('data', f'X_y_characteristics.pkl')

X = None
y = None

if os.path.exists(data_filename):
    with open(data_filename, 'rb') as f:
        data = pickle.load(f)
    
    X = data['X']
    y = data['y']

X = np.array(X)
y = np.array(y)

In [8]:
from collections import Counter
counter = Counter(y)
counter

Counter({'LTR': 212,
         'nan': 178,
         'Helitron': 170,
         'DNA/MuDR': 130,
         'LINE': 113,
         'DNA+': 87,
         'TEG': 35,
         'DNA/HAT': 27,
         'Mix': 22,
         'Mix with Helitron': 18,
         'Unassigned': 16,
         'RathE1/2/3_cons': 7,
         'SINE': 7})

Фильтруем, берем только те, которых много.

In [9]:
families_to_filter = ['LTR', 'Helitron', 'DNA/MuDR', 'LINE']

indices = np.isin(y, families_to_filter)

X_filtered = X[indices]
y_filtered = y[indices]

Делаем равномерные классы

In [10]:
def balance_classes(X, y):
    X = np.array(X)
    y = np.array(y)
    # Определяем количество элементов в каждом классе
    class_counts = Counter(y)
    min_count = min(class_counts.values())

    # Собираем индексы для каждого класса
    indices_by_class = {cls: np.where(y == cls)[0] for cls in class_counts}

    # Оставляем только min_count элементов для каждого класса
    balanced_indices = []
    for cls, indices in indices_by_class.items():
        balanced_indices.extend(indices[:min_count])

    np.random.shuffle(balanced_indices)

    X_balanced = X[balanced_indices]
    y_balanced = y[balanced_indices]

    return X_balanced, y_balanced

In [11]:
X_balanced, y_balanced = balance_classes(X_filtered, y_filtered)

In [12]:
X_balanced.shape

(452, 109)

In [13]:
y_balanced.shape

(452,)

In [14]:
clf = RandomForestClassifier(random_state=42)

# Кросс-валидация
y_pred_cv = cross_val_predict(clf, X_balanced, y_balanced, cv=5)

# Результат
print(classification_report(y_balanced, y_pred_cv))

# Таблица сопряженности
data_cv = pd.DataFrame({'1': y_balanced, '2': y_pred_cv})
contingency_table_cv = pd.crosstab(data_cv['1'], data_cv['2'])

print(contingency_table_cv)

              precision    recall  f1-score   support

    DNA/MuDR       0.64      0.66      0.65       113
    Helitron       0.70      0.67      0.68       113
        LINE       0.71      0.79      0.75       113
         LTR       0.84      0.74      0.79       113

    accuracy                           0.72       452
   macro avg       0.72      0.72      0.72       452
weighted avg       0.72      0.72      0.72       452

2         DNA/MuDR  Helitron  LINE  LTR
1                                      
DNA/MuDR        75        24     8    6
Helitron        20        76    13    4
LINE            14         4    89    6
LTR              9         5    15   84


In [15]:
clf = RandomForestClassifier(random_state=42)
clf.fit(X_balanced, y_balanced)
# Сохранение модели
file_path = 'models_files/random_forest_balanced_01.pkl'
if not os.path.exists(file_path):
    with open(file_path, 'wb') as f:  
        pickle.dump(clf, f)
    print("Завершено")

In [16]:
def load(file_path):  
    data_filename = os.path.join('data', file_path)
    
    X = None
    y = None
    
    if os.path.exists(data_filename):
        with open(data_filename, 'rb') as f:
            data = pickle.load(f)
    
        X = data['X']
        y = data['y']
    
    X = np.array(X)
    y = np.array(y)
    
    indices = np.isin(y, families_to_filter)
    
    X_filtered = X[indices]
    y_filtered = y[indices]
    
    X_balanced, y_balanced = balance_classes(X_filtered, y_filtered)
    return X_balanced, y_balanced

In [17]:
X_balanced, y_balanced = load('X_y_node2vec.pkl')
clf = RandomForestClassifier(random_state=42)

# Кросс-валидация
y_pred_cv = cross_val_predict(clf, X_balanced, y_balanced, cv=5)

# Результат
print(classification_report(y_balanced, y_pred_cv))

# Таблица сопряженности
data_cv = pd.DataFrame({'1': y_balanced, '2': y_pred_cv})
contingency_table_cv = pd.crosstab(data_cv['1'], data_cv['2'])

print(contingency_table_cv)

              precision    recall  f1-score   support

    DNA/MuDR       0.30      0.29      0.30       113
    Helitron       0.32      0.32      0.32       113
        LINE       0.21      0.21      0.21       113
         LTR       0.27      0.28      0.28       113

    accuracy                           0.28       452
   macro avg       0.28      0.28      0.28       452
weighted avg       0.28      0.28      0.28       452

2         DNA/MuDR  Helitron  LINE  LTR
1                                      
DNA/MuDR        33        20    33   27
Helitron        16        36    31   30
LINE            37        24    24   28
LTR             23        33    25   32


In [18]:
X_balanced, y_balanced = load('X_y_arope.pkl')
clf = RandomForestClassifier(random_state=42)

# Кросс-валидация
y_pred_cv = cross_val_predict(clf, X_balanced, y_balanced, cv=5)

# Результат
print(classification_report(y_balanced, y_pred_cv))

# Таблица сопряженности
data_cv = pd.DataFrame({'1': y_balanced, '2': y_pred_cv})
contingency_table_cv = pd.crosstab(data_cv['1'], data_cv['2'])

print(contingency_table_cv)

              precision    recall  f1-score   support

    DNA/MuDR       0.38      0.23      0.29       113
    Helitron       0.31      0.65      0.42       113
        LINE       0.37      0.28      0.32       113
         LTR       0.26      0.13      0.18       113

    accuracy                           0.33       452
   macro avg       0.33      0.33      0.30       452
weighted avg       0.33      0.33      0.30       452

2         DNA/MuDR  Helitron  LINE  LTR
1                                      
DNA/MuDR        26        47    26   14
Helitron        10        74    12   17
LINE            16        53    32   12
LTR             17        65    16   15


In [19]:
X_balanced, y_balanced = load('X_y_laplacian_eigenmaps.pkl')
clf = RandomForestClassifier(random_state=42)

# Кросс-валидация
y_pred_cv = cross_val_predict(clf, X_balanced, y_balanced, cv=5)

# Результат
print(classification_report(y_balanced, y_pred_cv))

# Таблица сопряженности
data_cv = pd.DataFrame({'1': y_balanced, '2': y_pred_cv})
contingency_table_cv = pd.crosstab(data_cv['1'], data_cv['2'])

print(contingency_table_cv)

              precision    recall  f1-score   support

    DNA/MuDR       0.27      0.16      0.20       113
    Helitron       0.31      0.79      0.45       113
        LINE       0.47      0.24      0.32       113
         LTR       0.22      0.09      0.13       113

    accuracy                           0.32       452
   macro avg       0.32      0.32      0.27       452
weighted avg       0.32      0.32      0.27       452

2         DNA/MuDR  Helitron  LINE  LTR
1                                      
DNA/MuDR        18        58    21   16
Helitron         7        89     4   13
LINE            20        60    27    6
LTR             21        76     6   10


In [20]:
X_balanced, y_balanced = load('X_y_node2vec_char.pkl')
clf = RandomForestClassifier(random_state=42)

# Кросс-валидация
y_pred_cv = cross_val_predict(clf, X_balanced, y_balanced, cv=5)

# Результат
print(classification_report(y_balanced, y_pred_cv))

# Таблица сопряженности
data_cv = pd.DataFrame({'1': y_balanced, '2': y_pred_cv})
contingency_table_cv = pd.crosstab(data_cv['1'], data_cv['2'])

print(contingency_table_cv)

              precision    recall  f1-score   support

    DNA/MuDR       0.60      0.57      0.58       113
    Helitron       0.65      0.71      0.68       113
        LINE       0.76      0.80      0.78       113
         LTR       0.81      0.74      0.77       113

    accuracy                           0.70       452
   macro avg       0.70      0.70      0.70       452
weighted avg       0.70      0.70      0.70       452

2         DNA/MuDR  Helitron  LINE  LTR
1                                      
DNA/MuDR        64        30     9   10
Helitron        21        80     8    4
LINE            10         7    90    6
LTR             11         7    11   84
