# Импорт библиотек и загрузка файлов

In [6]:
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.io as pio
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import (mean_squared_error, mean_absolute_error, r2_score,
                             confusion_matrix, precision_score, recall_score, f1_score, fbeta_score)
import tensorflow as tf
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import (Conv1D, Flatten, Dense, Dropout, BatchNormalization, Input,
                                     Concatenate, Reshape, LSTM, MaxPooling1D)
from tensorflow.keras.optimizers import Adam

from modules.data_transformation import *

In [2]:
pio.renderers.default = "notebook"

In [3]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

### Сейчас не нужно

In [None]:
# # Объединяем данные из эксель файла в один датафрейм, добавляя столбец cell_line (название листа в файле)
# file_path="data/4_cell_lines.xlsx"
# sheets = pd.ExcelFile(file_path).sheet_names

# df_list = []

# for sheet in sheets:
#     df = pd.read_excel(file_path, sheet_name=sheet)
#     df["cell_line"] = sheet
#     df_list.append(df)

# final_df = pd.concat(df_list, ignore_index=True)

# print(final_df.head())

# final_df.to_csv("data/4_cell_lines.csv", index=False, header=True)

In [None]:
# df = pd.read_csv("data/HEK293t.txt", sep=",", engine="python")

# df4 = pd.read_csv(r"data/II4.txt", sep=",", engine="python", names=['target', 'potential_off_target', 'is_off_target']) 

# K562_with_extra_features

In [None]:
# df = pd.read_csv("data/K562.txt", sep=",", engine="python",
#                  names=['target', 'potential_off_target', 'is_off_target']
#                  )

# df['encoded_7channels'] = df.apply(
#     lambda row: encode_7channels(
#         row['target'],
#         row['potential_off_target'],
#         pam_location="last",
#         pam_length=3
#     ).flatten(),
#     axis=1
# )

# df['mismatch_count'] = df.apply(
#     lambda row: count_mismatches(row['target'], row['potential_off_target']),
#     axis=1
# )

# df['gc_target'] = df['target'].apply(calc_gc_content)
# df['gc_off_target'] = df['potential_off_target'].apply(calc_gc_content)

In [37]:
df = pd.read_csv(r"data/K562_with_extra_features.csv", sep=",", engine="python")
df.drop(columns=df.columns[0], axis=1, inplace=True)
df['encoded_7channels'] = df['encoded_7channels'].apply(lambda x: np.fromstring(x.strip("[]"), sep=' '))

In [19]:
df.mismatch_count.value_counts()

mismatch_count
6    16070
5     3547
4      608
3       76
2       13
1        4
7        1
Name: count, dtype: int64

In [11]:
df[df.is_off_target == 1].mismatch_count.value_counts()

mismatch_count
4    71
3    32
2    12
1     4
7     1
Name: count, dtype: int64

In [None]:
# # Генерируем эмбеддинги для target
# target_embeddings = generate_embeddings(df, sequence_column='target', polymer_type='DNA', encoding_strategy='aptamer')
# target_embeddings.to_csv(r'data/target_embeddings.csv', header=True)

# # Генерируем эмбеддинги для potential_off_target
# off_target_embeddings = generate_embeddings(df, sequence_column='potential_off_target',
#                                                polymer_type='DNA', encoding_strategy='aptamer')
# off_target_embeddings.to_csv(r'data/off_target_embeddings.csv', header=True)

# # Объединяем с исходным DataFrame
# df = df.join(target_embeddings.add_prefix('target_'), how='left')
# df = df.join(off_target_embeddings.add_prefix('off_target_'), how='left')

In [None]:
target_embeddings = pd.read_csv("data/target_embeddings.csv", index_col=0)
off_target_embeddings = pd.read_csv("data/off_target_embeddings.csv", index_col=0)

df = df.join(target_embeddings.add_prefix('target_'), how='left')
df = df.join(off_target_embeddings.add_prefix('off_target_'), how='left')

# Table_S8_machine_learning_input

In [4]:
df = pd.read_csv(r"data/Table_S8_machine_learning_input.csv", engine="python", sep='\t')

In [None]:
df

In [10]:
df['encoded_7channels'] = df.apply(
    lambda row: encode_7channels(
        row['genome input'],
        row['sgRNA input'],
        pam_location="last",
        pam_length=3
    ).flatten(),
    axis=1
)

df['mismatch_count'] = df.apply(
    lambda row: count_mismatches(row['genome input'], row['sgRNA input']),
    axis=1
)

df['gc_genome'] = df['genome input'].apply(calc_gc_content)
df['gc_sgrna'] = df['sgRNA input'].apply(calc_gc_content)

df['pam'] = df['sgRNA input'].apply(reverse_last_3)

In [None]:
# # Генерируем эмбеддинги для genome input
# genome_embeddings = generate_embeddings(df, sequence_column='genome input', polymer_type='DNA', encoding_strategy='aptamer')
# genome_embeddings.to_csv(r'data/genome_embeddings.csv', header=True)

# # Генерируем эмбеддинги для sgRNA input
# sgRNA_embeddings = generate_embeddings(df, sequence_column='sgRNA input',
#                                                polymer_type='DNA', encoding_strategy='aptamer')
# sgRNA_embeddings.to_csv(r'data/sgRNA_embeddings.csv', header=True)

# # Объединяем с исходным DataFrame
# df = df.join(genome_embeddings.add_prefix('genome_'), how='left')
# df = df.join(sgRNA_embeddings.add_prefix('sgrna_'), how='left')

In [61]:
genome_embeddings = pd.read_csv("data/genome_embeddings.csv", index_col=0)
sgRNA_embeddings = pd.read_csv("data/sgRNA_embeddings.csv", index_col=0)

df = df.join(genome_embeddings.add_prefix('genome_'), how='left')
df = df.join(sgRNA_embeddings.add_prefix('sgrna_'), how='left')

# Regression

### CNN, SeQuant embeddings

In [68]:
# 📌 Определяем названия колонок
genome_features = [f"genome_feature_{i}" for i in range(43)]
sgrna_features = [f"sgrna_feature_{i}" for i in range(43)]
target = "mean relative gamma"

# 📌 Извлекаем данные
X_genome = df[genome_features].to_numpy()  # (samples, 43)
X_sgrna = df[sgrna_features].to_numpy()  # (samples, 43)
y = df[target].to_numpy()  # Целевая переменная

# 📌 Разделяем данные на train/test
X_genome_train, X_genome_test, X_sgrna_train, X_sgrna_test, y_train, y_test = train_test_split(
    X_genome, X_sgrna, y, test_size=0.2, random_state=42
)

# 📌 Проверяем размерности
print(f"X_genome_train shape: {X_genome_train.shape}")  # (samples, 43)
print(f"X_sgrna_train shape: {X_sgrna_train.shape}")  # (samples, 43)
print(f"y_train shape: {y_train.shape}")  # (samples,)

X_genome_train shape: (20998, 43)
X_sgrna_train shape: (20998, 43)
y_train shape: (20998,)


In [69]:
# 📌 Входной слой для genome эмбеддингов
input_genome = Input(shape=(43,), name="genome_input")
x1 = Dense(64, activation='relu')(input_genome)
x1 = Dropout(0.3)(x1)
x1 = Dense(32, activation='relu')(x1)
x1 = Dropout(0.3)(x1)

# 📌 Входной слой для sgrna эмбеддингов
input_sgrna = Input(shape=(43,), name="sgrna_input")
x2 = Dense(64, activation='relu')(input_sgrna)
x2 = Dropout(0.3)(x2)
x2 = Dense(32, activation='relu')(x2)
x2 = Dropout(0.3)(x2)

# 📌 Объединяем два потока
merged = Concatenate()([x1, x2])
x = Dense(64, activation='relu')(merged)
x = Dropout(0.3)(x)
x = Dense(32, activation='relu')(x)
x = Dropout(0.3)(x)

# 📌 Выходной слой (регрессия)
output = Dense(1, activation='linear', name="output")(x)

# 📌 Создаем модель
model = Model(inputs=[input_genome, input_sgrna], outputs=output)
model.compile(optimizer='adam', loss='mse', metrics=['mae'])

# 📌 Вывод структуры модели
model.summary()

In [70]:
history = model.fit(
    [X_genome_train, X_sgrna_train], y_train,
    epochs=20, batch_size=32, validation_split=0.2, verbose=1
)

Epoch 1/20
[1m525/525[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 4ms/step - loss: 4.6866 - mae: 1.1375 - val_loss: 0.1799 - val_mae: 0.2822
Epoch 2/20
[1m525/525[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - loss: 0.1855 - mae: 0.3436 - val_loss: 0.1566 - val_mae: 0.2973
Epoch 3/20
[1m525/525[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - loss: 0.1612 - mae: 0.3308 - val_loss: 0.1503 - val_mae: 0.3072
Epoch 4/20
[1m525/525[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - loss: 0.1577 - mae: 0.3333 - val_loss: 0.1484 - val_mae: 0.3129
Epoch 5/20
[1m525/525[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - loss: 0.1528 - mae: 0.3312 - val_loss: 0.1473 - val_mae: 0.3192
Epoch 6/20
[1m525/525[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - loss: 0.1557 - mae: 0.3350 - val_loss: 0.1473 - val_mae: 0.3201
Epoch 7/20
[1m525/525[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - 

In [71]:
results_df, mse, mae, r2 = evaluate_regression_model(model, [X_genome_test, X_sgrna_test], y_test)

[1m165/165[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.1550 - mae: 0.3364
Test Loss (MSE): 0.15257
Test MAE: 0.33360
[1m165/165[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step

Regression Metrics:
Mean Squared Error (MSE): 0.15257
Mean Absolute Error (MAE): 0.33360
R² Score: -0.00015


### CNN, Only encoded_7

In [None]:
# Преобразуем encoded_7channels обратно в матрицу 7×N (для CNN можно оставить 2D)
X = np.array([np.reshape(ch, (7, -1)).T for ch in df['encoded_7channels']])

# Целевая переменная
y = df['mean relative gamma'].to_numpy()

# Разбиваем на train/test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"X_train shape: {X_train.shape}")  # (samples, N, 7)
print(f"y_train shape: {y_train.shape}")  # (samples,)

X_train shape: (20998, 26, 7)
y_train shape: (20998,)


In [17]:
# Входной слой
input_layer = Input(shape=(X_train.shape[1], X_train.shape[2]))

# Сверточный слой для извлечения признаков
x = Conv1D(32, kernel_size=3, activation='relu', padding='same')(input_layer)
x = Flatten()(x)  # Преобразуем в одномерный вектор
x = Dense(64, activation='relu')(x)
x = Dropout(0.3)(x)
x = Dense(32, activation='relu')(x)
x = Dropout(0.3)(x)

# Выходной слой (предсказание числового значения)
output_layer = Dense(1, activation='linear')(x)

# Определяем модель
model = Model(inputs=input_layer, outputs=output_layer)
model.compile(optimizer='adam', loss='mse', metrics=['mae'])  # MSE + MAE

# Вывод структуры модели
model.summary()

In [18]:
# Обучение модели
history = model.fit(
    X_train, y_train, 
    epochs=20, batch_size=32, validation_split=0.2, verbose=1
)

Epoch 1/20
[1m525/525[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 7ms/step - loss: 0.1334 - mae: 0.2809 - val_loss: 0.0819 - val_mae: 0.2038
Epoch 2/20
[1m525/525[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 7ms/step - loss: 0.0843 - mae: 0.2127 - val_loss: 0.0772 - val_mae: 0.2007
Epoch 3/20
[1m525/525[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 5ms/step - loss: 0.0771 - mae: 0.2013 - val_loss: 0.0691 - val_mae: 0.1833
Epoch 4/20
[1m525/525[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 5ms/step - loss: 0.0725 - mae: 0.1929 - val_loss: 0.0641 - val_mae: 0.1847
Epoch 5/20
[1m525/525[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 7ms/step - loss: 0.0659 - mae: 0.1819 - val_loss: 0.0650 - val_mae: 0.1729
Epoch 6/20
[1m525/525[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 5ms/step - loss: 0.0609 - mae: 0.1738 - val_loss: 0.0595 - val_mae: 0.1689
Epoch 7/20
[1m525/525[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 4ms/step - 

In [26]:
results_df, mse, mae, r2 = evaluate_regression_model(model, X_test, y_test)

[1m165/165[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.0513 - mae: 0.1524
Test Loss (MSE): 0.05250
Test MAE: 0.15502
[1m165/165[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step

Regression Metrics:
Mean Squared Error (MSE): 0.05250
Mean Absolute Error (MAE): 0.15502
R² Score: 0.65584


### RNN, encoded_7 + extra

In [6]:
# 📌 Признаки
sequence_features = 'encoded_7channels'  # Матрица 7×N
numerical_features = ['mismatch position', 'mismatch_count', 'gc_genome', 'gc_sgrna']  # Числовые признаки
target = 'mean relative gamma'  # Целевая переменная

# Преобразуем `encoded_7channels` обратно в матрицу 7×N
X_seq = np.array([np.reshape(ch, (7, -1)).T for ch in df[sequence_features]])

# Извлекаем числовые признаки
X_num = df[numerical_features].to_numpy()

# Целевая переменная
y = df[target].to_numpy()

# Разбиваем на train/test
X_seq_train, X_seq_test, X_num_train, X_num_test, y_train, y_test = train_test_split(
    X_seq, X_num, y, test_size=0.2, random_state=42
)

# Проверяем размерности
print(f"X_seq_train shape: {X_seq_train.shape}")  # (samples, N, 7)
print(f"X_num_train shape: {X_num_train.shape}")  # (samples, 4)
print(f"y_train shape: {y_train.shape}")  # (samples,)

X_seq_train shape: (20998, 26, 7)
X_num_train shape: (20998, 4)
y_train shape: (20998,)


In [8]:
# Вход для последовательности (encoded_7channels)
input_seq = Input(shape=(X_seq_train.shape[1], X_seq_train.shape[2]), name="sequence_input")

# Вход для числовых признаков
input_num = Input(shape=(X_num_train.shape[1],), name="numerical_input")

# 📌 RNN часть (LSTM)
x1 = LSTM(64, return_sequences=True)(input_seq)
x1 = LSTM(32)(x1)

# 📌 Полносвязные слои для числовых признаков
x2 = Dense(32, activation='relu')(input_num)
x2 = Dropout(0.3)(x2)

# 📌 Объединяем два потока
merged = Concatenate()([x1, x2])
x = Dense(64, activation='relu')(merged)
x = Dropout(0.3)(x)
x = Dense(32, activation='relu')(x)
x = Dropout(0.3)(x)

# 📌 Выходной слой (регрессия)
output = Dense(1, activation='linear', name="output")(x)

# Создаем модель
model = Model(inputs=[input_seq, input_num], outputs=output)
model.compile(optimizer='adam', loss='mse', metrics=['mae'])

# Вывод структуры модели
model.summary()

In [9]:
history = model.fit(
    [X_seq_train, X_num_train], y_train,
    epochs=20, batch_size=32, validation_split=0.2, verbose=1
)

Epoch 1/20
[1m525/525[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 18ms/step - loss: 1.0169 - mae: 0.5456 - val_loss: 0.1339 - val_mae: 0.2455
Epoch 2/20
[1m525/525[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 17ms/step - loss: 0.1178 - mae: 0.2535 - val_loss: 0.1193 - val_mae: 0.2487
Epoch 3/20
[1m525/525[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 17ms/step - loss: 0.1121 - mae: 0.2537 - val_loss: 0.1180 - val_mae: 0.2471
Epoch 4/20
[1m525/525[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 17ms/step - loss: 0.1060 - mae: 0.2445 - val_loss: 0.1084 - val_mae: 0.2476
Epoch 5/20
[1m525/525[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 17ms/step - loss: 0.1029 - mae: 0.2439 - val_loss: 0.0991 - val_mae: 0.2425
Epoch 6/20
[1m525/525[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 17ms/step - loss: 0.1059 - mae: 0.2480 - val_loss: 0.1071 - val_mae: 0.2440
Epoch 7/20
[1m525/525[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 17ms

In [14]:
results_df, mse, mae, r2 = evaluate_regression_model(model, [X_seq_test, X_num_test], y_test)

[1m165/165[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - loss: 0.1095 - mae: 0.2374
Test Loss (MSE): 0.10911
Test MAE: 0.23721
[1m165/165[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step

Regression Metrics:
Mean Squared Error (MSE): 0.10911
Mean Absolute Error (MAE): 0.23721
R² Score: 0.28473


### CNN, encoded_7channels + extra

In [15]:
# 📌 Признаки
sequence_features = 'encoded_7channels'  # Матрица 7×N
numerical_features = ['mismatch position', 'mismatch_count', 'gc_genome', 'gc_sgrna']  # Числовые признаки
target = 'mean relative gamma'  # Целевая переменная

# Преобразуем `encoded_7channels` обратно в матрицу 7×N
X_seq = np.array([np.reshape(ch, (7, -1)).T for ch in df[sequence_features]])

# Извлекаем числовые признаки
X_num = df[numerical_features].to_numpy()

# Целевая переменная
y = df[target].to_numpy()

# Разбиваем на train/test
X_seq_train, X_seq_test, X_num_train, X_num_test, y_train, y_test = train_test_split(
    X_seq, X_num, y, test_size=0.2, random_state=42
)

# Проверяем размерности
print(f"X_seq_train shape: {X_seq_train.shape}")  # (samples, N, 7)
print(f"X_num_train shape: {X_num_train.shape}")  # (samples, 4)
print(f"y_train shape: {y_train.shape}")  # (samples,)

X_seq_train shape: (20998, 26, 7)
X_num_train shape: (20998, 4)
y_train shape: (20998,)


In [19]:
# Вход для последовательности (encoded_7channels)
input_seq = Input(shape=(X_seq_train.shape[1], X_seq_train.shape[2]), name="sequence_input")

# Вход для числовых признаков
input_num = Input(shape=(X_num_train.shape[1],), name="numerical_input")

# 📌 CNN часть (Conv1D + MaxPooling)
x1 = Conv1D(64, kernel_size=3, activation='relu', padding='same')(input_seq)
x1 = MaxPooling1D(pool_size=2)(x1)
x1 = Conv1D(32, kernel_size=3, activation='relu', padding='same')(x1)
x1 = Flatten()(x1)  # Преобразуем в одномерный вектор

# 📌 Полносвязные слои для числовых признаков
x2 = Dense(32, activation='relu')(input_num)
x2 = Dropout(0.3)(x2)

# 📌 Объединяем два потока
merged = Concatenate()([x1, x2])
x = Dense(64, activation='relu')(merged)
x = Dropout(0.3)(x)
x = Dense(32, activation='relu')(x)
x = Dropout(0.3)(x)

# 📌 Выходной слой (регрессия)
output = Dense(1, activation='linear', name="output")(x)

# Создаем модель
model = Model(inputs=[input_seq, input_num], outputs=output)
model.compile(optimizer='adam', loss='mse', metrics=['mae'])

# Вывод структуры модели
model.summary()

In [None]:
history = model.fit(
    [X_seq_train, X_num_train], y_train,
    epochs=20, batch_size=32, validation_split=0.2, verbose=1
)

Epoch 1/20
[1m525/525[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 7ms/step - loss: 0.1385 - mae: 0.2696 - val_loss: 0.1049 - val_mae: 0.2342
Epoch 2/20
[1m525/525[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 6ms/step - loss: 0.0954 - mae: 0.2302 - val_loss: 0.0998 - val_mae: 0.2224
Epoch 3/20
[1m525/525[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 5ms/step - loss: 0.0853 - mae: 0.2150 - val_loss: 0.0777 - val_mae: 0.2034
Epoch 4/20
[1m525/525[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 5ms/step - loss: 0.0759 - mae: 0.1993 - val_loss: 0.0746 - val_mae: 0.1906
Epoch 5/20
[1m525/525[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 5ms/step - loss: 0.0731 - mae: 0.1922 - val_loss: 0.0707 - val_mae: 0.1890
Epoch 6/20
[1m525/525[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 5ms/step - loss: 0.0670 - mae: 0.1829 - val_loss: 0.0684 - val_mae: 0.1859
Epoch 7/20
[1m525/525[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 5ms/step - 

In [23]:
results_df, mse, mae, r2 = evaluate_regression_model(model, [X_seq_test, X_num_test], y_test)

[1m165/165[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - loss: 0.0596 - mae: 0.1665
Test Loss (MSE): 0.05947
Test MAE: 0.16765
[1m165/165[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step

Regression Metrics:
Mean Squared Error (MSE): 0.05947
Mean Absolute Error (MAE): 0.16765
R² Score: 0.61017


# Classification

## CNN

### embeddings

In [69]:
# Удаляем строки, где нет хотя бы одного эмбеддинга
df_cleaned = df.dropna(subset=[f'target_feature_{i}' for i in range(43)] + 
                               [f'off_target_feature_{i}' for i in range(43)]).reset_index(drop=True)

print(f"Размер после очистки: {df_cleaned.shape}")  # Должно быть (20319, 93), если всё норм

target_features = [f'target_feature_{i}' for i in range(43)]
off_target_features = [f'off_target_feature_{i}' for i in range(43)]

X_target = df_cleaned[target_features].to_numpy()
X_off_target = df_cleaned[off_target_features].to_numpy()
y = df_cleaned['is_off_target'].fillna(0).to_numpy()

# Разделяем данные
X_target_train, X_target_test, X_off_train, X_off_test, y_train, y_test = train_test_split(
    X_target, X_off_target, y, test_size=0.2, random_state=42, stratify=y
)

print(f"X_target_train shape: {X_target_train.shape}")
print(f"X_off_train shape: {X_off_train.shape}")
print(f"y_train shape: {y_train.shape}")

Размер после очистки: (20319, 93)
X_target_train shape: (16255, 43)
X_off_train shape: (16255, 43)
y_train shape: (16255,)


In [70]:
# Вход для target
input_target = Input(shape=(43,), name='target_input')

# Вход для potential_off_target
input_off_target = Input(shape=(43,), name='off_target_input')

# Полносвязные слои для target
x1 = Dense(64, activation='relu')(input_target)
x1 = Dropout(0.3)(x1)

# Полносвязные слои для potential_off_target
x2 = Dense(64, activation='relu')(input_off_target)
x2 = Dropout(0.3)(x2)

# Объединяем два входа
merged = Concatenate()([x1, x2])
x = Dense(128, activation='relu')(merged)
x = Dropout(0.3)(x)
x = Dense(64, activation='relu')(x)
x = Dropout(0.3)(x)

# Выходной слой (теперь он правильно связан с моделью)
output = Dense(1, activation='sigmoid', name='output')(x)

# Определяем модель
model = Model(inputs=[input_target, input_off_target], outputs=output)
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Вывод структуры модели
model.summary()

In [74]:
# Преобразуем список классов в numpy массив
class_weights = compute_class_weight(class_weight='balanced', classes=np.array([0, 1]), y=y_train)
class_weight_dict = {i: class_weights[i] for i in range(len(class_weights))}

print(f"Class Weights: {class_weight_dict}")  # Посмотрим, какие веса

# Обучаем с учетом весов
history = model.fit(
    [X_target_train, X_off_train], y_train, 
    epochs=20, batch_size=32, validation_split=0.2, 
    class_weight=class_weight_dict, verbose=1
)

Class Weights: {0: np.float64(0.502970480846587), 1: np.float64(84.66145833333333)}
Epoch 1/20
[1m407/407[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 11ms/step - accuracy: 0.7464 - loss: 5.6945 - val_accuracy: 0.0920 - val_loss: 0.8014
Epoch 2/20
[1m407/407[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 10ms/step - accuracy: 0.5471 - loss: 1.1978 - val_accuracy: 0.9945 - val_loss: 0.3656
Epoch 3/20
[1m407/407[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 11ms/step - accuracy: 0.5855 - loss: 1.1828 - val_accuracy: 0.0584 - val_loss: 0.8825
Epoch 4/20
[1m407/407[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 13ms/step - accuracy: 0.2962 - loss: 0.7089 - val_accuracy: 0.0821 - val_loss: 0.7433
Epoch 5/20
[1m407/407[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 5ms/step - accuracy: 0.3202 - loss: 0.8664 - val_accuracy: 0.9923 - val_loss: 0.6414
Epoch 6/20
[1m407/407[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 5ms/step - accuracy: 0.3

In [75]:
results_df, annotated_cm = evaluate_model(model, [X_target_test, X_off_test], y_test, beta=1, threshold=0.4)

[1m127/127[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.0813 - loss: 0.7434
Test Loss: 0.74396
Test Accuracy: 0.07899
[1m127/127[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step

Metrics:
Precision: 0.00635
Recall: 1.00000
F1-score: 0.01263
F-beta (1): 0.01263
Threshold used: 0.4

Results DataFrame (first 5 rows):
   y_test  y_pred_proba  y_pred prediction_is_true
0       0      0.551689       1                 No
1       0      0.551689       1                 No
2       0      0.551689       1                 No
3       0      0.551689       1                 No
4       0      0.551689       1                 No

Annotated Confusion Matrix:
                       Predicted No (0)          Predicted Yes (1)
Actual No (0)   TN (True Negative): 287  FP (False Positive): 3753
Actual Yes (1)   FN (False Negative): 0     TP (True Positive): 24


### encoded_7channels + 3 extra

In [38]:
df['encoded_7channels'] = df['encoded_7channels'].apply(lambda x: x.reshape(23, 7))

In [39]:
# Входы
sequence_input = Input(shape=(23, 7), name='sequence_input')  # N = длина последовательности
additional_input = Input(shape=(3,), name='additional_input')  # gc_content, mismatch_count,

# CNN для последовательностей
x = Conv1D(32, kernel_size=3, activation='relu')(sequence_input)
x = Flatten()(x)

# Полносвязный слой для дополнительных признаков
w = Dense(16, activation='relu')(additional_input)

# Объединяем
combined = Concatenate()([x, w])
z = Dense(64, activation='relu')(combined)
output = Dense(1, activation='sigmoid')(z)

# Финальная модель
model = tf.keras.Model(inputs=[sequence_input, additional_input], outputs=output)
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [41]:
# X_sequences = np.array([encoded.flatten() for encoded in df['encoded_7channels']])  # Вход для CNN
X_sequences = np.array(df['encoded_7channels'].tolist())  # Вход для CNN
X_additional = df[['gc_target', 'gc_off_target', 'mismatch_count']].to_numpy()  # Доп. признаки
y = df['is_off_target'].to_numpy()

In [42]:
X_seq_train, X_seq_test, X_add_train, X_add_test, y_train, y_test = train_test_split(
    X_sequences, X_additional, y, test_size=0.2, random_state=42
)

# Обучение модели на тренировочных данных
model.fit(
    [X_seq_train, X_add_train],
    y_train,
    epochs=20,
    batch_size=32,
    validation_split=0.2,
    verbose=1
)

Epoch 1/20
[1m407/407[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 5ms/step - accuracy: 0.9996 - loss: 0.0014 - val_accuracy: 0.9982 - val_loss: 0.0052
Epoch 2/20
[1m407/407[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 5ms/step - accuracy: 0.9994 - loss: 0.0016 - val_accuracy: 0.9963 - val_loss: 0.0133
Epoch 3/20
[1m407/407[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - accuracy: 0.9998 - loss: 9.8313e-04 - val_accuracy: 0.9982 - val_loss: 0.0063
Epoch 4/20
[1m407/407[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 5ms/step - accuracy: 1.0000 - loss: 9.2333e-05 - val_accuracy: 0.9982 - val_loss: 0.0064
Epoch 5/20
[1m407/407[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - accuracy: 1.0000 - loss: 4.8750e-05 - val_accuracy: 0.9982 - val_loss: 0.0079
Epoch 6/20
[1m407/407[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 5ms/step - accuracy: 1.0000 - loss: 2.2468e-05 - val_accuracy: 0.9982 - val_loss: 0.0068
Epoch 7/20


<keras.src.callbacks.history.History at 0x11c5b915c90>

In [43]:
# Оценка модели на тестовых данных
results_df, annotated_cm = evaluate_classification_model(model, [X_seq_test, X_add_test], y_test)

[1m127/127[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.9987 - loss: 0.0028
Test Loss: 0.00198
Test Accuracy: 0.99902
[1m127/127[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step

Metrics:
Precision: 0.95455
Recall: 0.87500
F1-score: 0.91304
F-beta (2): 0.88983
Threshold used: 0.5

Results DataFrame (first 5 rows):
   y_test  y_pred_proba  y_pred prediction_is_true
0       0  9.797002e-33       0                Yes
1       0  6.778435e-22       0                Yes
2       0  4.551552e-31       0                Yes
3       0  1.131322e-29       0                Yes
4       0  1.170316e-27       0                Yes

Annotated Confusion Matrix:
                        Predicted No (0)       Predicted Yes (1)
Actual No (0)   TN (True Negative): 4039  FP (False Positive): 1
Actual Yes (1)    FN (False Negative): 3  TP (True Positive): 21


### Only encoded_7channels

In [44]:
# Преобразуем данные
X = np.array([np.reshape(ch, (7, -1)).T for ch in df['encoded_7channels']])  # Преобразуем flattened массивы обратно в матрицы 7xN
y = np.array(df['is_off_target'])  # Бинарная целевая переменная

# Разделение на тренировочный и тестовый наборы
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Определяем параметры входа
input_shape = X_train.shape[1:]  # (длина последовательности, 7 каналов)

In [14]:
unique, frequency = np.unique(y_test, return_counts = True)

print("Unique Values:", unique)
print("Frequency Values:", frequency)

Unique Values: [0 1]
Frequency Values: [4040   24]


In [45]:
# Создаем модель CNN
model = Sequential([
    Conv1D(filters=32, kernel_size=3, activation='relu', input_shape=input_shape),
    BatchNormalization(),
    Dropout(0.3),
    Conv1D(filters=64, kernel_size=3, activation='relu'),
    BatchNormalization(),
    Dropout(0.3),
    Flatten(),
    Dense(64, activation='relu'),
    Dropout(0.3),
    Dense(1, activation='sigmoid')  # Для бинарной классификации
])

# Компилируем модель
model.compile(optimizer=Adam(learning_rate=0.001),
              loss='binary_crossentropy',
              metrics=['accuracy'])

# Обучаем модель
history = model.fit(X_train, y_train, epochs=20, batch_size=32, validation_split=0.2, verbose=1)

Epoch 1/20


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m407/407[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 7ms/step - accuracy: 0.9747 - loss: 0.0858 - val_accuracy: 0.9908 - val_loss: 0.0670
Epoch 2/20
[1m407/407[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 7ms/step - accuracy: 0.9948 - loss: 0.0200 - val_accuracy: 0.9945 - val_loss: 0.0185
Epoch 3/20
[1m407/407[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 7ms/step - accuracy: 0.9961 - loss: 0.0113 - val_accuracy: 0.9945 - val_loss: 0.0161
Epoch 4/20
[1m407/407[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 6ms/step - accuracy: 0.9969 - loss: 0.0085 - val_accuracy: 0.9942 - val_loss: 0.0171
Epoch 5/20
[1m407/407[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 7ms/step - accuracy: 0.9970 - loss: 0.0073 - val_accuracy: 0.9942 - val_loss: 0.0142
Epoch 6/20
[1m407/407[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 6ms/step - accuracy: 0.9977 - loss: 0.0072 - val_accuracy: 0.9938 - val_loss: 0.0175
Epoch 7/20
[1m407/407[0m [32m━━━━━━━

In [46]:
results_df, annotated_cm = evaluate_classification_model(model, [X_seq_test, X_add_test], y_test)

Expected: keras_tensor_18
Received: inputs=('Tensor(shape=(32, 23, 7))', 'Tensor(shape=(32, 3))')


[1m127/127[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.9791 - loss: 0.0533
Test Loss: 0.05452
Test Accuracy: 0.98007
[1m127/127[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step

Metrics:
Precision: 0.13924
Recall: 0.45833
F1-score: 0.21359
F-beta (2): 0.31429
Threshold used: 0.5

Results DataFrame (first 5 rows):
   y_test  y_pred_proba  y_pred prediction_is_true
0       0  2.009036e-15       0                Yes
1       0  1.195813e-10       0                Yes
2       0  6.660504e-10       0                Yes
3       0  4.172120e-16       0                Yes
4       0  2.814111e-09       0                Yes

Annotated Confusion Matrix:
                        Predicted No (0)        Predicted Yes (1)
Actual No (0)   TN (True Negative): 3972  FP (False Positive): 68
Actual Yes (1)   FN (False Negative): 13   TP (True Positive): 11


Нужно свести к минимуму FN

# Функции

In [None]:
# def train_and_evaluate_model(df: pd.DataFrame, encoding_function, model) -> None:
#     """
#     Обучает модель с использованием заданной функции кодирования и выводит R^2, MSE, предсказания и тестовые значения.

#     :param df: Датафрейм с колонками genome input, sgRNA input и mean relative gamma
#     :param encoding_function: Функция кодирования для использования
#     """
#     # Кодирование данных
#     df['encoded'] = df.apply(
#         lambda row: encoding_function(row['genome input'], row['sgRNA input']).flatten(),
#         axis=1
#     )

#     # Подготовка данных для обучения
#     X = np.vstack(df['encoded'].values)  # Преобразование списка массивов в 2D массив
#     y = df['mean relative gamma']

#     # Разделяем данные на обучающую и тестовую выборки
#     X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#     # Обучение модели
#     model.fit(X_train, y_train)

#     # Предсказания
#     y_pred = model.predict(X_test)

#     # Вычисление метрик
#     r2_score = model.score(X_test, y_test)
#     mse = mean_squared_error(y_test, y_pred)
#     mae = mean_absolute_error(y_test, y_pred)

#     # Вывод результатов
#     print(f"R^2 Score: {r2_score:.4f}")
#     print(f"Mean Squared Error: {mse:.4f}")
#     print(f"Mean Absolute Error: {mae:.4f}")
#     print("Predictions vs Actual:")
#     for pred, actual in zip(y_pred, y_test):
#         print(f"Predicted: {pred:.4f}, Actual: {actual:.4f}")