In [3]:
import pandas as pd
import numpy as np
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from imblearn.pipeline import Pipeline
from sklearn.metrics import f1_score, accuracy_score

import os
os.chdir('C:\\Users\\mathi\\SimpleSequenceClassif')
from modules.preprocessing import categories_fit_one_hot, categories_transform_one_hot
from modules.preprocessing import df_seq_onehot_encode
from modules.data_specific import cleaning

In [4]:
base_path = 'C:\\Users\\mathi\\Documents\\sequence_data\\'
df0, df1, df2, df3, df4, test = [pd.read_csv(base_path + suffix) for suffix in [
    'fold_0.csv', 'fold_1.csv', 'fold_2.csv', 'fold_3.csv', 'fold_4.csv', 'test.csv']]
for df in [df0, df1, df2, df3, df4, test]:
  cleaning(df)
full_data = pd.concat([df0, df1, df2, df3, df4])

In [None]:
max_size = 15
alphabet = ['A', 'R', 'N', 'D', 'C', 'E', 'Q', 'G', 'H', 'I', 'L', \
            'K', 'M', 'F', 'P', 'S', 'T', 'W', 'Y', 'V']
sequence = 'peptide'
categories = ['class', 'gene', 'variant']

In [None]:
train_encoding, encoders = categories_fit_one_hot(df0, categories)
test_encoding = categories_transform_one_hot(df0, categories, encoders)
print(train_encoding.shape)
print(test_encoding.shape)

In [None]:
print(df_seq_onehot_encode(df0, sequence, alphabet, max_size).shape)

In [None]:
# List of DataFrames
dfs = [df0, df1, df2, df3, df4]

# Iterate through each DataFrame as a validation set
for i, val_df in enumerate(dfs):
    print(f"\nValidation on df{i}:")

    # Create training set by excluding the validation set
    train_dfs = [df for j, df in enumerate(dfs) if j != i]
    train_df = pd.concat(train_dfs, ignore_index=True)

    # Encodings for the training set
    y_train = train_df['hit'].values
    cat_train, encoders = categories_fit_one_hot(train_df, categories)
    seq_train = df_seq_onehot_encode(train_df, sequence, alphabet, max_size)
    X_train = np.hstack((cat_train, seq_train))

    # Encodings for the validation set
    y_val = val_df['hit'].values
    cat_val = categories_transform_one_hot(val_df, categories, encoders)
    seq_val = df_seq_onehot_encode(val_df, sequence, alphabet, max_size)
    X_val = np.hstack((cat_val, seq_val))

    # Random Forest
    imba_pipeline_cv = Pipeline([
        ('smote', SMOTE(random_state=42)),
        ('rf', RandomForestClassifier(n_estimators=20, max_depth=5, random_state=1))
    ])

    # Fit on the training set and validate on the validation set
    imba_pipeline_cv.fit(X_train, y_train)
    y_pred_val = imba_pipeline_cv.predict(X_val)

    # Evaluate performance on the validation set
    f1 = f1_score(y_val, y_pred_val)
    accuracy = accuracy_score(y_val, y_pred_val)

    print(f"F1 Score: {f1}, Accuracy: {accuracy}")