In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

import numpy as np
import pandas as pd

from matplotlib import pyplot as plt

import sequence_features
from importlib import reload
reload(sequence_features)

import itertools

In [None]:
import sys, time
def progress_bar(current, total, bar_length=40):
    fraction = current / total
    arrow = '=' * int(fraction * bar_length - 1) + '>'
    padding = ' ' * (bar_length - len(arrow))
    ending = '\n' if current == total else '\r'
    sys.stdout.write(f'[{arrow}{padding}] {int(fraction*100)}%{ending}')
    sys.stdout.flush()

In [None]:
models = [
    RandomForestRegressor,
    LinearRegression
]

encodings = [
    'hydro_features',
    'one_hot_encode',
    'aa_composition',
]

In [None]:
encoding_combos = []
for r in range(len(encodings)):
    encoding_combos += [*itertools.combinations(encodings, r + 1)]
encoding_combos

In [None]:
# load data
def load_data(filename):
    return pd.read_csv(filename, header=0)

df = load_data('train.csv')
y = df['RetentionTime'].tolist()
sf = sequence_features.SeqFeatures(df['PeptideSequence'].tolist())

### Test feature combinations

In [None]:
results = pd.DataFrame()
mean_absolute_errors = []

for combo in encoding_combos:
    combo_name = "_".join(combo)
    X = sf.get_feature_combination(combo)

    for j in range(len(models)):

        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

        model_encoding = f"{models[j].__name__}_{combo_name}"

        print(f"Training {model_encoding}...")

        model = models[j]()
        model.fit(X_train, y_train)

        y_pred = model.predict(X_test)
        print(f"{model_encoding} R²:", r2_score(y_test, y_pred))

        # calculate mean absolute error
        mae = np.mean(np.abs(y_test - y_pred))
        print(f"{model_encoding} Mean Absolute Error:", mae)

        results[model_encoding] = y_pred
        mean_absolute_errors.append((mae, model_encoding))

        print("")

results['y_test'] = y_test

In [None]:
sorted_mae = sorted(mean_absolute_errors, reverse=True)
model_encoding_ranking = [x[1] for x in sorted_mae]

In [None]:
top5 = model_encoding_ranking[-5:]

In [None]:
top5

In [None]:
ax = results.plot(x='y_test', y=top5, marker='o', linestyle='')
ax.plot(
    [results.y_test.min(numeric_only=True), results.y_test.max(numeric_only=True)],
    [results.y_test.min(numeric_only=True), results.y_test.max(numeric_only=True)],
    color='red', linestyle='-', label='y=x'
);