In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

import numpy as np
import pandas as pd

from matplotlib import pyplot as plt


In [None]:
import sequence_features

from importlib import reload
reload(sequence_features);


In [None]:
models = [
    RandomForestRegressor,
    LinearRegression
]

encodings = [
    'one_hot_encode',
    'aa_composition'
]

In [None]:
# load data
def load_data(filename):
    return pd.read_csv(filename, header=0)

df = load_data('train.csv')
y = df['RetentionTime'].tolist()
sf = sequence_features.SeqFeatures(df['PeptideSequence'].tolist())

In [None]:
results = pd.DataFrame()
mean_absolute_errors = []

for i in range(len(encodings)):

    X = getattr(sf, encodings[i])

    for j in range(len(models)):

        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

        model_encoding = f"{models[j].__name__}_{encodings[i]}"

        print(f"Training {model_encoding}...")

        model = models[j]()
        model.fit(X_train, y_train)

        y_pred = model.predict(X_test)
        print(f"{model_encoding} R²:", r2_score(y_test, y_pred))

        # calculate mean absolute error
        mae = np.mean(np.abs(y_test - y_pred))
        print(f"{model_encoding} Mean Absolute Error:", mae)

        results[model_encoding] = y_pred
        mean_absolute_errors.append((mae, model_encoding))

        print("")

results['y_test'] = y_test

In [None]:
sorted_mae = sorted(mean_absolute_errors, reverse=True)

In [None]:
model_encoding_ranking = [x[1] for x in sorted_mae]

In [None]:
model_encoding_ranking

In [None]:
results.plot(x='y_test', y=model_encoding_ranking, marker='o', linestyle='')