# Using an already existing model that has been trained

In [None]:
import pyteomics
import pandas as pd
import time
from pyteomics import mzml
import itertools
from pathlib import Path
from pprint import pprint
import os
import shutil
import random
import re

from matplotlib import pyplot as plt, cm
import numpy as np
from pandas_path import path
from sklearn.dummy import DummyClassifier
from sklearn.preprocessing import minmax_scale
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.metrics import make_scorer, log_loss
from sklearn.model_selection import StratifiedKFold, cross_val_score
from tqdm import tqdm

import joblib

from datetime import datetime
from openpyxl import load_workbook

RANDOM_SEED = 42  # For reproducibility
random.seed(RANDOM_SEED)
tqdm.pandas()

# Prediction features should already be converted to excel format, excel files should then be transfered to 'pred_features' that is inside 'prediction data'

In [None]:
# Specify the path to the 'predictions_data_folder'
predictions_data_folder = os.path.join(os.getcwd(), 'predictions data')
predictions_features_folder = os.path.join(predictions_data_folder, 'pred_features')

In [None]:
# Get all_labels for the format of report
data_directory = os.path.join(os.getcwd(), 'data')
file_path = os.path.join(data_directory, 'all_labels.csv')
all_labels = pd.read_csv(file_path)

# List the files in the 'predictions_data_folder' with '.xlsx' extension
prediction_data_files = [file for file in os.listdir(predictions_features_folder) if file.endswith('.xlsx')]

# Create dict with test sample IDs and paths
pred_dict = {pred_name[:-5]: "pred_features\\" + pred_name for pred_name in predictions_features_folder} 

# Remove ".xlsx" from the elements in prediction_data_files
prediction_data_files = [file.replace('.xlsx', '') for file in prediction_data_files]

# Create the report_format DataFrame with the same columns as all_labels and the file names as indexes
report_format = pd.DataFrame(columns=all_labels.columns[1:], index=prediction_data_files)

# Set the index header
report_format.index.name = "sample id"

# Define the file paths
report_format_path = os.path.join(predictions_data_folder, 'report_format.csv')

# Save test_labels in the data_directory folder
report_format.to_csv(report_format_path)

# Print the report_format DataFrame with the file names as indexes
print(report_format)

In [None]:
# Import submission format
submission_template_df = pd.read_csv(os.path.join(predictions_data_folder, "report_format.csv"), index_col="sample id")
                                     
compounds_order = submission_template_df.columns
sample_order = submission_template_df.index

In [None]:
# Create a series of time bins
timerange = pd.interval_range(start=0, end=35, freq=0.25)
timerange

# Make dataframe with rows that are combinations of all temperature bins and all m/z values
allcombs = list(itertools.product(timerange, [*range(1, 301)]))

allcombs_df = pd.DataFrame(allcombs, columns=["time bin", "rounded m/z"])
print(allcombs_df)

In [None]:
def int_per_timebin(df):

    """
    Transforms dataset to take the preprocessed max abundance for each
    time range for each m/z value

    Args:
        df: dataframe to transform

    Returns:
        transformed dataframe
    """

    # Bin times
    df["time bin"] = pd.cut(df["scan time"], bins=timerange)

    # Combine with a list of all time bin-m/z value combinations
    df = pd.merge(allcombs_df, df, on=["time bin", "rounded m/z"], how="left")

    # Aggregate to time bin level to find max
    df = df.groupby(["time bin", "rounded m/z"]).max("normalised intensity").reset_index()

    # Fill in 0 for intensity values without information
    df = df.replace(np.nan, 0)

    # Reshape so each row is a single sample
    df = df.pivot_table(
        columns=["rounded m/z", "time bin"], values=["normalised intensity"]
    )

    return df

# Refers to model folder containing all previously trained models for each compound

In [None]:
# Define the models folder path
MODELS_PATH = os.path.join(os.getcwd(), 'models')

In [None]:
def predict_for_sample(sample_id, models_folder_path):

    # Import sample
    temp_sample = pd.read_excel(os.path.join(predictions_data_folder, pred_dict[sample_id]))

    # Feature engineering on sample
    temp_sample = int_per_timebin(temp_sample)

    # Generate predictions for each class
    temp_sample_preds_dict = {}

    for compound in compounds_order:
        # Load the trained model from the MODELS_PATH folder
        model_filename = os.path.join(models_folder_path, f"logreg_model_{compound}.joblib")
        clf = joblib.load(model_filename)

        # Make predictions for the sample using the loaded model
        temp_sample_preds_dict[compound] = clf.predict_proba(temp_sample.values)[:, 1][0]

    return temp_sample_preds_dict


In [None]:
# Dataframe to store logreg submissions in
final_submission_df = pd.DataFrame(
    [
        predict_for_sample(sample_id, MODELS_PATH)
        for sample_id in tqdm(sample_order)
    ],
    index=sample_order,
)

In [None]:
# Check that columns and rows are the same between final submission and submission format
assert final_submission_df.index.equals(submission_template_df.index)
assert final_submission_df.columns.equals(submission_template_df.columns)

In [None]:
# Assuming final_submission_df is the DataFrame you want to save
current_datetime = datetime.now().strftime("%d%m%Y_%H%M")  # Get current date and time as a string
file_name = f"report_{current_datetime}.csv"  # Create the file name with current date and time

final_submission_df.to_csv(file_name, index=False)

# Load the workbook
wb = load_workbook(file_name)

# Get the first sheet (assuming there is only one sheet in the Excel file)
sheet = wb.active

# Autofit column widths for all columns
for column_cells in sheet.columns:
    length = max(len(str(cell.value)) for cell in column_cells)
    adjusted_width = length # Add a little padding and adjust to Excel's internal width units
    sheet.column_dimensions[column_cells[0].column_letter].width = adjusted_width

# Save the updated workbook
wb.save(file_name)

# End of prediction