In [None]:
# Download data set at NASA Earthdata Search. Collection name: 
# Global Mean Sea Level Trend from Integrated Multi-Mission Ocean Altimeters TOPEX/Poseidon, Jason-1, OSTM/Jason-2, and Jason-3 Version 5.1

In [None]:
# GSFC. 2021. Global Mean Sea Level Trend from Integrated Multi-Mission Ocean Altimeters TOPEX/Poseidon
# Jason-1, OSTM/Jason-2, and Jason-3 Version 5.1. Ver. 5.1 PO.DAAC, CA, USA.
# Dataset accessed [2023-04-20] at https://doi.org/10.5067/GMSLM-TJ151.

In [None]:
# Download the dataset and save it in this directory as data.txt.
# Read the header information of the data, and then remove it and run this script.
# This script will turn the data into CSV and generate your sample, test, and train data 
# using K-fold cross validation. These files will be saved as properly-formatted JSON for use with
# OpenAI evals framework.

In [1]:
import openai
import os
import random
import pandas as pd
from sklearn.model_selection import KFold
from typing import List
import numpy as np
from math import sqrt
from sklearn.metrics import r2_score
import json

In [2]:
class GMSLPredictor:

    def __init__(self, api_key, train_samples, train_samples_per_prompt=15):
        self.api_key = api_key
        self.train_samples = train_samples
        self.train_samples_per_prompt = train_samples_per_prompt
        openai.api_key = self.api_key

    def query_model(self, prompt):
        prompt_str = ""
        for item in prompt:
            if item["role"] == "system":
                prompt_str += item["content"] + "\n"
            else:
                prompt_str += "User: " + item["content"] + "\n"

        response = openai.ChatCompletion.create(
            model="gpt-3.5-turbo",
            messages=[{"role": "system", "content": prompt_str}]
        )
        with open(f"Logs/logfile.txt", "+a") as file:
            file.write(f"Prompt_str: {prompt_str} Response = {response}")
        return response.choices[0].message.content

    def eval_sample(self, test_sample, rng: random.Random):
        stuffing = rng.sample(self.train_samples, self.train_samples_per_prompt)

        prompt = [
            {"role": "system", "content": "You will be given Global Mean Sea Level (GMSL) variation WITHOUT"
             " Global Isostatic Adjustment in mm with respect to TOPEX/Jason collinear mean reference."
             " You will also be given the year and fraction of year in decimal form of when the measurement happened."
             " Predict the global mean sea level variation WITH global isostatic adjustment (GMSL with GIA)"
            ", in mm, with respect to TOPEX/Jason collinear mean reference."
             " Your response should only include the predicted GMSL with GIA in mm."},
            {"role": "user", "content": "GMSL without GIA: -42.71"},
            {"role": "assistant", "content": "-42.69"},
            {"role": "user", "content": "GMSL without GIA: -31.15"},
            {"role": "assistant", "content": "-30.91"}
        ]

        for i, sample in enumerate(stuffing + [test_sample]):
            if i < len(stuffing):
                prompt += [
                    {"role": "system", "content": sample["input"]},
                    {"role": "system", "content": sample["ideal"]},
                ]
            else:
                prompt += [{"role": "user", "content": sample["input"]}]

        model_prediction = self.query_model(prompt)
        return float(model_prediction), float(test_sample["ideal"])

In [3]:
data = pd.read_csv("Data/data.csv")
data_sample = pd.read_csv("Data/data.csv", nrows=100)

In [4]:
samples = []

for index, row in data_sample.iterrows():
    input_values = f"GMSL without GIA: {row['GMSL_variation_no_GIA']}."
    prompt = [
        {"role": "system", "content": "You will be given Global Mean Sea Level (GMSL) variation WITHOUT"
                 " Global Isostatic Adjustment in mm with respect to TOPEX/Jason collinear mean reference."
                 " You will also be given the year and fraction of year in decimal form of when the measurement happened."
                 " Predict the global mean sea level variation WITH global isostatic adjustment (GMSL with GIA)"
                ", in mm, with respect to TOPEX/Jason collinear mean reference."
                 " Your response should only include the predicted GMSL with GIA in mm."},
        {"role": "user", "content": input_values}
    ]
    ideal = str(row["GMSL_variation_with_GIA"])
    samples.append({"input": prompt, "ideal": ideal})
        
with open("samples.jsonl", "w") as f:
    for sample in samples:
        f.write(json.dumps(sample) + "\n")

In [None]:
def create_jsonl(prompt, ideal):
    return {"input": prompt, "ideal": ideal}

In [None]:
jsonl_data = []
i = 0
for index, row in data.iterrows():
    input_values = f"Date of measurements: {row['year_fraction']} GMSL without GIA: {row['GMSL_variation_no_GIA']}."
    prompt = input_values
    ideal = str(row["GMSL_variation_with_GIA"])
    jsonl = create_jsonl(prompt, ideal)
    jsonl_data.append(jsonl)


In [None]:
kf = KFold(n_splits=5)

train_samples_all = []
test_samples_all = []

In [None]:
for train_index, test_index in kf.split(jsonl_data):
    train_samples = [jsonl_data[i] for i in train_index]
    test_samples = [jsonl_data[i] for i in test_index]
    train_samples_all.extend(train_samples)
    test_samples_all.extend(test_samples)
    
# Write train samples to the train.jsonl file
with open(f"train.jsonl", "w") as file:
    for sample in train_samples_all:
        file.write(json.dumps(sample) + "\n")

# Write test samples to the test.jsonl file
with open(f"test.jsonl", "w") as file:
    for sample in test_samples_all:
        file.write(json.dumps(sample) + "\n")