In [None]:
### extracting locations (non-Google) from initial.ipynb
### superceded by calls to google maps API 
import pandas as pd
from geopy.geocoders import Nominatim
from time import sleep

# Load locations from a CSV file
locs = pd.DataFrame(df.Location.unique(), columns=["location"])

geolocator = Nominatim(user_agent="geo_lookup")

# Function to get latitude and longitude
def get_coordinates(location):
    try:
        loc = geolocator.geocode(location, timeout=10)
        if loc:
            return pd.Series([loc.latitude, loc.longitude])
        else:
            return pd.Series([None, None])
    except Exception as e:
        print(f"Error for {location}: {e}")
        return pd.Series([None, None])

# Apply function with a delay to avoid API rate limits
locs[["latitude", "longitude"]] = locs["location"].apply(lambda x: get_coordinates(x))


In [None]:
### automatic clustering from analysis.ipynb
### superceded by tolerance-based clustering


# test_doi = "10.3354/meps11591"
test_doi = carb_df.doi.unique()[1]
test_df = carb_df[carb_df.doi == test_doi].copy()

# assign species_types values to unique ints. This is handled later by grouping, but this is useful for a quick demo
# species_types = test_df["species_types"].unique()
# species_types_dict = {species_type: i for i, species_type in enumerate(species_types)}
# test_df.loc[:,"species_types"] = test_df["species_types"].map(species_types_dict)
# select control variables
# treatment_data = test_df[['t_in', 'phtot', 'irr', 'species_types']].dropna()
treatment_data = test_df[['t_in', 'phtot', 'irr']].dropna()
# determine optimal number of clusters in the data
optimal_k, score_list = utils.optimal_kmeans(treatment_data)
# fit KMeans with the optimal number of clusters
kmeans = KMeans(n_clusters=optimal_k, random_state=42, n_init=10)
test_df.loc[:,'treatment_group'] = kmeans.fit_predict(treatment_data)

# # visualise clustering score wrt number of clusters
# plt.figure(figsize=(4,4))
# plt.scatter(*zip(*score_list), marker='o', linestyle='-')
# plt.xlabel('Number of Clusters')
# plt.ylabel('Silhouette Score (higher is better)')
# # mark a vertical line at the optimal number of clusters
# plt.axvline(optimal_k, color='r', linestyle='--', label='Optimal # of clusters')
# plt.legend()
# plt.show()

In [None]:
### I think I'm now using more recent functions? Need to check

def group_irradiance(df, irr_col='irr', rtol=0.10):
    """
    Assigns an 'irr_group' to values that are within a relative tolerance.
    
    Args:
        df (pd.DataFrame): Input dataframe with an 'irr' column.
        irr_col (str): Column name for irradiance values.
        rtol (float): Relative tolerance (e.g., 0.10 for 10%).
        
    Returns:
        pd.DataFrame: Dataframe with new 'irr_group' column.
    """
    df = df.copy().sort_values(by=irr_col)  # copy to prevent modification of og, sort for efficiency
    # initialize variables for group assignments
    irr_groups = np.zeros(len(df))  
    group_id = 0
    prev_irr = None

    for i, irr in enumerate(df[irr_col]):
        if np.isnan(irr):   # handling grouping where irr is NaN
            irr_groups[i] = np.nan
            
        elif prev_irr is None or np.abs(irr - prev_irr) / prev_irr > rtol:   # if first value or outside tolerance
            group_id += 1

        irr_groups[i] = group_id
        prev_irr = irr

    df['irr_group'] = irr_groups
    return df


test_doi = "10.1111/jpy.13290"
print(test_doi)
test_df = carb_df[carb_df.doi == test_doi].copy()

treatment_vars = ['t_in', 'phtot']
# columns of interest
cols = ['species_types', 'treatment_group', 'calcification'] + treatment_vars


def assign_treatment_groups(df: pd.DataFrame, t_atol: float=0.1, pH_atol: float=0.1, irr_rtol: float=0.1) -> pd.DataFrame:
    # 1 – group by irradiance
    df = df.copy()
    
    treatment_dfs = []
    for study_doi, study_df in df.groupby('doi'):
        study_df = group_irradiance(study_df, rtol=irr_rtol)

        # 2 – process treatments within species and irradiance groups
        for (irr_group, species), species_df in study_df.groupby(['irr_group', 'species_types']):
            # handle NaN irradiance separately
            irr_label = 'nan' if np.isnan(irr_group) else f"group_{int(irr_group)}"
            # identify controls (min T, max pH) # TODO: optional replace with closest to climatology
            control_T = species_df.loc[species_df['t_in'].idxmin(), 't_in'] if not species_df['t_in'].isna().all() else None
            control_pH = species_df.loc[species_df['phtot'].idxmax(), 'phtot'] if not species_df['phtot'].isna().all() else None

            def classify_treatment(row):
                close_T = np.isclose(row['t_in'], control_T, atol=t_atol) if control_T is not None else False
                close_pH = np.isclose(row['phtot'], control_pH, atol=pH_atol) if control_pH is not None else False

                if close_T and close_pH:
                    return 'cT' + 'cP'
                elif close_T:
                    return 'cT' + 'tP'
                elif close_pH:
                    return 'tT' + 'cP'
                elif not (close_T or close_pH):
                    return 'tT' + 'tP'
                else:
                    return 'uncertain'

            species_df['treatment_group'] = species_df.apply(classify_treatment, axis=1)
            
            treatment_dfs.append(species_df)

    return pd.concat(treatment_dfs, ignore_index=True)

carb_df_tgs = assign_treatment_groups(carb_df)
carb_df_tgs
