In [74]:
import pandas as pd
import numpy as np
from pathlib import Path
from definitions import RAW_DATA_DIR, INTERIM_DATA_DIR
import re

In [75]:
# Clean IDs and join tables

def _format_cosinor_id(series: pd.Series) -> pd.Series:
    """Formats cosinor ID strings to match format of temp logger IDs in masterfile.
    exmples:
    - "M0122" -> 122
    - "M1058" -> 1058

    args:
        series (pd.Series): Series of cosinor ID strings.
    """
    cleaned = (
        series.astype(str)
        .str.replace(r"\D+", "", regex=True)  # drop non-digits
        .str.lstrip("0")
        .replace("", np.nan)
    )
    return pd.to_numeric(cleaned, errors="coerce").astype("Int64")


def _format_temp_logger_id(series: pd.Series) -> pd.Series:
    """Formats temp logger ID strings to be a string representation of an integer.
    examples:
    - "122.0 " -> 122

    args:
        series (pd.Series): Series of temp logger ID strings.
    """

    cleaned = (
        series.astype(str)
        .str.strip()
        .str.split(".", n=1)
        .str[0]
        .str.replace(r"\D+", "", regex=True)
        .str.lstrip("0")
        .replace("", np.nan)
    )
    return pd.to_numeric(cleaned, errors="coerce").astype("Int64")

def validate_column(df_to_validate, column_name, pattern_re_string, accept_nulls=True, ignore_duplicates=True):
    print("Original df shape:", df_to_validate.shape)

    return_df = df_to_validate.copy()

    if not ignore_duplicates:
        duplicates_df = check_for_duplicates(df_to_validate, column_name, ignore_na=accept_nulls)
        if not duplicates_df.empty:
            print(f"\nRemoving {len(duplicates_df)} duplicate rows")
            return_df = return_df.drop(duplicates_df.index)
            print(f"New df shape after removing duplicates: {return_df.shape}")

    if not accept_nulls:
        # Check for null values
        null_mask = df_to_validate[column_name].isna()
    else:
        null_mask = pd.Series([False] * len(df_to_validate), index=df_to_validate.index)
        
    # Check for invalid patterns
    valid_pattern = re.compile(pattern=pattern_re_string)
    pattern_mask = ~df_to_validate[column_name].astype(str).str.match(valid_pattern, na=False)

    # Combine masks
    invalid_mask = null_mask | pattern_mask

    if invalid_mask.sum() > 0:
        print(f"\nRows with null value: {null_mask.sum()}")
        print(f"Rows with invalid pattern: {pattern_mask.sum()}")
        print(f"Total invalid rows: {invalid_mask.sum()}")
        print(f"\nInvalid values in '{column_name}':")
        print(df_to_validate.loc[invalid_mask, column_name])
        
        print(f"\nRemoving {invalid_mask.sum()} rows with invalid values")
        return_df = df_to_validate[~invalid_mask].copy()
        print(f"New df shape: {return_df.shape}")
    else:
        print("No invalid IDs found.")
    
    return return_df


In [76]:

def check_for_duplicates(df_to_check, column_name, ignore_na=True):
    if ignore_na:
        # Get the non-null values and find duplicates
        non_null_values = df_to_check[column_name].dropna()
        duplicated_values = non_null_values[non_null_values.duplicated(keep=False)]
        # Create mask for all rows with duplicated values
        duplicated_mask = df_to_check[column_name].isin(duplicated_values)
    else:
        duplicated_mask = df_to_check[column_name].duplicated(keep=False)
    
    if duplicated_mask.sum() > 0:
        print(f"\nDuplicate entries found in column '{column_name}':")
        print(df_to_check.loc[duplicated_mask, column_name])
    else:
        print(f"No duplicates found in column '{column_name}'.")

    # return duplicated values
    return df_to_check[duplicated_mask]

In [77]:
# Load the prefixes scraped from cosinor analysis files
cosinor_ids = pd.read_parquet(INTERIM_DATA_DIR / "cosinor_IDs.parquet")
# Validate cosinor_ids
cosinor_ids = validate_column(cosinor_ids, "cosinor_id", r'^[M0-9]+$', accept_nulls=False)
cosinor_ids["cosinor_id_formatted"] = _format_cosinor_id(cosinor_ids["cosinor_id"])
temp_columns.append("cosinor_id_formatted")

Original df shape: (476, 2)
No invalid IDs found.


In [78]:
masterfile = pd.read_parquet(RAW_DATA_DIR / "Heat Stress Masterfile May 2024 - RF Ewe.ram data.parquet")
# Track temporary columns we'll add so we can remove them later
temp_columns = []

### Temp logger 2023
join

In [79]:
masterfile["temp_logger_2023_formatted"] = _format_temp_logger_id(masterfile["Temp logger # 2023"])
temp_columns.append("temp_logger_2023_formatted")
# Check for duplicates in temp_logger_2023 column
duplicated_df = check_for_duplicates(masterfile, "temp_logger_2023_formatted")
duplicate_logger_ids = duplicated_df["temp_logger_2023_formatted"].to_list()

if duplicate_logger_ids:
    # are any of the duplicates also cosinor_ids?
    overlap = [d for d in duplicate_logger_ids if d in cosinor_ids["cosinor_id_formatted"].to_list()]
    if not overlap:
        print("The dupes don't have a cosinor analysis, so we don't care about them")
    else:
        print(f"Dupes {overlap} have matching cosinor analysis. This is a PROBLEM")


Duplicate entries found in column 'temp_logger_2023_formatted':
274    718
901    718
Name: temp_logger_2023_formatted, dtype: Int64
The dupes don't have a cosinor analysis, so we don't care about them


In [80]:
joined_2023 = cosinor_ids.merge(
    masterfile,
    how="left",
    left_on="cosinor_id_formatted",
    right_on="temp_logger_2023_formatted",
    suffixes=("", "_masterfile"),
    indicator=True, # adds a column called “_merge” with the source of each row
)

In [81]:
print(f"Joined table shape: {joined_2023.shape}")
print(f"Rows with match: {(joined_2023['_merge'] == 'both').sum()}")
print(f"Rows without match: {(joined_2023['_merge'] == 'left_only').sum()}")
print(f"\nFirst 5 rows:")
print(joined_2023[['cosinor_id', 'cosinor_id_formatted', 'Temp logger # 2023', '_merge']].head())

Joined table shape: (476, 123)
Rows with match: 460
Rows without match: 16

First 5 rows:
  cosinor_id  cosinor_id_formatted  Temp logger # 2023 _merge
0      M0122                   122                 122   both
1      M0123                   123                 123   both
2      M0124                   124                 124   both
3      M0125                   125                 125   both
4      M0126                   126                 126   both


In [82]:
# Save join of 2023 starters
joined_2023 = joined_2023.assign(starter_year=2023)
out_2023 = INTERIM_DATA_DIR / f"{JOIN_NAME}_2023.parquet"
joined_2023.to_parquet(out_2023, index=False)
out_2023

WindowsPath('E:/alexa/No-OneDrive/Code/proj/disco-baa-01/data/02_interim/cosinor_IDs_masterfile_2023.parquet')

In [83]:
# Repeat for 2024 starters
if "Temp logger # 2024" not in masterfile.columns:
    raise KeyError("Temp logger # 2024 column not found in masterfile")

masterfile["temp_logger_2024_formatted"] = _format_temp_logger_id(masterfile["Temp logger # 2024"])

joined_2024 = cosinor_ids.merge(
    masterfile,
    how="left",
    left_on="cosinor_id_formatted",
    right_on="temp_logger_2024_formatted",
    suffixes=("", "_masterfile"),
    indicator=True,
)

joined_2024 = joined_2024.assign(starter_year=2024)
out_2024 = INTERIM_DATA_DIR / f"{JOIN_NAME}_2024.parquet"
joined_2024.to_parquet(out_2024, index=False)
out_2024

WindowsPath('E:/alexa/No-OneDrive/Code/proj/disco-baa-01/data/02_interim/cosinor_IDs_masterfile_2024.parquet')

In [84]:
# Check if any row has non-empty values for both temp logger columns
both_cols_2023 = masterfile["Temp logger # 2023"].notna()
both_cols_2024 = masterfile["Temp logger # 2024"].notna()

both_have_values = masterfile[both_cols_2023 & both_cols_2024]

print(f"Number of rows with both temp logger values: {len(both_have_values)}")
if len(both_have_values) > 0:
    print("\nRows with both values:")
    print(both_have_values[["Temp logger # 2023", "Temp logger # 2024"]].head(20))
else:
    print("\nNo rows have values for both temp logger columns.")


Number of rows with both temp logger values: 0

No rows have values for both temp logger columns.
