# Various features for article about THI affecting MY

## Note: add dpi at least 300 under "plt.figure" for publication quality to plots
This script contains the following:
- Total Daily Milk Yield Produced Within Herds in Crude data
- Daily Yield With Expected Milk Yield and Mean THI
- Per herd expected MY against actual MY and THI on secondary y-axis
- Per herd expected MY divided by number of cows giving milk on the day in question, against actual MY divided by number of cows giving milk on the day in question, with THI on secondary y-axis
- THI and Insemination Events Within Herds to Study Ins Patterns
- THI and Insemination Events per Cow
- Estimate how often milk production on a farm decreases by 30%, without it being connected to an increase in THI
- Percentage of cows responding negatively to heat stress in farm
- This code calculates the proportion of cows in each farm that have shown a negative response (yield drop) in milk yield under heat stress conditions.
- This code returns results nested within herd, breed and parity. results generated to pdf
- This code returns results nested within herd, breed, parity, lactation stage. Generates results to pdf instead of on screen
- Hur stor del av korna minskar i mjölkavkastning, då hela besättningen faller under en period med värmestress?
- Hur stor är förändringen i mjölkavkastningen i genomsnitt?
- Daily impact of the THI on MY using a regression approach to quantify how much milk yield decreases (in kg) per unit increase in THI
- Probability Density of Milk Yield for Heat Stressed vs Non-Heat Stressed Cows in Herd
  - Probability Density of Milk Yield for Heat Stressed vs Non-Heat Stressed Cows in Herd overall
  - Single herd plotting of Milk Yield Deviation, calc intersecting area
  - Multiple herd plotting Milk Yield Deviation, calc intersecting area
  - Plotting one plot per cow-lactation combination, calc intersecting area
- Check calving patterns on herds across time period
- Check correlation THI and temperature
- Double checking THI adjusted calculation

In [None]:
from datetime import timedelta
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import glob
import matplotlib.dates as mdates

## Total Daily Milk Yield Produced Within Herds in Crude data

In [None]:
# Load data
df_lact = pd.read_csv("../Data/MergedData/MY_weather.csv", low_memory=False)

# Just in case
df_lact = df_lact.drop_duplicates(subset=["SE_Number", "LactationNumber", "StartDate", "StartTime", "SessionNumber", "TotalYield"])

# Ensure the StartDate column is a datetime object
df_lact['StartDate'] = pd.to_datetime(df_lact['StartDate'])

# list of farms
list_of_farms = list(df_lact["FarmName_Pseudo"].unique())

# Dictionary to store farm color mapping
farm_color_mapping = {}

# Loop through each farm and create a separate plot for each
for i, farm in enumerate(list_of_farms):
    # Create a new figure for each farm
    plt.figure(figsize=(12, 6), dpi=300)

    selected_farm = df_lact[df_lact['FarmName_Pseudo'] == farm]
    number_of_cows = len(selected_farm['SE_Number'].unique())
    daily_yield = selected_farm.groupby('StartDate')['TotalYield'].sum()

    # Get color for this farm, ensuring enough colors
    color = sns.color_palette('bright', n_colors=max(len(list_of_farms), 10))[i]
    farm_color_mapping[farm] = color

    # Generate a complete date range and reindex the daily yield
    all_dates = pd.date_range(start=daily_yield.index.min(), end=daily_yield.index.max(), freq='D')
    daily_yield = daily_yield.reindex(all_dates, fill_value=0)

    # Plot data
    plt.plot(daily_yield.index, daily_yield.values, label=f'Farm {farm} \n{number_of_cows} cows', color=color)
    plt.title(f'Milk Data for Farm {farm}', fontsize=16)
    plt.ylabel('Daily Yield', fontsize=14)
    plt.legend(loc='upper right')
    plt.grid(True)

    # Set x-axis label
    plt.xlabel('Date', fontsize=14)

    # Show the plot
    plt.tight_layout()
    plt.show()

## Daily Milk Yield With Expected Milk Yield and Mean THI

In [None]:
# Load data
cow_data = pd.read_csv("../Data/CowData/MY_weather.csv", low_memory=False)

In [None]:
# Pick a cow to study
SE_Number = ["SE-5c06d92d-3205"]
cow_data = cow_data[cow_data["SE_Number"].isin(SE_Number)]

In [None]:
cow_data['THI_adj'] = cow_data['THI_adj'].fillna(cow_data['MeanTHI_adj'])
col_keep = ["SE_Number", "LactationNumber", "StartDate", "DaysInMilk", "THI_adj"]
cow_data = cow_data[col_keep]
cow_data

Script from wilminks to get estimates for expected milk yield
- Note: here using code from "WilminksHeatApp.ipynb" and not from Quantile program!

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
from scipy.optimize import curve_fit, OptimizeWarning
from tqdm import tqdm
import warnings
from scipy.stats import zscore
from statsmodels.tsa.stattools import acf, pacf
from scipy.optimize import minimize
from vqr import VectorQuantileRegressor
from vqr.solvers.regularized_lse import RegularizedDualVQRSolver
import statsmodels.api as sm


sns.set_theme()
sns.set_context("notebook")
%load_ext autoreload
%autoreload 2

In [None]:
dtype_dict = {
    'FarmName_Pseudo': 'str',
    'SE_Number': 'str',
    'AnimalNumber': 'Int64',          
    'StartDate': 'str',
    'StartTime': 'str',
    'DateTime': 'str',
    'LactationNumber': 'Int64',       
    'DaysInMilk': 'Int64', 
    'YearSeason': 'str',           
    'TotalYield': 'float',
    'DateTime': 'str',
    'BreedName': 'str',
    'Age': 'Int64',
    'Mother': 'str',
    'Father': 'str',
    'CullDecisionDate': 'str',
    'Temperature2': 'float',
    'RelativeHumidity': 'float',      
    'THI_adj2': 'float',
    'HW': 'Int64',                    
    'cum_HW': 'Int64',                
    'Temp15Threshold': 'Int64'        
}

# Load the CSV with specified dtypes
data = pd.read_csv('../Data/MergedData/CleanedYieldData.csv', dtype=dtype_dict)
# data = data[data["DaysInMilk"].notna()]

# Convert date and time columns back to datetime and time objects
data['DateTime'] = pd.to_datetime(data['DateTime'], errors='coerce')
data['StartTime'] = pd.to_datetime(data['StartTime'], format='%H:%M:%S', errors='coerce').dt.time
data['StartDate'] = pd.to_datetime(data['StartDate'], errors='coerce')
data['CullDecisionDate'] = pd.to_datetime(data['CullDecisionDate'], errors='coerce')
data['DateTime'] = pd.to_datetime(data['DateTime'], errors='coerce')
data.head()

In [None]:
# Calculate the DailyYield for each cow each day
data['DailyYield'] = data.groupby(['SE_Number', 'StartDate'])['TotalYield'].transform('sum')

# Sort the data by AnimalNumber and StartDate
data.sort_values(['AnimalNumber', 'StartDate'], inplace=True)

# Calculate the previous day's total yield for each cow
data['PreviousDailyYield'] = data.groupby('AnimalNumber')['DailyYield'].shift(1)

# Calculate the daily yield change for each cow
data['DailyYieldChange'] = data['DailyYield'] - data['PreviousDailyYield']

# Group and aggregate data ===========================================================>>> OBS change Temperature to Temperature2 and THI_adj to THI_adj2 when running filtered data
data = data.groupby(['SE_Number', 'FarmName_Pseudo', 'StartDate']).agg({
    'DailyYield': 'first',
    'PreviousDailyYield': 'first',
    'DailyYieldChange': 'first',
    'HW': 'max',
    'Temperature2': 'mean',
    'THI_adj2': 'mean',
    'DaysInMilk': 'first',
    'YearSeason': 'first',
    'cum_HW': 'max',
    'Temp15Threshold': 'max',
    'Age': 'first',
    'BreedName': 'first',
    'LactationNumber': 'first'
}).reset_index()

# Renaming and formatting ===========================================================>>> OBS change Temperature to Temperature2 and THI_adj to THI_adj2 when running filtered data
data.rename(columns={
    'Temperature2': 'MeanTemperature',
    'THI_adj2': 'MeanTHI_adj',
    'StartDate': 'Date'
}, inplace=True)
data['Date'] = pd.to_datetime(data['Date'])

# Display the first few rows of the transformed data
data.head()

In [None]:
# Check if DailyYield is centered around approx the same for each farm 
print("Mean of DailyYield:", data.groupby('FarmName_Pseudo')['DailyYield'].mean())
print("Standard Deviation of DailyYield:", data.groupby('FarmName_Pseudo')['DailyYield'].std())

In [None]:
# Define the Wilmink Lactation Curve function
def wilmink_lactation_curve(dim, a, b, c, d):
    dim = np.array(dim, dtype=float)
    return a + b * dim + c * np.exp(-d * dim)

# Function to detect and remove outliers
def remove_outliers(group, threshold=3.5):
    mean = np.mean(group['DailyYield'])
    std_dev = np.std(group['DailyYield'])
    return group[(group['DailyYield'] > mean - threshold * std_dev) & (group['DailyYield'] < mean + threshold * std_dev)]

# Function to smooth the data using a rolling average
def smooth_data(group, window=5):
    group = group.copy()
    group['DailyYield'] = group['DailyYield'].rolling(window, min_periods=1).mean()
    return group

# Function to fit the Wilmink Lactation Curve to the dataset
def fit_wilmink_lactation_curve(dataset):
    # Initialize the 'ExpectedYield' column to NaN
    dataset['ExpectedYield'] = np.nan
    params_dict = {}
    
    valid_indices = []

    # Group the dataset by 'SE_Number' and 'LactationNumber' and fit the curve for each segment
    for (animal_number, lactation_number), group in tqdm(dataset.groupby(['SE_Number', 'LactationNumber']), unit=" Segments"):
        # Prepare the data for fitting
        group = remove_outliers(group, threshold=3.5)  # Remove outliers with threshold 4
        group = smooth_data(group)  # Smooth the data
        x_data = group['DaysInMilk'].values
        y_data = group['DailyYield'].values
        
        # Ensure there are no NaN or infinite values in the data
        if not np.isfinite(x_data).all() or not np.isfinite(y_data).all():
            print(f"Non-finite values found for cow {animal_number}, lactation {lactation_number}, skipping.")
            continue
        
        # Ensure there are enough data points to fit the curve
        if len(x_data) < 150 or len(y_data) < 150:
            print(f"Insufficient data points for cow {animal_number}, lactation {lactation_number}, skipping.")
            continue

        valid_indices.extend(group.index)
        
        # Fit the model
        try:
            # Initial parameter guesses
            initial_guesses = [np.mean(y_data), 0, np.mean(y_data) / 2, 0.1]
            # Bounds on the parameters to prevent overflow
            bounds = ([-np.inf, -np.inf, -np.inf, 0], [np.inf, np.inf, np.inf, np.inf])
            
            with warnings.catch_warnings():
                warnings.filterwarnings('error', category=OptimizeWarning)
                try:
                    popt, pcov = curve_fit(
                        wilmink_lactation_curve, x_data, y_data,
                        p0=initial_guesses, bounds=bounds, maxfev=30000
                    )
                    
                    # Store the parameters in the dictionary
                    params_dict[(animal_number, lactation_number)] = {'a': popt[0], 'b': popt[1], 'c': popt[2], 'd': popt[3]}
                    
                    # Predict the expected yield using the fitted model
                    dataset.loc[group.index, 'ExpectedYield'] = wilmink_lactation_curve(group['DaysInMilk'], *popt)
                
                except OptimizeWarning:
                    print(f"OptimizeWarning for cow {animal_number}, lactation {lactation_number}, skipping.")
            
        except RuntimeError as e:
            print(f"Curve fit failed for cow {animal_number}, lactation {lactation_number}: {e}")
        except ValueError as e:
            print(f"Value error for cow {animal_number}, lactation {lactation_number}: {e}")
    
    # Keep only valid indices
    dataset = dataset.loc[valid_indices].reset_index(drop=True)
    
    return dataset, params_dict

# Apply the curve fitting function to your dataset
data, params_dict = fit_wilmink_lactation_curve(data)

# Remove rows where ExpectedYield is NaN
data = data.dropna(subset=['ExpectedYield'])

# Calculate NormalizedDailyYield, PreviousDailyYield, DailyYieldChange, and NormalizedDailyYieldChange
data.loc[:, 'NormalizedDailyYield'] = data['DailyYield'] / data['ExpectedYield']
data.loc[:, 'PreviousDailyYield'] = data.groupby('SE_Number')['DailyYield'].shift(1)
data.loc[:, 'DailyYieldChange'] = data['DailyYield'] - data['PreviousDailyYield']
data.loc[:, 'NormalizedDailyYieldChange'] = data['DailyYieldChange'] / data['ExpectedYield']

data

In [None]:
# Generate expected yield
col_keep = ["SE_Number", "LactationNumber", "DaysInMilk", "DailyYield", "ExpectedYield"]
data = data[col_keep]

SE_Number = ["SE-5c06d92d-3205"]
data = data[data["SE_Number"].isin(SE_Number)]

In [None]:
cow_data = pd.merge(cow_data, data, on=["SE_Number", "LactationNumber", "DaysInMilk"], how="left")
cow_data

In [None]:
# MY x MEANTHI

# Ensure StartDate is a datetime type
cow_data['StartDate'] = pd.to_datetime(cow_data['StartDate'])
cow_data = cow_data.sort_values(by='StartDate')

# Create a figure and a set of subplots with higher resolution (dpi=300)
fig, ax1 = plt.subplots(figsize=(16, 8), dpi=300)

# Plot the first y-axis (Milk Yield)
ax1.plot(cow_data['StartDate'], cow_data['ExpectedYield'],
         label='Expected Milk Yield per Cow', color='skyblue', linestyle='--', linewidth=2.5)
ax1.plot(cow_data['StartDate'], cow_data['DailyYield'],
         label='Actual Milk Yield per Cow', color='salmon', marker='o', markersize=1, linewidth=2.5)

ax1.set_ylim(0, 60)
ax1.set_xlabel('Date', fontsize=15)
ax1.set_ylabel('Daily Milk Yield, liters per day', fontsize=15)
ax1.tick_params(axis='y')
plt.xticks(rotation=45)
plt.title('Daily Milk Yield With Expected Milk Yield and Mean THI, \nSE-5c06d92d-3205', fontsize=20)

ax1.xaxis.set_major_locator(mdates.MonthLocator())
ax1.xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m-%d'))

# Create a second y-axis that shares the same x-axis
ax2 = ax1.twinx()
ax2.set_ylim(0, 80)
ax2.plot(cow_data['StartDate'], cow_data['THI_adj'], color='g')
ax2.set_ylabel('Mean THI', fontsize=15)
ax2.tick_params(axis='y')

# Adjust layout to prevent overlap
fig.tight_layout()
plt.show()


If want scattered alternative to generate MY as dots instead

In [None]:
#PLOT MY x MEANTEMP

#SUBSET CHOSEN COW(S)
SE_Number = ["SE-f454e660-0588"] 
df = df[df["SE_Number"].isin(SE_Number)]

#dates = pd.date_range(start='2022-01-01', end='2023-11-13', periods=30)

# Create a figure and a set of subplots
fig, ax1 = plt.subplots(figsize=(16, 8), dpi=300)

# Plot the first y-axis (Milk Yield)
ax1.scatter(df['Date'], df['DailyYield'], c=df['LactationNumber'])
ax1.set_xlabel('Date', fontsize=15)
ax1.set_ylabel('Total Daily Milk Yield, kg per day', fontsize=15)
ax1.tick_params(axis='y')
ax1.set_ylim(0,100)
plt.xticks(rotation=45)
plt.title('Total Daily Milk Yield With Mean Temperature, \nSE-f454e660-0588', fontsize=20)
plt.gca().xaxis.set_major_locator(plt.matplotlib.dates.MonthLocator())

# Create a second y-axis that shares the same x-axis
ax2 = ax1.twinx() # instantiate a second Axes that shares the same x-axis
ax2.plot(df['Date'], df['MeanTHI_adj'], color='r')
ax2.set_ylabel('Mean THI', fontsize=15)
ax2.tick_params(axis='y')

# Adjust layout to prevent overlap
fig.tight_layout()
plt.show()

## Per herd expected MY against actual MY and THI on secondary y-axis
Note: here the dataset that is loaded is after full Wilminks program has been run, i.e. autocorrelation is handled for cows which will affect their expected MY curves => alternative load above with first part of Wilmink's program then move down here to get without handling autocorrelation for smooth expected MY 

In [None]:
dtype_dict = {
    'Date': 'str',
    'FarmName_Pseudo': 'str',
    'SE_Number': 'str',
    'Age': 'Int64',
    'BreedName': 'str',
    'DailyYield': 'float',
    'PreviousDailyYield': 'float',
    'DailyYieldChange': 'float',
    'DaysInMilk': 'Int64',
    'YearSeason': 'str',
    'LactationNumber': 'Int64',
    'ExpectedYield': 'float',
    'NormalizedDailyYield': 'float',
    'NormalizedDailyYieldChange': 'float',
    'HeatStress': 'Int64',
    'Temp15Threshold': 'Int64',
    'HW': 'Int64',
    'cum_HW': 'Int64',
    'MeanTemperature': 'float',
    'MeanTHI_adj': 'float',
    'HeatLoad': 'float',
    'CumulativeHeatLoad': 'float',
}

milk_data = pd.read_csv('../Data/MergedData/HeatApproachCleanedYieldDataTestQuantile61.csv', dtype=dtype_dict)
milk_data['Date'] = pd.to_datetime(milk_data['Date'], format='%Y-%m-%d')
milk_data.head(-5)

In [None]:
# Create DataFrame
df = pd.DataFrame(milk_data)

# Convert Date to datetime
df['Date'] = pd.to_datetime(df['Date'])

# Sort values
df = df.sort_values(by=['FarmName_Pseudo', 'Date'])

# Summarize yields by date within each herd
df_sum1 = df.groupby(['FarmName_Pseudo', 'Date'])['ExpectedYield'].sum().reset_index()
df_sum2 = df.groupby(['FarmName_Pseudo', 'Date'])['DailyYield'].sum().reset_index()

# Rename columns
df_sum1.rename(columns={'ExpectedYield': 'SumExpectedYield'}, inplace=True)
df_sum2.rename(columns={'DailyYield': 'SumDailyYield'}, inplace=True)

# Merge the summed data back to the original DataFrame
df = pd.merge(df, df_sum1, on=['FarmName_Pseudo', 'Date'], how="left")
df = pd.merge(df, df_sum2, on=['FarmName_Pseudo', 'Date'], how="left")

# Export to CSV, ensuring the path is correct
df.to_csv("../Data/MergedData/test.csv", index=False)

In [None]:
# Ensure 'Date' is in datetime format
df['Date'] = pd.to_datetime(df['Date'])

# Specify the herd you want to plot
specific_herd = '5f7f33d6'  # ================================================================>>> Replace with the actual herd name
herd_data = df[df['FarmName_Pseudo'] == specific_herd]

# Check if the data for the specific herd is not empty
if not herd_data.empty:
    # Set up a single plot
    fig, ax1 = plt.subplots(figsize=(12, 6), dpi=300)
    
    # Define colors for each line type
    expected_color = 'blue'
    actual_color = 'green'
    thi_color = 'red'
    
    # Primary y-axis for Expected and Actual Milk Yield
    ax1.plot(herd_data['Date'], herd_data['SumExpectedYield'], 
             label='Expected Milk Yield', color=expected_color, linestyle='--')
    ax1.plot(herd_data['Date'], herd_data['SumDailyYield'], 
             label='Actual Milk Yield', color=actual_color)
    
    # Set labels and title
    ax1.set_ylabel('Milk Yield (Liters)')
    ax1.set_title(f'Expected vs Actual Milk Yield with THI for {specific_herd}')
    
    # Secondary y-axis for THI
    ax2 = ax1.twinx()
    ax2.plot(herd_data['Date'], herd_data['MeanTHI_adj'], 
             label='THI', color=thi_color, linestyle=':', alpha=0.6)
    ax2.set_ylabel('THI')
    
    # Add legend
    lines, labels = ax1.get_legend_handles_labels()
    lines2, labels2 = ax2.get_legend_handles_labels()
    ax1.legend(lines + lines2, labels + labels2, loc='upper left')
    
    # Set x-axis label and rotate ticks
    ax1.set_xlabel('Date')
    plt.xticks(rotation=45)

    # Adjust layout and show the plot
    plt.tight_layout()
    plt.show()
else:
    print(f"No data available for the specified herd: {specific_herd}")


Alternative to get full panel, i.e. all herds

In [None]:
# Ensure 'Date' is in datetime format
df['Date'] = pd.to_datetime(df['Date'])

# Get unique herds
herds = df['FarmName_Pseudo'].unique()

# Set up subplots for each herd in a grid layout
num_herds = len(herds)
fig, axs = plt.subplots(num_herds, 1, figsize=(12, 6 * num_herds), sharex=True, dpi=300)

# If there's only one herd, axs will not be an array
if num_herds == 1:
    axs = [axs]  # wrap in a list for consistency

# Define colors for each line type
expected_color = 'blue'
actual_color = 'green'
thi_color = 'red'

# Iterate over each herd and corresponding subplot axis
for i, herd in enumerate(herds):
    herd_data = df[df['FarmName_Pseudo'] == herd]
    
    # Check if herd_data is not empty
    if not herd_data.empty:
        # Primary y-axis for Expected and Actual Milk Yield
        ax1 = axs[i]
        ax1.plot(herd_data['Date'], herd_data['SumExpectedYield'], 
                 label='Expected Milk Yield', color=expected_color, linestyle='--')
        ax1.plot(herd_data['Date'], herd_data['SumDailyYield'], 
                 label='Actual Milk Yield', color=actual_color)
        
        # Set labels and title
        ax1.set_ylabel('Milk Yield (Liters)')
        ax1.set_title(f'Expected vs Actual Milk Yield with THI for {herd}')
        
        # Secondary y-axis for THI
        ax2 = ax1.twinx()
        ax2.plot(herd_data['Date'], herd_data['MeanTHI_adj'], 
                 label='THI', color=thi_color, linestyle=':', alpha=0.6)
        ax2.set_ylabel('THI')
        
        # Add legend for each subplot
        lines, labels = ax1.get_legend_handles_labels()
        lines2, labels2 = ax2.get_legend_handles_labels()
        ax1.legend(lines + lines2, labels + labels2, loc='upper left')

# Set shared x-axis label for all subplots
axs[-1].set_xlabel('Date')

# Show the plot
plt.tight_layout()
plt.show()

## Per herd expected MY divided by number of cows giving milk on the day in question, against actual MY divided by number of cows giving milk on the day in question, with THI on secondary y-axis
- Generate for single herd of interest
- Full panel for multiple herds

In [None]:
df = pd.read_csv('../Data/MergedData/HeatApproachCleanedYieldDataTestQuantile61.csv')
df['Date'] = pd.to_datetime(df['Date'], format='%Y-%m-%d')

# Make cow count - variable counting number of cows giving milk each day
df['CowCount'] = df.groupby(['FarmName_Pseudo', 'Date'])['SE_Number'].transform('nunique')

# Summarize expected MY and actual MY per herd
df_sum1 = df.groupby(['FarmName_Pseudo', 'Date'])['ExpectedYield'].sum().reset_index()
df_sum2 = df.groupby(['FarmName_Pseudo', 'Date'])['DailyYield'].sum().reset_index()

# Rename columns
df_sum1.rename(columns={'ExpectedYield': 'SumExpectedYield'}, inplace=True)
df_sum2.rename(columns={'DailyYield': 'SumDailyYield'}, inplace=True)

# Merge the summed data back to the original DataFrame
df = pd.merge(df, df_sum1, on=['FarmName_Pseudo', 'Date'], how="left")
df = pd.merge(df, df_sum2, on=['FarmName_Pseudo', 'Date'], how="left")
df

In [None]:
import matplotlib.pyplot as plt

# Specify the specific herd to plot
specific_herd = '5f7f33d6'  # Replace with the desired herd name

# Calculate normalized yields per cow within each herd
df['ExpectedMY'] = df['SumExpectedYield'] / df['CowCount']
df['ActualMY'] = df['SumDailyYield'] / df['CowCount']

# Sort values by date
df = df.sort_values(by=['Date'])

# Filter data for the specific herd
herd_data = df[df['FarmName_Pseudo'] == specific_herd]

# Ensure the herd exists in the data
if herd_data.empty:
    print(f"No data available for herd '{specific_herd}'")
else:
    # Set up the plot
    fig, ax1 = plt.subplots(figsize=(12, 6), dpi=300)
    
    # Plot Expected and Actual Milk Yield per Cow
    ax1.plot(herd_data['Date'], herd_data['ExpectedMY'], 
             label='Expected Milk Yield per Cow', color='skyblue', linestyle='--')
    ax1.scatter(herd_data['Date'], herd_data['ActualMY'], 
                label='Actual Milk Yield per Cow', color='salmon', marker='o')
    
    # Set labels and title
    ax1.set_ylabel('Milk Yield per Cow (Liters)')
    ax1.set_title(f'Expected vs Actual Milk Yield per Cow with THI for {specific_herd}')
    
    # Secondary y-axis for THI
    ax2 = ax1.twinx()
    ax2.plot(herd_data['Date'], herd_data['MeanTHI_adj'], 
             label='THI', color='green', linestyle=':', alpha=0.6)
    ax2.set_ylabel('THI')
    
    # Combine legends for both axes
    lines1, labels1 = ax1.get_legend_handles_labels()
    lines2, labels2 = ax2.get_legend_handles_labels()
    ax1.legend(lines1 + lines2, labels1 + labels2, loc='upper left')
    
    # Rotate x-axis ticks for clarity
    ax1.tick_params(axis='x', rotation=45)
    
    # Adjust layout and display the plot
    plt.tight_layout()
    plt.show()


In [None]:
import matplotlib.pyplot as plt

# Calculate normalized yields per cow within each herd
df['ExpectedMY'] = df['SumExpectedYield'] / df['CowCount']
df['ActualMY'] = df['SumDailyYield'] / df['CowCount']

# Sort values by both herd and date
df = df.sort_values(by=['FarmName_Pseudo', 'Date'])

# Get unique herds
herds = df['FarmName_Pseudo'].unique()

# Set up subplots
fig, axs = plt.subplots(len(herds), 1, figsize=(12, 6 * len(herds)), sharex=True, dpi=300)

# If only one herd, axs will not be an array
if len(herds) == 1:
    axs = [axs]  # Wrap single axis into a list for consistency

# Iterate over each herd
for i, herd in enumerate(herds):
    herd_data = df[df['FarmName_Pseudo'] == herd]
    
    # Plot Expected and Actual Milk Yield per Cow
    ax1 = axs[i]
    ax1.plot(herd_data['Date'], herd_data['ExpectedMY'], 
             label='Expected Milk Yield per Cow', color='skyblue', linestyle='--')
    ax1.scatter(herd_data['Date'], herd_data['ActualMY'], 
             label='Actual Milk Yield per Cow', color='salmon', marker='o')
    
    # Set labels and title
    ax1.set_ylabel('Milk Yield per Cow (Liters)')
    ax1.set_title(f'Expected vs Actual Milk Yield per Cow with THI for {herd}')
    
    # Secondary y-axis for THI
    ax2 = ax1.twinx()
    ax2.plot(herd_data['Date'], herd_data['MeanTHI_adj'], 
             label='THI', color='green', linestyle=':', alpha=0.6)
    ax2.set_ylabel('THI')
    
    # Combine legends for both axes
    lines1, labels1 = ax1.get_legend_handles_labels()
    lines2, labels2 = ax2.get_legend_handles_labels()
    ax1.legend(lines1 + lines2, labels1 + labels2, loc='upper left')

# Set shared x-axis label
axs[-1].set_xlabel('Date')

# Rotate x-axis ticks for clarity
for ax in axs:
    ax.tick_params(axis='x', rotation=45)

# Adjust layout and display the plot
plt.tight_layout()
plt.show()

## Plotting THI and insemination events within herds
To see if have periods of no inseminations reported (other than 150d data extraction filtering requirement)

In [None]:
# Example data (assuming df is already your data)
df = pd.read_csv("../Data/CR_W_MY61.csv", low_memory=False)
df

In [None]:
# Add FarmName_Pseudo to df
df_herd = pd.read_csv("../Data/fertilityDF_W_MY61_filtered.csv", low_memory=False)
# df_herd = pd.read_csv("../Data/fertilityDF_W_MY67_filtered.csv", low_memory=False)
col_keep = ["FarmName_Pseudo", "SE_Number"]
df_herd = df_herd[col_keep]
df_herd = df_herd.drop_duplicates(subset=["SE_Number"])

df = pd.merge(df, df_herd, on=["SE_Number"], how="left")
df

In [None]:
# Change time formats
df['StartDate'] = pd.to_datetime(df['StartDate'])  # Ensure 'StartDate' is in datetime format
df["InseminationDate"] = pd.to_datetime(df["InseminationDate"])
df.set_index('StartDate', inplace=True)  # Set 'StartDate' as the index

# Sort df by StartDate
df = df.sort_values(by=["StartDate"])

# List of herds
herds = df['FarmName_Pseudo'].unique()

# Loop through each herd and create a separate plot
for herd in herds:
    # Filter data for the current herd
    herd_data = df[df['FarmName_Pseudo'] == herd]
    
    # Initialize the plot for the current herd
    fig, ax1 = plt.subplots(figsize=(10, 6))

    # Plot THI on the primary y-axis
    ax1.plot(herd_data.index, herd_data['MeanTHI_adj'], color='red', label='THI', linewidth=2)
    ax1.set_xlabel('Date')
    ax1.set_ylabel('THI', color='blue')
    ax1.tick_params(axis='y', labelcolor='blue')

    # Plot insemination dates as dots on the bottom of the x-axis
    insemination_dates = herd_data['InseminationDate'].dropna()  # Avoid NaNs
    ax1.scatter(insemination_dates, [ax1.get_ylim()[0]] * len(insemination_dates), 
                color='green', marker='o', label='Insemination Date', zorder=3)

    # Format the x-axis to show dates nicely
    ax1.xaxis.set_major_locator(mdates.MonthLocator())  # Adjust this to your date frequency
    ax1.xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m-%d'))
    plt.xticks(rotation=45)

    # Add a legend
    ax1.legend(loc='upper left')

    # Add title for each plot with the herd name/number
    plt.title(f'Herd {herd}: THI and Insemination Dates')

    # Show the plot
    plt.tight_layout()
    plt.show()


## Plotting THI and insemination events for each cow
This will print to pdf to avoid screen of death

In [None]:
# Load data
df = pd.read_csv("../Data/CR_W_MY61.csv", low_memory=False)

In [None]:
# Add FarmName_Pseudo to df
df_herd = pd.read_csv("../Data/fertilityDF_W_MY61_filtered.csv", low_memory=False)
col_keep = ["FarmName_Pseudo", "SE_Number"]
df_herd = df_herd[col_keep]
df_herd = df_herd.drop_duplicates(subset=["SE_Number"])

df = pd.merge(df, df_herd, on=["SE_Number"], how="left")
df

In [None]:
# Filtering which herd you are working with
selected_herd = 'a624fb9a'

# Filter the DataFrame for the selected herd
df_herd = df[df['FarmName_Pseudo'] == selected_herd]  # Filter for the specified herd

# Change time formats
df_herd = df_herd.copy()
df_herd['StartDate'] = pd.to_datetime(df_herd['StartDate'])  # Ensure 'StartDate' is in datetime format
df_herd["InseminationDate"] = pd.to_datetime(df_herd["InseminationDate"])
df_herd.set_index('StartDate', inplace=True)  # Set 'StartDate' as the index

# Sort df by StartDate
df_herd = df_herd.sort_values(by=["StartDate"])

# List of cows
cows = df_herd['SE_Number'].unique()

# Create a PDF file to save the plots
output_pdf = f'THI_Inspattern_herd_{selected_herd}.pdf'  # Output file name
with PdfPages(output_pdf) as pdf:
    # Loop through each cow and create a separate plot
    for cow in cows:
        # Filter data for the current cow
        cow_data = df_herd[df_herd['SE_Number'] == cow]
        
        # Initialize the plot for the current cow
        fig, ax1 = plt.subplots(figsize=(10, 6))

        # Plot THI on the primary y-axis
        ax1.plot(cow_data.index, cow_data['MeanTHI_adj'], color='red', label='THI', linewidth=2)
        ax1.set_xlabel('Date')
        ax1.set_ylabel('THI', color='blue')
        ax1.tick_params(axis='y', labelcolor='blue')

        # Plot insemination dates as dots
        insemination_dates = cow_data["InseminationDate"].dropna()  # Avoid NaNs
        ax1.scatter(insemination_dates, [ax1.get_ylim()[0]] * len(insemination_dates),
                    color="green", marker="o", label="Insemination Date", zorder=3)
        
        # Format the x-axis to show dates nicely
        ax1.xaxis.set_major_locator(mdates.MonthLocator())  # Adjust this to your date frequency
        ax1.xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m-%d'))
        plt.xticks(rotation=45)

        # Add a legend
        ax1.legend(loc='upper left')

        # Add title for each plot with the herd name/number
        plt.title(f'Herd {selected_herd} - Cow {cow}: THI and Insemination Dates')

        # Adjust layout to prevent clipping
        plt.tight_layout()

        # Save the current figure to the PDF
        pdf.savefig(fig)
        plt.close(fig)  # Close the figure to free memory

print(f"Plots saved to '{output_pdf}'")

## Estimate how often milk production on a farm decreases by 30%, without it being connected to an increase in THI and heat stressed animals
ID and analyzes instances of significant milk production decreases not attributable to THI increases.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm

In [2]:
# Load data
df = pd.read_csv("../Data/MergedData/MY_weather_filtered.csv", low_memory=False)

In [None]:
# If want to load specific herd
herd = ["5f7f33d6"]
df = df[df["FarmName_Pseudo"].isin(herd)]
print(f"No. milking samples from Berte herd: {df.shape}")

In [3]:
# Calc relative change in MY and flag instance where decrease is above 30%
df['MilkChange'] = df['TotalYield'].pct_change()  # Percent change
df['Decrease30'] = df['MilkChange'] <= -0.30  # Flag 30% decrease

In [None]:
df

In [5]:
# Summarize THI data
df["THI_adj2"] = df['THI_adj'].fillna(df['MeanTHI_adj'])

# Exclude periods of high THI, i.e. define threshold for THI that indicates heat stress, here 67 THI
heat_stress_threshold = 67

# Filter periods where THI is below this threshold
df['NonHeatStress'] = df['THI_adj2'] <= heat_stress_threshold

In [6]:
# ID periods with decrease in milk production of at least 30% that are not associated with high THIs
df['UnconnectedDecrease'] = df['Decrease30'] & df['NonHeatStress']

In [None]:
# Count how often these unconnected decreases occur
unconnected_decrease_count = df['UnconnectedDecrease'].sum()
total_days = len(df)
frequency = unconnected_decrease_count / total_days
print(f"Unconnected 30% decreases occurred {unconnected_decrease_count} times, or {frequency:.2%} of the time.")

In [None]:
# Group by herds
herd_results = []

# Iterate through each herd
for herd, group in df.groupby('FarmName_Pseudo'):
    unconnected_decrease_count = group['UnconnectedDecrease'].sum()
    total_days = len(group)
    frequency = unconnected_decrease_count / total_days
    
    # Append the results to a list
    herd_results.append({
        'Herd': herd,
        'UnconnectedDecreaseCount': unconnected_decrease_count,
        'TotalDays': total_days,
        'Frequency': frequency
    })

# Convert the results to a DataFrame for easier analysis
herd_summary_df = pd.DataFrame(herd_results)

# Display the summary
print(herd_summary_df)

# Percentage of cows responding negatively to heat stress on farm

In [1]:
import pandas as pd
import numpy as np

In [None]:
df = pd.read_csv("../Data/MergedData/QuantileRerunTHI61.csv", low_memory=False)

# df = pd.read_csv('../Data/MergedData/HeatApproachCleanedYieldDataTestQuantile61.csv', low_memory=False)
# df = pd.read_csv('../Data/MergedData/HeatApproachCleanedYieldDataTestQuantile67.csv', low_memory=False)

col_keep = ["Date", "FarmName_Pseudo", "SE_Number", "BreedName", "LactationNumber", "DaysInMilk", "DailyYield", "ExpectedYield", "MeanTHI_adj"]
milk_data = df[col_keep]

milk_data

## This code calculates the proportion of cows in each farm that have shown a negative response (yield drop) in milk yield under heat stress conditions.

In [None]:
# Step 1: Define the threshold for heat stress ==============================================================>>> Change THI threshold here
heat_stress_threshold = 61

# Step 2: Initialize a dictionary to store the percentage of cows responding negatively for each farm
percentage_negative_responders_per_farm = {}

# Get the unique herd IDs (farm IDs) from the milk_data dataset
farm_ids = milk_data['FarmName_Pseudo'].unique()

# Loop through each farm_id (herd)
for farm_id in farm_ids:
    # Step 0: Filter data for the current herd and create a copy to avoid SettingWithCopyWarning
    farm_data = milk_data[milk_data['FarmName_Pseudo'] == farm_id].copy()

    # Step 2: Identify the periods of heat stress
    farm_data['HeatStress'] = farm_data['MeanTHI_adj'] >= heat_stress_threshold

    # Step 7: Calculate the deviation from expected yield during heat stress
    farm_data['DeviationFromExpected'] = farm_data['DailyYield'] - farm_data['ExpectedYield']

    # Step 8: Analyze if the cow is responding negatively during heat stress
    farm_data['NegativeResponseToHeatStress'] = np.where((farm_data['HeatStress'] == True) & 
                                                         (farm_data['DeviationFromExpected'] < 0), 1, 0)

    # Step 9: Calculate the percentage of cows that respond negatively to heat stress for the current herd
    negative_responders = farm_data.groupby(['SE_Number', 'LactationNumber'])['NegativeResponseToHeatStress'].max()
    percentage_negative_responders = negative_responders.mean() * 100

    # Store the result for the current farm
    percentage_negative_responders_per_farm[farm_id] = percentage_negative_responders

# Output the results for all farms
for farm_id, percentage in percentage_negative_responders_per_farm.items():
    print(f"Percentage of cows responding negatively to heat stress in farm {farm_id}: {percentage:.2f}%")


## This code returns results nested within herd, breed and parity. results generated to pdf

In [None]:
# Make Parity 1-3
milk_data = milk_data.copy()
milk_data["Parity"] = milk_data["LactationNumber"]
milk_data.loc[(milk_data['LactationNumber'] >= 3) & (milk_data['LactationNumber'] <= 8), 'Parity'] = 3
milk_data

In [None]:
from reportlab.lib.pagesizes import letter
from reportlab.platypus import SimpleDocTemplate, Paragraph, Table, TableStyle
from reportlab.lib.styles import getSampleStyleSheet
import numpy as np
import pandas as pd

# Define heat stress threshold ======================================================================================>>> Change your THI threshold here
heat_stress_threshold = 61

# Sample milk_data DataFrame setup
# milk_data = pd.read_csv('your_milk_data.csv')  # Replace with your data

# Process milk_data for heat stress analysis and calculate additional fields
milk_data['HeatStress'] = milk_data['MeanTHI_adj'] >= heat_stress_threshold
milk_data['MilkYieldDifference'] = milk_data['DailyYield'] - milk_data['ExpectedYield']
milk_data['NegativeResponseToHeatStress'] = ((milk_data['HeatStress']) & 
                                              (milk_data['MilkYieldDifference'] < 0)).astype(int)

# Initialize PDF document setup
output_pdf = 'heat_stress_analysis.pdf'
doc = SimpleDocTemplate(output_pdf, pagesize=letter)
content = []

# Define PDF styles
styles = getSampleStyleSheet()
style_normal = styles['Normal']
style_title = styles['Title']  # Use the Title style for the title

# Add title to the content
title = "Heat Stress Analysis for Dairy Cows"
content.append(Paragraph(title, style_title))

# Create an optimized version of the farm loop
cow_estimates = []

# Iterate over farms and process data
for farm_id, farm_data in milk_data.groupby('FarmName_Pseudo'):
    for breed, breed_data in farm_data.groupby('BreedName'):
        for lactation, lactation_data in breed_data.groupby('Parity'):
            
            # Calculate negative responders and affected cows during heat stress
            negative_responders = lactation_data.groupby('SE_Number')['NegativeResponseToHeatStress'].max().mean() * 100

            heat_stress_data = lactation_data[lactation_data['HeatStress']]
            affected_cows = heat_stress_data.groupby('SE_Number')['MilkYieldDifference'].apply(lambda x: (x < 0).any()).sum()
            total_cows = heat_stress_data['SE_Number'].nunique()
            percentage_affected = (affected_cows / total_cows) * 100 if total_cows > 0 else 0

            # Append results for this specific group
            cow_estimates.append({
                'FarmName_Pseudo': farm_id,
                'BreedName': breed,
                'Parity': lactation,
                'TotalCows': total_cows,
                'AffectedCows': affected_cows,
                'PercentageAffected': percentage_affected,
                'PercentageNegativeResponders': negative_responders
            })

# Sort the results based on the desired order: Farm -> Breed -> Parity (1, 2, 3)
cow_estimates.sort(key=lambda x: (
    x['FarmName_Pseudo'],  # First sort by farm
    x['BreedName'],        # Then by breed
    x['Parity']            # Then by parity number, sorted numerically
))

# Prepare table data
table_data = [['Farm', 'Breed', 'Parity', 'Total Cows', 'Affected Cows', 'Percentage Affected', 'Percentage Negative Responders']]
table_data.extend([
    [
        entry['FarmName_Pseudo'],
        entry['BreedName'],
        entry['Parity'],
        entry['TotalCows'],
        entry['AffectedCows'],
        f"{entry['PercentageAffected']:.2f}%",
        f"{entry['PercentageNegativeResponders']:.2f}%"
    ]
    for entry in cow_estimates
])

# Create the table
table = Table(table_data)
table.setStyle(TableStyle([
    ('BACKGROUND', (0, 0), (-1, 0), (0.8, 0.8, 0.8)),
    ('TEXTCOLOR', (0, 0), (-1, 0), (0, 0, 0)),
    ('ALIGN', (0, 0), (-1, -1), 'CENTER'),
    ('FONTNAME', (0, 0), (-1, 0), 'Helvetica-Bold'),
    ('BOTTOMPADDING', (0, 0), (-1, 0), 12),
    ('GRID', (0, 0), (-1, -1), 0.5, (0, 0, 0)),
]))

# Add the table to the content at the beginning
content.append(table)

# Build the PDF document
doc.build(content)

print(f"Report saved as {output_pdf}")

In [None]:
milk_data

Create histogram of MilkYieldDifference to look at data

In [None]:
import matplotlib.pyplot as plt

# Clean data first
# Replace non-finite values with zeros or another placeholder
milk_data['MilkYieldDifference'] = milk_data['MilkYieldDifference'].replace([float('inf'), float('-inf')], float('nan'))
milk_data['MilkYieldDifference'] = milk_data['MilkYieldDifference'].fillna(0)

# Create a histogram
plt.hist(milk_data['MilkYieldDifference'], bins=20, color='skyblue', edgecolor='black')
plt.title('Histogram of Milk Yield Difference')
plt.xlabel('Milk Yield Difference')
plt.ylabel('Frequency')
plt.show()

In [None]:
import matplotlib.pyplot as plt

# Group the data by 'FarmName_Pseudo'
grouped = milk_data.groupby('FarmName_Pseudo')

# Loop through each herd and create a histogram
for herd_name, group in grouped:
    plt.figure()  # Create a new figure for each herd
    plt.hist(group['MilkYieldDifference'], bins=20, color='skyblue', edgecolor='black')

    plt.title(f'Histogram of Milk Yield Difference for {herd_name}')
    plt.xlabel('Milk Yield Difference')
    plt.ylabel('Frequency')
    plt.show()

To estimate the actual decrease in milk yield per cow per day during heat stress

In [None]:
"""
This is the base code used below but without the pdf output option
# Filter for cows under heat stress and showing negative yield difference
heat_stressed_cows = milk_data[(milk_data['HeatStress']) & (milk_data['MilkYieldDifference'] < 0)]

# Group by meaningful categories and calculate average daily loss
grouped_loss = heat_stressed_cows.groupby(['FarmName_Pseudo', 'BreedName', 'Parity']) \
    .agg(AverageDailyLoss=('MilkYieldDifference', 'mean'),
         TotalMilkLoss=('MilkYieldDifference', 'sum'),
         CowsAffected=('SE_Number', 'nunique'))

# Reset the index for easier manipulation and viewing
grouped_loss = grouped_loss.reset_index()
print(grouped_loss)
"""

In [None]:
import pandas as pd
from reportlab.lib.pagesizes import letter
from reportlab.platypus import SimpleDocTemplate, Table, TableStyle, Paragraph
from reportlab.lib.styles import getSampleStyleSheet
from reportlab.lib import colors

# Filter for cows under heat stress and showing negative yield difference
heat_stressed_cows = milk_data[(milk_data['HeatStress']) & (milk_data['MilkYieldDifference'] < 0)]

# Group by meaningful categories and calculate average daily loss
# I.e. out of the heat stressed cows, how much do they lose compared to what is expected of them according to ExpectedYield
grouped_loss = heat_stressed_cows.groupby(['FarmName_Pseudo', 'BreedName', 'Parity']) \
    .agg(AverageDailyLoss=('MilkYieldDifference', 'mean'),
         TotalMilkLoss=('MilkYieldDifference', 'sum'),
         CowsAffected=('SE_Number', 'nunique'))

# Reset the index for easier manipulation and viewing
grouped_loss = grouped_loss.reset_index()

# Prepare PDF document to store the output
output_pdf = "heat_stressed_cows_report.pdf"
doc = SimpleDocTemplate(output_pdf, pagesize=letter)

# Styles for the document
styles = getSampleStyleSheet()
style_title = styles['Title']
style_normal = styles['Normal']

# Add title to the content
content = []
content.append(Paragraph("Heat Stressed Cows - Milk Loss Metrics", style_title))

# Prepare the table data for the PDF
table_data = [["Farm", "Breed", "Parity", 
               "Average Daily Loss (kg)", "Cows Affected"]]

# Populate the table with data from grouped_loss DataFrame
for _, row in grouped_loss.iterrows():
    table_data.append([
        row['FarmName_Pseudo'],
        row['BreedName'],
        row['Parity'],
        f"{row['AverageDailyLoss']:.2f}",
        row['CowsAffected']
    ])

# Create a Table object
table = Table(table_data)

# Define table styles
table.setStyle(TableStyle([
    ('BACKGROUND', (0, 0), (-1, 0), colors.grey),
    ('TEXTCOLOR', (0, 0), (-1, 0), colors.whitesmoke),
    ('ALIGN', (0, 0), (-1, -1), 'CENTER'),
    ('FONTNAME', (0, 0), (-1, 0), 'Helvetica-Bold'),
    ('BOTTOMPADDING', (0, 0), (-1, 0), 12),
    ('GRID', (0, 0), (-1, -1), 0.5, colors.black),
    ('BACKGROUND', (0, 1), (-1, -1), colors.beige),
]))

# Add the table to the content
content.append(table)

# Build the PDF document
try:
    doc.build(content)
    print(f"Report saved as {output_pdf}")
except Exception as e:
    print(f"Error occurred: {e}")

In [None]:
grouped_loss

In [None]:
plt.hist(grouped_loss['AverageDailyLoss'], bins=20, color='skyblue', edgecolor='black')
plt.title('Histogram of Average Daily Milk Loss')
plt.xlabel('Milk Yield Loss')
plt.ylabel('Frequency')
plt.show()

Plot the losses, on average loss

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Bar plot for average daily loss per cow by lactation stage
sns.barplot(data=grouped_loss, x='Parity', y='AverageDailyLoss', hue='BreedName')
plt.title('Average Daily Milk Yield Loss by Lactation Stage and Breed')
plt.xlabel('Lactation')
plt.ylabel('Average Daily Milk Yield Loss (kg)')
plt.legend(title='Breed')
plt.show()


Plot herd-wise

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Loop through each unique herd and generate a plot
for herd in grouped_loss['FarmName_Pseudo'].unique():
    # Filter data for the current herd
    herd_data = grouped_loss[grouped_loss['FarmName_Pseudo'] == herd]
    
    # Create the bar plot
    plt.figure(figsize=(8, 6))
    sns.barplot(data=herd_data, x='Parity', y='AverageDailyLoss', hue='BreedName')
    plt.title(f'Average Daily Milk Yield Loss for Herd: {herd}')
    plt.xlabel('Lactation')
    plt.ylabel('Average Daily Milk Yield Loss (kg)')
    plt.legend(title='Breed')
    
    # Save the plot as an image file (optional)
    plt.savefig(f'herd_{herd}_milk_yield_loss.png')
    
    # Show the plot
    plt.show()

Step 6: Estimate Overall Economic Impact
  - Last recorded milk price for Arlas producers is 6,18 kr for conventional milk from December, 2024

In [None]:
import pandas as pd
from reportlab.lib.pagesizes import letter
from reportlab.platypus import SimpleDocTemplate, Table, TableStyle, Paragraph
from reportlab.lib.styles import getSampleStyleSheet
from reportlab.lib import colors

# Milk price in SEK per kg
milk_price_per_kg = 6.18

# Filter for parities 1, 2, and 3 only
grouped_loss = grouped_loss[grouped_loss['Parity'].isin([1, 2, 3])]

# Estimate revenue loss
grouped_loss['RevenueLoss'] = grouped_loss['TotalMilkLoss'] * milk_price_per_kg  # Revenue loss per herd

# Estimate average revenue loss per cow per herd
grouped_loss['RevenueLossPerCow'] = grouped_loss['RevenueLoss'] / grouped_loss['CowsAffected']

# Print the grouped_loss with added revenue columns for verification
print(grouped_loss[['FarmName_Pseudo', 'BreedName', 'Parity', 'CowsAffected', 'AverageDailyLoss', 'RevenueLossPerCow']])

# Prepare the PDF report
output_pdf = "economic_impact_report.pdf"
doc = SimpleDocTemplate(output_pdf, pagesize=letter)

# Styles for the document
styles = getSampleStyleSheet()
style_normal = styles['Normal']
style_title = styles['Title']

# Add title to the content
content = []
content.append(Paragraph("Economic Impact of Heat Stress on Milk Yield", style_title))

# Prepare the table data for the PDF
table_data = [
    ["Farm", "Breed", "Parity", "Cows Affected", "Average Daily Loss (kg)", "Revenue Loss per Cow per Lactation (SEK)"]
]

# Populate the table with data from the grouped_loss DataFrame
for _, row in grouped_loss.iterrows():
    table_data.append([
        row['FarmName_Pseudo'],
        row['BreedName'],
        row['Parity'],
        f"{row['CowsAffected']:.2f}",
        f"{row['AverageDailyLoss']:.2f}",
        f"{row['RevenueLossPerCow']:.2f}"
    ])

# Create a Table object
table = Table(table_data)

# Define table styles
table.setStyle(TableStyle([
    ('BACKGROUND', (0, 0), (-1, 0), colors.grey),  # Header background color
    ('TEXTCOLOR', (0, 0), (-1, 0), colors.whitesmoke),  # Header text color
    ('ALIGN', (0, 0), (-1, -1), 'CENTER'),  # Center align all cells
    ('FONTNAME', (0, 0), (-1, 0), 'Helvetica-Bold'),  # Header font
    ('BOTTOMPADDING', (0, 0), (-1, 0), 12),  # Header padding
    ('GRID', (0, 0), (-1, -1), 0.5, colors.black),  # Add grid lines
    ('BACKGROUND', (0, 1), (-1, -1), colors.beige),  # Background for data rows
]))

# Add the table to the content
content.append(table)

"""
# Add summary information
total_herds = grouped_loss['FarmName_Pseudo'].nunique()
total_milk_loss = grouped_loss['TotalMilkLoss'].sum()
total_revenue_loss = grouped_loss['RevenueLoss'].sum()
total_cows_affected = grouped_loss['CowsAffected'].sum()
average_loss_per_herd = total_revenue_loss / total_herds if total_herds > 0 else 0
average_loss_per_cow = total_revenue_loss / total_cows_affected if total_cows_affected > 0 else 0

summary_text = (
    f"<para><b>Summary Metrics:</b><br/>"
    f"Total Herds: {total_herds}<br/>"
    f"Total Cows Affected: {total_cows_affected}<br/>"
    f"Total Milk Loss (kg): {total_milk_loss:.2f}<br/>"
    f"Total Revenue Loss (SEK): {total_revenue_loss:.2f}<br/>"
    f"Average Revenue Loss per Herd (SEK): {average_loss_per_herd:.2f}<br/>"
    f"Average Revenue Loss per Cow (SEK): {average_loss_per_cow:.2f}<br/></para>"
)
content.append(Paragraph(summary_text, style_normal))
"""

# Build the PDF
doc.build(content)

print(f"Report saved as {output_pdf}")


# This code returns results nested within herd, breed, parity, lactation stage. Generates results to pdf instead of on screen

In [None]:
from reportlab.lib.pagesizes import letter
from reportlab.platypus import SimpleDocTemplate, Paragraph, Table, TableStyle
from reportlab.lib.styles import getSampleStyleSheet
import numpy as np
import pandas as pd

# Define heat stress threshold
heat_stress_threshold = 61

# Sample milk_data DataFrame setup
# milk_data = pd.read_csv('your_milk_data.csv')  # Replace with your data

# Process milk_data for heat stress analysis and calculate additional fields
milk_data['HeatStress'] = milk_data['MeanTHI_adj'] >= heat_stress_threshold
milk_data['MilkYieldDifference'] = milk_data['DailyYield'] - milk_data['ExpectedYield']
milk_data['NegativeResponseToHeatStress'] = ((milk_data['HeatStress']) & 
                                              (milk_data['MilkYieldDifference'] < 0)).astype(int)

# Function to get lactation stage
def get_lactation_stage(DaysInMilk):
    if 1 <= DaysInMilk <= 60:
        return 'Early'
    elif 61 <= DaysInMilk <= 150:
        return 'Mid'
    elif DaysInMilk >= 151:
        return 'Late'
    return 'Invalid DIM'

milk_data['LactationStage'] = milk_data['DaysInMilk'].apply(get_lactation_stage)

# Initialize PDF document setup
output_pdf = 'heat_stress_analysis2.pdf'
doc = SimpleDocTemplate(output_pdf, pagesize=letter)
content = []

# Define PDF styles
styles = getSampleStyleSheet()
style_normal = styles['Normal']

# Create an optimized version of the farm loop
cow_estimates = []

# Iterate over farms and process data
for farm_id, farm_data in milk_data.groupby('FarmName_Pseudo'):
    for breed, breed_data in farm_data.groupby('BreedName'):
        for lactation, lactation_data in breed_data.groupby('Parity'):
            for lactation_stage in ['Early', 'Mid', 'Late']:  # Sort lactation stages in correct order
                lactation_stage_data = lactation_data[lactation_data['LactationStage'] == lactation_stage]
                
                if lactation_stage_data.empty:
                    continue

                # Calculate negative responders and affected cows during heat stress
                negative_responders = lactation_stage_data.groupby('SE_Number')['NegativeResponseToHeatStress'].max().mean() * 100

                heat_stress_data = lactation_stage_data[lactation_stage_data['HeatStress']]
                affected_cows = heat_stress_data.groupby('SE_Number')['MilkYieldDifference'].apply(lambda x: (x < 0).any()).sum()
                total_cows = heat_stress_data['SE_Number'].nunique()
                percentage_affected = (affected_cows / total_cows) * 100 if total_cows > 0 else 0

                # Append results for this specific group
                cow_estimates.append({
                    'FarmName_Pseudo': farm_id,
                    'BreedName': breed,
                    'Parity': lactation,
                    'LactationStage': lactation_stage,
                    'TotalCows': total_cows,
                    'AffectedCows': affected_cows,
                    'PercentageAffected': percentage_affected,
                    'PercentageNegativeResponders': negative_responders
                })

# Sort the results based on the desired order: Farm -> Breed -> Lactation -> LactationStage
lactation_order = ['Early', 'Mid', 'Late']
cow_estimates.sort(key=lambda x: (
    x['FarmName_Pseudo'],  # First sort by farm
    x['BreedName'],        # Then by breed
    x['Parity'],           # Then by lactation number
    lactation_order.index(x['LactationStage'])  # Finally by lactation stage
))

# Prepare table data
table_data = [['Farm', 'Breed', 'Lactation', 'Stage', 'Total Cows', 'Affected Cows', 'Percentage Affected', 'Percentage Negative Responders']]
table_data.extend([
    [
        entry['FarmName_Pseudo'],
        entry['BreedName'],
        entry['Parity'],
        entry['LactationStage'],
        entry['TotalCows'],
        entry['AffectedCows'],
        f"{entry['PercentageAffected']:.2f}%",
        f"{entry['PercentageNegativeResponders']:.2f}%"
    ]
    for entry in cow_estimates
])

# Create the table
table = Table(table_data)
table.setStyle(TableStyle([
    ('BACKGROUND', (0, 0), (-1, 0), (0.8, 0.8, 0.8)),
    ('TEXTCOLOR', (0, 0), (-1, 0), (0, 0, 0)),
    ('ALIGN', (0, 0), (-1, -1), 'CENTER'),
    ('FONTNAME', (0, 0), (-1, 0), 'Helvetica-Bold'),
    ('BOTTOMPADDING', (0, 0), (-1, 0), 12),
    ('GRID', (0, 0), (-1, -1), 0.5, (0, 0, 0)),
]))

# Add the table to the content at the beginning
content.append(table)

# Add detailed analysis per farm, breed, lactation, and lactation stage
for entry in cow_estimates:
    result_text = (
        f"<para><b>Farm {entry['FarmName_Pseudo']}:</b><br/>"
        f" - <b>Breed {entry['BreedName']} - Lactation {entry['Parity']}</b><br/>"
        f" - Lactation Stage {entry['LactationStage']}:<br/>"
        f"  No. of cows producing in the herd: <b>{entry['TotalCows']}</b><br/>"
        f"  Percentage of cows affected by heat stress: <b>{entry['PercentageAffected']:.2f}%</b><br/>"
        f"  Percentage of cows responding negatively to heat stress: <b>{entry['PercentageNegativeResponders']:.2f}%</b><br/><br/></para>")
    content.append(Paragraph(result_text, style_normal))

# Build the PDF document
doc.build(content)

print(f"Report saved as {output_pdf}")

# Hur stor del av korna minskar i mjölkavkastning, då hela besättningen faller under en period med värmestress?
Simplified calc

In [1]:
import pandas as pd
import numpy as np

In [2]:
# Load data
df = pd.read_csv('../Data/MergedData/HeatApproachCleanedYieldDataTestQuantile61.csv', low_memory=False)
# df = pd.read_csv('../Data/MergedData/HeatApproachCleanedYieldDataTestQuantile67.csv', low_memory=False)

col_keep = ["Date", "FarmName_Pseudo", "SE_Number", "LactationNumber", "DaysInMilk", "DailyYield", "ExpectedYield", "MeanTHI_adj", "YearSeason"]
milk_data = df[col_keep]

In [None]:
milk_data

In [None]:
# Define heat stress threshold =================================================================================================>>> Change THI threshold here
heat_stress_threshold = 61

# Create a column to indicate whether there is heat stress or not
milk_data = milk_data.copy()
milk_data['HeatStress'] = milk_data['MeanTHI_adj'] >= heat_stress_threshold

# Calculate the milk yield difference (actual - expected)
milk_data['MilkYieldDifference'] = milk_data['DailyYield'] - milk_data['ExpectedYield']

# Get the unique list of farms dynamically
farm_ids = milk_data['FarmName_Pseudo'].unique()

# Initialize an empty list to store the estimates (percentage of cows with decreased yield per cow)
cow_estimates = []

# Iterate over each farm to generate the estimate for each cow
for farm_id in farm_ids:
    # Filter data for the current farm
    farm_data = milk_data[milk_data['FarmName_Pseudo'] == farm_id]
    
    # Filter data for heat stress periods within the current farm
    heat_stress_data = farm_data[farm_data['HeatStress']]
    
    # Group by cow (SE_Number) and check if any record shows decreased yield during heat stress
    affected_cows = heat_stress_data.groupby('SE_Number')['MilkYieldDifference'].apply(lambda x: (x < 0).any()).sum()
    
    # Get the unique count of cows in this farm
    total_cows = heat_stress_data['SE_Number'].nunique()
    
    # Calculate the percentage of cows affected by heat stress
    percentage_affected = (affected_cows / total_cows) * 100 if total_cows > 0 else 0
    
    # Append the result to the list
    cow_estimates.append({
        'FarmName_Pseudo': farm_id,
        'TotalCows': total_cows,
        'AffectedCows': affected_cows,
        'PercentageAffected': percentage_affected
    })
    
    # Print the result for this farm
    print(f"Percentage of cows in farm {farm_id} affected by heat stress: {percentage_affected:.2f}% at THI {heat_stress_threshold} degrees")

# Optionally, convert the results into a DataFrame for further analysis or visualization
results_df = pd.DataFrame(cow_estimates)

# Display the summary DataFrame
print(results_df)


# Hur stor är förändringen i mjölkavkastningen i genomsnitt?

In [13]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

import statsmodels.api as sm
import statsmodels.formula.api as smf
import scipy.stats as stats

In [50]:
# Load data
df = pd.read_csv('../Data/MergedData/HeatApproachCleanedYieldDataTestQuantile61.csv', low_memory=False)
# df = pd.read_csv('../Data/MergedData/HeatApproachCleanedYieldDataTestQuantile67.csv', low_memory=False)

df = df.copy()
df["Parity"] = df["LactationNumber"]
df.loc[(df['LactationNumber'] >= 3) & (df['LactationNumber'] <= 8), 'Parity'] = 3

col_keep = ["Date", "FarmName_Pseudo", "SE_Number", "BreedName", "Parity", "DaysInMilk", "DailyYield", "ExpectedYield", "MeanTHI_adj", "YearSeason"]
milk_data = df[col_keep]

In [None]:
# Define heat stress threshold ==========================================================================================>>> Change your THI threshold here
heat_stress_threshold = 61

# Create a column to indicate whether there is heat stress or not
milk_data = milk_data.copy()
milk_data['HeatStress'] = milk_data['MeanTHI_adj'] >= heat_stress_threshold

# Calculate the milk yield difference (actual - expected) per cow
milk_data['MilkYieldDifference'] = milk_data['DailyYield'] - milk_data['ExpectedYield']

# Dynamically extract all unique herd IDs
farm_ids = milk_data['FarmName_Pseudo'].unique()

# Initialize a dictionary to store the average decrease in milk yield per cow for each farm
herd_yield_decrease_per_cow = {}

# Iterate over each farm_id to calculate the mean milk yield decrease per cow during heat stress
for farm_id in farm_ids:
    # Filter data for the current farm
    farm_data = milk_data[milk_data['FarmName_Pseudo'] == farm_id]
    
    # Filter data for heat stress periods within the current farm
    heat_stress_data = farm_data[farm_data['HeatStress']]
    
    # Calculate the average milk yield decrease for each cow during heat stress
    cow_avg_yield_decrease = heat_stress_data[heat_stress_data['MilkYieldDifference'] < 0].groupby(['SE_Number', 'LactationNumber'])['MilkYieldDifference'].mean()
    
    # Calculate the mean of these averages to get the per-cow average yield decrease for the herd
    avg_yield_decrease_per_cow = cow_avg_yield_decrease.mean() if not cow_avg_yield_decrease.empty else 0
    
    # Store the result in the dictionary
    herd_yield_decrease_per_cow[farm_id] = avg_yield_decrease_per_cow

# Output the results for each farm
for farm_id, avg_decrease_per_cow in herd_yield_decrease_per_cow.items():
    print(f"Average decrease in milk yield (kg) per day per cow during heat stress in farm {farm_id}: {avg_decrease_per_cow:.2f} kg")


Including Breed and Parity

In [None]:
from reportlab.lib.pagesizes import letter
from reportlab.platypus import SimpleDocTemplate, Table, TableStyle, Paragraph
from reportlab.lib.styles import getSampleStyleSheet

# Define heat stress threshold ============================================================================================>>> Change your THI threshold here
heat_stress_threshold = 61

# Create a column to indicate whether there is heat stress or not
milk_data = milk_data.copy()
milk_data['HeatStress'] = milk_data['MeanTHI_adj'] >= heat_stress_threshold

# Calculate the milk yield difference (actual - expected) per cow
milk_data['MilkYieldDifference'] = milk_data['DailyYield'] - milk_data['ExpectedYield']

# Dynamically extract all unique herd IDs
farm_ids = milk_data['FarmName_Pseudo'].unique()

# Initialize a dictionary to store the average decrease in milk yield per cow for each farm, breed, and parity
herd_yield_decrease_details = {}

# Iterate over each farm_id to calculate the mean milk yield decrease per cow during heat stress
for farm_id in farm_ids:
    # Filter data for the current farm
    farm_data = milk_data[milk_data['FarmName_Pseudo'] == farm_id]
    
    # Iterate over each breed in the farm
    for breed in farm_data['BreedName'].unique():
        breed_data = farm_data[farm_data['BreedName'] == breed]
        
        # Iterate over each parity (lactation number) in the breed
        for parity in breed_data['Parity'].unique():
            parity_data = breed_data[breed_data['Parity'] == parity]
            
            # Filter data for heat stress periods within the current farm, breed, and parity
            heat_stress_data = parity_data[parity_data['HeatStress']]
            
            # Calculate the average milk yield decrease for each cow during heat stress
            cow_avg_yield_decrease = heat_stress_data[heat_stress_data['MilkYieldDifference'] < 0] \
                .groupby(['SE_Number'])['MilkYieldDifference'].mean()
            
            # Calculate the mean of these averages to get the per-cow average yield decrease for this group
            avg_yield_decrease_per_cow = cow_avg_yield_decrease.mean() if not cow_avg_yield_decrease.empty else 0
            
            # Store the result in the dictionary with details of farm, breed, and parity
            herd_yield_decrease_details[(farm_id, breed, parity)] = avg_yield_decrease_per_cow

# Output the results to a PDF
output_pdf = 'herd_heat_stress_analysis.pdf'
doc = SimpleDocTemplate(output_pdf, pagesize=letter)
content = []

# Add a title to the PDF
styles = getSampleStyleSheet()
content.append(Paragraph("Herd Heat Stress Analysis", styles['Title']))

# Create table data
table_data = [['Farm', 'Breed', 'Parity', 'Avg Decrease in Milk Yield (kg)']]
for (farm_id, breed, parity), avg_decrease_per_cow in herd_yield_decrease_details.items():
    table_data.append([
        farm_id, breed, parity, f"{avg_decrease_per_cow:.2f} kg"
    ])

# Create a table for the data
table = Table(table_data)
table.setStyle(TableStyle([
    ('BACKGROUND', (0, 0), (-1, 0), (0.8, 0.8, 0.8)),  # Header background
    ('TEXTCOLOR', (0, 0), (-1, 0), (0, 0, 0)),         # Header text color
    ('ALIGN', (0, 0), (-1, -1), 'CENTER'),             # Center align all cells
    ('FONTNAME', (0, 0), (-1, 0), 'Helvetica-Bold'),   # Bold font for header
    ('GRID', (0, 0), (-1, -1), 0.5, (0, 0, 0)),        # Grid lines
    ('BOTTOMPADDING', (0, 0), (-1, 0), 12),            # Padding for header
]))

# Add the table to the PDF content
content.append(table)

# Build the PDF
doc.build(content)

print(f"Results have been saved to {output_pdf}")


# Analyze the daily impact of the Temperature Humidity Index (THI) on milk yield in dairy cattle, using a regression approach to quantify how much milk yield decreases (in kg) per unit increase in THI.

In [49]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

import statsmodels.api as sm
import statsmodels.formula.api as smf
import scipy.stats as stats

from pygam import LinearGAM, s, f

In [98]:
# Load data
milk_data = pd.read_csv('../Data/MergedData/HeatApproachCleanedYieldDataTestQuantile61.csv', low_memory=False)
# milk_data = pd.read_csv('../Data/MergedData/HeatApproachCleanedYieldDataTestQuantile67.csv', low_memory=False)

col_keep = ["Date", "FarmName_Pseudo", "SE_Number", "LactationNumber", "BreedName", "DaysInMilk", "DailyYield", "ExpectedYield", "MeanTHI_adj", "YearSeason"]
milk_data = milk_data[col_keep]

In [None]:
# Make HYS effect
milk_data["HYS"] = milk_data["FarmName_Pseudo"].astype(str) + milk_data["YearSeason"].astype(str)
milk_data

In [None]:
# Shift THI reading down for each day
milk_data["THI1d"] = milk_data.groupby(["SE_Number", "LactationNumber"])["MeanTHI_adj"].shift(1)
milk_data["THI2d"] = milk_data.groupby(["SE_Number", "LactationNumber"])["MeanTHI_adj"].shift(2)
milk_data["THI3d"] = milk_data.groupby(["SE_Number", "LactationNumber"])["MeanTHI_adj"].shift(3)
milk_data

In [101]:
# Make Parity
milk_data["Parity"] = milk_data["LactationNumber"]
milk_data.loc[(milk_data['LactationNumber'] >= 3) & (milk_data['LactationNumber'] <= 8), 'Parity'] = 3

In [None]:
milk_data

LMM

In [None]:
# Ensure consistent data types
milk_data['Parity'] = pd.Categorical(milk_data['Parity'])
milk_data['BreedName'] = pd.Categorical(milk_data['BreedName'])
milk_data['SE_Number'] = milk_data['SE_Number'].astype(str)

# Reset index to ensure no indexing issues
milk_data = milk_data.reset_index(drop=True)

#Remove THI1d if missing
milk_data = milk_data[milk_data["THI1d"].notna()]

# Mixed-effects model, where `SE_Number` is included as a random effect
model = smf.mixedlm("DailyYield ~ MeanTHI_adj + THI1d + Parity + BreedName", milk_data, groups=milk_data["SE_Number"])
result = model.fit()

# Output the summary of the model
print(result.summary())

In [None]:
# Create a linear regression model to estimate milk loss per unit THI
# Formula: DailyMilkYield ~ THI + (other covariates if available, e.g., 'CowID')
# Using a mixed-effects model to account for random effects by individual cows

# Verify the column names are correct and consistent within milk_data
assert 'MeanTHI_adj' in milk_data.columns, "MeanTHI_adj column not found in milk_data"
assert 'DailyYield' in milk_data.columns, "DailyYield column not found in milk_data"
assert 'SE_Number' in milk_data.columns, "SE_Number column not found in milk_data"

# Reset index to ensure no indexing issues
milk_data = milk_data.reset_index(drop=True)

# Mixed-effects model, where `SE_Number` is included as a random effect
model = smf.mixedlm("DailyYield ~ MeanTHI_adj + Parity + BreedName", milk_data, groups=milk_data["SE_Number"])
result = model.fit()

# Output the summary of the model
print(result.summary())

# Print additional fit statistics and model diagnostics
print("\n--- Additional Fit Statistics ---")
print(f"AIC: {result.aic}")
print(f"BIC: {result.bic}")
print(f"Log-Likelihood: {result.llf}")
# print(f"Converged: {result.mle_retvals['converged']}")
print(f"Number of Observations: {result.nobs}")
print(f"Number of Groups (Cows): {result.model.groups.size}")
print(f"Random Effect Variance: {result.cov_re.iloc[0, 0]}")
print(f"Residual Variance: {result.scale}")

# Extract residuals
residuals = result.resid

# Plot distribution of residuals
plt.figure(figsize=(12, 6))

# Histogram of residuals
plt.subplot(1, 3, 1)
sns.histplot(residuals, kde=True, color='skyblue')
plt.title("Histogram of Residuals")
plt.xlabel("Residuals")

# Q-Q plot to assess normality
plt.subplot(1, 3, 2)
stats.probplot(residuals, dist="norm", plot=plt)
plt.title("Q-Q Plot of Residuals")

# Residuals vs. fitted values to check for homoscedasticity
plt.subplot(1, 3, 3)
fitted_values = result.fittedvalues
plt.scatter(fitted_values, residuals, color='purple', alpha=0.5)
plt.axhline(0, color='red', linestyle='--')
plt.title("Residuals vs Fitted Values")
plt.xlabel("Fitted Values")
plt.ylabel("Residuals")

plt.tight_layout()
plt.show()

# Print Shapiro-Wilk test for normality of residuals
shapiro_test = stats.shapiro(residuals)
print("\n--- Shapiro-Wilk Test for Normality ---")
print(f"Statistic: {shapiro_test.statistic}, p-value: {shapiro_test.pvalue}")

1. Fixed Effects Coefficients

The fixed effects section of the summary shows the estimated coefficients for the model’s fixed effects, which are the intercept and the MeanTHI_adj variable in this case.

- Intercept: The intercept represents the baseline expected DailyYield when MeanTHI_adj is zero. This value indicates the average milk yield when the temperature-humidity index is at its lowest value, assuming THI can reach zero.
- MeanTHI_adj Coefficient: This coefficient represents the estimated change in DailyYield (in kg per day) for a one-unit increase in MeanTHI_adj. If this coefficient is negative, it suggests that higher THI values are associated with lower milk yield, indicating that heat stress may be negatively impacting milk production.

2. Random Effects

The random effects section gives insight into the variability in DailyYield attributed to the grouping factor, SE_Number (presumably individual cows).

- Group Variance (SE_Number Var): This variance component indicates the variability in DailyYield due to differences between individual cows, capturing how much milk yield varies across cows regardless of MeanTHI_adj. A larger variance here suggests that individual cows differ significantly in their baseline milk yield levels or sensitivity to THI.
- Residual Variance: The residual variance represents the remaining variability in DailyYield not explained by MeanTHI_adj or the random effect (SE_Number). Lower residual variance implies that the model explains more of the variation in milk yield.

3. Statistical Significance of Fixed Effects

The summary output also includes statistical tests (t-statistic and p-value) for the fixed effects.

- p-value for MeanTHI_adj: If this p-value is below a common significance threshold (e.g., 0.05), it indicates that the association between MeanTHI_adj and DailyYield is statistically significant. A significant p-value suggests confidence that THI has a measurable impact on milk yield.

4. Model Fit Statistics

- Log-Likelihood: A higher log-likelihood value generally indicates a better model fit, but it is more meaningful when comparing models.
- AIC (Akaike Information Criterion) and BIC (Bayesian Information Criterion): Lower AIC and BIC values indicate a better fit, considering both the model’s accuracy and its complexity. These metrics are useful for comparing different models to see which one balances goodness-of-fit with simplicity.

Example Interpretation

If the output showed:

- Intercept: 30 kg (suggesting an average baseline milk yield of 30 kg per day when MeanTHI_adj is zero)
- MeanTHI_adj Coefficient: -0.2 kg per THI unit (indicating that each unit increase in THI reduces milk yield by 0.2 kg/day)
- p-value for MeanTHI_adj: < 0.05 (indicating a statistically significant effect of THI on milk yield)

Then we could conclude that higher THI values are significantly associated with decreased milk yield, potentially due to heat stress impacts on dairy cows.

Explanation of the Residual Analysis

- Histogram of Residuals: Displays the distribution of residuals. A normal distribution would suggest that the residuals are normally distributed, meeting one of the model assumptions.

- Q-Q Plot of Residuals: Compares the quantiles of the residuals to a standard normal distribution. If the points fall approximately along a straight line, the residuals are likely normally distributed.

- Residuals vs. Fitted Values Plot: Shows residuals against predicted values. This plot helps in identifying patterns (e.g., funnel shapes) that indicate heteroscedasticity (non-constant variance), which violates assumptions of the linear mixed model.

- Shapiro-Wilk Test for Normality: A statistical test for normality of the residuals. A high p-value (>0.05) suggests that the residuals do not significantly differ from a normal distribution, supporting the model assumption.

# Probability Density of Milk Yield for Heat Stressed vs Non-Heat Stressed Cows in Herd

In [4]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages
import numpy as np

In [5]:
df = pd.read_csv('../Data/MergedData/HeatApproachCleanedYieldDataTestQuantile61.csv', low_memory=False)
# df = pd.read_csv('../Data/MergedData/HeatApproachCleanedYieldDataTestQuantile67.csv', low_memory=False)

col_keep = ["Date", "FarmName_Pseudo", "SE_Number", "LactationNumber", "DaysInMilk", "DailyYield", "ExpectedYield", "MeanTHI_adj"]
milk_data = df[col_keep]

## Probability Density of Milk Yield for Heat Stressed vs Non-Heat Stressed Cows in Herd

In [None]:
# Define the threshold for heat stress ===================================================================>>> Change THI threshold here
heat_stress_threshold = 67

# Get unique farm IDs dynamically from the dataset
farm_ids = milk_data['FarmName_Pseudo'].unique()

# Define the number of rows for the subplots based on the number of farms
num_rows = len(farm_ids)

# Set up the subplots - one row per farm with individual x-axes
fig, axes = plt.subplots(num_rows, 1, figsize=(10, num_rows * 5), sharex=False)  # sharex=False to ensure all subplots have their own x-axis

# Loop through each farm_id and each axis to create individual subplots
for i, (farm_id, ax) in enumerate(zip(farm_ids, axes)):
    # Filter data for the current farm
    farm_data = milk_data[milk_data['FarmName_Pseudo'] == farm_id].copy()
    
    # Identify heat stress periods for the current farm
    farm_data['HeatStress'] = farm_data['MeanTHI_adj'] >= heat_stress_threshold
    
    # Filter data into heat-stressed and non-heat-stressed cows
    heat_stress_data = farm_data[farm_data['HeatStress']]
    non_heat_stress_data = farm_data[~farm_data['HeatStress']]
    
    # Plot the probability density plot for cows under heat stress
    sns.kdeplot(data=heat_stress_data, x='DailyYield', fill=True, bw_adjust=0.5, color='red', label='Heat Stress', ax=ax)
    
    # Plot the probability density plot for cows not under heat stress
    sns.kdeplot(data=non_heat_stress_data, x='DailyYield', fill=True, bw_adjust=0.5, color='blue', label='No Heat Stress', ax=ax)
    
    # Set consistent x-axis (starting at 0) and y-axis limits (adjust based on data)
    ax.set_xlim(left=0)
    
    # Add title for each subplot
    ax.set_title(f'Probability Density of Milk Yield for Heat Stressed vs Non-Heat Stressed Cows in Herd {farm_id} at THI {heat_stress_threshold} degrees')
    ax.set_ylabel('Density')
    
    # Set custom x-axis ticks (can be adjusted or made dynamic if necessary)
    ax.set_xticks(np.arange(0, farm_data['DailyYield'].max() + 2, 2))  # Dynamic x-ticks based on data
    ax.set_xlabel('Milk Yield (kg/day)')
    
    # Add a legend
    ax.legend()
    
    # Add grid
    ax.grid(True, which='both', linestyle='-', linewidth=0.7, color='blue', alpha=0.3)

# Adjust layout to prevent overlap
plt.tight_layout()
plt.show()


Single herd plotting of Milk Yield Deviation

In [None]:
# Define heat stress threshold =========================================================================================>>> Change your THI threshold here
heat_stress_threshold = 61

# Specify the farm_id you want to analyze 
specific_farm_id = '5f7f33d6'  # =======================================================================================>>> Change this to the desired farm ID

# Filter data for the specific farm
farm_data = milk_data[milk_data['FarmName_Pseudo'] == specific_farm_id].copy()

# Calculate Milk Yield Difference
farm_data['MilkYieldDifference'] = farm_data['DailyYield'] - farm_data['ExpectedYield']

# Identify heat stress periods for the current farm
farm_data['HeatStress'] = farm_data['MeanTHI_adj'] >= heat_stress_threshold

# Filter data into heat-stressed and non-heat-stressed cows
heat_stress_data = farm_data[farm_data['HeatStress']]
non_heat_stress_data = farm_data[~farm_data['HeatStress']]

# Set up the plot for the specific farm
fig, ax = plt.subplots(figsize=(10, 5))

# Plot the probability density plot for cows under heat stress
sns.kdeplot(data=heat_stress_data, x='MilkYieldDifference', fill=False, bw_adjust=0.5, color='red', label='Heat Stress', ax=ax)

# Plot the probability density plot for cows not under heat stress
sns.kdeplot(data=non_heat_stress_data, x='MilkYieldDifference', fill=False, bw_adjust=0.5, color='blue', label='No Heat Stress', ax=ax)

# Access the lines in the plot to extract the x and y values
kde_lines = ax.get_lines()

# Get the data from the first line (Heat Stress KDE curve)
x_heat, y_heat = kde_lines[0].get_data()

# Get the data from the second line (No Heat Stress KDE curve)
x_no_heat, y_no_heat = kde_lines[1].get_data()

# Find the common x-range (the overlapping region of both curves)
common_x = np.union1d(x_heat, x_no_heat)

# Interpolate y-values for both curves at the common x points
y_heat_interp = np.interp(common_x, x_heat, y_heat)
y_no_heat_interp = np.interp(common_x, x_no_heat, y_no_heat)

# Calculate the intersecting region area
overlap_y_min = np.minimum(y_heat_interp, y_no_heat_interp)
overlap_area = np.trapz(overlap_y_min, common_x)

# Use fill_between to highlight the overlap area
ax.fill_between(common_x, overlap_y_min, color='purple', alpha=0.3, label=f'Intersecting Area: {overlap_area:.2f}')

# Set title and labels for the plot
ax.set_title(f'Probability Density of Milk Yield Difference for Farm {specific_farm_id} at THI {heat_stress_threshold} degrees')
ax.set_xlabel('Milk Yield Deviation (kg/day)')
ax.set_ylabel('Density')

# Update the legend with the areas under the curves
ax.legend(loc='upper right')

# Add grid for better readability
ax.grid(True, which='both', linestyle='-', linewidth=0.7, color='blue', alpha=0.3)

# Adjust layout for better spacing
plt.tight_layout()
plt.show()


Multiple herd plotting Milk Yield Deviation

In [None]:
# Define heat stress threshold =================================================================================>>> Change your THI threshold here
heat_stress_threshold = 61

# Get unique farm IDs dynamically from the dataset
farm_ids = milk_data['FarmName_Pseudo'].unique()

# Set up the subplots - one row per farm
fig, axes = plt.subplots(len(farm_ids), 1, figsize=(10, len(farm_ids) * 5), sharex=True)

# Loop through each farm_id and each axis to create individual subplots
for i, (farm_id, ax) in enumerate(zip(farm_ids, axes)):
    # Filter data for the current farm
    farm_data = milk_data[milk_data['FarmName_Pseudo'] == farm_id].copy()
    
    # Calculate Milk Yield Difference
    farm_data['MilkYieldDifference'] = farm_data['DailyYield'] - farm_data['ExpectedYield']
    
    # Identify heat stress periods for the current farm
    farm_data['HeatStress'] = farm_data['MeanTHI_adj'] >= heat_stress_threshold
    
    # Filter data into heat-stressed and non-heat-stressed cows
    heat_stress_data = farm_data[farm_data['HeatStress']]
    non_heat_stress_data = farm_data[~farm_data['HeatStress']]

    # Plot the probability density plot for cows under heat stress
    sns.kdeplot(data=heat_stress_data, x='MilkYieldDifference', fill=False, bw_adjust=0.5, color='red', label='Heat Stress', ax=ax)
    
    # Plot the probability density plot for cows not under heat stress
    sns.kdeplot(data=non_heat_stress_data, x='MilkYieldDifference', fill=False, bw_adjust=0.5, color='blue', label='No Heat Stress', ax=ax)

    # Access the lines in the plot to extract the x and y values
    kde_lines = ax.get_lines()

    # Get the data from the first line (Heat Stress KDE curve)
    x_heat, y_heat = kde_lines[0].get_data()

    # Get the data from the second line (No Heat Stress KDE curve)
    x_no_heat, y_no_heat = kde_lines[1].get_data()

    # Find the common x-range (the overlapping region of both curves)
    common_x = np.union1d(x_heat, x_no_heat)

    # Interpolate y-values for both curves at the common x points
    y_heat_interp = np.interp(common_x, x_heat, y_heat)
    y_no_heat_interp = np.interp(common_x, x_no_heat, y_no_heat)
    
    # Calculate the intersecting region area
    overlap_y_min = np.minimum(y_heat_interp, y_no_heat_interp)
    overlap_area = np.trapz(overlap_y_min, common_x)
    
    # Use fill_between to highlight the overlap area
    ax.fill_between(common_x, overlap_y_min, color='purple', alpha=0.3, label=f'Intersecting Area: {overlap_area:.2f}')

    # Set title and labels for each subplot
    ax.set_title(f'Probability Density of Milk Yield Difference for Farm {farm_id} at THI {heat_stress_threshold} degrees')
    ax.set_xlabel('Milk Yield Deviation (kg/day)')
    ax.set_ylabel('Density')
    
    # Update the legend with the areas under the curves
    ax.legend(loc='upper right')
    
    # Add grid for better readability
    ax.grid(True, which='both', linestyle='-', linewidth=0.7, color='blue', alpha=0.3)

# Adjust layout for better spacing
plt.tight_layout()
plt.show()


# Plotting one plot per cow-lactation combination - beware screen of death without PDF alternative!

Do this separately for each herd, otherwise risk of killing python due to amount of data when outputting per cow?

In [13]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages
import numpy as np

In [18]:
df = pd.read_csv('../Data/MergedData/HeatApproachCleanedYieldDataTestQuantile61.csv', low_memory=False)
# df = pd.read_csv('../Data/MergedData/HeatApproachCleanedYieldDataTestQuantile67.csv', low_memory=False)

col_keep = ["Date", "FarmName_Pseudo", "SE_Number", "LactationNumber", "DaysInMilk", "DailyYield", "ExpectedYield", "MeanTHI_adj"]
milk_data = df[col_keep]

## Herd 5b581702

In [None]:
# Define list of farm IDs you want to analyze
farm_ids = ['5b581702']

# Define heat stress threshold
heat_stress_threshold = 61

# Filter data for selected farms
farm_data_filtered = milk_data[milk_data['FarmName_Pseudo'].isin(farm_ids)]

# Get a list of unique cow-lactation combinations
cow_lactation_combinations = farm_data_filtered.groupby(['SE_Number', 'LactationNumber'])

# Create a PDF file to save the plots
with PdfPages("5b581702_individual_cow_lactation_plots.pdf") as pdf:
    # Loop through each cow-lactation combination to create individual plots
    for (cow_id, lactation_number), cow_lactation_data in cow_lactation_combinations:
        # Create a new figure for each cow-lactation combination
        fig, ax = plt.subplots(figsize=(10, 5))

        # Calculate Milk Yield Difference
        cow_lactation_data['MilkYieldDifference'] = cow_lactation_data['DailyYield'] - cow_lactation_data['ExpectedYield']

        # Identify heat stress periods for the current cow-lactation
        cow_lactation_data['HeatStress'] = cow_lactation_data['MeanTHI_adj'] >= heat_stress_threshold

        # Filter data into heat-stressed and non-heat-stressed periods
        heat_stress_data = cow_lactation_data[cow_lactation_data['HeatStress']]
        non_heat_stress_data = cow_lactation_data[~cow_lactation_data['HeatStress']]

        # Plot the probability density plot for periods under heat stress
        sns.kdeplot(data=heat_stress_data, x='MilkYieldDifference', fill=False, bw_adjust=0.5, color='red', label='Heat Stress', ax=ax, warn_singular=False)

        # Plot the probability density plot for periods without heat stress
        sns.kdeplot(data=non_heat_stress_data, x='MilkYieldDifference', fill=False, bw_adjust=0.5, color='blue', label='No Heat Stress', ax=ax, warn_singular=False)

        # Access the lines in the plot to extract the x and y values
        kde_lines = ax.get_lines()

        # Check if both lines exist before trying to access them
        if len(kde_lines) >= 2:
            # Get the data from the first line (Heat Stress KDE curve)
            x_heat, y_heat = kde_lines[0].get_data()

            # Get the data from the second line (No Heat Stress KDE curve)
            x_no_heat, y_no_heat = kde_lines[1].get_data()

            # Find the common x-range (the overlapping region of both curves)
            common_x = np.union1d(x_heat, x_no_heat)

            # Interpolate y-values for both curves at the common x points
            y_heat_interp = np.interp(common_x, x_heat, y_heat)
            y_no_heat_interp = np.interp(common_x, x_no_heat, y_no_heat)

            # Calculate and fill the intersecting region
            overlap_y_min = np.minimum(y_heat_interp, y_no_heat_interp)
            overlap_area = np.trapz(overlap_y_min, common_x)

            # Fill the overlapping area
            ax.fill_between(common_x, overlap_y_min, color='purple', alpha=0.3, label=f'Intersecting Area: {overlap_area:.2f}')
        else:
            # If there aren't two lines, you can still proceed but without overlap area
            print(f"Warning: No overlapping data for Cow {cow_id}, Lactation {lactation_number}. Skipping overlap calculation.")

        # Set title and labels for each subplot
        ax.set_title(f'Probability Density of Milk Yield Difference for Cow {cow_id}, Lactation {lactation_number}')
        ax.set_xlabel('Milk Yield Deviation (kg/day)')
        ax.set_ylabel('Density')

        # Add legend and grid
        ax.legend(loc='upper right')
        ax.grid(True, which='both', linestyle='-', linewidth=0.7, color='blue', alpha=0.3)

        # Save the current figure to a new page in the PDF
        pdf.savefig(fig)

        # Close the figure to release memory
        plt.close(fig)

print("PDF file '5b581702_individual_cow_lactation_plots.pdf' has been saved with each cow-lactation combination on a new page.")


## Herd 5c06d92d

In [None]:
# Define list of farm IDs you want to analyze
farm_ids = ['5c06d92d']

# Define heat stress threshold
heat_stress_threshold = 61

# Filter data for selected farms
farm_data_filtered = milk_data[milk_data['FarmName_Pseudo'].isin(farm_ids)]

# Get a list of unique cow-lactation combinations
cow_lactation_combinations = farm_data_filtered.groupby(['SE_Number', 'LactationNumber'])

# Create a PDF file to save the plots
with PdfPages("5c06d92d_individual_cow_lactation_plots.pdf") as pdf:
    # Loop through each cow-lactation combination to create individual plots
    for (cow_id, lactation_number), cow_lactation_data in cow_lactation_combinations:
        # Create a new figure for each cow-lactation combination
        fig, ax = plt.subplots(figsize=(10, 5))

        # Calculate Milk Yield Difference
        cow_lactation_data['MilkYieldDifference'] = cow_lactation_data['DailyYield'] - cow_lactation_data['ExpectedYield']

        # Identify heat stress periods for the current cow-lactation
        cow_lactation_data['HeatStress'] = cow_lactation_data['MeanTHI_adj'] >= heat_stress_threshold

        # Filter data into heat-stressed and non-heat-stressed periods
        heat_stress_data = cow_lactation_data[cow_lactation_data['HeatStress']]
        non_heat_stress_data = cow_lactation_data[~cow_lactation_data['HeatStress']]

        # Plot the probability density plot for periods under heat stress
        sns.kdeplot(data=heat_stress_data, x='MilkYieldDifference', fill=False, bw_adjust=0.5, color='red', label='Heat Stress', ax=ax, warn_singular=False)

        # Plot the probability density plot for periods without heat stress
        sns.kdeplot(data=non_heat_stress_data, x='MilkYieldDifference', fill=False, bw_adjust=0.5, color='blue', label='No Heat Stress', ax=ax, warn_singular=False)

        # Access the lines in the plot to extract the x and y values
        kde_lines = ax.get_lines()

        # Check if both lines exist before trying to access them
        if len(kde_lines) >= 2:
            # Get the data from the first line (Heat Stress KDE curve)
            x_heat, y_heat = kde_lines[0].get_data()

            # Get the data from the second line (No Heat Stress KDE curve)
            x_no_heat, y_no_heat = kde_lines[1].get_data()

            # Find the common x-range (the overlapping region of both curves)
            common_x = np.union1d(x_heat, x_no_heat)

            # Interpolate y-values for both curves at the common x points
            y_heat_interp = np.interp(common_x, x_heat, y_heat)
            y_no_heat_interp = np.interp(common_x, x_no_heat, y_no_heat)

            # Calculate and fill the intersecting region
            overlap_y_min = np.minimum(y_heat_interp, y_no_heat_interp)
            overlap_area = np.trapz(overlap_y_min, common_x)

            # Fill the overlapping area
            ax.fill_between(common_x, overlap_y_min, color='purple', alpha=0.3, label=f'Intersecting Area: {overlap_area:.2f}')
        else:
            # If there aren't two lines, you can still proceed but without overlap area
            print(f"Warning: No overlapping data for Cow {cow_id}, Lactation {lactation_number}. Skipping overlap calculation.")

        # Set title and labels for each subplot
        ax.set_title(f'Probability Density of Milk Yield Difference for Cow {cow_id}, Lactation {lactation_number}')
        ax.set_xlabel('Milk Yield Deviation (kg/day)')
        ax.set_ylabel('Density')

        # Add legend and grid
        ax.legend(loc='upper right')
        ax.grid(True, which='both', linestyle='-', linewidth=0.7, color='blue', alpha=0.3)

        # Save the current figure to a new page in the PDF
        pdf.savefig(fig)

        # Close the figure to release memory
        plt.close(fig)

print("PDF file '5c06d92d_individual_cow_lactation_plots.pdf' has been saved with each cow-lactation combination on a new page.")


## Herd 5f7f33d6

In [None]:
# Define list of farm IDs you want to analyze
farm_ids = ['5f7f33d6']

# Define heat stress threshold
heat_stress_threshold = 61

# Filter data for selected farms
farm_data_filtered = milk_data[milk_data['FarmName_Pseudo'].isin(farm_ids)]

# Get a list of unique cow-lactation combinations
cow_lactation_combinations = farm_data_filtered.groupby(['SE_Number', 'LactationNumber'])

# Create a PDF file to save the plots
with PdfPages("5f7f33d6_individual_cow_lactation_plots.pdf") as pdf:
    # Loop through each cow-lactation combination to create individual plots
    for (cow_id, lactation_number), cow_lactation_data in cow_lactation_combinations:
        # Create a new figure for each cow-lactation combination
        fig, ax = plt.subplots(figsize=(10, 5))

        # Calculate Milk Yield Difference
        cow_lactation_data['MilkYieldDifference'] = cow_lactation_data['DailyYield'] - cow_lactation_data['ExpectedYield']

        # Identify heat stress periods for the current cow-lactation
        cow_lactation_data['HeatStress'] = cow_lactation_data['MeanTHI_adj'] >= heat_stress_threshold

        # Filter data into heat-stressed and non-heat-stressed periods
        heat_stress_data = cow_lactation_data[cow_lactation_data['HeatStress']]
        non_heat_stress_data = cow_lactation_data[~cow_lactation_data['HeatStress']]

        # Plot the probability density plot for periods under heat stress
        sns.kdeplot(data=heat_stress_data, x='MilkYieldDifference', fill=False, bw_adjust=0.5, color='red', label='Heat Stress', ax=ax, warn_singular=False)

        # Plot the probability density plot for periods without heat stress
        sns.kdeplot(data=non_heat_stress_data, x='MilkYieldDifference', fill=False, bw_adjust=0.5, color='blue', label='No Heat Stress', ax=ax, warn_singular=False)

        # Access the lines in the plot to extract the x and y values
        kde_lines = ax.get_lines()

        # Check if both lines exist before trying to access them
        if len(kde_lines) >= 2:
            # Get the data from the first line (Heat Stress KDE curve)
            x_heat, y_heat = kde_lines[0].get_data()

            # Get the data from the second line (No Heat Stress KDE curve)
            x_no_heat, y_no_heat = kde_lines[1].get_data()

            # Find the common x-range (the overlapping region of both curves)
            common_x = np.union1d(x_heat, x_no_heat)

            # Interpolate y-values for both curves at the common x points
            y_heat_interp = np.interp(common_x, x_heat, y_heat)
            y_no_heat_interp = np.interp(common_x, x_no_heat, y_no_heat)

            # Calculate and fill the intersecting region
            overlap_y_min = np.minimum(y_heat_interp, y_no_heat_interp)
            overlap_area = np.trapz(overlap_y_min, common_x)

            # Fill the overlapping area
            ax.fill_between(common_x, overlap_y_min, color='purple', alpha=0.3, label=f'Intersecting Area: {overlap_area:.2f}')
        else:
            # If there aren't two lines, you can still proceed but without overlap area
            print(f"Warning: No overlapping data for Cow {cow_id}, Lactation {lactation_number}. Skipping overlap calculation.")

        # Set title and labels for each subplot
        ax.set_title(f'Probability Density of Milk Yield Difference for Cow {cow_id}, Lactation {lactation_number}')
        ax.set_xlabel('Milk Yield Deviation (kg/day)')
        ax.set_ylabel('Density')

        # Add legend and grid
        ax.legend(loc='upper right')
        ax.grid(True, which='both', linestyle='-', linewidth=0.7, color='blue', alpha=0.3)

        # Save the current figure to a new page in the PDF
        pdf.savefig(fig)

        # Close the figure to release memory
        plt.close(fig)

print("PDF file '5f7f33d6_individual_cow_lactation_plots.pdf' has been saved with each cow-lactation combination on a new page.")


## Herd 752efd72

In [None]:
# Define list of farm IDs you want to analyze
farm_ids = ['752efd72']

# Define heat stress threshold
heat_stress_threshold = 61

# Filter data for selected farms
farm_data_filtered = milk_data[milk_data['FarmName_Pseudo'].isin(farm_ids)]

# Get a list of unique cow-lactation combinations
cow_lactation_combinations = farm_data_filtered.groupby(['SE_Number', 'LactationNumber'])

# Create a PDF file to save the plots
with PdfPages("752efd72_individual_cow_lactation_plots.pdf") as pdf:
    # Loop through each cow-lactation combination to create individual plots
    for (cow_id, lactation_number), cow_lactation_data in cow_lactation_combinations:
        # Create a new figure for each cow-lactation combination
        fig, ax = plt.subplots(figsize=(10, 5))

        # Calculate Milk Yield Difference
        cow_lactation_data['MilkYieldDifference'] = cow_lactation_data['DailyYield'] - cow_lactation_data['ExpectedYield']

        # Identify heat stress periods for the current cow-lactation
        cow_lactation_data['HeatStress'] = cow_lactation_data['MeanTHI_adj'] >= heat_stress_threshold

        # Filter data into heat-stressed and non-heat-stressed periods
        heat_stress_data = cow_lactation_data[cow_lactation_data['HeatStress']]
        non_heat_stress_data = cow_lactation_data[~cow_lactation_data['HeatStress']]

        # Plot the probability density plot for periods under heat stress
        sns.kdeplot(data=heat_stress_data, x='MilkYieldDifference', fill=False, bw_adjust=0.5, color='red', label='Heat Stress', ax=ax, warn_singular=False)

        # Plot the probability density plot for periods without heat stress
        sns.kdeplot(data=non_heat_stress_data, x='MilkYieldDifference', fill=False, bw_adjust=0.5, color='blue', label='No Heat Stress', ax=ax, warn_singular=False)

        # Access the lines in the plot to extract the x and y values
        kde_lines = ax.get_lines()

        # Check if both lines exist before trying to access them
        if len(kde_lines) >= 2:
            # Get the data from the first line (Heat Stress KDE curve)
            x_heat, y_heat = kde_lines[0].get_data()

            # Get the data from the second line (No Heat Stress KDE curve)
            x_no_heat, y_no_heat = kde_lines[1].get_data()

            # Find the common x-range (the overlapping region of both curves)
            common_x = np.union1d(x_heat, x_no_heat)

            # Interpolate y-values for both curves at the common x points
            y_heat_interp = np.interp(common_x, x_heat, y_heat)
            y_no_heat_interp = np.interp(common_x, x_no_heat, y_no_heat)

            # Calculate and fill the intersecting region
            overlap_y_min = np.minimum(y_heat_interp, y_no_heat_interp)
            overlap_area = np.trapz(overlap_y_min, common_x)

            # Fill the overlapping area
            ax.fill_between(common_x, overlap_y_min, color='purple', alpha=0.3, label=f'Intersecting Area: {overlap_area:.2f}')
        else:
            # If there aren't two lines, you can still proceed but without overlap area
            print(f"Warning: No overlapping data for Cow {cow_id}, Lactation {lactation_number}. Skipping overlap calculation.")

        # Set title and labels for each subplot
        ax.set_title(f'Probability Density of Milk Yield Difference for Cow {cow_id}, Lactation {lactation_number}')
        ax.set_xlabel('Milk Yield Deviation (kg/day)')
        ax.set_ylabel('Density')

        # Add legend and grid
        ax.legend(loc='upper right')
        ax.grid(True, which='both', linestyle='-', linewidth=0.7, color='blue', alpha=0.3)

        # Save the current figure to a new page in the PDF
        pdf.savefig(fig)

        # Close the figure to release memory
        plt.close(fig)

print("PDF file '752efd72_individual_cow_lactation_plots.pdf' has been saved with each cow-lactation combination on a new page.")


## Herd a624fb9a

In [None]:
# Define list of farm IDs you want to analyze
farm_ids = ['a624fb9a']

# Define heat stress threshold
heat_stress_threshold = 61

# Filter data for selected farms
farm_data_filtered = milk_data[milk_data['FarmName_Pseudo'].isin(farm_ids)]

# Get a list of unique cow-lactation combinations
cow_lactation_combinations = farm_data_filtered.groupby(['SE_Number', 'LactationNumber'])

# Create a PDF file to save the plots
with PdfPages("a624fb9a_individual_cow_lactation_plots.pdf") as pdf:
    # Loop through each cow-lactation combination to create individual plots
    for (cow_id, lactation_number), cow_lactation_data in cow_lactation_combinations:
        # Create a new figure for each cow-lactation combination
        fig, ax = plt.subplots(figsize=(10, 5))

        # Calculate Milk Yield Difference
        cow_lactation_data['MilkYieldDifference'] = cow_lactation_data['DailyYield'] - cow_lactation_data['ExpectedYield']

        # Identify heat stress periods for the current cow-lactation
        cow_lactation_data['HeatStress'] = cow_lactation_data['MeanTHI_adj'] >= heat_stress_threshold

        # Filter data into heat-stressed and non-heat-stressed periods
        heat_stress_data = cow_lactation_data[cow_lactation_data['HeatStress']]
        non_heat_stress_data = cow_lactation_data[~cow_lactation_data['HeatStress']]

        # Plot the probability density plot for periods under heat stress
        sns.kdeplot(data=heat_stress_data, x='MilkYieldDifference', fill=False, bw_adjust=0.5, color='red', label='Heat Stress', ax=ax, warn_singular=False)

        # Plot the probability density plot for periods without heat stress
        sns.kdeplot(data=non_heat_stress_data, x='MilkYieldDifference', fill=False, bw_adjust=0.5, color='blue', label='No Heat Stress', ax=ax, warn_singular=False)

        # Access the lines in the plot to extract the x and y values
        kde_lines = ax.get_lines()

        # Check if both lines exist before trying to access them
        if len(kde_lines) >= 2:
            # Get the data from the first line (Heat Stress KDE curve)
            x_heat, y_heat = kde_lines[0].get_data()

            # Get the data from the second line (No Heat Stress KDE curve)
            x_no_heat, y_no_heat = kde_lines[1].get_data()

            # Find the common x-range (the overlapping region of both curves)
            common_x = np.union1d(x_heat, x_no_heat)

            # Interpolate y-values for both curves at the common x points
            y_heat_interp = np.interp(common_x, x_heat, y_heat)
            y_no_heat_interp = np.interp(common_x, x_no_heat, y_no_heat)

            # Calculate and fill the intersecting region
            overlap_y_min = np.minimum(y_heat_interp, y_no_heat_interp)
            overlap_area = np.trapz(overlap_y_min, common_x)

            # Fill the overlapping area
            ax.fill_between(common_x, overlap_y_min, color='purple', alpha=0.3, label=f'Intersecting Area: {overlap_area:.2f}')
        else:
            # If there aren't two lines, you can still proceed but without overlap area
            print(f"Warning: No overlapping data for Cow {cow_id}, Lactation {lactation_number}. Skipping overlap calculation.")

        # Set title and labels for each subplot
        ax.set_title(f'Probability Density of Milk Yield Difference for Cow {cow_id}, Lactation {lactation_number}')
        ax.set_xlabel('Milk Yield Deviation (kg/day)')
        ax.set_ylabel('Density')

        # Add legend and grid
        ax.legend(loc='upper right')
        ax.grid(True, which='both', linestyle='-', linewidth=0.7, color='blue', alpha=0.3)

        # Save the current figure to a new page in the PDF
        pdf.savefig(fig)

        # Close the figure to release memory
        plt.close(fig)

print("PDF file 'a624fb9a_individual_cow_lactation_plots.pdf' has been saved with each cow-lactation combination on a new page.")


## Herd ab18b151

In [None]:
# Define list of farm IDs you want to analyze
farm_ids = ['ab18b151']

# Define heat stress threshold
heat_stress_threshold = 61

# Filter data for selected farms
farm_data_filtered = milk_data[milk_data['FarmName_Pseudo'].isin(farm_ids)]

# Get a list of unique cow-lactation combinations
cow_lactation_combinations = farm_data_filtered.groupby(['SE_Number', 'LactationNumber'])

# Create a PDF file to save the plots
with PdfPages("ab18b151_individual_cow_lactation_plots.pdf") as pdf:
    # Loop through each cow-lactation combination to create individual plots
    for (cow_id, lactation_number), cow_lactation_data in cow_lactation_combinations:
        # Create a new figure for each cow-lactation combination
        fig, ax = plt.subplots(figsize=(10, 5))

        # Calculate Milk Yield Difference
        cow_lactation_data['MilkYieldDifference'] = cow_lactation_data['DailyYield'] - cow_lactation_data['ExpectedYield']

        # Identify heat stress periods for the current cow-lactation
        cow_lactation_data['HeatStress'] = cow_lactation_data['MeanTHI_adj'] >= heat_stress_threshold

        # Filter data into heat-stressed and non-heat-stressed periods
        heat_stress_data = cow_lactation_data[cow_lactation_data['HeatStress']]
        non_heat_stress_data = cow_lactation_data[~cow_lactation_data['HeatStress']]

        # Plot the probability density plot for periods under heat stress
        sns.kdeplot(data=heat_stress_data, x='MilkYieldDifference', fill=False, bw_adjust=0.5, color='red', label='Heat Stress', ax=ax, warn_singular=False)

        # Plot the probability density plot for periods without heat stress
        sns.kdeplot(data=non_heat_stress_data, x='MilkYieldDifference', fill=False, bw_adjust=0.5, color='blue', label='No Heat Stress', ax=ax, warn_singular=False)

        # Access the lines in the plot to extract the x and y values
        kde_lines = ax.get_lines()

        # Check if both lines exist before trying to access them
        if len(kde_lines) >= 2:
            # Get the data from the first line (Heat Stress KDE curve)
            x_heat, y_heat = kde_lines[0].get_data()

            # Get the data from the second line (No Heat Stress KDE curve)
            x_no_heat, y_no_heat = kde_lines[1].get_data()

            # Find the common x-range (the overlapping region of both curves)
            common_x = np.union1d(x_heat, x_no_heat)

            # Interpolate y-values for both curves at the common x points
            y_heat_interp = np.interp(common_x, x_heat, y_heat)
            y_no_heat_interp = np.interp(common_x, x_no_heat, y_no_heat)

            # Calculate and fill the intersecting region
            overlap_y_min = np.minimum(y_heat_interp, y_no_heat_interp)
            overlap_area = np.trapz(overlap_y_min, common_x)

            # Fill the overlapping area
            ax.fill_between(common_x, overlap_y_min, color='purple', alpha=0.3, label=f'Intersecting Area: {overlap_area:.2f}')
        else:
            # If there aren't two lines, you can still proceed but without overlap area
            print(f"Warning: No overlapping data for Cow {cow_id}, Lactation {lactation_number}. Skipping overlap calculation.")

        # Set title and labels for each subplot
        ax.set_title(f'Probability Density of Milk Yield Difference for Cow {cow_id}, Lactation {lactation_number}')
        ax.set_xlabel('Milk Yield Deviation (kg/day)')
        ax.set_ylabel('Density')

        # Add legend and grid
        ax.legend(loc='upper right')
        ax.grid(True, which='both', linestyle='-', linewidth=0.7, color='blue', alpha=0.3)

        # Save the current figure to a new page in the PDF
        pdf.savefig(fig)

        # Close the figure to release memory
        plt.close(fig)

print("PDF file 'ab18b151_individual_cow_lactation_plots.pdf' has been saved with each cow-lactation combination on a new page.")


## Herd ad0a39f5

In [None]:
# Define list of farm IDs you want to analyze
farm_ids = ['ad0a39f5']

# Define heat stress threshold
heat_stress_threshold = 61

# Filter data for selected farms
farm_data_filtered = milk_data[milk_data['FarmName_Pseudo'].isin(farm_ids)]

# Get a list of unique cow-lactation combinations
cow_lactation_combinations = farm_data_filtered.groupby(['SE_Number', 'LactationNumber'])

# Create a PDF file to save the plots
with PdfPages("ad0a39f5_individual_cow_lactation_plots.pdf") as pdf:
    # Loop through each cow-lactation combination to create individual plots
    for (cow_id, lactation_number), cow_lactation_data in cow_lactation_combinations:
        # Create a new figure for each cow-lactation combination
        fig, ax = plt.subplots(figsize=(10, 5))

        # Calculate Milk Yield Difference
        cow_lactation_data['MilkYieldDifference'] = cow_lactation_data['DailyYield'] - cow_lactation_data['ExpectedYield']

        # Identify heat stress periods for the current cow-lactation
        cow_lactation_data['HeatStress'] = cow_lactation_data['MeanTHI_adj'] >= heat_stress_threshold

        # Filter data into heat-stressed and non-heat-stressed periods
        heat_stress_data = cow_lactation_data[cow_lactation_data['HeatStress']]
        non_heat_stress_data = cow_lactation_data[~cow_lactation_data['HeatStress']]

        # Plot the probability density plot for periods under heat stress
        sns.kdeplot(data=heat_stress_data, x='MilkYieldDifference', fill=False, bw_adjust=0.5, color='red', label='Heat Stress', ax=ax, warn_singular=False)

        # Plot the probability density plot for periods without heat stress
        sns.kdeplot(data=non_heat_stress_data, x='MilkYieldDifference', fill=False, bw_adjust=0.5, color='blue', label='No Heat Stress', ax=ax, warn_singular=False)

        # Access the lines in the plot to extract the x and y values
        kde_lines = ax.get_lines()

        # Check if both lines exist before trying to access them
        if len(kde_lines) >= 2:
            # Get the data from the first line (Heat Stress KDE curve)
            x_heat, y_heat = kde_lines[0].get_data()

            # Get the data from the second line (No Heat Stress KDE curve)
            x_no_heat, y_no_heat = kde_lines[1].get_data()

            # Find the common x-range (the overlapping region of both curves)
            common_x = np.union1d(x_heat, x_no_heat)

            # Interpolate y-values for both curves at the common x points
            y_heat_interp = np.interp(common_x, x_heat, y_heat)
            y_no_heat_interp = np.interp(common_x, x_no_heat, y_no_heat)

            # Calculate and fill the intersecting region
            overlap_y_min = np.minimum(y_heat_interp, y_no_heat_interp)
            overlap_area = np.trapz(overlap_y_min, common_x)

            # Fill the overlapping area
            ax.fill_between(common_x, overlap_y_min, color='purple', alpha=0.3, label=f'Intersecting Area: {overlap_area:.2f}')
        else:
            # If there aren't two lines, you can still proceed but without overlap area
            print(f"Warning: No overlapping data for Cow {cow_id}, Lactation {lactation_number}. Skipping overlap calculation.")

        # Set title and labels for each subplot
        ax.set_title(f'Probability Density of Milk Yield Difference for Cow {cow_id}, Lactation {lactation_number}')
        ax.set_xlabel('Milk Yield Deviation (kg/day)')
        ax.set_ylabel('Density')

        # Add legend and grid
        ax.legend(loc='upper right')
        ax.grid(True, which='both', linestyle='-', linewidth=0.7, color='blue', alpha=0.3)

        # Save the current figure to a new page in the PDF
        pdf.savefig(fig)

        # Close the figure to release memory
        plt.close(fig)

print("PDF file 'ad0a39f5_individual_cow_lactation_plots.pdf' has been saved with each cow-lactation combination on a new page.")


## Herd afdd9a78

In [None]:
# Define list of farm IDs you want to analyze
farm_ids = ['afdd9a78']

# Define heat stress threshold
heat_stress_threshold = 61

# Filter data for selected farms
farm_data_filtered = milk_data[milk_data['FarmName_Pseudo'].isin(farm_ids)]

# Get a list of unique cow-lactation combinations
cow_lactation_combinations = farm_data_filtered.groupby(['SE_Number', 'LactationNumber'])

# Create a PDF file to save the plots
with PdfPages("afdd9a78_individual_cow_lactation_plots.pdf") as pdf:
    # Loop through each cow-lactation combination to create individual plots
    for (cow_id, lactation_number), cow_lactation_data in cow_lactation_combinations:
        # Create a new figure for each cow-lactation combination
        fig, ax = plt.subplots(figsize=(10, 5))

        # Calculate Milk Yield Difference
        cow_lactation_data['MilkYieldDifference'] = cow_lactation_data['DailyYield'] - cow_lactation_data['ExpectedYield']

        # Identify heat stress periods for the current cow-lactation
        cow_lactation_data['HeatStress'] = cow_lactation_data['MeanTHI_adj'] >= heat_stress_threshold

        # Filter data into heat-stressed and non-heat-stressed periods
        heat_stress_data = cow_lactation_data[cow_lactation_data['HeatStress']]
        non_heat_stress_data = cow_lactation_data[~cow_lactation_data['HeatStress']]

        # Plot the probability density plot for periods under heat stress
        sns.kdeplot(data=heat_stress_data, x='MilkYieldDifference', fill=False, bw_adjust=0.5, color='red', label='Heat Stress', ax=ax, warn_singular=False)

        # Plot the probability density plot for periods without heat stress
        sns.kdeplot(data=non_heat_stress_data, x='MilkYieldDifference', fill=False, bw_adjust=0.5, color='blue', label='No Heat Stress', ax=ax, warn_singular=False)

        # Access the lines in the plot to extract the x and y values
        kde_lines = ax.get_lines()

        # Check if both lines exist before trying to access them
        if len(kde_lines) >= 2:
            # Get the data from the first line (Heat Stress KDE curve)
            x_heat, y_heat = kde_lines[0].get_data()

            # Get the data from the second line (No Heat Stress KDE curve)
            x_no_heat, y_no_heat = kde_lines[1].get_data()

            # Find the common x-range (the overlapping region of both curves)
            common_x = np.union1d(x_heat, x_no_heat)

            # Interpolate y-values for both curves at the common x points
            y_heat_interp = np.interp(common_x, x_heat, y_heat)
            y_no_heat_interp = np.interp(common_x, x_no_heat, y_no_heat)

            # Calculate and fill the intersecting region
            overlap_y_min = np.minimum(y_heat_interp, y_no_heat_interp)
            overlap_area = np.trapz(overlap_y_min, common_x)

            # Fill the overlapping area
            ax.fill_between(common_x, overlap_y_min, color='purple', alpha=0.3, label=f'Intersecting Area: {overlap_area:.2f}')
        else:
            # If there aren't two lines, you can still proceed but without overlap area
            print(f"Warning: No overlapping data for Cow {cow_id}, Lactation {lactation_number}. Skipping overlap calculation.")

        # Set title and labels for each subplot
        ax.set_title(f'Probability Density of Milk Yield Difference for Cow {cow_id}, Lactation {lactation_number}')
        ax.set_xlabel('Milk Yield Deviation (kg/day)')
        ax.set_ylabel('Density')

        # Add legend and grid
        ax.legend(loc='upper right')
        ax.grid(True, which='both', linestyle='-', linewidth=0.7, color='blue', alpha=0.3)

        # Save the current figure to a new page in the PDF
        pdf.savefig(fig)

        # Close the figure to release memory
        plt.close(fig)

print("PDF file 'afdd9a78_individual_cow_lactation_plots.pdf' has been saved with each cow-lactation combination on a new page.")


## Herd f454e660

In [None]:
# Define list of farm IDs you want to analyze
farm_ids = ['f454e660']

# Define heat stress threshold
heat_stress_threshold = 61

# Filter data for selected farms
farm_data_filtered = milk_data[milk_data['FarmName_Pseudo'].isin(farm_ids)]

# Get a list of unique cow-lactation combinations
cow_lactation_combinations = farm_data_filtered.groupby(['SE_Number', 'LactationNumber'])

# Create a PDF file to save the plots
with PdfPages("f454e660_individual_cow_lactation_plots.pdf") as pdf:
    # Loop through each cow-lactation combination to create individual plots
    for (cow_id, lactation_number), cow_lactation_data in cow_lactation_combinations:
        # Create a new figure for each cow-lactation combination
        fig, ax = plt.subplots(figsize=(10, 5))

        # Calculate Milk Yield Difference
        cow_lactation_data['MilkYieldDifference'] = cow_lactation_data['DailyYield'] - cow_lactation_data['ExpectedYield']

        # Identify heat stress periods for the current cow-lactation
        cow_lactation_data['HeatStress'] = cow_lactation_data['MeanTHI_adj'] >= heat_stress_threshold

        # Filter data into heat-stressed and non-heat-stressed periods
        heat_stress_data = cow_lactation_data[cow_lactation_data['HeatStress']]
        non_heat_stress_data = cow_lactation_data[~cow_lactation_data['HeatStress']]

        # Plot the probability density plot for periods under heat stress
        sns.kdeplot(data=heat_stress_data, x='MilkYieldDifference', fill=False, bw_adjust=0.5, color='red', label='Heat Stress', ax=ax, warn_singular=False)

        # Plot the probability density plot for periods without heat stress
        sns.kdeplot(data=non_heat_stress_data, x='MilkYieldDifference', fill=False, bw_adjust=0.5, color='blue', label='No Heat Stress', ax=ax, warn_singular=False)

        # Access the lines in the plot to extract the x and y values
        kde_lines = ax.get_lines()

        # Check if both lines exist before trying to access them
        if len(kde_lines) >= 2:
            # Get the data from the first line (Heat Stress KDE curve)
            x_heat, y_heat = kde_lines[0].get_data()

            # Get the data from the second line (No Heat Stress KDE curve)
            x_no_heat, y_no_heat = kde_lines[1].get_data()

            # Find the common x-range (the overlapping region of both curves)
            common_x = np.union1d(x_heat, x_no_heat)

            # Interpolate y-values for both curves at the common x points
            y_heat_interp = np.interp(common_x, x_heat, y_heat)
            y_no_heat_interp = np.interp(common_x, x_no_heat, y_no_heat)

            # Calculate and fill the intersecting region
            overlap_y_min = np.minimum(y_heat_interp, y_no_heat_interp)
            overlap_area = np.trapz(overlap_y_min, common_x)

            # Fill the overlapping area
            ax.fill_between(common_x, overlap_y_min, color='purple', alpha=0.3, label=f'Intersecting Area: {overlap_area:.2f}')
        else:
            # If there aren't two lines, you can still proceed but without overlap area
            print(f"Warning: No overlapping data for Cow {cow_id}, Lactation {lactation_number}. Skipping overlap calculation.")

        # Set title and labels for each subplot
        ax.set_title(f'Probability Density of Milk Yield Difference for Cow {cow_id}, Lactation {lactation_number}')
        ax.set_xlabel('Milk Yield Deviation (kg/day)')
        ax.set_ylabel('Density')

        # Add legend and grid
        ax.legend(loc='upper right')
        ax.grid(True, which='both', linestyle='-', linewidth=0.7, color='blue', alpha=0.3)

        # Save the current figure to a new page in the PDF
        pdf.savefig(fig)

        # Close the figure to release memory
        plt.close(fig)

print("PDF file 'f454e660_individual_cow_lactation_plots.pdf' has been saved with each cow-lactation combination on a new page.")


# Check calving patterns on herds across time period

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.dates as mdates

In [None]:
df = pd.read_csv('C:/Users/pagd0001/Desktop/Gigacow/Data/20241009/Gigacow-tools/Projects/HeatStressEvaluation/Data/MergedData/MY_weather_filtered.csv', low_memory=False)
col_keep = ["SE_Number", "LactationNumber", "CalvingDate"]
data = df[col_keep]
data = data.drop_duplicates(subset=["SE_Number", "LactationNumber"])

data 

In [None]:
df = pd.read_csv('../Data/MergedData/HeatApproachCleanedYieldDataTestQuantile61.csv', low_memory=False)
col_keep = ["Date", "FarmName_Pseudo", "SE_Number", "LactationNumber", "DaysInMilk", "DailyYield", "ExpectedYield", "MeanTHI_adj", "YearSeason"]
milk_data = df[col_keep]

data = pd.merge(milk_data, data, on=["SE_Number", "LactationNumber"], how="left")
data

In [None]:
data['CalvingDate'] = pd.to_datetime(data['CalvingDate'])

# Function to determine the season based on month
def get_season(date):
    month = date.month
    if month in [12, 1, 2]:
        return 1
    elif month in [3, 4, 5]:
        return 2
    elif month in [6, 7, 8]:
        return 3
    elif month in [9, 10, 11]:
        return 4


# Apply the function to create a 'YearSeason' variable
data['CalvingYearSeason'] = data['CalvingDate'].apply(lambda x: f"{x.year}0{get_season(x)}")

data

In [None]:
# Convert 'Date' and 'CalvingDate' to datetime format
data['Date'] = pd.to_datetime(data['Date'])
data['CalvingDate'] = pd.to_datetime(data['CalvingDate'])

# Set 'Date' as the index and sort data by it (only need to sort once)
data.set_index('Date', inplace=True)
data = data.sort_values(by="Date")

# List of herds (unique values)
herds = data['FarmName_Pseudo'].unique()

# Loop through each herd and create a separate plot
for herd in herds:
    # Filter data for the current herd (use .loc for efficient indexing)
    herd_data = data.loc[data['FarmName_Pseudo'] == herd]
    
    # Initialize the plot for the current herd
    fig, ax1 = plt.subplots(figsize=(10, 6))

    # Plot THI on the primary y-axis
    ax1.plot(herd_data.index, herd_data['MeanTHI_adj'], color='red', label='THI', linewidth=2)
    ax1.set_xlabel('Date')
    ax1.set_ylabel('THI', color='blue')
    ax1.tick_params(axis='y', labelcolor='blue')
    ax1.set_ylim(0, 80)

    # Plot all insemination dates as a single scatter plot close to the x-axis
    calving_dates = herd_data['CalvingDate'].dropna()  # Avoid NaNs
    ax1.scatter(calving_dates, [2] * len(calving_dates), color='green', marker='o', label='Calving Date', zorder=5)

    # Format the x-axis to show dates nicely
    ax1.xaxis.set_major_locator(mdates.MonthLocator())  # Adjust this to your date frequency
    ax1.xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m-%d'))
    plt.xticks(rotation=45)

    # Add a legend
    ax1.legend(loc='upper left')

    # Add title for each plot with the herd name/number
    plt.title(f'Herd {herd}: THI and Calving Dates')

    # Show the plot
    plt.tight_layout()
    plt.show()


# Checking correlation THI and temperature readings

In [3]:
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import pearsonr
import seaborn as sns

In [None]:
# Load data
df = pd.read_csv("../Data/MergedData/HeatApproachCleanedYieldDataTestQuantile61.csv", low_memory=False)

# Make Parity 1-3
df = df.copy()
df["Parity"] = df["LactationNumber"]
df.loc[(df['LactationNumber'] >= 3) & (df['LactationNumber'] <= 8), 'Parity'] = 3

# Clean up dataframe
col_keep = ["FarmName_Pseudo", "SE_Number", "BreedName", "Parity", "Date", "DailyYield", "MeanTHI_adj", "MeanTemperature"]
df = df[col_keep]
df

Visual Inspection with Scatterplot

In [None]:
# Scatterplot of THI vs Temperature
plt.scatter(df['MeanTemperature'], df['MeanTHI_adj'], alpha=0.5)
plt.title("Scatterplot: Temperature vs THI")
plt.xlabel("Temperature")
plt.ylabel("THI")
plt.show()

Correlation Coefficient: Calculate the correlation coefficient using Pearson’s correlation, which measures linear relationships, or Spearman’s correlation, which is for monotonic relationships:

In [None]:
# Pearson Correlation
pearson_corr = df['MeanTemperature'].corr(df['MeanTHI_adj'], method='pearson')
print(f"Pearson correlation between Temperature and THI: {pearson_corr:.2f}")

# Spearman Correlation (if the relationship is not linear)
# spearman_corr = df['MeanTemperature'].corr(df['MeanTHI_adj'], method='spearman')
# print(f"Spearman correlation between Temperature and THI: {spearman_corr:.2f}")

Statistical Test for Significance

In [None]:
# Pearson correlation and p-value
corr, p_value = pearsonr(df['MeanTemperature'], df['MeanTHI_adj'])
print(f"Pearson Correlation: {corr:.2f}, p-value: {p_value:.4f}")

Heatmap for Multiple Variables

In [None]:
# Heatmap of correlations
correlation_matrix = df[['MeanTemperature', 'MeanTHI_adj']].corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')
plt.title("Correlation Heatmap")
plt.show()

# Double checking MeanTHI_adj calc

In [None]:
df = pd.read_csv("../Data/WeatherData/AllPreProcessedWeatherData/processed_data_f454e660.csv")
col_keep = ["Tid", "Temperatur", "Relativ fuktighet", "Vindhastighet", "Global irradiance", "THI_adj", "StartDate"]
df = df[col_keep]
df

In [116]:
df["RF"] = df["Relativ fuktighet"]
df["GI"] = df["Global irradiance"]

In [None]:
# Define the function for THI
def THI_funct(Temperatur, RF):
    # Calculate the value based on the formula provided
    return 0.8 * Temperatur + RF * (Temperatur - 14.4) + 46.4

# Apply the function to the DataFrame
df['THI'] = df.apply(lambda row: THI_funct(row['Temperatur'], row['RF']), axis=1)

# Print the DataFrame with the new column
print(df.head())

In [None]:
# Define the function for THI_adj
def THI_funct2(THI, Vindhastighet, GI):
    # Calculate the value based on the formula provided
    return 4.51 + THI - (1.992 * Vindhastighet) + (0.0068 * GI)

# Apply the function to the DataFrame
df['THIadj'] = df.apply(lambda row: THI_funct2(row['THI'], row['Vindhastighet'], row["GI"]), axis=1)

# Print the DataFrame with the new column
print(df.head())