# FILTERING MY_weather.csv for HS-MY manuscript
Edits are performed on dataset "MY_weather.csv" and "updateDF.csv" i.e. after "BuildingDataframe.ipynb" has been run.
- keep only NRDC, SH, SJB and dairy crosses
- have to start milking by 1-40DIM and maintain milking until 100-400DIM
- keep only 1-8 lactation (make parity 1, 2, +3)
- set MY between 2.5-60kg to handle outliers, kick-offs and incomplete milkings
- remove colostrum, assume first 4 days

Generates filtered dataset "MY_weather_filtered.csv" with full structure

In [None]:
from datetime import timedelta
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import glob

In [None]:
# FILTER MILK YIELD DATA
df3 = pd.read_csv("../Data/MergedData/MY_weather.csv", low_memory=False)
print(f"No. milking events in MY_weather.csv: {df3.shape}")

In [None]:
# No. MY records per herd
count_my_rec = df3.groupby(["FarmName_Pseudo"])["StartDate"].count().reset_index()
print(f"No. milking events per herd: \n", count_my_rec.to_string(index=False))

Start by removing herds with missing MY

In [None]:
Herds = ["6d38bc90", "a756bc39"]
df3 = df3[~df3["FarmName_Pseudo"].isin(Herds)]

print(f"No. milking events in the 9 herds: {df3.shape}")

# No. MY records per herd
count_my_rec = df3.groupby(["FarmName_Pseudo"])["StartDate"].count().reset_index()
print(f"No. milking events per herd: \n", count_my_rec.to_string(index=False))

Check breed distribution and filter on breed

In [None]:
value_counts = df3['Breed'].value_counts()
print(value_counts)

# Pie chart over distribution of Breed in crude data
value_counts = df3['Breed'].value_counts()
plt.figure(figsize=(10, 10))
plt.pie(value_counts, labels=value_counts.index, autopct='%1.1f%%', startangle=140, pctdistance=0.5)
plt.axis('equal')
plt.title('Distribution of Milking Events Across Breeds in MY_weather.csv')
plt.show()

# Keep only SRB, SH, dairy crosses and SJB
df3 = df3[(df3["Breed"] == "NRDC") |
          (df3["Breed"] == "SLB") |
          (df3["Breed"] == "DairyCross") |
          (df3["Breed"] == "SJB")]

# Pie chart over distribution of Breed in filtered data
value_counts = df3['Breed'].value_counts()
plt.figure(figsize=(10, 10))
plt.pie(value_counts, labels=value_counts.index, autopct='%1.1f%%', startangle=140, pctdistance=0.5)
plt.axis('equal')
plt.title('Distribution of Milking Events Across Breeds in MY_weather.csv')
plt.show()

# No. milking events, lactations and cows in data
print(f"No. milking events in SRB, SH, dairy Crosses and SJB cows: {df3.shape}")
test = df3.drop_duplicates(subset=["SE_Number", "LactationNumber"])
print(f"No. of lactations from SRB, SH, SJB and dairy crosses: {test.shape}")
test = test.drop_duplicates(subset=["SE_Number"])
print(f"No. of cows from SRB, SH, SJB and dairy crosses: {test.shape}")

In [None]:
# No. MY records per herd
count_my_rec = df3.groupby(["FarmName_Pseudo"])["StartDate"].count().reset_index()
print(f"No. milking events per herd: \n", count_my_rec.to_string(index=False))

In [None]:
# No. MY records per herd
count_my_rec = df3.groupby(["FarmName_Pseudo"])["TotalYield"].count().reset_index()
print(f"No. milking yield occurrences per herd: \n", count_my_rec.to_string(index=False))

Transfer data for full df

In [None]:
# Transfer basic data about lactation to every observation, including those missing MY data
print("Original shape of df3:", df3.shape)

# Select columns to keep and remove duplicates from df
df = df3[df3['DaysInMilk'].notna()]
col_keep = ["SE_Number", "CalvingDate", "LactationNumber", "upper_limit"]
df = df[col_keep].drop_duplicates(subset=["SE_Number", "LactationNumber"])
# df.to_csv("../Data/MY3.csv", index=False)

# Set 'ID' as the index in both DataFrames
df.set_index(['SE_Number', "LactationNumber"], inplace=True)
df3.set_index(['SE_Number', "LactationNumber"], inplace=True)

# Use .update() to overwrite values
df3.update(df)

# Reset index
df3.reset_index(inplace=True)

# Calculate DaysInMilk2 and save
df3["StartDate"] = pd.to_datetime(df3["StartDate"])
df3["CalvingDate"] = pd.to_datetime(df3["CalvingDate"])
df3["DaysInMilk2"] = (df3["StartDate"] - df3["CalvingDate"]).dt.days + 1
new_column_order = ["FarmName_Pseudo", "SE_Number", "AnimalNumber", "Del_Cow_Id", "Breed", "LactationNumber", "CalvingDate", "upper_limit", "DaysInMilk2",
                    "StartDate", "StartTime", "SessionNumber", "TotalYield", 
                    "TotalYieldLF", "TotalYieldRF", "TotalYieldLR", "TotalYieldRR",
                    "AverageFlowLF", "AverageFlowLR", "AverageFlowRF", "AverageFlowRR",
                    "PeakFlowLF", "PeakFlowLR", "PeakFlowRF", "PeakFlowRR",
                    "BloodLF", "BloodLR", "BloodRF", "BloodRR",
                    "ConductivityLF", "ConductivityLR", "ConductivityRF", "ConductivityRR",
                    "Occ",
                    "Temperature", "RelativeHumidity", "WindSpeed", "WindDirection", "Crosswind", "Precipitation",
                    "Snow", "PrecipitationType", "Cloudiness", "Visibility", "AirPressure", "GlobalIrradiance",
                    "THI_adj", "HW", "cum_HW", "Temp15Threshold",
                    "MeanTemperature", "MeanRelativeHumidity", "MeanWindSpeed", "MeanPrecipitation", "MeanGlobalIrradiance",
                    "MeanTHI_adj", "MeanHW", "Meancum_HW", "MeanTemp15Threshold"]
df3 = df3[new_column_order]
# df3.to_csv("../Data/MY4.csv", index=False)
print("New shape of df3:", df3.shape)

In [None]:
df3

In [None]:
# No. MY records per herd
count_my_rec = df3.groupby(["FarmName_Pseudo"])["StartDate"].count().reset_index()
print(f"No. milking events per herd: \n", count_my_rec.to_string(index=False))

First and last DIM where have milk yield

In [None]:
df4 = df3[df3['TotalYield'].notna()]
first_last_df = df4.groupby(['SE_Number', 'LactationNumber'])['DaysInMilk2'].agg(['first', 'last']).reset_index()

plt.figure()
plt.hist(first_last_df["first"], color='blue')
plt.title(f"First DIM in Lactation")
plt.xlabel('First DIM')
plt.ylabel('Frequency')
plt.grid(True)
plt.show()

plt.figure()
plt.hist(first_last_df["last"], color='blue')
plt.title(f"Last DIM in Lactation")
plt.xlabel('Last DIM')
plt.ylabel('Frequency')
plt.grid(True)
plt.show()

df3 = df3.merge(first_last_df, on=["SE_Number", "LactationNumber"], how="left")
# df3.to_csv("../Data/MY3.csv", index=False)

average_production = (df3.groupby(['FarmName_Pseudo'])['first'].agg(['mean', 'std']).reset_index())
average_production['mean'] = average_production['mean'].round(0)
average_production['std'] = average_production['std'].round(0)
average_production.rename(columns={'mean': 'MeanFirstDIM', 'std': 'SDFirstDIM'}, inplace=True)
print(f"Mean and SD First DIM: \n", average_production.to_string(index=False))

average_production = (df3.groupby(['FarmName_Pseudo'])['last'].agg(['mean', 'std']).reset_index())
average_production['mean'] = average_production['mean'].round(0)
average_production['std'] = average_production['std'].round(0)
average_production.rename(columns={'mean': 'MeanLastDIM', 'std': 'SDLastDIM'}, inplace=True)
print(f"Mean and SD Last DIM: \n", average_production.to_string(index=False))

Demands 1-40DIM start and 100-400DIM stop

In [None]:
# Filter to demand started milking by 1-40 DIM and maintain milking until 100-400 DIM
def filter_first_last(group):
    first_value = group['first'].iloc[0]
    last_value = group['last'].iloc[-1]
    return (1 <= first_value <= 40) and (100 <= last_value <= 400)


filtered_df = df3.groupby(["SE_Number", "LactationNumber"]).filter(filter_first_last)

# No. milking events, lactations and cows in data
print(f"No. milking events in SRB, SH, SJB and dairy crosses cows within 1-40 DIM and 100-400 DIM: {filtered_df.shape}")
test = filtered_df.drop_duplicates(subset=["SE_Number", "LactationNumber"])
print(f"No. of lactations from SRB, SH, SJB and dairy crosses within 1-40DIM and 100-400DIM: {test.shape}")
test = filtered_df.drop_duplicates(subset=["SE_Number"])
print(f"No. of cows from SRB, SH, SJB and dairy crosses within 1-40DIM and 100-400DIM: {test.shape}")
# filtered_df.to_csv("../Data/MY3.csv", index=False)

Distribution of milk yield data over various lactations

In [None]:
value_counts = filtered_df['LactationNumber'].value_counts()
print(value_counts)

# Keep only lactation 1-8
df_lact = filtered_df[filtered_df["LactationNumber"] <= 8]
print(f"No. milking events in SRB, SH, SJB and dairy crosses cows within 1-40 DIM and 100-400 DIM in lactation 1-8: "
      f"{df_lact.shape}")  

# No. lactations and cows in data
test = df_lact.drop_duplicates(subset=["SE_Number", "LactationNumber"])
print(f"No. of lactations from SRB, SH, SJB and dairy crosses within 1-40DIM and 100-400DIM in lactation 1-8: {test.shape}")
test = df_lact.drop_duplicates(subset=["SE_Number"])
print(f"No. of cows from SRB, SH, SJB and dairy crosses within 1-40DIM and 100-400DIM in lactation 1-8: {test.shape}")

# Make Parity 1-3
df_lact = df_lact.copy()
df_lact["Parity"] = df_lact["LactationNumber"]
df_lact.loc[(df_lact['LactationNumber'] >= 3) & (df_lact['LactationNumber'] <= 8), 'Parity'] = 3
# df_lact.to_csv("../Data/MY3.csv", index=False)

In [None]:
# No. MY records per herd
count_my_rec = df_lact.groupby(["FarmName_Pseudo"])["StartDate"].count().reset_index()
print(f"No. milking events per herd: \n", count_my_rec.to_string(index=False))

Check distribution of total yield column

In [None]:
# Basic statistics
summary_stats = df_lact['TotalYield'].describe()

# Check for non-empty column before calculating percentiles
if not df_lact['TotalYield'].dropna().empty:
    percentiles = np.percentile(df_lact['TotalYield'].dropna(), [1, 5, 10, 90, 95, 99])
else:
    percentiles = None

print("Descriptive Statistics:\n", summary_stats)
if percentiles is not None:
    print("\nPercentiles (1%, 5%, 10%, 90%, 95%, 99%):", percentiles)
else:
    print("\nPercentiles cannot be calculated due to empty or missing data.")

# Remove duplicates (if required)
count_my_rec = df_lact.drop_duplicates(subset=["SE_Number", "LactationNumber"])

# Box plot: Ensure no missing values in key columns
sns.boxplot(x='FarmName_Pseudo', y='TotalYield', data=df_lact.dropna(subset=['FarmName_Pseudo', 'TotalYield']))
plt.title('Box Plot of Total Yield per Milking Event Grouped by Herd')
plt.xlabel('Herd')
plt.ylabel('Total Yield per Milking Event')
plt.xticks(rotation=45)
plt.tight_layout()  # Adjust spacing
plt.show()

Total yield 2.5kg - 60kg

In [None]:
# =================================================================================>>> OBS THIS STEP REMOVES MISSING RECORDS FOR TOTAL YIELD, ie dry period
# df_lact = pd.read_csv("../Data/MY3.csv", low_memory=False)

# Count instances with TotalYield greater than 60 L
count_invalid_total_yield = df_lact[df_lact['TotalYield'] > 60].shape[0]
print(f"Number of instances with TotalYield greater than 60: {count_invalid_total_yield}")
df_lact2 = df_lact[df_lact['TotalYield'] < 60]

count_invalid_total_yield = df_lact2[df_lact2['TotalYield'] < 2.5].shape[0]
print(f"Number of instances with TotalYield less than 2.5 kg: {count_invalid_total_yield}")
df_lact2 = df_lact2[df_lact2['TotalYield'] > 2.5]
# df_lact2.to_csv("../Data/MY4.csv", index=False)

df3a = df_lact2.drop_duplicates(subset=['SE_Number', "LactationNumber", "StartDate", "StartTime", "SessionNumber"])
print(f"No. of milking events in MY file: {df3a.shape}")
df3a = df_lact2.drop_duplicates(subset=['SE_Number', "LactationNumber", "StartDate"])
print(f"No. of milking days in MY file: {df3a.shape}")
df3a = df_lact2.drop_duplicates(subset=['SE_Number', "LactationNumber"])
print(f"No. of lactations in MY file: {df3a.shape}")
df3a = df_lact2.drop_duplicates(subset=['SE_Number'])
print(f"No. of cows in MY file: {df3a.shape}")

In [None]:
# No. MY records per herd
count_my_rec = df_lact2.groupby(["FarmName_Pseudo"])["StartDate"].count().reset_index()
print(f"No. milking events per herd: \n", count_my_rec.to_string(index=False))

Remove colostrum

In [None]:
count_colostrum_days = df_lact2[df_lact2['DaysInMilk2'] <= 4].shape[0]
print(f"Number of milking events <= 4DIM: {count_colostrum_days}")
print(df_lact2.shape)
df_lact2 = df_lact2[df_lact2['DaysInMilk2'] > 4]
print(df_lact2.shape)

In [None]:
# No. MY records per herd
count_my_rec = df_lact2.groupby(["FarmName_Pseudo"])["StartDate"].count().reset_index()
print(f"No. milking events per herd: \n", count_my_rec.to_string(index=False))

Basic desk statistics

In [None]:
# Basic statistics
summary_stats = df_lact2['TotalYield'].describe()

# Check for non-empty column before calculating percentiles
if not df_lact2['TotalYield'].dropna().empty:
    percentiles = np.percentile(df_lact2['TotalYield'].dropna(), [1, 5, 10, 90, 95, 99])
else:
    percentiles = None

print("Descriptive Statistics:\n", summary_stats)
if percentiles is not None:
    print("\nPercentiles (1%, 5%, 10%, 90%, 95%, 99%):", percentiles)
else:
    print("\nPercentiles cannot be calculated due to empty or missing data.")

# Box plot: Ensure no missing values in key columns
sns.boxplot(x='FarmName_Pseudo', y='TotalYield', data=df_lact2.dropna(subset=['FarmName_Pseudo', 'TotalYield']))
plt.title('Box Plot of Total Yield per Milking Event Grouped by Herd')
plt.xlabel('Herd')
plt.ylabel('Total Yield per Milking Event')
plt.xticks(rotation=45)
plt.tight_layout()  # Adjust spacing
plt.show()

Save

In [None]:
df_lact2.to_csv("../Data/MergedData/MY_weather_filtered.csv", index=False)

# Desk stat for Table 1

In [None]:
import pandas as pd
import numpy as np

In [None]:
df_lact = pd.read_csv("../Data/MergedData/MY_weather_filtered.csv", low_memory=False)
print(f"No. milking events in filtered dataset: {df_lact.shape}")

# No. MY records per herd
count_my_rec = df_lact.groupby(["FarmName_Pseudo"])["StartDate"].count().reset_index()
print(f"No. milking events per herd: \n", count_my_rec.to_string(index=False))

In [None]:
# MILKING RECORDS
count_my_rec = df_lact.groupby(["Parity"])["StartDate"].count().reset_index()
print(f"No. of milking records divided over parities: \n", count_my_rec.to_string(index=False))

count_my_rec = df_lact.groupby(["Parity", "Breed"])["StartDate"].count().reset_index()
print(f"No. of milking records divided over parities and breeds: \n", count_my_rec.to_string(index=False))

# By parity
for_my_rec5 = df_lact.drop_duplicates(subset=["SE_Number", "LactationNumber"])
print(f"No. of parities in file: {for_my_rec5.shape}")

count_my_rec = for_my_rec5.groupby(["Breed"])["SE_Number"].count().reset_index()
print(f"No. of lactations from NRDC and SH: \n", count_my_rec.to_string(index=False))  

count_my_rec = for_my_rec5.groupby(["Parity", "Breed"])["SE_Number"].count().reset_index()
print(f"No. of cows within breed and parity: \n", count_my_rec.to_string(index=False))

# By cows
for_my_rec4 = df_lact.drop_duplicates(subset=["SE_Number"])
print(f"No. of cows in file: {for_my_rec4.shape}") 

for_my_rec5 = for_my_rec4.drop_duplicates(subset=["SE_Number"])
count_my_rec = for_my_rec5.groupby(["Breed"])["SE_Number"].count().reset_index()
print(f"No. of cows from NRDC, SH: \n", count_my_rec.to_string(index=False))