# Checking robot milking file

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
my_robot = pd.read_csv("Del_Milk_Robot240805.csv", delimiter=';', low_memory=False)
print(f"No. of milking records in robot milking file: {my_robot.shape}")  # 2,208,343, 56col
# print(my_robot.dtypes)

# Change comma to dots
# List of columns where you want to replace commas with dots
columns_to_modify = ["TotalYield", "TotalYieldLF", "TotalYieldRF", "TotalYieldLR", "TotalYieldRR",
                     "AverageFlowLF", "AverageFlowLR", "AverageFlowRF", "AverageFlowRR",
                     "PeakFlowLF", "PeakFlowLR", "PeakFlowRF", "PeakFlowRR"]

# Replace comma with dot and convert to float
my_robot[columns_to_modify] = my_robot[columns_to_modify].replace(',', '.', regex=True).astype(float)

# Split MilkingStartDateTime into StartDate and StartTime
# Handle SE-a624fb9a-1259 where missing time in date-time variable at 2019-12-03
my_robot["MilkingStartDateTime"] = pd.to_datetime(my_robot["MilkingStartDateTime"], errors="coerce")
my_robot["MilkingStartDateTime"] = my_robot["MilkingStartDateTime"].fillna(pd.Timestamp('2019-12-03 00:00:00'))
my_robot['StartDate'] = my_robot['MilkingStartDateTime'].dt.date
my_robot["StartDate"] = pd.to_datetime(my_robot["StartDate"])
my_robot['StartTime'] = my_robot['MilkingStartDateTime'].dt.time

In [None]:
my_robot = my_robot.sort_values(by=["SE_Number", "StartDate", "StartTime", "SessionNumber"])
my_robot = my_robot.drop_duplicates(subset=["SE_Number", "StartDate", "StartTime", "SessionNumber", "TotalYield"])
print(f"No. of milking records in robot milking file: {my_robot.shape}")
my_robotx = my_robot[pd.notna(my_robot["TotalYield"])]
print(f"No. of milking records in robot milking file: {my_robotx.shape}")

my_robot = my_robot.sort_values(by=["SE_Number", "StartDate", "StartTime", "SessionNumber"])
my_robotx = my_robot.drop_duplicates(subset=["SE_Number", "StartDate", "StartTime", "SessionNumber"])
print(f"No. of sessions in robot milking file: {my_robotx.shape}")
my_robotx = my_robot[pd.notna(my_robot["SessionNumber"])]
print(f"No. of sessions in robot milking file: {my_robotx.shape}")

my_robot = my_robot.sort_values(by=["SE_Number", "StartDate", "StartTime"])
my_robotx = my_robot.drop_duplicates(subset=["SE_Number", "StartDate", "StartTime"])
print(f"No. of unique sessions with time stamps in robot milking file: {my_robotx.shape}")
my_robotx = my_robot[pd.notna(my_robot["StartTime"])]
print(f"No. of unique sessions with time stamps in robot milking file: {my_robotx.shape}")

# Which herds have robot?
my_robot_unique = my_robot.drop_duplicates(subset=["FarmName_Pseudo"])
col_keep = ["FarmName_Pseudo"]
my_robot_unique = my_robot_unique[col_keep]
print(my_robot_unique.to_string(index=False))

In [None]:
# Remove missing yield
my_robot = my_robot[pd.notna(my_robot["TotalYield"])]
print(f"No. of milking records with recorded yield in robot milking file: {my_robot.shape}")  # 2,055,513
# Remove missing time stamps
my_robot = my_robot[pd.notna(my_robot["StartTime"])]
print(f"No. of unique sessions with time stamps in robot milking file: {my_robot.shape}")  # 2,055,513

# Keep relevant col
col_keep = ["FarmName_Pseudo", "SE_Number", "StartDate", "StartTime", "SessionNumber", "EquipmentName",
            "TotalYieldLF", "TotalYieldRF", "TotalYieldLR", "TotalYieldRR", "TotalYield",
            "KickOffLF", "KickOffLR", "KickOffRF", "KickOffRR",
            "IncompleteLF","IncompleteLR", "IncompleteRF", "IncompleteRR",
            "NotMilkedTeatLF", "NotMilkedTeatLR", "NotMilkedTeatRF", "NotMilkedTeatRR",
            "AverageFlowLF", "AverageFlowLR", "AverageFlowRF", "AverageFlowRR",
            "PeakFlowLF", "PeakFlowLR", "PeakFlowRF", "PeakFlowRR",
            "Occ", "MilkingUnitNumber", "MilkingUnitName", "Action", "Various_User",
            "Various_MilkDestination", "Various_ReasonForMilkSeperation"]
my_robot = my_robot[col_keep]
my_robot = my_robot.sort_values(by=["SE_Number", "StartDate", "StartTime", "SessionNumber"])

my_robot.to_csv("my_robot.csv", index=False)

In [None]:
# No. robots in herds
# EquipmentName
unique_values = my_robot.groupby("FarmName_Pseudo")["EquipmentName"].apply(lambda x: list(x.unique())).reset_index()
print(unique_values.to_string(index=False))

# MilkingUnitName
unique_values = my_robot.groupby("FarmName_Pseudo")["MilkingUnitName"].apply(lambda x: list(x.unique())).reset_index()
print(unique_values.to_string(index=False))

# No. MY records per milking unit
count_my_rec = my_robot.groupby(["FarmName_Pseudo", "MilkingUnitName"])["StartDate"].count().reset_index()
print(count_my_rec.to_string(index=False))

# No. cows
my_robotx = my_robot.sort_values(by=["SE_Number"])
my_robotx = my_robotx.drop_duplicates(subset=["SE_Number"])
print(f"No. cows in robot milking file: {my_robotx.shape}")

In [None]:
# Add calving date, lactation number and dry off dates to MY records
# Load calving file and make upper limit to sort by
calving = pd.read_csv("Del_Calving240531.csv", delimiter=';', low_memory=False)
col_keep = ["SE_Number", "CalvingDate"]
calving = calving[col_keep]
calving = calving.sort_values(by=["SE_Number", "CalvingDate"])
calving = calving.drop_duplicates(subset=["SE_Number", "CalvingDate"])
calving["upper_limit"] = calving.groupby(["SE_Number"])["CalvingDate"].shift(-1)
calving.to_csv("calving.csv", index=False)

# Load culling data
culling = pd.read_csv("Del_Cow240531.csv", delimiter=';', low_memory=False)
col_keep = ["SE_Number", "CullDecisionDate"]
culling = culling[col_keep]
culling = culling.sort_values(by=["SE_Number", "CullDecisionDate"])
culling = culling.drop_duplicates(subset=["SE_Number", "CullDecisionDate"])
culling.to_csv("culling.csv", index=False)

# Merge
for_my_rec = calving.merge(culling, on=["SE_Number"], how="left")
for_my_rec.loc[pd.isna(for_my_rec["upper_limit"]), "upper_limit"] = for_my_rec["CullDecisionDate"]
for_my_rec.to_csv("for_my_rec0.csv", index=False)

# Load dry off data
dry_off = pd.read_csv("Del_DryOff240531.csv", delimiter=';', low_memory=False)
col_keep = ["SE_Number", "DryOffDate"]
dry_off = dry_off[col_keep]
dry_off = dry_off.sort_values(by=["SE_Number", "DryOffDate"])
dry_off = dry_off.drop_duplicates(subset=["SE_Number", "DryOffDate"])
dry_off.to_csv("dry_off.csv", index=False)

for_my_rec = for_my_rec.merge(dry_off, on=["SE_Number"], how="left")
for_my_rec["CalvingDate"] = pd.to_datetime(for_my_rec["CalvingDate"])
for_my_rec["DryOffDate"] = pd.to_datetime(for_my_rec["DryOffDate"])
for_my_rec["upper_limit"] = pd.to_datetime(for_my_rec["upper_limit"])


def data(row):
    if row["CalvingDate"] < row["DryOffDate"] <= row["upper_limit"]:
        return 1
    else:
        return 0


for_my_rec["dryoff"] = for_my_rec.apply(data, axis=1)

# Keep all records with dry off dates fitted within lactation
df_ones = for_my_rec[for_my_rec['dryoff'] == 1]
# Keep last record where open and lacking dry off date
df_last_zero = for_my_rec[for_my_rec['dryoff'] == 0].groupby("SE_Number").tail(1)
# Concatenate dataframe and sort to maintain original order
df_combined = pd.concat([df_ones, df_last_zero])
df_combined = df_combined.sort_values(by=["SE_Number", "CalvingDate"]).reset_index(drop=True)
df_combined.to_csv("for_my_rec1.csv", index=False)

# Get today's date for current lactation missing upper_limit
df_combined['TodayDate'] = pd.to_datetime('today').normalize()
df_combined.loc[df_combined["upper_limit"].isna() & df_combined["CullDecisionDate"].isna(), "upper_limit"] = (
    df_combined)["TodayDate"]

df_combined.loc[df_combined["upper_limit"].isna()
                & (df_combined["DryOffDate"] < df_combined["CalvingDate"]), "upper_limit"] = df_combined["DryOffDate"]
df_combined.loc[df_combined["dryoff"] == 0, "DryOffDate"] = np.nan
df_combined.to_csv("for_my_rec2.csv", index=False)

# Load lactation number
lact = pd.read_csv("Del_Lactation240531.csv", delimiter=';', low_memory=False)
col_keep = ["SE_Number", "LactationInfoDate", "LactationNumber"]
lact = lact[col_keep]
lact = lact.sort_values(by=["SE_Number", "LactationInfoDate", "LactationNumber"])
lact = lact.drop_duplicates(subset=["SE_Number", "LactationNumber"])
lact.to_csv("lact.csv", index=False)

for_my_rec2 = lact.merge(df_combined, on=["SE_Number"], how="left")
for_my_rec2 = for_my_rec2[for_my_rec2["LactationInfoDate"] != "2022-05"]
for_my_rec2["LactationInfoDate"] = pd.to_datetime(for_my_rec2["LactationInfoDate"])


def data1(row):
    if row["CalvingDate"] <= row["LactationInfoDate"] <= row["upper_limit"]:
        return 1
    else:
        return 0


for_my_rec2["lact"] = for_my_rec2.apply(data1, axis=1)
for_my_rec2 = for_my_rec2[for_my_rec2['lact'] == 1]
print(f"No. of lactations in dataframe: {for_my_rec2.shape}")  # 9907 lact

# Add breed information
breed = pd.read_csv("Del_Cow240531.csv", delimiter=';', low_memory=False)
col_keep = ["SE_Number", "BreedName"]
breed = breed[col_keep]
breed = breed.sort_values(by=["SE_Number", "BreedName"])
breed = breed.drop_duplicates(subset=["SE_Number", "BreedName"])
breed.to_csv("breed.csv", index=False)

for_my_rec2 = for_my_rec2.merge(breed, on=["SE_Number"], how="left")
col_keep = ["SE_Number", "BreedName", "LactationNumber", "CalvingDate", "upper_limit", "CullDecisionDate", "DryOffDate"]
for_my_rec2 = for_my_rec2[col_keep]
for_my_rec2.to_csv("for_my_rec2.csv", index=False)

# Check breed information
for_my_rec3 = for_my_rec2.drop_duplicates(subset=["SE_Number", "BreedName"])
count_my_rec = for_my_rec3.groupby(["BreedName"])["SE_Number"].count().reset_index()
print(f"No. of cows of respective breed: \n", count_my_rec.to_string(index=False))

# Merge with robot milking file
for_my_rec3 = for_my_rec2.merge(my_robot, on=["SE_Number"], how="left")


def data2(row):
    if row["CalvingDate"] <= row["StartDate"] <= row["upper_limit"]:
        return 1
    else:
        return 0


for_my_rec3["lact"] = for_my_rec3.apply(data2, axis=1)
for_my_rec3 = for_my_rec3[for_my_rec3['lact'] == 1]
for_my_rec_x = for_my_rec3[(for_my_rec3["BreedName"] == "01 SRB") | (for_my_rec3["BreedName"] == "02 SLB")]
for_my_rec_x.to_csv("for_my_rec2.csv", index=False)
for_my_rec_x.to_csv("Robot_MY.csv", index=False)

# Descriptive statistics

In [None]:
# Overall
for_my_rec4 = for_my_rec3[pd.notna(for_my_rec3["TotalYield"])]
print(f"No. of milking records with recorded yield in robot milking file: {for_my_rec4.shape}")  # 1,777,014
for_my_rec5 = for_my_rec4[(for_my_rec4["BreedName"] == "01 SRB") | (for_my_rec4["BreedName"] == "02 SLB")]
count_my_rec = for_my_rec5.groupby(["BreedName"])["StartDate"].count().reset_index()
print(f"No. of milking records from SRB and SLB: \n", count_my_rec.to_string(index=False))

In [None]:
for_my_rec5 = for_my_rec4.drop_duplicates(subset=["SE_Number", "LactationNumber"])
print(f"No. of lactations in robot milking file: {for_my_rec5.shape}")  # 2,365

for_my_rec6 = for_my_rec5[(for_my_rec5["BreedName"] == "01 SRB") | (for_my_rec5["BreedName"] == "02 SLB")]
count_my_rec = for_my_rec6.groupby(["BreedName"])["LactationNumber"].count().reset_index()
print(f"No. of lactations from SRB and SLB: \n", count_my_rec.to_string(index=False))

In [None]:
for_my_rec4 = for_my_rec3.drop_duplicates(subset=["SE_Number"])
print(f"No. of cows in robot milking file: {for_my_rec4.shape}")  # 1,378

for_my_rec5 = for_my_rec4.drop_duplicates(subset=["SE_Number"])
for_my_rec6 = for_my_rec5[(for_my_rec5["BreedName"] == "01 SRB") | (for_my_rec5["BreedName"] == "02 SLB")]
count_my_rec = for_my_rec6.groupby(["BreedName"])["SE_Number"].count().reset_index()
print(f"No. of cows from SRB and SLB: \n", count_my_rec.to_string(index=False))

In [None]:
for_my_rec5 = for_my_rec3.drop_duplicates(subset=["FarmName_Pseudo", "SE_Number"])
for_my_rec6 = for_my_rec5[(for_my_rec5["BreedName"] == "01 SRB") | (for_my_rec5["BreedName"] == "02 SLB")]
count_my_rec = for_my_rec6.groupby(["FarmName_Pseudo"])["SE_Number"].count().reset_index()
print(f"No. of SRB and SLB cows in different herds with robot milking: \n", count_my_rec.to_string(index=False))

In [None]:
for_my_rec5 = for_my_rec3.drop_duplicates(subset=["FarmName_Pseudo", "SE_Number"])
for_my_rec6 = for_my_rec5[(for_my_rec5["BreedName"] == "01 SRB") | (for_my_rec5["BreedName"] == "02 SLB")]
count_my_rec = for_my_rec6.groupby(["FarmName_Pseudo", "BreedName"])["SE_Number"].count().reset_index()
print(f"No. of SRB and SLB cows in different herds with robot milking: \n", count_my_rec.to_string(index=False))

In [None]:
for_my_rec3["MilkingYear"] = for_my_rec3["StartDate"].dt.year
for_my_rec5 = for_my_rec3.drop_duplicates(subset=["FarmName_Pseudo", "SE_Number", "MilkingYear"])
for_my_rec6 = for_my_rec5[(for_my_rec5["BreedName"] == "01 SRB") | (for_my_rec5["BreedName"] == "02 SLB")]
count_my_rec = for_my_rec6.groupby(["FarmName_Pseudo", "MilkingYear"])["SE_Number"].count().reset_index()
print(f"No. of yearly cows (SRB and SLB) in different herds with robot milking: \n", count_my_rec.to_string(index=False))

In [None]:
for_my_rec5 = for_my_rec3.drop_duplicates(subset=["FarmName_Pseudo", "SE_Number", "MilkingYear"])
for_my_rec6 = for_my_rec5[(for_my_rec5["BreedName"] == "01 SRB") | (for_my_rec5["BreedName"] == "02 SLB")]
count_my_rec = for_my_rec6.groupby(["FarmName_Pseudo", "BreedName", "MilkingYear"])["SE_Number"].count().reset_index()
print(f"No. of yearly SRB and SLB cows in different herds with robot milking: \n", count_my_rec.to_string(index=False))

In [None]:
# By lactation
count_my_rec = for_my_rec3.groupby(["LactationNumber"])["StartDate"].count().reset_index()
print(f"No. of milking records divided over lactation numbers: \n", count_my_rec.to_string(index=False))

count_my_rec = for_my_rec3.groupby(["LactationNumber", "BreedName"])["StartDate"].count().reset_index()
print(f"No. of milking records divided over lactation numbers and breeds: \n", count_my_rec.to_string(index=False))

for_my_rec4 = for_my_rec3.drop_duplicates(subset=["SE_Number", "LactationNumber"])
lactation_counts = for_my_rec4.groupby('LactationNumber').size().reset_index(name='NumberOfCows')
print(f"No. of cows divided over lactation numbers: \n", lactation_counts.to_string(index=False))

for_my_rec5 = for_my_rec4.drop_duplicates(subset=["SE_Number", "LactationNumber"])
for_my_rec6 = for_my_rec5[(for_my_rec5["BreedName"] == "01 SRB") | (for_my_rec5["BreedName"] == "02 SLB")]
count_my_rec = for_my_rec6.groupby(["LactationNumber", "BreedName"])["SE_Number"].count().reset_index()
print(f"No. of lactations from SRB and SLB cows: \n", count_my_rec.to_string(index=False))

In [None]:
# Average milk production per herd
df = pd.read_csv("for_my_rec2.csv", low_memory=False)
col_keep = ["FarmName_Pseudo", "SE_Number", "BreedName", "LactationNumber", "CalvingDate", "StartDate", "StartTime",
            "TotalYield"]
df = df[col_keep]

df["StartDate"] = pd.to_datetime(df["StartDate"])
df["CalvingDate"] = pd.to_datetime(df["CalvingDate"])
df["DaysInMilk"] = (df["StartDate"] - df["CalvingDate"]).dt.days + 1
df["MilkingYear"] = df["StartDate"].dt.year

# Calculate the total yield for each lactation
lactation_yield = df.groupby(["SE_Number", "LactationNumber"])["TotalYield"].sum().reset_index()
lactation_yield.rename(columns={"TotalYield": "TotalLactationYield"}, inplace=True)
lactation_yield.to_csv("lact2.csv", index=False)

df2 = df.merge(lactation_yield, on=["SE_Number", "LactationNumber"], how="left")
df2.to_csv("for_my_rec3.csv", index=False)

In [None]:
"""
# Plot Mean Milk Production By Herd
df3 = df2.drop_duplicates(subset=["SE_Number", "LactationNumber"])
df3 = df3[(df3["BreedName"] == "01 SRB") | (df3["BreedName"] == "02 SLB")]

herd_ids = df3["FarmName_Pseudo"].unique()

for herd_id in herd_ids:
    plt.figure()
    subset = df3[df3["FarmName_Pseudo"] == herd_id]
    plt.hist(subset["TotalLactationYield"], bins=5, alpha=0.7, color='blue')
    plt.title(f'Herd {herd_id} - Total Milk Production in Lactation from SRB and SH Cows')
    plt.xlabel('Milk Production')
    plt.ylabel('Frequency')
    plt.grid(True)
    plt.show()
"""

In [None]:
# Mean and SD milk production in herds
df3 = df2.drop_duplicates(subset=["SE_Number", "LactationNumber"])
df3 = df3[(df3["BreedName"] == "01 SRB") | (df3["BreedName"] == "02 SLB")]
df3.to_csv("for_my_rec4.csv", index=False)
average_production = (df3.groupby(['FarmName_Pseudo'])['TotalLactationYield']
                      .agg(['mean', 'std']).reset_index())
average_production['mean'] = average_production['mean'].round(0)
average_production['std'] = average_production['std'].round(0)
average_production.rename(columns={'mean': 'MeanMilkProduction', 'std': 'SDMilkProduction'}, inplace=True)
print(f"Mean milk production in herds: \n", average_production.to_string(index=False))

# Mean and SD milk production in herds by year
df3 = df2.drop_duplicates(subset=["SE_Number", "LactationNumber"])
df3 = df3[(df3["BreedName"] == "01 SRB") | (df3["BreedName"] == "02 SLB")]
average_production = (df3.groupby(['FarmName_Pseudo', 'MilkingYear'])['TotalLactationYield']
                      .agg(['mean', 'std']).reset_index())
average_production['mean'] = average_production['mean'].round(0)
average_production['std'] = average_production['std'].round(0)
average_production.rename(columns={'mean': 'MeanMilkProduction', 'std': 'SDMilkProduction'}, inplace=True)
print(f"Mean milk production in herds by production year: \n", average_production.to_string(index=False))

In [None]:
# Filter non-complete lactation
df2 = pd.read_csv("for_my_rec3.csv", low_memory=False)
first_obs = df2.groupby(["SE_Number", "LactationNumber"]).first().reset_index()
first_obs.rename(columns={"DaysInMilk": "FirstDayInMilk"}, inplace=True)
col_keep = ["SE_Number", "LactationNumber", "FirstDayInMilk"]
first_obs = first_obs[col_keep]

last_obs = df2.groupby(["SE_Number", "LactationNumber"]).last().reset_index()
last_obs.rename(columns={"DaysInMilk": "LastDayInMilk"}, inplace=True)
col_keep = ["SE_Number", "LactationNumber", "LastDayInMilk"]
last_obs = last_obs[col_keep]

obs = first_obs.merge(last_obs, on=["SE_Number", "LactationNumber"], how="left")
obs.to_csv("lact2.csv", index=False)

In [None]:
"""
# Plot variation in first day of milking in lactation
plt.hist(obs["FirstDayInMilk"], bins=5, edgecolor='black')
plt.title("Histogram of First Day In Milk")
plt.xlabel("First Day In Milk")
plt.ylabel("Frequency")
plt.show()

# Plot variation in flast day of milking in lactation
plt.hist(obs["LastDayInMilk"], bins=5, edgecolor='black')
plt.title("Histogram of Last Day In Milk")
plt.xlabel("Last Day In Milk")
plt.ylabel("Frequency")
plt.show()
"""

In [None]:
df3 = df2.merge(obs, on=["SE_Number", "LactationNumber"], how="left")
df4 = (df3.groupby(["FarmName_Pseudo"])["LastDayInMilk"]
                      .agg(['mean', 'std']).reset_index())
print(f"Mean and SD of last day in milk: \n", df4.to_string(index=False))

MINOR FILTERING AND DESCRIPTIVE STATISTICS

In [None]:
# Filter to include only the first 365 days in milk
print(df3.shape)  # 1,540,006

df_365 = df3[df3["DaysInMilk"] <= 365]
print(f"No. milking events in SRB and SH cows within 365 DIM: {df_365.shape}")  # 1,317,044
df_lact = df_365[df_365["LactationNumber"] <= 7]
print(f"No. milking events in SRB and SH cows within 365 DIM in lactation 1-7: {df_lact.shape}")  # 1,307,751

# Make Parity 1-3
df_lact = df_lact.copy()
df_lact["Parity"] = df_lact["LactationNumber"]
df_lact.loc[(df_lact['LactationNumber'] >= 3) & (df_lact['LactationNumber'] <= 7), 'Parity'] = 3

print(df_lact.shape)
df_lact = df_lact.sort_values(by=['SE_Number', 'LactationNumber', 'StartDate', 'StartTime'])
df_lact.to_csv("for_my_rec4.csv", index=False)

In [None]:
# DESCRIPTIVE STATISTICS FILTERED DATA
# Milking records
count_my_rec = df_lact.groupby(["Parity"])["StartDate"].count().reset_index()
print(f"No. of milking records divided over parities: \n", count_my_rec.to_string(index=False))

count_my_rec = df_lact.groupby(["Parity", "BreedName"])["StartDate"].count().reset_index()
print(f"No. of milking records divided over parities and breeds: \n", count_my_rec.to_string(index=False))

In [None]:
# By parity
for_my_rec5 = df_lact.drop_duplicates(subset=["SE_Number", "LactationNumber"])
print(f"No. of parities in robot milking file: {for_my_rec5.shape}")  # 1984

count_my_rec = for_my_rec5.groupby(["Parity", "BreedName"])["SE_Number"].count().reset_index()
print(f"No. of parities from SRB and SLB: \n", count_my_rec.to_string(index=False))

In [None]:
# By cows
df_lact = pd.read_csv("for_my_rec4.csv", low_memory=False)
for_my_rec4 = df_lact.drop_duplicates(subset=["SE_Number"])
print(f"No. of cows in robot milking file: {for_my_rec4.shape}")  # 1,164

for_my_rec5 = for_my_rec4.drop_duplicates(subset=["SE_Number"])
count_my_rec = for_my_rec5.groupby(["BreedName"])["SE_Number"].count().reset_index()
print(f"No. of cows from SRB and SLB: \n", count_my_rec.to_string(index=False))

In [None]:
# Average milk production in filtered data
df_lact = pd.read_csv("for_my_rec4.csv", low_memory=False)
for_my_rec5 = df_lact.drop_duplicates(subset=["SE_Number", "StartDate", "StartTime"])
count_my_rec = for_my_rec5.groupby(["FarmName_Pseudo"])["StartTime"].count().reset_index()
print(f"No. of milking events from SRB and SH in different herds: \n", count_my_rec.to_string(index=False))

In [None]:
# Mean and SD milk production in herds
# Calculate the 365DIM total yield for each lactation
df_lact = pd.read_csv("for_my_rec4.csv", low_memory=False)
print(df_lact.shape)

df_lact = df_lact.sort_values(by=['SE_Number', 'LactationNumber', 'StartDate', 'StartTime'])
lactation_yield = df_lact.groupby(["SE_Number", "LactationNumber"])["TotalYield"].sum().reset_index()
lactation_yield.rename(columns={"TotalYield": "TotalLactationYield_365DIM"}, inplace=True)
# lactation_yield.to_csv("lact2.csv", index=False)

df_lact2 = df_lact.merge(lactation_yield, on=["SE_Number", "LactationNumber"], how="left")
df_lact2.to_csv("for_my_rec4.csv", index=False)

df3 = df_lact2.drop_duplicates(subset=["SE_Number", "LactationNumber"])
average_production = (df3.groupby(['FarmName_Pseudo'])['TotalLactationYield_365DIM']
                      .agg(['mean', 'std']).reset_index())
average_production['mean'] = average_production['mean'].round(0)
average_production['std'] = average_production['std'].round(0)
average_production.rename(columns={'mean': 'Mean_MY_365DIM', 'std': 'SD_MY_365DIM'}, inplace=True)
print(f"Mean milk production in herds: \n", average_production.to_string(index=False))

In [None]:
for_my_rec5 = df3.drop_duplicates(subset=["FarmName_Pseudo", "SE_Number"])
count_my_rec = for_my_rec5.groupby(["FarmName_Pseudo"])["SE_Number"].count().reset_index()
print(f"No. of cows in different herds with robot milking: \n", count_my_rec.to_string(index=False))

In [None]:
# Plot Mean Milk Production By Herd
df3 = df_lact2.drop_duplicates(subset=["SE_Number", "LactationNumber"])
herd_ids = df3["FarmName_Pseudo"].unique()

for herd_id in herd_ids:
    plt.figure()
    subset = df3[df3["FarmName_Pseudo"] == herd_id]
    plt.hist(subset["TotalLactationYield_365DIM"], bins=5, alpha=0.7, color='blue')
    plt.title(f'Herd {herd_id} - Total MY in 365 DIM Lactations from SRB and SH Cows')
    plt.xlabel('Milk Production 365 DIM')
    plt.ylabel('Frequency')
    plt.grid(True)
    plt.show()