# Filtering dataframe for fertility analysis

Edits are performed on dataset "fertilityDF_W.csv" i.e. after "BuildingDataframe.ipynb" & "FertilityTraits.ipynb" have been run. The code generates a clean dataset for further analysis in "FertilityAnalysis.ipynb".

The following filtering steps are according to NAV, apart from choice of breeds.
- Add 305d MY from raw data and check missing MY records (a756bc39, 6d38bc90)
- Keep only data from SH and NRDC
- Keep only lactation 1-8 
- Make parity 1, 2, >=3 
- Records within 150 days from data extraction are excluded from the data set 
- Only the first 10 inseminations are accepted for CR 
- Age at first calving: 550d - 1100d 
- CI maximum 2 years for cows
- CFS 20 - 230d
- FLS max 365d

In addition, putting thresholds on classical fertility traits according to NAV or µ+-2SD of respective trait
- CFI: 20-230
- CLI: 20-217
- FLI: 0-365
- CI: 301-730
- GL: 260-302

Allow for minimum 5 records in HYS groups (of insemination date) to help convergence in analysis

Add threshold for 305d MY (7,395 - 16,255kg)

Basic descriptive analysis at end of script

This script generates a clean dataset containing the following columns:
- SE_Number
- Breed 
    * NRDC: SRB, DR, SAB
    * SH
- LactationNumber
- Parity (1,2,>=3)
- InseminationDate
- HYS of insemination
- HeatStress (whether the cow experienced HS 7d prior to, during and 7d after insemination)
- Milk_Kg (305d)
- CFI (Interval between calving and first insemination)
- CLI (Interval between calving and last insemination)
- FLI (Interval between first and last insemination)
- NINS (Total number of inseminations during service period)
- CR0 (Conception rate)
- CI (Calving interval)
- GL (Gestation length)


In [None]:
from datetime import timedelta
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import glob

from fpdf import FPDF

import statsmodels.api as sm
import statsmodels.formula.api as smf
from itertools import product

In [None]:
# LOAD DATA
df3 = pd.read_csv("../Data/CowData/fertilityDF_W.csv", low_memory=False)

In [None]:
# Crude data distribution
print(f"No. observations in fertilityDF_W.csv.csv: {df3.shape}")
test = df3.drop_duplicates(subset=["SE_Number", "LactationNumber", "InseminationDate", "PregnancyCheckDate"])
print(f"Double check no. pregnancy checks in fertilityDF_W.csv: {test.shape}")
print(f'No. ins without pregnancy checks: {df3.shape[0] - test.shape[0]}')

test = df3.drop_duplicates(subset=["SE_Number", "LactationNumber", "InseminationDate"])
print(f"No. inseminations in fertilityDF_W.csv: {test.shape}")
test = df3.drop_duplicates(subset=["SE_Number", "LactationNumber"])
print(f"No. lactations in fertilityDF_W.csv: {test.shape}")
test = df3.drop_duplicates(subset=["SE_Number"])
print(f"No. cows in fertilityDF_W.csv: {test.shape}")

Keep only herds with (MY_W) data

In [None]:
# Count number of inseminations in herds
ins_count = df3.groupby("FarmName_Pseudo")["InseminationDate"].count()
print(ins_count)

In [None]:
# Count presence of HeatStress in the herds with issues with MY data
herds_to_keep = ['a756bc39', '6d38bc90']
df4 = df3[df3["FarmName_Pseudo"].isin(herds_to_keep)]

non_nan_count = df4['HeatStress'].notna().sum()
print(f"Number of non-NaN values: {non_nan_count}")

is_nan_count = df4['HeatStress'].isna().sum()
print(f"Number of NaN values: {is_nan_count}")


In [None]:
# Check if herds with issues with MY recording in DelPro have raw data from cow database
MY = pd.read_csv("C:/Users/pagd0001/Desktop/Gigacow/Data/20241009/Gigacow-tools/Projects/HeatStressEvaluation/Data/CowData/Kok_LactationReturn240820.csv", low_memory=False, delimiter=";")
col_keep = ["BirthID", "CalvingNumber", "Milk_Kg"]
MY = MY[col_keep]
MY.rename(columns={"BirthID": "SE_Number", "CalvingNumber": "LactationNumber"}, inplace=True)
MY = MY.drop_duplicates(subset=["SE_Number", "LactationNumber", "Milk_Kg"])

In [None]:
MY

In [None]:
# Add 305d MY records to fertilityDF_W.csv
df3 = pd.merge(df3, MY, on=["SE_Number", "LactationNumber"], how="left")
df3.to_csv("../Data/CowData/fertilityDF_W_MY.csv", index=False)
df3

In [None]:
# Count presence of 305d MY records in the herds with issues with MY data in DelPro
herds_to_keep = ['a756bc39', '6d38bc90']
df4 = df3[df3["FarmName_Pseudo"].isin(herds_to_keep)]

non_nan_count = df4['Milk_Kg'].notna().sum()
print(f"Number of non-NaN values: {non_nan_count}")

is_nan_count = df4['Milk_Kg'].isna().sum()
print(f"Number of NaN values: {is_nan_count}")

In [None]:
""" =========================================================================>>> These herds have data reported to the cow database and can be included!
# Remove herds with missing MY records (a756bc39, 6d38bc90)
# Herds to remove
herds_to_remove = ['a756bc39', '6d38bc90']
df3 = df3[~df3['FarmName_Pseudo'].isin(herds_to_remove)]
"""

In [None]:
print(f"No. observations in fertilityDF_W_MY.csv.csv: {df3.shape}")
test = df3.drop_duplicates(subset=["SE_Number", "LactationNumber", "InseminationDate", "PregnancyCheckDate"])
print(f"Double check no. pregnancy checks in fertilityDF_W_MY.csv: {test.shape}")
print(f'No. ins without pregnancy checks: {df3.shape[0] - test.shape[0]}')

test = df3.drop_duplicates(subset=["SE_Number", "LactationNumber", "InseminationDate"])
print(f"No. inseminations in fertilityDF_W_MY.csv: {test.shape}")
test = df3.drop_duplicates(subset=["SE_Number", "LactationNumber"])
print(f"No. lactations in fertilityDF_W_MY.csv: {test.shape}")
test = df3.drop_duplicates(subset=["SE_Number"])
print(f"No. cows in fertilityDF_W_MY.csv: {test.shape}")

Data from SH and NRDC

In [None]:
# Keep only data from SH and NRDC
breeds_to_keep = ["NRDC", "SLB"]
df3 = df3[df3["Breed"].isin(breeds_to_keep)]

In [None]:
print(f"No. observations in file: {df3.shape}")
test = df3.drop_duplicates(subset=["SE_Number", "LactationNumber", "InseminationDate", "PregnancyCheckDate"])
print(f"Double check no. pregnancy checks in file: {test.shape}")
print(f'No. ins without pregnancy checks: {df3.shape[0] - test.shape[0]}')

test = df3.drop_duplicates(subset=["SE_Number", "LactationNumber", "InseminationDate"])
print(f"No. inseminations in file: {test.shape}")
test = df3.drop_duplicates(subset=["SE_Number", "LactationNumber"])
print(f"No. lactations in file: {test.shape}")
test = df3.drop_duplicates(subset=["SE_Number"])
print(f"No. cows in file: {test.shape}")

Allow for maximum lactation 1-8

In [None]:
# Distribution of lactations
check = df3.drop_duplicates(subset=["SE_Number", "LactationNumber"])
value_counts = check['LactationNumber'].value_counts()
print(value_counts)

In [None]:
# Keep only lactation 1-8
df3 = df3[df3["LactationNumber"] <= 8]

# No. lactations and cows in data
print(f"No. observations in file: {df3.shape}")
test = df3.drop_duplicates(subset=["SE_Number", "LactationNumber", "InseminationDate", "PregnancyCheckDate"])
print(f"Double check no. pregnancy checks in file: {test.shape}")
print(f'No. ins without pregnancy checks: {df3.shape[0] - test.shape[0]}')

test = df3.drop_duplicates(subset=["SE_Number", "LactationNumber", "InseminationDate"])
print(f"No. inseminations in file: {test.shape}")
test = df3.drop_duplicates(subset=["SE_Number", "LactationNumber"])
print(f"No. lactations in file: {test.shape}")
test = df3.drop_duplicates(subset=["SE_Number"])
print(f"No. cows in file: {test.shape}")

In [None]:
# Make Parity 1-3
df3["Parity"] = df3["LactationNumber"]
df3.loc[(df3['LactationNumber'] >= 3) & (df3['LactationNumber'] <= 8), 'Parity'] = 3

Records within 150 days from data extraction are excluded from the data set (i.e. open records)

In [None]:
# Find open records
df3["extraction_limit"] = pd.to_datetime(df3["extraction_limit"])
df3["InseminationDate"] = pd.to_datetime(df3["InseminationDate"])

df3["open_rec"] = (df3["extraction_limit"] - df3["InseminationDate"]).dt.days

# Plot histogram
plt.hist(df3['open_rec'], bins=50, color='blue', edgecolor='black')
plt.xlabel('date')
plt.ylabel('Frequency')
plt.show()

In [None]:
non_nan_count = df3['open_rec'].notna().sum()
print(f"Number of non-NaN values: {non_nan_count}")

is_nan_count = df3['open_rec'].isna().sum()
print(f"Number of NaN values: {is_nan_count}")

In [None]:
df3 = df3[(df3["open_rec"] > 150)]

In [None]:
print(f"No. observations in file: {df3.shape}")
test = df3.drop_duplicates(subset=["SE_Number", "LactationNumber", "InseminationDate", "PregnancyCheckDate"])
print(f"Double check no. pregnancy checks in file: {test.shape}")
print(f'No. ins without pregnancy checks: {df3.shape[0] - test.shape[0]}')

test = df3.drop_duplicates(subset=["SE_Number", "LactationNumber", "InseminationDate"])
print(f"No. inseminations in file: {test.shape}")
test = df3.drop_duplicates(subset=["SE_Number", "LactationNumber"])
print(f"No. lactations in file: {test.shape}")
test = df3.drop_duplicates(subset=["SE_Number"])
print(f"No. cows in file: {test.shape}")

Only keep first 10 ins

In [None]:
# Only the first 10 inseminations are accepted
# Distribution of NINS
value_counts = df3["NINS"].value_counts()
print(value_counts)

In [None]:
# Count ins
df4 = df3.drop_duplicates(subset=["SE_Number", "LactationNumber", "InseminationDate"])
df4

In [None]:
df3["InseminationDate"] = pd.to_datetime(df3["InseminationDate"])

df4 = df4.copy()
df4['InseminationDate'] = pd.to_datetime(df4['InseminationDate'])

df4['InsCount'] = df4.groupby(['SE_Number', 'LactationNumber']).cumcount() + 1
col_keep = ["SE_Number", "LactationNumber", "InseminationDate", "InsCount"]
df4 = df4[col_keep]

df5 = pd.merge(df3, df4, on=["SE_Number", "LactationNumber", "InseminationDate"], how="left")
df5

In [None]:
# Distribution of NINS 
value_counts = df5["InsCount"].value_counts()
print(value_counts)

df5.to_csv("test.csv", index=False)

In [None]:
# The first 10 ins are accepted
# Sort by cow, lactation, and insemination_date
df5 = df5.sort_values(by=["SE_Number", "LactationNumber", "InseminationDate"])

# Identify the last insemination record within each group (cow, lactation) where InsCount == 11
last_ins11 = (
    df5[df5["InsCount"] == 11]
    .groupby(["SE_Number", "LactationNumber"])["InseminationDate"]
    .idxmax()
)

# Remove these rows from the DataFrame
df5 = df5.drop(last_ins11)

value_counts = df5["InsCount"].value_counts()
print(value_counts)

Age at first calving: 550d - 1100d 

In [None]:
# Load raw data to get full calving history from cow database
dfkok = pd.read_csv("C:/Users/pagd0001/Desktop/Gigacow/Data/20241009/Gigacow-tools/Projects/HeatStressEvaluation/Data/CowData/Kok_Calving240820.csv", delimiter=";", low_memory=False)
col_keep = ["BirthID", "CalvingDate", "CalvingNumber"]
dfkok = dfkok[col_keep]
dfkok = dfkok.rename(columns={"BirthID": "SE_Number", "CalvingNumber": "LactationNumber"})
dfkok = dfkok.drop_duplicates(subset=["SE_Number", "LactationNumber"])
dfkok

In [None]:
# Load raw data to get BirthDate
bd = pd.read_csv("C:/Users/pagd0001/Desktop/Gigacow/Data/20241009/Gigacow-tools/Projects/HeatStressEvaluation/Data/CowData/Kok_Lineage240821.csv", delimiter=";", low_memory=False)
col_keep = ["BirthID", "BirthDate"]
bd = bd[col_keep]
bd = bd.rename(columns={"BirthID": "SE_Number"})
bd = bd.drop_duplicates(subset=["SE_Number"])
bd


In [None]:
# Combine
dfkok = pd.merge(dfkok, bd, on=["SE_Number"], how="left")

In [None]:
# Calculate age at first calving
dfkok = dfkok[dfkok["LactationNumber"] == 1]
dfkok = dfkok.copy()
dfkok["BirthDate"] = pd.to_datetime(dfkok["BirthDate"])
dfkok["CalvingDate"] = pd.to_datetime(dfkok["CalvingDate"])
dfkok["AgeFirstCalving"] = (dfkok["CalvingDate"] - dfkok["BirthDate"]).dt.days

col_keep = ["SE_Number", "AgeFirstCalving"]
dfkok = dfkok[col_keep]
dfkok

In [None]:
# Plot
plt.hist(dfkok['AgeFirstCalving'], bins=50, color='blue', edgecolor='black')
plt.xlabel('Age')
plt.ylabel('Frequency')
plt.show()

In [None]:
# Merge back onto current dataset
df6 = pd.merge(df5, dfkok, on=["SE_Number"], how="left")
df6

In [None]:
non_nan_count = df6['AgeFirstCalving'].notna().sum()
print(f"Number of non-NaN values: {non_nan_count}")

is_nan_count = df6['AgeFirstCalving'].isna().sum()
print(f"Number of NaN values: {is_nan_count}")

In [None]:
# Filter age at first calving keeping only inseminations from cows with an age between 550 and 1100days old
df6 = df6[df6['AgeFirstCalving'].between(550, 1100)]

In [None]:
# Plot
plt.hist(df6['AgeFirstCalving'], bins=50, color='blue', edgecolor='black')
plt.xlabel('Age')
plt.ylabel('Frequency')
plt.show()

In [None]:
print(f"No. observations in file: {df6.shape}")

test = df6.drop_duplicates(subset=["SE_Number", "LactationNumber", "InseminationDate"])
print(f"No. inseminations in file: {test.shape}")
test = df6.drop_duplicates(subset=["SE_Number", "LactationNumber"])
print(f"No. lactations in file: {test.shape}")
test = df6.drop_duplicates(subset=["SE_Number"])
print(f"No. cows in file: {test.shape}")

Thresholds for classical fertility traits in editing full material (insemination wise)
- CI maximum 2 years for cows
- CFS 20 - 230d
- FLS max 365d

In [None]:
# Look at distribution of CI, CFI and FLI
df_fert = df6.drop_duplicates(subset=["SE_Number", "LactationNumber"])

plt.hist(df_fert['CI'], bins=50, color='blue', edgecolor='black')
plt.xlabel('Age')
plt.ylabel('Frequency')
plt.show()

plt.hist(df_fert['CFI'], bins=50, color='blue', edgecolor='black')
plt.xlabel('CFI')
plt.ylabel('Frequency')
plt.show()

plt.hist(df_fert['FLI'], bins=50, color='blue', edgecolor='black')
plt.xlabel('FLI')
plt.ylabel('Frequency')
plt.show()

In [None]:
# Thresholds
# Filter for CI values less than or equal to 730 or NaN
df6 = df6[(df6["CI"] <= 730) | (df6["CI"].isna())]

# Filter for CFI values between 20 and 230 (inclusive) or NaN
df6 = df6[(df6["CFI"].between(20, 230, inclusive="both")) | (df6["CFI"].isna())]

# Filter for FLI values less than or equal to 365 or NaN
df6 = df6[(df6["FLI"] <= 365) | (df6["FLI"].isna())]

df6.to_csv("../data/CowData/fertilityDF_W_MY_filtered.csv", index=False)

In [None]:
print(f"No. observations in fertilityDF_W_MY_filtered.csv.csv: {df6.shape}")

test = df6.drop_duplicates(subset=["SE_Number", "LactationNumber", "InseminationDate"])
print(f"No. inseminations in filtered data: {test.shape}")
test = df6.drop_duplicates(subset=["SE_Number", "LactationNumber"])
print(f"No. lactations in filtered data: {test.shape}")
test = df6.drop_duplicates(subset=["SE_Number"])
print(f"No. cows in filtered data: {test.shape}")

Minimum and maximum threshold on specific classical fertility traits
- i.e. if not within range, put to missing

In [None]:
# Load unfiltered data to check distribution
fert_df = pd.read_csv("../Data/CowData/fertilityDF_W_MY.csv", low_memory=False)
fert_df = fert_df.drop_duplicates(subset=["SE_Number", "LactationNumber"])

In [None]:
# Look at distributions
summary_stats = fert_df['GL'].describe()
percentiles = np.percentile(fert_df['GL'].dropna(), [1, 5, 10, 90, 95, 99])

print("Descriptive Statistics:\n", summary_stats)
print("\nPercentiles (1%, 5%, 10%, 90%, 95%, 99%):", percentiles)

In [None]:
plt.hist(fert_df['CI'], bins=50, color='blue', edgecolor='black')
plt.xlabel('CI')
plt.ylabel('Frequency')
plt.show()

In [None]:
# Put thresholds on traits: Define the valid range and set outside this range to missing
df6 = pd.read_csv("../Data/CowData/fertilityDF_W_MY_filtered.csv", low_memory=False)

# CFI: 79 +- 35 => 20-230d according to NAV (min 32, max 404) Percentiles (1%, 5%, 10%, 90%, 95%, 99%): [ 37.  43.  48. 121. 146. 208.]
CFI_min = 20
CFI_max = 230

df6.loc[(df6["CFI"] < CFI_min) | (df6["CFI"] > CFI_max), "CFI"] = np.nan

# CLI: 107 +- 55 => 20-217 if CFI min 20 and FLI 0 then CLI also 20d, 217 = 107+(55*2)
#   (min 22, max 531) Percentiles (1%, 5%, 10%, 90%, 95%, 99%): [ 39.    48.    53.   181.   213.   293.46]
CLI_min = 20
CLI_max = 217

df6.loc[(df6["CLI"] < CLI_min) | (df6["CLI"] > CLI_max), "CLI"] = np.nan

# FLI: 28 +- 45 => 0-365d according to NAV (min 0, max 471) Percentiles (1%, 5%, 10%, 90%, 95%, 99%): [  0.     0.     0.    87.   115.   194.46]
FLI_min = 0
FLI_max = 365

df6.loc[(df6["FLI"] < FLI_min) | (df6["FLI"] > FLI_max), "FLI"] = np.nan

# CI: 373 +- 72 => 301-730d 301d according to 373-72, 730d according to NAV
#   (min 1, max 822) Percentiles (1%, 5%, 10%, 90%, 95%, 99%): [  2.59 322.   329.   442.   472.   545.64] 
CI_min = 301
CI_max = 730

df6.loc[(df6["CI"] < CI_min) | (df6["CI"] > CI_max), "CI"] = np.nan

# GL: 260 - 302 according to NAV. Percentiles (1%, 5%, 10%, 90%, 95%, 99%): [-79.28 269.   273.   287.   288.   295.  ]
GL_min = 260
GL_max = 302

df6.loc[(df6["GL"] < GL_min) | (df6["GL"] > GL_max), "GL"] = np.nan

In [None]:
# Double check distributions after handling outliers
summary_stats = df6['CFI'].describe()
percentiles = np.percentile(df6['CFI'].dropna(), [1, 5, 10, 90, 95, 99])

print("Descriptive Statistics:\n", summary_stats)
print("\nPercentiles (1%, 5%, 10%, 90%, 95%, 99%):", percentiles)

Allow for minimum 5 records in HYS groups (of insemination date)

In [None]:
# Make HYS of insemination
df6['InseminationDate'] = pd.to_datetime(df6['InseminationDate'])


# Function to determine the season based on month
def get_season(date):
    month = date.month
    if month in [12, 1, 2]:
        return 1
    elif month in [3, 4, 5]:
        return 2
    elif month in [6, 7, 8]:
        return 3
    elif month in [9, 10, 11]:
        return 4


# Apply the function to create a 'YearSeason' variable
df6['YearSeason'] = df6['InseminationDate'].apply(lambda x: f"{x.year}0{get_season(x)}")

# Make HYS
df6["HYS"] = df6["FarmName_Pseudo"] + df6["YearSeason"]
df6 

In [None]:
# Check distribution of HYS
print(df6["HYS"].value_counts())

In [None]:
# HYS requiring minimum 5 events, otherwise remove
# Count occurrences of each HYS group and filter to keep only 5 or above
hys_counts = df6["HYS"].value_counts()
valid_hys = hys_counts[hys_counts >= 5].index
df6 = df6[df6["HYS"].isin(valid_hys)]

print(df6["HYS"].value_counts())

Add threshold for 305d MY

In [None]:
# Occurrence of MY data
non_nan_count = df6['Milk_Kg'].notna().sum()
print(f"Number of non-NaN values: {non_nan_count}")

is_nan_count = df6['Milk_Kg'].isna().sum()
print(f"Number of NaN values: {is_nan_count}")

In [None]:
# Plot MY
plt.hist(df6['Milk_Kg'], bins=50, color='blue', edgecolor='black')
plt.xlabel('MY')
plt.ylabel('Frequency')
plt.show()

# Look at distribution
summary_stats = df6['Milk_Kg'].describe()
percentiles = np.percentile(df6['Milk_Kg'].dropna(), [1, 5, 10, 90, 95, 99])

print("Descriptive Statistics:\n", summary_stats)
print("\nPercentiles (1%, 5%, 10%, 90%, 95%, 99%):", percentiles)

In [None]:
# µ: 11,825. SD: 2215 => Thresholds: 7,395 - 16,255
MY_min = 7395
MY_max = 16255

df6.loc[(df6["Milk_Kg"] < MY_min) | (df6["Milk_Kg"] > MY_max), "Milk_Kg"] = np.nan

In [None]:
plt.hist(df6['Milk_Kg'], bins=50, color='blue', edgecolor='black')
plt.xlabel('MY')
plt.ylabel('Frequency')
plt.show()

Save filtered dataset

In [None]:
# Save filtered data
df6.to_csv("../Data/fertilityDF_W_MY_filtered.csv", index=False)

In [None]:
# Clean dataset
col_keep = ["SE_Number", "Breed", "LactationNumber", "Parity", "InseminationDate", "HYS", "HeatStress", "Milk_Kg", "CFI", "CLI", "FLI", "NINS", "CR0", "CI", "GL"]
df7 = df6[col_keep]
df7.to_csv("../Data/fertility_filtered.csv", index=False)

# Descriptive statistics - filtered data

In [None]:
# MILKING RECORDS
df_lact = pd.read_csv("../Data/fertilityDF_W_MY_filtered.csv", low_memory=False)

count_my_rec = df_lact.groupby(["Parity"])["InseminationDate"].count().reset_index()
print(f"No. of insemination records divided over parities: \n", count_my_rec.to_string(index=False))

count_my_rec = df_lact.groupby(["Parity", "Breed"])["InseminationDate"].count().reset_index()
print(f"No. of insemination records divided over parities and breeds: \n", count_my_rec.to_string(index=False))

# By parity
for_my_rec5 = df_lact.drop_duplicates(subset=["SE_Number", "LactationNumber"])
print(f"No. of parities in file: {for_my_rec5.shape}")

count_my_rec = for_my_rec5.groupby(["Breed"])["SE_Number"].count().reset_index()
print(f"No. of lactations from NRDC and SH: \n", count_my_rec.to_string(index=False))  

count_my_rec = for_my_rec5.groupby(["Parity", "Breed"])["SE_Number"].count().reset_index()
print(f"No. of cows within breed and parity: \n", count_my_rec.to_string(index=False))

# By cows
for_my_rec4 = df_lact.drop_duplicates(subset=["SE_Number"])
print(f"No. of cows in file: {for_my_rec4.shape}") 

for_my_rec5 = for_my_rec4.drop_duplicates(subset=["SE_Number"])
count_my_rec = for_my_rec5.groupby(["Breed"])["SE_Number"].count().reset_index()
print(f"No. of cows from NRDC, SH: \n", count_my_rec.to_string(index=False))

# Herd info
df_lact = pd.read_csv("../Data/fertilityDF_W_MY_filtered.csv", low_memory=False)
df_lact = df_lact.drop_duplicates(subset=["FarmName_Pseudo"])
col_keep = ["FarmName_Pseudo"]
df_lact = df_lact[col_keep]
print(df_lact.shape)
print(f"Herds in filtered data: \n", df_lact.to_string(index=False))