In [None]:
from datetime import timedelta
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import glob

LOAD DATA FROM COW DATABASE,
Requires the following data sets:
- Kok_Calving
- Kok_HerdEntryExit
- Kok_CowMilkSampling
- Kok_Lineage
- Kok_Reproduction

In [None]:
#
#
# LOAD COW DATABASE CALVING DATA
calving = pd.read_csv("Kok_Calving240820.csv", delimiter=';', low_memory=False)
col_keep = ["ActiveHerdNumber", "BirthID", "CalvingDate", "CalvingNumber"]
calving = calving[col_keep]
calving.rename(columns={'BirthID': 'SE_Number', "ActiveHerdNumber": "FarmName_Pseudo", "CalvingDate": "CalvingDateKok",
                        "CalvingNumber": "LactationNumberKok"}, inplace=True)
calving = calving.sort_values(by=["SE_Number", "CalvingDateKok"])
calving = calving.drop_duplicates(subset=["SE_Number", "CalvingDateKok"])
# calving["upper_limit"] = calving.groupby(["SE_Number"])["CalvingDateKok"].shift(-1)
calving.to_csv("calving_kok.csv", index=False)

calving2 = calving.groupby(["FarmName_Pseudo"])["LactationNumberKok"].count().reset_index()
calving2.rename(columns={'LactationNumberKok': 'CountLact'}, inplace=True)
print(f"No. of lactation records in cow database in different herds: \n", calving2.to_string(index=False))
calving2 = calving.drop_duplicates(subset=["SE_Number", "CalvingDateKok"])
print(f"No. of lactation records in cow database: {calving2.shape}")  # 23,688

calving2 = calving.groupby(["FarmName_Pseudo"])["SE_Number"].count().reset_index()
calving2.rename(columns={'SE_Number': 'CountCows'}, inplace=True)
print(f"No. of cows with calving data in cow database in different herds: \n", calving2.to_string(index=False))
calving2 = calving.drop_duplicates(subset=["SE_Number"])
print(f"No. of cows with calving data in cow database: {calving2.shape}")  # 9,168

In [None]:
#
#
# ADD ENRTY AND CULLING DATA FROM COW DATABASE
cull = pd.read_csv("Kok_HerdEntryExit240820.csv", delimiter=';', low_memory=False)
col_keep = ["BirthID", "ActiveHerdNumber", "EntryDate", "ExitDate", "ExitReason_PrimaryReason",
            "ExitReason_SecondaryReason1","ExitReason_SecondaryReason2"]
cull = cull[col_keep]
cull.rename(columns={'BirthID': 'SE_Number', "ActiveHerdNumber": "FarmName_Pseudo", "EntryDate": "EntryDateKok",
                     "ExitDate": "ExitDateKok", "ExitReason_PrimaryReason": "ExitReason_PrimaryReasonKok",
                     "ExitReason_SecondaryReason1": "ExitReason_SecondaryReason1Kok",
                     "ExitReason_SecondaryReason2": "ExitReason_SecondaryReason2Kok"}, inplace=True)
cull = cull.sort_values(by=["SE_Number", "ExitDateKok"])
print(f"Number of records in raw entry/culling file: {cull.shape}")  # 33,679
cull2 = cull.drop_duplicates(subset=["SE_Number", "ExitDateKok"])
print(f"NUmber of unique records in entry/culling file: {cull2.shape}")  # 33,668
# => 11 records with multiple culling reasons

# check how many cows enter/exit herds multiple times
cull3 = cull2.groupby(["SE_Number"])["ExitDateKok"].count().reset_index()
cull3.rename(columns={'ExitDateKok': 'CountExits'}, inplace=True)

frequency_table = cull3['CountExits'].value_counts()
print(f"No. of cows with multiple exit records in cow database:")
print(frequency_table)

# Assume last record within cow as culling date
cull2 = cull2.groupby('SE_Number').tail(1)
cull2.to_csv("cullingKok.csv", index=False)

# Merge
dfKok = calving.merge(cull2, on=["FarmName_Pseudo", "SE_Number"], how="left")

# Make upper_limit to sort dry off date later
dfKok = dfKok.sort_values(by=["SE_Number", "CalvingDateKok"]).reset_index(drop=True)
dfKok["upper_limit"] = dfKok.groupby(["SE_Number"])["CalvingDateKok"].shift(-1)

dfKok.to_csv("dfKok.csv", index=False)

In [None]:
#
#
# ADD DRY OFF DATA FROM COW DATABASE
dry_offKok = pd.read_csv("Kok_Kok_CowMilkSampling240829.csv", delimiter=';', low_memory=False)
col_keep = ["BirthID", "ActiveHerdNumber", "SamplingDate", "VariousSystemInfo"]
dry_offKok = dry_offKok[col_keep]
dry_offKok.rename(columns={'BirthID': 'SE_Number', "ActiveHerdNumber": "FarmName_Pseudo",
                           "SamplingDate": "DryOffDateKok"}, inplace=True)
col_keep = ["FarmName_Pseudo", "SE_Number", "DryOffDateKok", "VariousSystemInfo"]
dry_offKok = dry_offKok[col_keep]

# Find dry off data and output last record for each cow (ie last time she's sampled for test day sampling)
# and by "kod: 02"
dry_offKok = dry_offKok.sort_values(by=["SE_Number", "DryOffDateKok"])
# Create a new column to track when the "VariousSystemInfo" changes, want the last one ie -1
dry_offKok['InfoChange'] = (dry_offKok.groupby('SE_Number')['VariousSystemInfo'].shift(-1) !=
                            dry_offKok['VariousSystemInfo'])
dry_offKok.to_csv("DryOffKok1.csv", index=False)
# Keep only the rows where there is a change in "VariousSystemInfo" AND "VariousSystemInfo" is 2 ie dry off
dry_offKok = dry_offKok[(dry_offKok["VariousSystemInfo"] == "kod: 02") & (dry_offKok["InfoChange"] == True)]
# Drop columns
dry_offKok = dry_offKok.drop(columns=['InfoChange', "VariousSystemInfo"])
dry_offKok.to_csv("DryOffKok.csv", index=False)

print(f"No. dry off records in cow database: {dry_offKok.shape}")  # 12,513
dry_offKok2 = dry_offKok.drop_duplicates(subset=["SE_Number"])
print(f"No. cows with dry off records in cow database: {dry_offKok2.shape}")  # 6,447

# Merge
dfKok = dfKok.merge(dry_offKok, on=["FarmName_Pseudo", "SE_Number"], how="left")
dfKok["CalvingDateKok"] = pd.to_datetime(dfKok["CalvingDateKok"])
dfKok["DryOffDateKok"] = pd.to_datetime(dfKok["DryOffDateKok"])
dfKok["upper_limit"] = pd.to_datetime(dfKok["upper_limit"])


def data(row):
    if row["CalvingDateKok"] < row["DryOffDateKok"] <= row["upper_limit"]:
        return 1
    else:
        return 0


dfKok["dryoff"] = dfKok.apply(data, axis=1)

# Keep all records with dry off dates fitted within lactation
df_ones = dfKok[dfKok['dryoff'] == 1]
# Keep last record where open and lacking dry off date
df_last_zero = dfKok[dfKok['dryoff'] == 0].groupby("SE_Number").tail(1)
# Concatenate dataframe and sort to maintain original order
df_combined = pd.concat([df_ones, df_last_zero])
df_combined = df_combined.sort_values(by=["SE_Number", "CalvingDateKok"]).reset_index(drop=True)
df_combined.to_csv("dfKok.csv", index=False)

# Put ExitDateKok as upper_limit if upper_limit is missing from calving date (mostly last lactation)
df_combined.loc[df_combined["upper_limit"].isna() & df_combined["ExitDateKok"].notna(), "upper_limit"] = (
    df_combined)["ExitDateKok"]

# Get today's date for current lactation when missing upper_limit after adjusting using ExitDateKok
df_combined['TodayDate'] = pd.to_datetime('today').normalize()
df_combined.loc[df_combined["upper_limit"].isna() & df_combined["ExitDateKok"].isna(), "upper_limit"] = (
    df_combined)["TodayDate"]

df_combined.loc[df_combined["upper_limit"].isna()
                & (df_combined["DryOffDateKok"] < df_combined["CalvingDateKok"]), "upper_limit"] = (df_combined)["DryOffDateKok"]
df_combined.loc[df_combined["dryoff"] == 0, "DryOffDateKok"] = np.nan
df_combined.to_csv("dfKok.csv", index=False)

In [None]:
#
#
# ADD BREED INFORMATION FROM COW DATABASE
df = pd.read_csv("Kok_Lineage240821.csv", delimiter=';', low_memory=False)
df.rename(columns={"BirthID": "SE_Number", "ActiveHerdNumber": "FarmName_Pseudo", "Father_Breed": "SireBreedKok",
                   "Mother_Breed": "DamBreedKok", "MothersFather_Breed": "MGSBreedKok"}, inplace=True)
col_keep = ["FarmName_Pseudo", "SE_Number", "SireBreedKok", "DamBreedKok", "MGSBreedKok"]
df = df[col_keep]

# Check for duplicates and sort
print(df.shape)  # 24,067
df = df.sort_values(by=["FarmName_Pseudo", "SE_Number"])
df2 = df.drop_duplicates(subset=["SE_Number", "SireBreedKok", "DamBreedKok", "MGSBreedKok"])
print(f"No. cows with breed data: {df2.shape}")  # 22,653 => 1414 duplicates, all okay
df3 = df.drop_duplicates(subset=["SE_Number"])
print(f"No. cows with different breed data recorded: {df3.shape}")  # 22,607 => 46 cows with conflicting breed

# The 46 cows with conflicting breed are a mess, e.g. LIM and SJB for the same sire, remove!
duplicate_records = df2[df2['SE_Number'].duplicated(keep=False)]
duplicate_records = duplicate_records.copy()
duplicate_records["DupBreed"] = 1
col_keep = ["SE_Number", "DupBreed"]
duplicate_records = duplicate_records[col_keep]
df_ras = df2.merge(duplicate_records, on=["SE_Number"], how="left")
df_ras = df_ras[df_ras['DupBreed'] != 1]
df_ras.to_csv("kok_ras.csv", index=False)

dfKok = pd.read_csv("dfKok.csv", low_memory=False)
dfKok = dfKok.merge(df_ras, on=["FarmName_Pseudo", "SE_Number"], how="left")
dfKok.to_csv("dfKok.csv", index=False)

In [None]:
#
#
# ADD INSEMINATION FROM COW DATABASE
df13 = pd.read_csv("Kok_Reproduction240820.csv", delimiter=';', low_memory=False)

# Create a boolean mask where SireBull_SE_Number is NE 0
mask = df13["SireBull_SE_Number"] != 0
# Sum the mask to count the number of True values (i.e., the number of not 0s)
count_non_zeros = mask.sum()
print(f"Number of events with sire ID in cow database: {count_non_zeros}")  # 95,369

# Count occurrences of each unique value in the EventType column
value_counts = df13["EventType"].value_counts()
print(value_counts)
"""
EventType
Inseminering               47926
Dräktighetsundersökning    41625
Behandling                  2120
Embryoinlägg                2092
Betäckning                  1433
Fri bet                      173
"""
# Keep only data from insemination
df14 = df13[df13["EventType"] == "Inseminering"]

# Check for duplicates and sort
print(f"No. insemination records in raw file in cow database: {df14.shape}")  # 47,926, 27col
df14 = df14.sort_values(by=["BirthID", "EventDate"])
df15 = df14.drop_duplicates(subset=["BirthID", "EventDate"])
print(f"No. unique inseminations in cow database: {df15.shape}")  # 43,951, 27col

df15 = df15.copy()
df15.rename(columns={"ActiveHerdNumber": "FarmName_Pseudo", "BirthID": "SE_Number", "EventDate": "InseminationDateKok",
                     "SireBull_SE_Number": "SireBull_SE_NumberKok"}, inplace=True)
col_keep = ["FarmName_Pseudo", "SE_Number", "InseminationDateKok", "SireBull_SE_NumberKok"]
df15 = df15[col_keep]

"""
# Subset chosen cow
SE_Number = ["SE-064c0cec-1189"]
df15 = df15[df15["SE_Number"].isin(SE_Number)]
"""
"""
# Subset chosen cows
SE_Number = ["SE-064c0cec-1189", "SE-5c06d92d-3145", "SE-5c06d92d-3177", "SE-5b581702-1742",
             "SE-5b581702-1851", "SE-5c06d92d-2915", "SE-5b581702-2002", "SE-5c06d92d-2515"]
df15 = df15[df15["SE_Number"].isin(SE_Number)]
"""

# Merge with calving data
dfkok2 = pd.read_csv("dfKok.csv", low_memory=False)
dfins6 = pd.merge(dfkok2, df15, on=["FarmName_Pseudo", "SE_Number"])

# Filter df for relevant inseminations sorted to correct lactation
dfins6 = dfins6[(dfins6["InseminationDateKok"] >= dfins6["CalvingDateKok"]) & (dfins6["InseminationDateKok"] <= dfins6["upper_limit"])]
dfins6.to_csv("dfKok.csv", index=False)

In [None]:
#
#
# ADD PREGNANCY CHECKS FROM COW DATABASE
# Make next_ins to sort pregnancy checks
df20 = pd.read_csv("dfKok.csv", low_memory=False)
# df20 = pd.DataFrame(df20, columns=["SE_Number", "LactationNumberKok", "InseminationDateKok", "upper_limit"])

df20['next_ins'] = df20.groupby(['SE_Number', 'LactationNumberKok'])['InseminationDateKok'].shift(-1)

# only keep next_ins where falls within range
dfins7 = df20[(df20["next_ins"] >= df20["InseminationDateKok"]) & (df20["next_ins"] <= df20["upper_limit"])]
col_keep = ["SE_Number", "LactationNumberKok", "InseminationDateKok", "next_ins", "upper_limit"]
dfins7 = dfins7[col_keep]
dfins7.to_csv("dfKok2.csv", index=False)

"""
#Subset chosen cow
SE_Number = ["SE-064c0cec-1189"]
dfins8 = dfins8[dfins8["SE_Number"].isin(SE_Number)]
#dfins8.to_csv("dataframe3.csv", index=False)
"""

# Load pregnancy check data, check for duplicates, sort
preg = pd.read_csv("Kok_Reproduction240820.csv", delimiter=';', low_memory=False)
# Keep only data from pregnancy checks
preg = preg[preg["EventType"] == "Dräktighetsundersökning"]

col_keep = ["BirthID", "EventDate", "PregnancyStatus"]
preg = preg[col_keep]
preg.rename(columns={"BirthID": "SE_Number", "EventDate": "PregnancyCheckDateKok",
                     "PregnancyStatus": "PregnancyStatusKok"}, inplace=True)

print(f"No. pregnancy checks in cow database: {preg.shape}")  # 41,625 events, 3col
preg = preg.drop_duplicates(subset=["SE_Number", "PregnancyCheckDateKok"])
print(f"No. unique pregnancy checks in cow database: {preg.shape}")    # 38,320 unique events, 3col
preg = preg.sort_values(by=["SE_Number", "PregnancyCheckDateKok"])

# Add to subset df
col_keep = ["SE_Number", "LactationNumberKok", "InseminationDateKok", "next_ins", "upper_limit"]
df20 = df20[col_keep]
df21 = df20.join(preg.set_index(["SE_Number"]), on=["SE_Number"])

# Ensure datetime conversion
df21['InseminationDateKok'] = pd.to_datetime(df21['InseminationDateKok'])
df21['PregnancyCheckDateKok'] = pd.to_datetime(df21['PregnancyCheckDateKok'])
df21['next_ins'] = pd.to_datetime(df21['next_ins'])
df21['upper_limit'] = pd.to_datetime(df21['upper_limit'])

# Initialize 'C' column with NaN
df21['C'] = np.nan


# Define the filtering function
def filter_pregcheck(row):
    if pd.isna(row["next_ins"]):
        if (row["PregnancyCheckDateKok"] >= row["InseminationDateKok"]) and (row["PregnancyCheckDateKok"]
                                                                             <= row["upper_limit"]):
            return "Yes"
        else:
            return "No"
    if pd.notna(row["next_ins"]):
        if (row["PregnancyCheckDateKok"] >= row["InseminationDateKok"]) and (row["PregnancyCheckDateKok"]
                                                                             <= row["next_ins"]):
            return "Yes"
        else:
            return "No"


# Apply the filter function to each row
df21['C'] = df21.apply(filter_pregcheck, axis=1)
dfins10 = df21[df21["C"] == "Yes"]
dfins10.to_csv("dfKok2.csv", index=False)

col_keep = ["SE_Number", "LactationNumberKok", "InseminationDateKok", "PregnancyCheckDateKok", "PregnancyStatusKok"]
dfins11 = dfins10[col_keep]

# Convert the 'InseminationDate' column from datetime64[ns] to object for merging
print(dfins11.dtypes)
dfins11 = dfins11.copy()
dfins11['InseminationDateKok'] = dfins11['InseminationDateKok'].astype(str)

# Add to master df
dfins12 = pd.read_csv("dfKok.csv")
print(dfins12.dtypes)
dfins13 = dfins12.merge(dfins11, on=["SE_Number", "LactationNumberKok", "InseminationDateKok"], how="left")

"""
#Subset chosen cow
SE_Number = ["SE-064c0cec-1189"]
dfins13 = dfins13[dfins13["SE_Number"].isin(SE_Number)]
"""
dfins13.to_csv("dfKok.csv", index=False)