In [None]:
from datetime import timedelta
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import glob

LOAD DATA FROM COW DATABASE

Requires the following data to create dfKok.csv:
- Kok_Calving
- Kok_HerdEntryExit
- Kok_CowMilkSampling
- Kok_Lineage
- Kok_Reproduction

In [None]:
#
#
# LOAD COW DATABASE CALVING DATA
calving = pd.read_csv("Kok_Calving240820.csv", delimiter=';', low_memory=False)
col_keep = ["ActiveHerdNumber", "BirthID", "CalvingDate", "CalvingNumber"]
calving = calving[col_keep]
calving.rename(columns={'BirthID': 'SE_Number', "ActiveHerdNumber": "FarmName_Pseudo", "CalvingDate": "CalvingDateKok",
                        "CalvingNumber": "LactationNumberKok"}, inplace=True)
calving = calving.sort_values(by=["SE_Number", "CalvingDateKok"])
calving = calving.drop_duplicates(subset=["SE_Number", "CalvingDateKok"])
# calving["upper_limit"] = calving.groupby(["SE_Number"])["CalvingDateKok"].shift(-1)
calving.to_csv("calving_kok.csv", index=False)

calving2 = calving.groupby(["FarmName_Pseudo"])["LactationNumberKok"].count().reset_index()
calving2.rename(columns={'LactationNumberKok': 'CountLact'}, inplace=True)
print(f"No. of lactation records in cow database in different herds: \n", calving2.to_string(index=False))
calving2 = calving.drop_duplicates(subset=["SE_Number", "CalvingDateKok"])
print(f"No. of lactation records in cow database: {calving2.shape}")  # 23,688

calving2 = calving.groupby(["FarmName_Pseudo"])["SE_Number"].count().reset_index()
calving2.rename(columns={'SE_Number': 'CountCows'}, inplace=True)
print(f"No. of cows with calving data in cow database in different herds: \n", calving2.to_string(index=False))
calving2 = calving.drop_duplicates(subset=["SE_Number"])
print(f"No. of cows with calving data in cow database: {calving2.shape}")  # 9,168

In [None]:
#
#
# ADD ENRTY AND CULLING DATA FROM COW DATABASE
cull = pd.read_csv("Kok_HerdEntryExit240820.csv", delimiter=';', low_memory=False)
col_keep = ["BirthID", "ActiveHerdNumber", "EntryDate", "ExitDate", "ExitReason_PrimaryReason",
            "ExitReason_SecondaryReason1","ExitReason_SecondaryReason2"]
cull = cull[col_keep]
cull.rename(columns={'BirthID': 'SE_Number', "ActiveHerdNumber": "FarmName_Pseudo", "EntryDate": "EntryDateKok",
                     "ExitDate": "ExitDateKok", "ExitReason_PrimaryReason": "ExitReason_PrimaryReasonKok",
                     "ExitReason_SecondaryReason1": "ExitReason_SecondaryReason1Kok",
                     "ExitReason_SecondaryReason2": "ExitReason_SecondaryReason2Kok"}, inplace=True)
cull = cull.sort_values(by=["SE_Number", "ExitDateKok"])
print(f"Number of records in raw entry/culling file: {cull.shape}")  # 33,679
cull2 = cull.drop_duplicates(subset=["SE_Number", "ExitDateKok"])
print(f"NUmber of unique records in entry/culling file: {cull2.shape}")  # 33,668
# => 11 records with multiple culling reasons

# check how many cows enter/exit herds multiple times
cull3 = cull2.groupby(["SE_Number"])["ExitDateKok"].count().reset_index()
cull3.rename(columns={'ExitDateKok': 'CountExits'}, inplace=True)

frequency_table = cull3['CountExits'].value_counts()
print(f"No. of cows with multiple exit records in cow database:")
print(frequency_table)

# Assume last record within cow as culling date
cull2 = cull2.groupby('SE_Number').tail(1)
cull2.to_csv("cullingKok.csv", index=False)

# Merge
dfKok = calving.merge(cull2, on=["FarmName_Pseudo", "SE_Number"], how="left")

# Make upper_limit to sort dry off date later
dfKok = dfKok.sort_values(by=["SE_Number", "CalvingDateKok"]).reset_index(drop=True)
dfKok["upper_limit"] = dfKok.groupby(["SE_Number"])["CalvingDateKok"].shift(-1)

dfKok.to_csv("dfKok.csv", index=False)

In [None]:
#
#
# ADD DRY OFF DATA FROM COW DATABASE
dry_offKok = pd.read_csv("Kok_Kok_CowMilkSampling240829.csv", delimiter=';', low_memory=False)
col_keep = ["BirthID", "ActiveHerdNumber", "SamplingDate", "VariousSystemInfo"]
dry_offKok = dry_offKok[col_keep]
dry_offKok.rename(columns={'BirthID': 'SE_Number', "ActiveHerdNumber": "FarmName_Pseudo",
                           "SamplingDate": "DryOffDateKok"}, inplace=True)
col_keep = ["FarmName_Pseudo", "SE_Number", "DryOffDateKok", "VariousSystemInfo"]
dry_offKok = dry_offKok[col_keep]

# Find dry off data and output last record for each cow (ie last time she's sampled for test day sampling)
# and by "kod: 02"
dry_offKok = dry_offKok.sort_values(by=["SE_Number", "DryOffDateKok"])
# Create a new column to track when the "VariousSystemInfo" changes, want the last one ie -1
dry_offKok['InfoChange'] = (dry_offKok.groupby('SE_Number')['VariousSystemInfo'].shift(-1) !=
                            dry_offKok['VariousSystemInfo'])
dry_offKok.to_csv("DryOffKok1.csv", index=False)
# Keep only the rows where there is a change in "VariousSystemInfo" AND "VariousSystemInfo" is 2 ie dry off
dry_offKok = dry_offKok[(dry_offKok["VariousSystemInfo"] == "kod: 02") & (dry_offKok["InfoChange"] == True)]
# Drop columns
dry_offKok = dry_offKok.drop(columns=['InfoChange', "VariousSystemInfo"])
dry_offKok.to_csv("DryOffKok.csv", index=False)

print(f"No. dry off records in cow database: {dry_offKok.shape}")  # 12,513
dry_offKok2 = dry_offKok.drop_duplicates(subset=["SE_Number"])
print(f"No. cows with dry off records in cow database: {dry_offKok2.shape}")  # 6,447

# Merge
dfKok = dfKok.merge(dry_offKok, on=["FarmName_Pseudo", "SE_Number"], how="left")
dfKok["CalvingDateKok"] = pd.to_datetime(dfKok["CalvingDateKok"])
dfKok["DryOffDateKok"] = pd.to_datetime(dfKok["DryOffDateKok"])
dfKok["upper_limit"] = pd.to_datetime(dfKok["upper_limit"])


def data(row):
    if row["CalvingDateKok"] < row["DryOffDateKok"] <= row["upper_limit"]:
        return 1
    else:
        return 0


dfKok["dryoff"] = dfKok.apply(data, axis=1)

# Keep all records with dry off dates fitted within lactation
df_ones = dfKok[dfKok['dryoff'] == 1]
# Keep last record where open and lacking dry off date
df_last_zero = dfKok[dfKok['dryoff'] == 0].groupby("SE_Number").tail(1)
# Concatenate dataframe and sort to maintain original order
df_combined = pd.concat([df_ones, df_last_zero])
df_combined = df_combined.sort_values(by=["SE_Number", "CalvingDateKok"]).reset_index(drop=True)
df_combined.to_csv("dfKok.csv", index=False)

# Put ExitDateKok as upper_limit if upper_limit is missing from calving date (mostly last lactation)
df_combined.loc[df_combined["upper_limit"].isna() & df_combined["ExitDateKok"].notna(), "upper_limit"] = (
    df_combined)["ExitDateKok"]

# Get today's date for current lactation when missing upper_limit after adjusting using ExitDateKok
df_combined['TodayDate'] = pd.to_datetime('today').normalize()
df_combined.loc[df_combined["upper_limit"].isna() & df_combined["ExitDateKok"].isna(), "upper_limit"] = (
    df_combined)["TodayDate"]

df_combined.loc[df_combined["upper_limit"].isna()
                & (df_combined["DryOffDateKok"] < df_combined["CalvingDateKok"]), "upper_limit"] = (df_combined)["DryOffDateKok"]
df_combined.loc[df_combined["dryoff"] == 0, "DryOffDateKok"] = np.nan
df_combined.to_csv("dfKok.csv", index=False)

In [None]:
#
#
# ADD BREED INFORMATION FROM COW DATABASE
df = pd.read_csv("Kok_Lineage240821.csv", delimiter=';', low_memory=False)
df.rename(columns={"BirthID": "SE_Number", "ActiveHerdNumber": "FarmName_Pseudo", "Father_Breed": "SireBreedKok",
                   "Mother_Breed": "DamBreedKok", "MothersFather_Breed": "MGSBreedKok"}, inplace=True)
col_keep = ["FarmName_Pseudo", "SE_Number", "SireBreedKok", "DamBreedKok", "MGSBreedKok"]
df = df[col_keep]

# Check for duplicates and sort
print(df.shape)  # 24,067
df = df.sort_values(by=["FarmName_Pseudo", "SE_Number"])
df2 = df.drop_duplicates(subset=["SE_Number", "SireBreedKok", "DamBreedKok", "MGSBreedKok"])
print(f"No. cows with breed data: {df2.shape}")  # 22,653 => 1414 duplicates, all okay
df3 = df.drop_duplicates(subset=["SE_Number"])
print(f"No. cows with different breed data recorded: {df3.shape}")  # 22,607 => 46 cows with conflicting breed

# The 46 cows with conflicting breed are a mess, e.g. LIM and SJB for the same sire, remove!
duplicate_records = df2[df2['SE_Number'].duplicated(keep=False)]
duplicate_records = duplicate_records.copy()
duplicate_records["DupBreed"] = 1
col_keep = ["SE_Number", "DupBreed"]
duplicate_records = duplicate_records[col_keep]
df_ras = df2.merge(duplicate_records, on=["SE_Number"], how="left")
df_ras = df_ras[df_ras['DupBreed'] != 1]
df_ras.to_csv("kok_ras.csv", index=False)

dfKok = pd.read_csv("dfKok.csv", low_memory=False)
dfKok = dfKok.merge(df_ras, on=["FarmName_Pseudo", "SE_Number"], how="left")
dfKok.to_csv("dfKok.csv", index=False)

In [None]:
#
#
# ADD INSEMINATION FROM COW DATABASE
df13 = pd.read_csv("Kok_Reproduction240820.csv", delimiter=';', low_memory=False)

# Create a boolean mask where SireBull_SE_Number is NE 0
mask = df13["SireBull_SE_Number"] != 0
# Sum the mask to count the number of True values (i.e., the number of not 0s)
count_non_zeros = mask.sum()
print(f"Number of events with sire ID in cow database: {count_non_zeros}")  # 95,369

# Count occurrences of each unique value in the EventType column
value_counts = df13["EventType"].value_counts()
print(value_counts)
"""
EventType
Inseminering               47926
Dräktighetsundersökning    41625
Behandling                  2120
Embryoinlägg                2092
Betäckning                  1433
Fri bet                      173
"""
# Keep only data from insemination
df14 = df13[df13["EventType"] == "Inseminering"]

# Check for duplicates and sort
print(f"No. insemination records in raw file in cow database: {df14.shape}")  # 47,926, 27col
df14 = df14.sort_values(by=["BirthID", "EventDate"])
df15 = df14.drop_duplicates(subset=["BirthID", "EventDate"])
print(f"No. unique inseminations in cow database: {df15.shape}")  # 43,951, 27col

df15 = df15.copy()
df15.rename(columns={"ActiveHerdNumber": "FarmName_Pseudo", "BirthID": "SE_Number", "EventDate": "InseminationDateKok",
                     "SireBull_SE_Number": "SireBull_SE_NumberKok"}, inplace=True)
col_keep = ["FarmName_Pseudo", "SE_Number", "InseminationDateKok", "SireBull_SE_NumberKok"]
df15 = df15[col_keep]

"""
# Subset chosen cow
SE_Number = ["SE-064c0cec-1189"]
df15 = df15[df15["SE_Number"].isin(SE_Number)]
"""
"""
# Subset chosen cows
SE_Number = ["SE-064c0cec-1189", "SE-5c06d92d-3145", "SE-5c06d92d-3177", "SE-5b581702-1742",
             "SE-5b581702-1851", "SE-5c06d92d-2915", "SE-5b581702-2002", "SE-5c06d92d-2515"]
df15 = df15[df15["SE_Number"].isin(SE_Number)]
"""

# Merge with calving data
dfkok2 = pd.read_csv("dfKok.csv", low_memory=False)
dfins6 = pd.merge(dfkok2, df15, on=["FarmName_Pseudo", "SE_Number"])

# Filter df for relevant inseminations sorted to correct lactation
dfins6 = dfins6[(dfins6["InseminationDateKok"] >= dfins6["CalvingDateKok"]) & (dfins6["InseminationDateKok"] <= dfins6["upper_limit"])]
dfins6.to_csv("dfKok.csv", index=False)

In [None]:
#
#
# ADD PREGNANCY CHECKS FROM COW DATABASE
# Make next_ins to sort pregnancy checks
df20 = pd.read_csv("dfKok.csv", low_memory=False)
# df20 = pd.DataFrame(df20, columns=["SE_Number", "LactationNumberKok", "InseminationDateKok", "upper_limit"])

df20['next_ins'] = df20.groupby(['SE_Number', 'LactationNumberKok'])['InseminationDateKok'].shift(-1)

# only keep next_ins where falls within range
dfins7 = df20[(df20["next_ins"] >= df20["InseminationDateKok"]) & (df20["next_ins"] <= df20["upper_limit"])]
col_keep = ["SE_Number", "LactationNumberKok", "InseminationDateKok", "next_ins", "upper_limit"]
dfins7 = dfins7[col_keep]
dfins7.to_csv("dfKok2.csv", index=False)

"""
#Subset chosen cow
SE_Number = ["SE-064c0cec-1189"]
dfins8 = dfins8[dfins8["SE_Number"].isin(SE_Number)]
#dfins8.to_csv("dataframe3.csv", index=False)
"""

# Load pregnancy check data, check for duplicates, sort
preg = pd.read_csv("Kok_Reproduction240820.csv", delimiter=';', low_memory=False)
# Keep only data from pregnancy checks
preg = preg[preg["EventType"] == "Dräktighetsundersökning"]

col_keep = ["BirthID", "EventDate", "PregnancyStatus"]
preg = preg[col_keep]
preg.rename(columns={"BirthID": "SE_Number", "EventDate": "PregnancyCheckDateKok",
                     "PregnancyStatus": "PregnancyStatusKok"}, inplace=True)

print(f"No. pregnancy checks in cow database: {preg.shape}")  # 41,625 events, 3col
preg = preg.drop_duplicates(subset=["SE_Number", "PregnancyCheckDateKok"])
print(f"No. unique pregnancy checks in cow database: {preg.shape}")    # 38,320 unique events, 3col
preg = preg.sort_values(by=["SE_Number", "PregnancyCheckDateKok"])

# Add to subset df
col_keep = ["SE_Number", "LactationNumberKok", "InseminationDateKok", "next_ins", "upper_limit"]
df20 = df20[col_keep]
df21 = df20.join(preg.set_index(["SE_Number"]), on=["SE_Number"])

# Ensure datetime conversion
df21['InseminationDateKok'] = pd.to_datetime(df21['InseminationDateKok'])
df21['PregnancyCheckDateKok'] = pd.to_datetime(df21['PregnancyCheckDateKok'])
df21['next_ins'] = pd.to_datetime(df21['next_ins'])
df21['upper_limit'] = pd.to_datetime(df21['upper_limit'])

# Initialize 'C' column with NaN
df21['C'] = np.nan


# Define the filtering function
def filter_pregcheck(row):
    if pd.isna(row["next_ins"]):
        if (row["PregnancyCheckDateKok"] >= row["InseminationDateKok"]) and (row["PregnancyCheckDateKok"]
                                                                             <= row["upper_limit"]):
            return "Yes"
        else:
            return "No"
    if pd.notna(row["next_ins"]):
        if (row["PregnancyCheckDateKok"] >= row["InseminationDateKok"]) and (row["PregnancyCheckDateKok"]
                                                                             <= row["next_ins"]):
            return "Yes"
        else:
            return "No"


# Apply the filter function to each row
df21['C'] = df21.apply(filter_pregcheck, axis=1)
dfins10 = df21[df21["C"] == "Yes"]
dfins10.to_csv("dfKok2.csv", index=False)

col_keep = ["SE_Number", "LactationNumberKok", "InseminationDateKok", "PregnancyCheckDateKok", "PregnancyStatusKok"]
dfins11 = dfins10[col_keep]

# Convert the 'InseminationDate' column from datetime64[ns] to object for merging
print(dfins11.dtypes)
dfins11 = dfins11.copy()
dfins11['InseminationDateKok'] = dfins11['InseminationDateKok'].astype(str)

# Add to master df
dfins12 = pd.read_csv("dfKok.csv")
print(dfins12.dtypes)
dfins13 = dfins12.merge(dfins11, on=["SE_Number", "LactationNumberKok", "InseminationDateKok"], how="left")

"""
#Subset chosen cow
SE_Number = ["SE-064c0cec-1189"]
dfins13 = dfins13[dfins13["SE_Number"].isin(SE_Number)]
"""
dfins13.to_csv("dfKok.csv", index=False)

LOAD DELPRO DATA

Requires the following data to create dfDelPro.csv:
- Del_Calving
- Del_DryOff
- Del_Lactation
- Del_Cow
- Del_Insemination
- Del_PregnancyCheck

In [None]:
#
#
# LOAD DELPRO CALVING DATA
calving = pd.read_csv("Del_Calving240823.csv", delimiter=';', low_memory=False)
col_keep = ["FarmName_Pseudo", "SE_Number", "CalvingDate"]
calving = calving[col_keep]
calving.rename(columns={"CalvingDate": "CalvingDateDelPro"}, inplace=True)
calving = calving.sort_values(by=["SE_Number", "CalvingDateDelPro"])
calving = calving.drop_duplicates(subset=["SE_Number", "CalvingDateDelPro"])
calving["upper_limit"] = calving.groupby(["SE_Number"])["CalvingDateDelPro"].shift(-1)
calving.to_csv("calving_delpro.csv", index=False)

calving2 = calving.groupby(["FarmName_Pseudo"])["CalvingDateDelPro"].count().reset_index()
calving2.rename(columns={'CalvingDateDelPro': 'CountLact'}, inplace=True)
print(f"No. of lactation records in DelPro in different herds: \n", calving2.to_string(index=False))
calving2 = calving.drop_duplicates(subset=["SE_Number", "CalvingDateDelPro"])
print(f"No. of lactation records in DelPro: {calving2.shape}")  # 10,163

calving2 = calving.groupby(["FarmName_Pseudo"])["SE_Number"].count().reset_index()
calving2.rename(columns={'SE_Number': 'CountCows'}, inplace=True)
print(f"No. of cows with calving data in DelPro in different herds: \n", calving2.to_string(index=False))
calving2 = calving.drop_duplicates(subset=["SE_Number"])
print(f"No. of cows with calving data in DelPro: {calving2.shape}")  # 5,401

# LOAD DELPRO CULLING DATA
culling = pd.read_csv("Del_Cow240823.csv", delimiter=';', low_memory=False)
col_keep = ["SE_Number", "BirthDate", "CullDecisionDate", "CullReason1", "CullReason2"]
culling = culling[col_keep]
culling = culling.sort_values(by=["SE_Number", "CullDecisionDate"])
print(f"No. records in raw culling file: {culling.shape}")  # 25,105
culling2 = culling.drop_duplicates(subset=["SE_Number", "CullDecisionDate"])
print(f"No. records in culling file: {culling.shape}")  # 25,105
culling2.to_csv("cull_delpro.csv", index=False)

# Merge
for_my_rec = calving.merge(culling2, on=["SE_Number"], how="left")
for_my_rec.loc[pd.isna(for_my_rec["upper_limit"]), "upper_limit"] = for_my_rec["CullDecisionDate"]
for_my_rec.to_csv("dfDelPro.csv", index=False)

In [None]:
#
#
# LOAD DELPRO CULLING DATA
culling = pd.read_csv("Del_Cow240823.csv", delimiter=';', low_memory=False)
col_keep = ["SE_Number", "BirthDate", "CullDecisionDate", "CullReason1", "CullReason2"]
culling = culling[col_keep]
culling = culling.sort_values(by=["SE_Number", "CullDecisionDate"])
print(f"No. records in raw culling file: {culling.shape}")  # 25,105
culling2 = culling.drop_duplicates(subset=["SE_Number", "CullDecisionDate"])
print(f"No. records in culling file: {culling.shape}")  # 25,105
culling2.to_csv("cull_delpro.csv", index=False)

# Merge
for_my_rec = calving.merge(culling2, on=["SE_Number"], how="left")
for_my_rec.loc[pd.isna(for_my_rec["upper_limit"]), "upper_limit"] = for_my_rec["CullDecisionDate"]
for_my_rec.to_csv("dfDelPro.csv", index=False)

In [None]:
#
#
# LOAD DELPRO DRY OFF DATA
dry_off = pd.read_csv("Del_DryOff240823.csv", delimiter=';', low_memory=False)
col_keep = ["SE_Number", "DryOffDate"]
dry_off = dry_off[col_keep]
dry_off = dry_off.sort_values(by=["SE_Number", "DryOffDate"])
dry_off = dry_off.drop_duplicates(subset=["SE_Number", "DryOffDate"])
print(f"No. records in dry off file: {dry_off.shape}")  # 5,305
dry_off.to_csv("dry_off_delpro.csv", index=False)

for_my_rec = for_my_rec.merge(dry_off, on=["SE_Number"], how="left")
for_my_rec["CalvingDateDelPro"] = pd.to_datetime(for_my_rec["CalvingDateDelPro"])
for_my_rec["DryOffDate"] = pd.to_datetime(for_my_rec["DryOffDate"])
for_my_rec["upper_limit"] = pd.to_datetime(for_my_rec["upper_limit"])


def data(row):
    if row["CalvingDateDelPro"] < row["DryOffDate"] <= row["upper_limit"]:
        return 1
    else:
        return 0


for_my_rec["dryoff"] = for_my_rec.apply(data, axis=1)

# Keep all records with dry off dates fitted within lactation
df_ones = for_my_rec[for_my_rec['dryoff'] == 1]
# Keep last record where open and lacking dry off date
df_last_zero = for_my_rec[for_my_rec['dryoff'] == 0].groupby("SE_Number").tail(1)
# Concatenate dataframe and sort to maintain original order
df_combined = pd.concat([df_ones, df_last_zero])
df_combined = df_combined.sort_values(by=["SE_Number", "CalvingDateDelPro"]).reset_index(drop=True)
df_combined.to_csv("dfDelPro.csv", index=False)

# Get today's date for current lactation missing upper_limit
df_combined['TodayDate'] = pd.to_datetime('today').normalize()
df_combined.loc[df_combined["upper_limit"].isna() & df_combined["CullDecisionDate"].isna(), "upper_limit"] = (
    df_combined)["TodayDate"]

df_combined.loc[df_combined["upper_limit"].isna()
                & (df_combined["DryOffDate"] < df_combined["CalvingDateDelPro"]), "upper_limit"] = df_combined[
    "DryOffDate"]
df_combined.loc[df_combined["dryoff"] == 0, "DryOffDate"] = np.nan
df_combined.to_csv("dfDelPro.csv", index=False)

In [None]:
#
#
# LOAD DELPRO LACTATION NUMBER
lact = pd.read_csv("Del_Lactation240823.csv", delimiter=';', low_memory=False)
col_keep = ["SE_Number", "LactationInfoDate", "LactationNumber"]
lact = lact[col_keep]
lact = lact.sort_values(by=["SE_Number", "LactationInfoDate", "LactationNumber"])
lact = lact.drop_duplicates(subset=["SE_Number", "LactationNumber"])
lact.to_csv("lact.csv", index=False)

for_my_rec2 = lact.merge(df_combined, on=["SE_Number"], how="left")
for_my_rec2 = for_my_rec2[for_my_rec2["LactationInfoDate"] != "2022-05"]
for_my_rec2["LactationInfoDate"] = pd.to_datetime(for_my_rec2["LactationInfoDate"])


def data1(row):
    if row["CalvingDateDelPro"] <= row["LactationInfoDate"] <= row["upper_limit"]:
        return 1
    else:
        return 0


for_my_rec2["lact"] = for_my_rec2.apply(data1, axis=1)
for_my_rec2 = for_my_rec2[for_my_rec2['lact'] == 1]
for_my_rec2.to_csv("dfDelPro.csv", index=False)
print(f"No. of lactations in dataframe: {for_my_rec2.shape}")  # 10,361 lact - when DelPro data is used
for_my_rec3 = for_my_rec2.drop_duplicates("SE_Number")
print(f"No. of cows in dataframe: {for_my_rec3.shape}")  # 5,397

In [None]:
#
#
# ADD DELPRO BREED INFORMATION
breed = pd.read_csv("Del_Cow240823.csv", delimiter=';', low_memory=False)
col_keep = ["SE_Number", "BreedName"]
breed = breed[col_keep]
breed = breed.sort_values(by=["SE_Number", "BreedName"])
breed = breed.drop_duplicates(subset=["SE_Number", "BreedName"])
breed.to_csv("breed.csv", index=False)

for_my_rec2 = for_my_rec2.merge(breed, on=["SE_Number"], how="left")  # - when Del_Calving is used
# for_my_rec2 = df_combined.merge(breed, on=["SE_Number"], how="left") - when cow database is used
col_keep = ["FarmName_Pseudo", "SE_Number", "BreedName", "LactationNumber", "CalvingDateDelPro", "upper_limit",
            "CullDecisionDate", "CullReason1", "CullReason2", "DryOffDate"]
for_my_rec2 = for_my_rec2[col_keep]
for_my_rec2.rename(columns={"BreedName": "BreedNameDelPro", "LactationNumber": "LactationNumberDelPro",
                            "BirthDate": "BirthDateDelPro",
                            "upper_limit": "UpperLimitDelPro", "CullDecisionDate": "CullDecisionDateDelPro",
                            "CullReason1": "CullReason1DelPro", "CullReason2": "CullReason2DelPro",
                            "DryOffDate": "DryOffDateDelPro"}, inplace=True)
for_my_rec2.to_csv("dfDelPro.csv", index=False)

In [None]:
#
#
# ADD INSEMINATION DATA FROM DELPRO
# Load data, keep cowid, insdate, check for duplicates, sort
dfins = pd.read_csv("Del_Insemination240823.csv", delimiter=';', low_memory=False)
col_keep = ["FarmName_Pseudo", "SE_Number", "InseminationDate", "Breeder"]
dfins2 = dfins[col_keep]
print(dfins2.shape)  # 18,775 insemination events, 6col
dfins2 = dfins2.drop_duplicates(subset=["SE_Number", "InseminationDate"])
print(dfins2.shape)  # 18,689 unique insemination events, 6col
dfins2 = dfins2.sort_values(by=["SE_Number", "InseminationDate"])

dfins2.rename(columns={"InseminationDate": "InseminationDateDelPro", "Breeder": "BreederDelPro"}, inplace=True)

# Merge with dataframe
delpro = pd.read_csv("dfDelPro.csv", low_memory=False)
dfins4 = pd.merge(delpro, dfins2, on=["FarmName_Pseudo", "SE_Number"])
"""
# Subset chosen cow
SE_Number = ["SE-064c0cec-1189"]
dfins4 = dfins4[dfins4["SE_Number"].isin(SE_Number)]
"""
# Filter df for relevant insemination sorted to correct lactation
dfins5 = dfins4[(dfins4["InseminationDateDelPro"] >= dfins4["CalvingDateDelPro"]) & (dfins4["InseminationDateDelPro"] <= dfins4["UpperLimitDelPro"])]
dfins5.to_csv("dfDelPro.csv", index=False)

In [None]:
#
#
# ADD PREGNANCY CHECKS FROM DELPRO
# Make next_ins to sort pregnancy checks
df20 = pd.read_csv("dfDelPro.csv", low_memory=False)
df20['next_ins'] = df20.groupby(['SE_Number', 'LactationNumberDelPro'])['InseminationDateDelPro'].shift(-1)

# only keep next_ins where falls within range
dfins7 = df20[(df20["next_ins"] >= df20["InseminationDateDelPro"]) & (df20["next_ins"] <= df20["UpperLimitDelPro"])]
col_keep = ["SE_Number", "LactationNumberDelPro", "InseminationDateDelPro", "next_ins", "UpperLimitDelPro"]
dfins7 = dfins7[col_keep]
dfins7.to_csv("dfDelPro2.csv", index=False)

"""
#Subset chosen cow
SE_Number = ["SE-064c0cec-1189"]
dfins8 = dfins8[dfins8["SE_Number"].isin(SE_Number)]
#dfins8.to_csv("dataframe3.csv", index=False)
"""

# Load pregnancy check data, check for duplicates, sort
preg = pd.read_csv("Del_PregnancyCheck240823.csv", delimiter=';', low_memory=False)

col_keep = ["FarmName_Pseudo", "SE_Number", "PregnancyCheckDate", "PregnancyCheckResult"]
preg = preg[col_keep]
preg.rename(columns={"PregnancyCheckDate": "PregnancyCheckDateDelPro",
                     "PregnancyCheckResult": "PregnancyCheckResultDelPro"}, inplace=True)

print(f"No. pregnancy checks in cow database: {preg.shape}")  # 14,169 events, 4col
preg = preg.drop_duplicates(subset=["SE_Number", "PregnancyCheckDateDelPro"])
print(f"No. unique pregnancy checks in cow database: {preg.shape}")    # 14,146 unique events, 4col
preg = preg.sort_values(by=["SE_Number", "PregnancyCheckDateDelPro"])

# Add to subset df
col_keep = ["SE_Number", "LactationNumberDelPro", "InseminationDateDelPro", "next_ins", "UpperLimitDelPro"]
df20 = df20[col_keep]
df21 = df20.join(preg.set_index(["SE_Number"]), on=["SE_Number"])
df21.to_csv("dfDelPro2.csv", index=False)

# Ensure datetime conversion
df21['InseminationDateDelPro'] = pd.to_datetime(df21['InseminationDateDelPro'])
df21['PregnancyCheckDateDelPro'] = pd.to_datetime(df21['PregnancyCheckDateDelPro'])
df21['next_ins'] = pd.to_datetime(df21['next_ins'])
df21['UpperLimitDelPro'] = pd.to_datetime(df21['UpperLimitDelPro'])

# Initialize 'C' column with NaN
df21['C'] = np.nan


# Define the filtering function
def filter_pregcheck(row):
    if pd.isna(row["next_ins"]):
        if (row["PregnancyCheckDateDelPro"] >= row["InseminationDateDelPro"]) and (row["PregnancyCheckDateDelPro"] <=
                                                                                   row["UpperLimitDelPro"]):
            return "Yes"
        else:
            return "No"
    if pd.notna(row["next_ins"]):
        if (row["PregnancyCheckDateDelPro"] >= row["InseminationDateDelPro"]) and (row["PregnancyCheckDateDelPro"] <=
                                                                                   row["next_ins"]):
            return "Yes"
        else:
            return "No"


# Apply the filter function to each row
df21['C'] = df21.apply(filter_pregcheck, axis=1)
dfins10 = df21[df21["C"] == "Yes"]
dfins10.to_csv("dfDelPro2.csv", index=False)

col_keep = ["SE_Number", "LactationNumberDelPro", "InseminationDateDelPro", "PregnancyCheckDateDelPro",
            "PregnancyCheckResultDelPro"]
dfins11 = dfins10[col_keep]

# Convert the 'InseminationDate' column from datetime64[ns] to object for merging
print(dfins11.dtypes)
dfins11 = dfins11.copy()
dfins11['InseminationDateDelPro'] = dfins11['InseminationDateDelPro'].astype(str)

# Add to master df
dfins12 = pd.read_csv("dfDelPro.csv")
print(dfins12.dtypes)
dfins13 = dfins12.merge(dfins11, on=["SE_Number", "LactationNumberDelPro", "InseminationDateDelPro"], how="left")
"""
#Subset chosen cow
SE_Number = ["SE-064c0cec-1189"]
dfins13 = dfins13[dfins13["SE_Number"].isin(SE_Number)]
"""
dfins13.to_csv("dfDelPro.csv", index=False)

PAIRING COW DATABASE AND DELPRO DATA
- Use primarily cow database data, fill from DelPro where have missing data
Creates the following datasets:
- calving.csv
- culling.csv
- dry_off.csv
- breed.csv
- insemination.csv
- pregnancy_checks.csv
- updateDF.csv

In [None]:
#
#
# PAIRING COW DATABASE AND DELPRO DATA FOR CALVING
cKok = pd.read_csv("dfKok.csv", low_memory=False)
cKok["CalvingDate"] = cKok["CalvingDateKok"]
col_keep = ["FarmName_Pseudo", "SE_Number", "CalvingDate", "CalvingDateKok", "LactationNumberKok"]
cKok = cKok[col_keep]
cKok = cKok.drop_duplicates(subset=["SE_Number", "CalvingDateKok"])

cDel = pd.read_csv("dfDelPro.csv", low_memory=False)
cDel["CalvingDate"] = cDel["CalvingDateDelPro"]
col_keep = ["FarmName_Pseudo", "SE_Number", "CalvingDate", "CalvingDateDelPro", "LactationNumberDelPro"]
cDel = cDel[col_keep]
cDel = cDel.drop_duplicates(subset=["SE_Number", "CalvingDateDelPro"])

df_sum = pd.merge(cKok, cDel, on=["FarmName_Pseudo", "SE_Number", "CalvingDate"], how='outer')
df_sum = df_sum.sort_values(by=["FarmName_Pseudo", "SE_Number", "CalvingDate"])

"""
# Subset chosen cow - stämmer!
# SE-169e580a-3418 has 5 lactations, 4 in cow database (missing lact 3), last three in DelPro
SE_Number = ["SE-064c0cec-1189"]
df_sum = df_sum[df_sum["SE_Number"].isin(SE_Number)]
"""

df_sum['LactationNumber'] = df_sum['LactationNumberKok'].fillna(df_sum['LactationNumberDelPro'])
col_keep = ["FarmName_Pseudo", "SE_Number", "CalvingDate", "LactationNumber"]
df_sum = df_sum[col_keep]
df_sum.to_csv("calving.csv", index=False)

In [None]:
#
#
# PAIRING COW DATABASE AND DELPRO FOR CULLING
cKok2 = pd.read_csv("dfKok.csv", low_memory=False)
cKok2 = cKok2.drop_duplicates(subset=["SE_Number", "ExitDateKok"])
cKok2["CullingDate"] = cKok2["ExitDateKok"]
col_keep = ["FarmName_Pseudo", "SE_Number", "CullingDate", "ExitDateKok", "ExitReason_PrimaryReasonKok",
            "ExitReason_SecondaryReason1Kok", "ExitReason_SecondaryReason2Kok"]
cKok2 = cKok2[col_keep]
cKok2 = cKok2.drop_duplicates(subset=["SE_Number", "CullingDate"])

cDel2 = pd.read_csv("for_my_rec2.csv", low_memory=False)
cDel2 = cDel2.drop_duplicates(subset=["SE_Number", "CullDecisionDateDelPro"])
cDel2["CullingDate"] = cDel2["CullDecisionDateDelPro"]
col_keep = ["FarmName_Pseudo", "SE_Number", "CullingDate", "CullDecisionDateDelPro", "CullReason1DelPro",
            "CullReason2DelPro"]
cDel2 = cDel2[col_keep]
cDel2 = cDel2.drop_duplicates(subset=["SE_Number", "CullingDate"])

cKok2['CullingDate'] = cKok2['ExitDateKok'].fillna(cDel2['CullDecisionDateDelPro'])
cKok2['CullingReason1'] = cKok2['ExitReason_PrimaryReasonKok'].fillna(cDel2['CullReason1DelPro'])
cKok2['CullingReason2'] = cKok2['ExitReason_SecondaryReason1Kok'].fillna(cDel2['CullReason2DelPro'])

"""
# Subset chosen cow
SE_Number = ["SE-064c0cec-1189"]
df_sum = df_sum[df_sum["SE_Number"].isin(SE_Number)]
"""

# filt = cKok[cKok["ExitDateKok"].isna()]
# filt = cKok[cKok["ExitReason_SecondaryReason2Kok"].notna()]

cKok2.drop(columns=["ExitDateKok"], inplace=True)
cKok2.to_csv("culling.csv", index=False)

In [None]:
#
#
# PAIRING COW DATABASE AND DELPRO FOR DRY OFF
cKok3 = pd.read_csv("dfKok.csv", low_memory=False)
cKok3 = cKok3.drop_duplicates(subset=["SE_Number", "DryOffDateKok"])
col_keep = ["FarmName_Pseudo", "SE_Number", "LactationNumberKok", "DryOffDateKok"]
cKok3 = cKok3[col_keep]
cKok3.rename(columns={"LactationNumberKok": "LactationNumber"}, inplace=True)
cKok3 = cKok3.drop_duplicates(subset=["SE_Number", "DryOffDateKok"])

cDel3 = pd.read_csv("for_my_rec2.csv", low_memory=False)
cDel3 = cDel3.drop_duplicates(subset=["SE_Number", "DryOffDateDelPro"])
col_keep = ["FarmName_Pseudo", "SE_Number", "LactationNumberDelPro", "DryOffDateDelPro"]
cDel3 = cDel3[col_keep]
cDel3.rename(columns={"LactationNumberDelPro": "LactationNumber"}, inplace=True)
cDel3 = cDel3.drop_duplicates(subset=["SE_Number", "DryOffDateDelPro"])

merged_df = pd.merge(cKok3, cDel3, on=["FarmName_Pseudo", "SE_Number", "LactationNumber"], how='outer')
merged_df['DryOffDate'] = merged_df['DryOffDateKok'].combine_first(merged_df['DryOffDateDelPro'])
merged_df = merged_df.sort_values(by=["FarmName_Pseudo", "SE_Number", "LactationNumber"])

"""
# Subset chosen cow
# obs 867, SE-169e580a-2843, good example cow
SE_Number = ["SE-064c0cec-1189"]
merged_df = merged_df[merged_df["SE_Number"].isin(SE_Number)]
"""

merged_df.to_csv("dry_off.csv", index=False)

In [None]:
#
#
# PAIRING COW DATABASE AND DELPRO FOR BREED
cKok4 = pd.read_csv("dfKok.csv", low_memory=False)
cKok4 = cKok4.drop_duplicates(subset=["SE_Number"])
col_keep = ["FarmName_Pseudo", "SE_Number", "SireBreedKok", "DamBreedKok", "MGSBreedKok"]
cKok4 = cKok4[col_keep]
cKok4 = cKok4.drop_duplicates(subset=["SE_Number"])

cDel4 = pd.read_csv("for_my_rec2.csv", low_memory=False)
cDel4 = cDel4.drop_duplicates(subset=["SE_Number"])
col_keep = ["FarmName_Pseudo", "SE_Number", "BreedNameDelPro"]
cDel4 = cDel4[col_keep]
cDel4 = cDel4.drop_duplicates(subset=["SE_Number"])

merged_df = pd.merge(cKok4, cDel4, on=["FarmName_Pseudo", "SE_Number"], how='outer')

# Setting options to display more rows and columns
pd.set_option('display.max_rows', None)  # Show all rows
pd.set_option('display.max_columns', None)  # Show all columns

frequency_table = pd.crosstab(merged_df['SireBreedKok'], merged_df['DamBreedKok'])
print(frequency_table)


# Define breeds in cow database
def categorize1(value):
    if (value["SireBreedKok"] == "SRB") and (value["DamBreedKok"] == "SRB"):
        return "NRDC"
    if (value["SireBreedKok"] == "RB") and (value["DamBreedKok"] == "RB"):
        return "NRDC"
    if (value["SireBreedKok"] == "RB-SRB") and (value["DamBreedKok"] == "RB-SRB"):
        return "NRDC"
    if (value["SireBreedKok"] == "RB") and (value["DamBreedKok"] == "RB-SRB"):
        return "NRDC"
    if (value["SireBreedKok"] == "SRB") and (value["DamBreedKok"] == "RB"):
        return "NRDC"
    if (value["SireBreedKok"] == "RB-SRB") and (value["DamBreedKok"] == "RB-SRB"):
        return "NRDC"
    if (value["SireBreedKok"] == "SRB") and (value["DamBreedKok"] == "RB-SRB"):
        return "NRDC"
    if (value["SireBreedKok"] == "RB") and (value["DamBreedKok"] == "SRB"):
        return "NRDC"
    if (value["SireBreedKok"] == "RB-SRB") and (value["DamBreedKok"] == "SRB"):
        return "NRDC"
    if (value["SireBreedKok"] == "SRB-RB") and (value["DamBreedKok"] == "SRB"):
        return "NRDC"
    if (value["SireBreedKok"] == "RB") and (value["DamBreedKok"] == "SRB-RB"):
        return "NRDC"
    if (value["SireBreedKok"] == "SAB") and (value["DamBreedKok"] == "SAB"):
        return "NRDC"

    if (value["SireBreedKok"] == "SLB") and (value["DamBreedKok"] == "SLB"):
        return "SLB"
    if (value["SireBreedKok"] == "SJB") and (value["DamBreedKok"] == "SJB") and (value["MGSBreedKok"] == "SJB"):
        return "SJB"

    if (value["SireBreedKok"] == "FJÄ") and (value["DamBreedKok"] == "FJÄ"):
        return "Other"
    if (value["SireBreedKok"] == "SIM") and (value["DamBreedKok"] == "SIM"):
        return "Other"

    if (value["SireBreedKok"] == "SRB") and (value["DamBreedKok"] == "SLB"):
        return "DairyCross"
    if (value["SireBreedKok"] == "SLB") and (value["DamBreedKok"] == "SRB"):
        return "DairyCross"
    if (value["SireBreedKok"] == "SLB") and (value["DamBreedKok"] == "RB"):
        return "DairyCross"
    if (value["SireBreedKok"] == "SLB") and (value["DamBreedKok"] == "RB-SLB"):
        return "DairyCross"
    if (value["SireBreedKok"] == "SLB") and (value["DamBreedKok"] == "RB-SRB"):
        return "DairyCross"
    if (value["SireBreedKok"] == "SLB") and (value["DamBreedKok"] == "SJB"):
        return "DairyCross"
    if (value["SireBreedKok"] == "SLB") and (value["DamBreedKok"] == "SJB-SLB"):
        return "DairyCross"
    if (value["SireBreedKok"] == "SLB") and (value["DamBreedKok"] == "SJB-SRB"):
        return "DairyCross"
    if (value["SireBreedKok"] == "SLB") and (value["DamBreedKok"] == "SKB-SJB"):
        return "DairyCross"
    if (value["SireBreedKok"] == "SLB") and (value["DamBreedKok"] == "SKB-SLB"):
        return "DairyCross"
    if (value["SireBreedKok"] == "SLB") and (value["DamBreedKok"] == "SLB-RB"):
        return "DairyCross"
    if (value["SireBreedKok"] == "SLB") and (value["DamBreedKok"] == "SLB-SJB"):
        return "DairyCross"
    if (value["SireBreedKok"] == "SLB") and (value["DamBreedKok"] == "SLB-SKB"):
        return "DairyCross"
    if (value["SireBreedKok"] == "SLB") and (value["DamBreedKok"] == "SLB-SRB"):
        return "DairyCross"
    if (value["SireBreedKok"] == "SLB") and (value["DamBreedKok"] == "SRB-RB"):
        return "DairyCross"
    if (value["SireBreedKok"] == "SLB") and (value["DamBreedKok"] == "SRB-SJB"):
        return "DairyCross"
    if (value["SireBreedKok"] == "SLB") and (value["DamBreedKok"] == "SRB-SLB"):
        return "DairyCross"
    if (value["SireBreedKok"] == "SRB") and (value["DamBreedKok"] == "RB-SJB"):
        return "DairyCross"
    if (value["SireBreedKok"] == "SRB") and (value["DamBreedKok"] == "SJB"):
        return "DairyCross"
    if (value["SireBreedKok"] == "SRB") and (value["DamBreedKok"] == "SJB-SLB"):
        return "DairyCross"
    if (value["SireBreedKok"] == "SRB") and (value["DamBreedKok"] == "SJB-SRB"):
        return "DairyCross"
    if (value["SireBreedKok"] == "SRB") and (value["DamBreedKok"] == "SLB-SRB"):
        return "DairyCross"
    if (value["SireBreedKok"] == "SRB") and (value["DamBreedKok"] == "SRB-SKB"):
        return "DairyCross"
    if (value["SireBreedKok"] == "SRB") and (value["DamBreedKok"] == "SRB-SLB"):
        return "DairyCross"
    if (value["SireBreedKok"] == "RB") and (value["DamBreedKok"] == "SJB"):
        return "DairyCross"
    if (value["SireBreedKok"] == "RB") and (value["DamBreedKok"] == "SJB-SRB"):
        return "DairyCross"
    if (value["SireBreedKok"] == "RB") and (value["DamBreedKok"] == "SLB"):
        return "DairyCross"
    if (value["SireBreedKok"] == "RB") and (value["DamBreedKok"] == "SRB-SJB"):
        return "DairyCross"
    if (value["SireBreedKok"] == "RB") and (value["DamBreedKok"] == "SRB-SLB"):
        return "DairyCross"
    if (value["SireBreedKok"] == "SKB") and (value["DamBreedKok"] == "FJÄ"):
        return "DairyCross"
    if (value["SireBreedKok"] == "SRB") and (value["DamBreedKok"] == "SAB"):
        return "DairyCross"
    if (value["SireBreedKok"] == "SAB") and (value["DamBreedKok"] == "SAB-SRB"):
        return "DairyCross"
    if (value["SireBreedKok"] == "SLB") and (value["DamBreedKok"] == "SAB-SRB"):
        return "DairyCross"
    if (value["SireBreedKok"] == "SRB") and (value["DamBreedKok"] == "SAB-SRB"):
        return "DairyCross"
    if (value["SireBreedKok"] == "SLB") and (value["DamBreedKok"] == "SAB-SRB"):
        return "DairyCross"
    if (value["SireBreedKok"] == "SLB") and (value["DamBreedKok"] == "SLB-BSW"):
        return "DairyCross"
    if (value["SireBreedKok"] == "SLB") and (value["DamBreedKok"] == "SLB-SAB"):
        return "DairyCross"
    if (value["SireBreedKok"] == "RB") and (value["DamBreedKok"] == "SRB-SAB"):
        return "DairyCross"
    if (value["SireBreedKok"] == "SAB") and (value["DamBreedKok"] == "SRB"):
        return "DairyCross"
    if (value["SireBreedKok"] == "SLB") and (value["DamBreedKok"] == "SRB-SAB"):
        return "DairyCross"
    if (value["SireBreedKok"] == "SRB") and (value["DamBreedKok"] == "SRB-SAB"):
        return "DairyCross"

    if (value["SireBreedKok"] == "RB") and (value["DamBreedKok"] == "MON-SLB"):
        return "DairyBeefCross"
    if (value["SireBreedKok"] == "SLB") and (value["DamBreedKok"] == "CHA-RB"):
        return "DairyBeefCross"
    if (value["SireBreedKok"] == "SLB") and (value["DamBreedKok"] == "HER-SLB"):
        return "DairyBeefCross"
    if (value["SireBreedKok"] == "SLB") and (value["DamBreedKok"] == "MON-RB"):
        return "DairyBeefCross"
    if (value["SireBreedKok"] == "SLB") and (value["DamBreedKok"] == "MON-SLB"):
        return "DairyBeefCross"
    if (value["SireBreedKok"] == "SLB") and (value["DamBreedKok"] == "MON-SRB"):
        return "DairyBeefCross"
    if (value["SireBreedKok"] == "SLB") and (value["DamBreedKok"] == "SLB-HER"):
        return "DairyBeefCross"
    if (value["SireBreedKok"] == "SLB") and (value["DamBreedKok"] == "SRB-MON"):
        return "DairyBeefCross"
    if (value["SireBreedKok"] == "SLB") and (value["DamBreedKok"] == "SLB-MON"):
        return "DairyBeefCross"
    if (value["SireBreedKok"] == "SRB") and (value["DamBreedKok"] == "MON-SLB"):
        return "DairyBeefCross"
    if (value["SireBreedKok"] == "SRB") and (value["DamBreedKok"] == "MON-SRB"):
        return "DairyBeefCross"
    if (value["SireBreedKok"] == "SJB") and (value["DamBreedKok"] == "SIM-SJB"):
        return "DairyBeefCross"
    if (value["SireBreedKok"] == "SJB") and (value["DamBreedKok"] == "SJB-SIM"):
        return "DairyBeefCross"

    if (value["SireBreedKok"] == "CHA") and (value["DamBreedKok"] == "RB-SRB"):
        return "BeefDairyCross"
    if (value["SireBreedKok"] == "CHA") and (value["DamBreedKok"] == "SLB-SRB"):
        return "BeefDairyCross"
    if (value["SireBreedKok"] == "MON") and (value["DamBreedKok"] == "SLB"):
        return "BeefDairyCross"
    if (value["SireBreedKok"] == "MON") and (value["DamBreedKok"] == "SRB"):
        return "BeefDairyCross"
    if (value["SireBreedKok"] == "MON") and (value["DamBreedKok"] == "RB-SLB"):
        return "BeefDairyCross"
    if (value["SireBreedKok"] == "MON") and (value["DamBreedKok"] == "RB-SRB"):
        return "BeefDairyCross"
    if (value["SireBreedKok"] == "MON") and (value["DamBreedKok"] == "SRB-SLB"):
        return "BeefDairyCross"
    if (value["SireBreedKok"] == "MON") and (value["DamBreedKok"] == "SLB-SRB"):
        return "BeefDairyCross"
    if (value["SireBreedKok"] == "SIM") and (value["DamBreedKok"] == "SJB"):
        return "BeefDairyCross"
    if (value["SireBreedKok"] == "SIM") and (value["DamBreedKok"] == "SRB"):
        return "BeefDairyCross"
    if (value["SireBreedKok"] == "SIM") and (value["DamBreedKok"] == "SRB-SJB"):
        return "BeefDairyCross"
    if (value["SireBreedKok"] == "SIM") and (value["DamBreedKok"] == "SRB-SAB"):
        return "BeefDairyCross"
    if (value["SireBreedKok"] == "HER") and (value["DamBreedKok"] == "SLB-SRB"):
        return "BeefDairyCross"

    if pd.isna(value["SireBreedKok"]) and pd.isna(value["DamBreedKok"]) and pd.isna(value["MGSBreedKok"]):
        return np.nan
    else:
        return 'Other'


merged_df['BreedKok'] = merged_df.apply(categorize1, axis=1)
merged_df.to_csv("breed.csv", index=False)

# Count the occurrences of each unique value in the breed column
value_counts = merged_df['BreedKok'].value_counts()
plt.figure(figsize=(10, 10))  # Optional: Set the figure size
plt.pie(value_counts, labels=value_counts.index, autopct='%1.1f%%', startangle=140, pctdistance=0.85)
plt.axis('equal')
plt.title('Distribution of Breeds of Individuals According to Cow Database')
plt.show()

# Fix missing breed info from DelPro
frequency_table = merged_df['BreedNameDelPro'].value_counts()
print(frequency_table)


# Define breeds in delpro
def categorize2(value):
    if value["BreedNameDelPro"] == "01 SRB":
        return "NRDC"
    if value["BreedNameDelPro"] == "06 RB":
        return "NRDC"
    if value["BreedNameDelPro"] == "02 SLB":
        return "SLB"
    if value["BreedNameDelPro"] == "04 SJB":
        return "SJB"
    if value["BreedNameDelPro"] == "03 SKB":
        return "Other"
    if value["BreedNameDelPro"] == "Unknown Breed":
        return "Unknown"
    if value["BreedNameDelPro"] == "186":
        return "Unknown"
    if value["BreedNameDelPro"] == "187":
        return "Unknown"
    if value["BreedNameDelPro"] == "99 Korsning/Obest Ras":
        return "Unknown"
    if value["BreedNameDelPro"] == "99 Korsning/övriga raser":
        return "Unknown"
    if value["BreedNameDelPro"] == "41 Fjällko":
        return "Other"
    if value["BreedNameDelPro"] == "08 Hereford":
        return "Other"
    if value["BreedNameDelPro"] == "11 Aberdeen Angus":
        return "Other"
    if value["BreedNameDelPro"] == "28 Fleckvieh":
        return "Other"
    if value["BreedNameDelPro"] == "27 Montbéliard":
        return "Other"
    if (value["BreedKok"] == "Missing") & pd.isna(value["BreedNameDelPro"]):
        return np.nan
    if pd.isna(value["BreedNameDelPro"]):
        return np.nan
    else:
        return "Other"


merged_df['BreedDelPro'] = merged_df.apply(categorize2, axis=1)

merged_df['Breed'] = merged_df['BreedKok'].fillna(merged_df['BreedDelPro'])
merged_df.to_csv("breed.csv", index=False)

# Count the occurrences of each unique value in the 'Fruit' column
value_counts = merged_df['Breed'].value_counts()
plt.figure(figsize=(10, 10))
plt.pie(value_counts, labels=value_counts.index, autopct='%1.1f%%', startangle=140, pctdistance=0.85)
plt.axis('equal')
plt.title('Distribution of Breeds in Full Material')
plt.show()

# Fix missing breed info from DelPro
frequency_table = merged_df['Breed'].value_counts()
print(frequency_table)

In [None]:
#
#
# PAIRING COW DATABASE AND DELPRO FOR INSEMINATION DATA
cKok4 = pd.read_csv("dfKok.csv", low_memory=False)
cKok4 = cKok4.drop_duplicates(subset=["SE_Number", "InseminationDateKok"])
col_keep = ["FarmName_Pseudo", "SE_Number", "LactationNumberKok", "InseminationDateKok"]
cKok4 = cKok4[col_keep]
cKok4.rename(columns={"LactationNumberKok": "LactationNumber"}, inplace=True)
cKok4 = cKok4.drop_duplicates(subset=["SE_Number", "LactationNumber", "InseminationDateKok"])

cDel4 = pd.read_csv("dfDelPro.csv", low_memory=False)
cDel4 = cDel4.drop_duplicates(subset=["SE_Number", "InseminationDateDelPro"])
col_keep = ["FarmName_Pseudo", "SE_Number", "LactationNumberDelPro", "InseminationDateDelPro"]
cDel4 = cDel4[col_keep]
cDel4.rename(columns={"LactationNumberDelPro": "LactationNumber"}, inplace=True)
cDel4 = cDel4.drop_duplicates(subset=["SE_Number", "LactationNumber", "InseminationDateDelPro"])

merged_df = pd.merge(cKok4, cDel4, on=["FarmName_Pseudo", "SE_Number", "LactationNumber"], how='outer')
merged_df['InseminationDate'] = merged_df['InseminationDateKok'].fillna(merged_df['InseminationDateDelPro'])
merged_df = merged_df.drop_duplicates(subset=["SE_Number", "InseminationDate"])

"""
# Subset chosen cow
# obs 867, SE-169e580a-2843, good example cow
SE_Number = ["SE-064c0cec-1189"]
merged_df = merged_df[merged_df["SE_Number"].isin(SE_Number)]
"""

merged_df = merged_df.sort_values(by=["FarmName_Pseudo", "SE_Number", "LactationNumber", "InseminationDate"])
merged_df.to_csv("insemination.csv", index=False)

In [None]:
#
#
# PAIRING PREGNANCY CHECK FROM COW DATABASE AND DELPRO
cKok5 = pd.read_csv("dfKok.csv", low_memory=False)
cKok5 = cKok5.drop_duplicates(subset=["SE_Number", "PregnancyCheckDateKok"])
col_keep = ["FarmName_Pseudo", "SE_Number", "LactationNumberKok", "PregnancyCheckDateKok", "PregnancyStatusKok"]
cKok5 = cKok5[col_keep]
cKok5.rename(columns={"LactationNumberKok": "LactationNumber"}, inplace=True)
cKok5 = cKok5.drop_duplicates(subset=["SE_Number", "LactationNumber", "PregnancyCheckDateKok"])

cDel5 = pd.read_csv("dfDelPro.csv", low_memory=False)
cDel5 = cDel5.drop_duplicates(subset=["SE_Number", "InseminationDateDelPro"])
col_keep = ["FarmName_Pseudo", "SE_Number", "LactationNumberDelPro", "PregnancyCheckDateDelPro",
            "PregnancyCheckResultDelPro"]
cDel5 = cDel5[col_keep]
cDel5.rename(columns={"LactationNumberDelPro": "LactationNumber"}, inplace=True)
cDel5 = cDel5.drop_duplicates(subset=["SE_Number", "LactationNumber", "PregnancyCheckDateDelPro"])

merged_df = pd.merge(cKok5, cDel5, on=["FarmName_Pseudo", "SE_Number", "LactationNumber"], how='outer')
merged_df['PregnancyCheckDate'] = merged_df['PregnancyCheckDateKok'].fillna(merged_df['PregnancyCheckDateDelPro'])
merged_df['PregnancyStatus'] = merged_df['PregnancyStatusKok'].fillna(merged_df['PregnancyCheckResultDelPro'])
merged_df = merged_df.drop_duplicates(subset=["SE_Number", "PregnancyCheckDate"])

"""
# Subset chosen cow
# obs 867, SE-169e580a-2843, good example cow
SE_Number = ["SE-064c0cec-1189"]
merged_df = merged_df[merged_df["SE_Number"].isin(SE_Number)]
"""

merged_df.to_csv("pregnancy_checks.csv", index=False)

In [None]:
#
#
# BUILD MASTER DATAFRAME
test2a = pd.read_csv("calving.csv", low_memory=False)
test2b = pd.read_csv("breed.csv", low_memory=False)
col_keep = ["FarmName_Pseudo", "SE_Number", "Breed"]
test2b = test2b[col_keep]
test2c = pd.merge(test2a, test2b, on=["FarmName_Pseudo", "SE_Number"], how='outer')
test2c.to_csv("updateDF.csv", index=False)

test2d = pd.read_csv("insemination.csv", low_memory=False)
col_keep = ["FarmName_Pseudo", "SE_Number", "LactationNumber", "InseminationDate"]
test2d = test2d[col_keep]
test2d = test2d.drop_duplicates(subset=["FarmName_Pseudo", "SE_Number", "LactationNumber", "InseminationDate"])
test2e = pd.merge(test2c, test2d, on=["FarmName_Pseudo", "SE_Number", "LactationNumber"], how='outer')
test2e.to_csv("updateDF.csv", index=False)

test2f = pd.read_csv("pregnancy_checks.csv", low_memory=False)
col_keep = ["FarmName_Pseudo", "SE_Number", "LactationNumber", "PregnancyCheckDate", "PregnancyStatus"]
test2f = test2f[col_keep]
test2f = test2f.drop_duplicates(subset=["FarmName_Pseudo", "SE_Number", "LactationNumber", "PregnancyCheckDate"])
test2g = pd.merge(test2e, test2f, on=["FarmName_Pseudo", "SE_Number", "LactationNumber"], how='outer')
# test2g = test2g.drop_duplicates(subset=["FarmName_Pseudo", "SE_Number", "LactationNumber", "PregnancyCheckDate"])
test2g = test2g.sort_values(by=["FarmName_Pseudo", "SE_Number", "LactationNumber", "InseminationDate",
                                "PregnancyCheckDate"])
test2g.to_csv("updateDF.csv", index=False)

test2h = pd.read_csv("dry_off.csv", low_memory=False)
col_keep = ["FarmName_Pseudo", "SE_Number", "LactationNumber", "DryOffDate"]
test2h = test2h[col_keep]
test2h = test2h.drop_duplicates(subset=["FarmName_Pseudo", "SE_Number", "LactationNumber", "DryOffDate"])
test2i = pd.merge(test2g, test2h, on=["FarmName_Pseudo", "SE_Number", "LactationNumber"], how='outer')

"""
# Subset chosen cow
# obs 867, SE-169e580a-2843, good example cow
SE_Number = ["SE-064c0cec-1189"]
test2i = test2i[test2i["SE_Number"].isin(SE_Number)]
"""
test2i.to_csv("updateDF.csv", index=False)

test2j = pd.read_csv("culling.csv", low_memory=False)
test2j = test2j.drop_duplicates(subset=["FarmName_Pseudo", "SE_Number", "CullingDate"])
test2k = pd.merge(test2i, test2j, on=["FarmName_Pseudo", "SE_Number"], how='outer')
test2k.to_csv("updateDF.csv", index=False)

"""
# Subset chosen cow
# obs 867, SE-169e580a-2843, good example cow
SE_Number = ["SE-064c0cec-1189"]
test2k = test2k[test2k["SE_Number"].isin(SE_Number)]
"""

col_keep = ["SE_Number", "LactationNumber", "CalvingDate", "InseminationDate", "PregnancyCheckDate", "PregnancyStatus"]
test2k2 = test2k[col_keep]

# Sort pregnancy checks and keep only for relevant insemination
test2l = test2k2.drop_duplicates(subset=["SE_Number", "CalvingDate", "InseminationDate"])
test2l = test2l.copy()
test2l["next_ins"] = test2l.groupby(["SE_Number", "LactationNumber"])["InseminationDate"].shift(-1)
col_keep = ["SE_Number", "LactationNumber", "InseminationDate", "next_ins"]
test2l = test2l[col_keep]
test2m = pd.merge(test2k2, test2l, on=["SE_Number", "LactationNumber", "InseminationDate"], how='outer')
# test2m.to_csv("updateDF2.csv", index=False)

test2m.loc[test2m['PregnancyCheckDate'] > test2m['next_ins'], 'PregnancyCheckDate'] = np.nan
test2m.loc[test2m['PregnancyCheckDate'] < test2m['InseminationDate'], 'PregnancyCheckDate'] = np.nan
# test2m.to_csv("updateDF3.csv", index=False)

test2n = test2m.drop_duplicates(subset=["SE_Number", "LactationNumber", "InseminationDate", "PregnancyCheckDate"])
test2n = test2n.sort_values(by=["SE_Number", "LactationNumber", "InseminationDate", "PregnancyCheckDate"])
# test2n.to_csv("updateDF.csv", index=False)

df_not_missing = test2n[test2n['PregnancyCheckDate'].notna()]
# df_not_missing.to_csv("updateDF4.csv", index=False)

test2o = test2n[test2n['PregnancyCheckDate'].isnull()]
col_keep = ["SE_Number", "LactationNumber", "CalvingDate", "InseminationDate"]
test2o = test2o[col_keep]
# test2o.to_csv("updateDF5.csv", index=False)

test2p = pd.merge(df_not_missing, test2o, on=["SE_Number", "LactationNumber", "CalvingDate", "InseminationDate"],
                  how='outer')
# test2p.to_csv("updateDF2.csv", index=False)

test2q = pd.read_csv("updateDF.csv", low_memory=False)
col_keep = ["FarmName_Pseudo", "SE_Number", "LactationNumber", "CalvingDate", "Breed", "DryOffDate", "CullingDate",
            "ExitReason_PrimaryReasonKok", "ExitReason_SecondaryReason1Kok", "ExitReason_SecondaryReason2Kok",
            "CullingReason1", "CullingReason2"]
test2q = test2q[col_keep]
test2r = pd.merge(test2q, test2p, on=["SE_Number", "LactationNumber", "CalvingDate"], how='left')
test2r.to_csv("updateDF3.csv", index=False)

df = pd.read_csv("updateDF3.csv", low_memory=False)
print(f"No. of pregnancy checks in database: {df.shape}")  # 327,839 pregnancy checks
df2 = df.drop_duplicates(subset=["SE_Number", "LactationNumber", "InseminationDate"])
print(f"No. of inseminations in dataset: {df2.shape}")  # 36,506 inseminations
df2 = df.drop_duplicates(subset=["SE_Number", "LactationNumber"])
print(f"No. of lactations in dataset: {df2.shape}")  # 20,683 lactations
df2 = df.drop_duplicates(subset=["SE_Number"])
print(f"No. of cows in dataset: {df2.shape}")  # 9,535 cows
df2 = df.drop_duplicates(subset=["FarmName_Pseudo"])
print(f"No. of herds in dataset: {df2.shape}")  # 57 herds

#
#
# ADD LINEAGE INFORMATION
lin = pd.read_csv("Kok_Lineage240821.csv", delimiter=';', low_memory=False)
col_keep = ["BirthID", "BirthDate", "Father_SE_Number", "Mother_SE_Number"]
lin = lin[col_keep]
lin.rename(columns={'BirthID': 'SE_Number'}, inplace=True)

print(f"No. of unique cows in database: {lin.shape}")  # 24,067
lin = lin.drop_duplicates(subset=["SE_Number", "BirthDate", "Father_SE_Number", "Mother_SE_Number"])
print(f"No. of unique cows in database: {lin.shape}")  # 22,666

df = pd.read_csv("updateDF3.csv", low_memory=False)
lin2 = df.merge(lin, on=["SE_Number"], how="left")

# Change order of columns
new_column_order = ["FarmName_Pseudo", "SE_Number", "Breed", "BirthDate", "Father_SE_Number", "Mother_SE_Number",
                    "CalvingDate", "LactationNumber", "InseminationDate",
                    "PregnancyCheckDate", "PregnancyStatus", "DryOffDate", "CullingDate", "ExitReason_PrimaryReasonKok",
                    "ExitReason_SecondaryReason1Kok", "ExitReason_SecondaryReason2Kok", "CullingReason1",
                    "CullingReason2"]
lin2 = lin2[new_column_order]
lin2 = lin2.drop_duplicates(subset=["SE_Number", "CalvingDate", "LactationNumber", "InseminationDate",
                                    "PregnancyCheckDate"])
"""
SE_Number = ["SE-064c0cec-1189"]
lin2 = lin2[lin2["SE_Number"].isin(SE_Number)]
"""

# REORDER PregnancyStatus
unique_values = lin2['PregnancyStatus'].unique()
print(unique_values)

lin2["PregnancyStatus"] = lin2["PregnancyStatus"].replace(
    {"Dräktig (undersökt) Dr": 2,
     "Dräktig Analys Dr A": 52,
     "Dräktig (ej undersökt) Dr": 22,
     "Dräktig (tjurbetäckt) Dr": 32,
     "Ej dräktig (tjurbetäckt) eDr": 31,
     "Dräktig ? Analys Dr? A": 53,
     "Ej Dräktig (ej undersökt) eDr": 21,
     "Negative": 51,
     "Positive": 52,
     "Uncertain": 53,
     "Dräktig ? (undersökt) Dr?": 3,
     "Dräktig, (sem/bet annan bes) Dr": 42,
     "Dräktig ? (tjurbetäckt) Dr?": 33,
     "Ej dräktig Analys eDr A": 1})

lin2.to_csv("updateDF.csv", index=False)