# SCRIPT TO GENERATE FERTILITY TRAITS

Script assuming that "HeatStressCleanWorkFlow.csv" has been run to generate "updateDF.csv"

Program creates the following fertility traits
- NINS: Number of inseminations
- CFI: Interval from calving to first service
- CLI: Interval from calving to last service
- FLI: Interval from first to last service
- CI: Calving interval
- GL: Gestation length
- CR: Conception rate - NEEDS EDITING!

In [1]:
import pandas as pd
import numpy as np

In [9]:
# NINS - NUMBER OF INSEMINATIONS
df_ins2d = pd.read_csv("../Data/updateDF.csv", low_memory=False)
ins_count = df_ins2d.drop_duplicates(subset=["SE_Number", "LactationNumber", "InseminationDate"])
ins_count = (ins_count.groupby(["SE_Number", 'LactationNumber']).size().reset_index(name='NINS'))
df_ins2d = df_ins2d.merge(ins_count, on=["SE_Number", "LactationNumber"], how="left")
df_ins2d.to_csv("../Data/fertility.csv", index=False)

In [10]:
# CFI - INTERVAL FROM CALVING TO FIRST SERVICE
# CLI - INTERVAL FROM CALVING TO LAST SERVICE
# FLI - INTERVAL FROM FIRST TO LAST SERVICE
# Group by cow and lactation, and get the first and last inseminations
first_observations = (df_ins2d.groupby(["SE_Number", "LactationNumber"]).first().reset_index())
last_observations = (df_ins2d.groupby(["SE_Number", "LactationNumber"]).last().reset_index())

print(last_observations.columns)
print(len(last_observations.columns))

col_keep = ["SE_Number", "LactationNumber", "CalvingDate", "InseminationDate"]
first_observations = first_observations[col_keep]

col_keep = ["SE_Number", "LactationNumber", "CalvingDate", "InseminationDate"]
last_observations = last_observations[col_keep]

# Rename the columns of the last observations df to distinguish them
last_observations.rename(columns={
    "CalvingDate": "CalvingDate_last",
    "InseminationDate": "InseminationDate_last",
}, inplace=True)


# Concatenate first and last observations df side by side
df_ins2e = pd.concat([first_observations, last_observations.iloc[:, 2:]], axis=1)

# Convert columns to datetime objects
df_ins2e["InseminationDate"] = pd.to_datetime(df_ins2e["InseminationDate"])
df_ins2e["InseminationDate_last"] = pd.to_datetime(df_ins2e["InseminationDate_last"])
df_ins2e["CalvingDate"] = pd.to_datetime(df_ins2e["CalvingDate"])
df_ins2e["CalvingDate_last"] = pd.to_datetime(df_ins2e["CalvingDate_last"])

# Calculate fertility traits
df_ins2e["CFI"] = (df_ins2e["InseminationDate"] - df_ins2e["CalvingDate"]).dt.days
df_ins2e["CLI"] = (df_ins2e["InseminationDate_last"] - df_ins2e["CalvingDate_last"]).dt.days
df_ins2e["FLI"] = (df_ins2e["InseminationDate_last"] - df_ins2e["InseminationDate"]).dt.days
col_keep = ["SE_Number", "LactationNumber", "CFI", "CLI", "FLI", "InseminationDate_last"]
df_ins2e = df_ins2e[col_keep]

df_ins2f = df_ins2d.merge(df_ins2e, on=["SE_Number", "LactationNumber"], how="left")
df_ins2f.to_csv("../Data/fertility.csv", index=False)

Index(['SE_Number', 'LactationNumber', 'Breed', 'FarmName_Pseudo',
       'AnimalNumber', 'Del_Cow_Id', 'BirthDate', 'Father_SE_Number',
       'Mother_SE_Number', 'CalvingDate', 'InseminationDate',
       'PregnancyCheckDate', 'PregnancyStatus', 'DryOffDate', 'CullingDate',
       'ExitReason_PrimaryReasonKok', 'ExitReason_SecondaryReason1Kok',
       'ExitReason_SecondaryReason2Kok', 'CullingReason1', 'CullingReason2',
       'next_calving', 'next_ins', 'prev_ins', 'shift_calf', 'upper_limit',
       'NINS'],
      dtype='object')
26


In [11]:
# CI - CALVING INTERVAL
df_ins2g = df_ins2f.drop_duplicates(subset=["SE_Number", "LactationNumber", "CalvingDate", "next_calving"])
col_keep = ["SE_Number", "LactationNumber", "CalvingDate", "next_calving"]
df_ins2g = df_ins2g[col_keep]
df_ins2g["CalvingDate"] = pd.to_datetime(df_ins2g["CalvingDate"])
df_ins2g["next_calving"] = pd.to_datetime(df_ins2g["next_calving"])
df_ins2g["CI"] = (df_ins2g["next_calving"] - df_ins2g["CalvingDate"]).dt.days
df_ins2g.drop(columns=["CalvingDate", "next_calving"], inplace=True)

df_ins2h = df_ins2f.merge(df_ins2g, on=["SE_Number", "LactationNumber"], how="left")

In [12]:
# GL - GESTATION LENGTH
# i.e. length from last insemination to next calving
df_ins2h["InseminationDate_last"] = pd.to_datetime(df_ins2h["InseminationDate_last"])
df_ins2h["next_calving"] = pd.to_datetime(df_ins2h["next_calving"])
df_ins2h["GL"] = (df_ins2h["next_calving"] - df_ins2h["InseminationDate_last"]).dt.days
df_ins2h.to_csv("../Data/updateDF.csv", index=False)

# CONCEPTION RATE - script not finished
- see Appendix 2 in "NAV official genetic evaluation of Dairy Cattle - data and genetic models" for full description

In [None]:
#
#
# PREPARATIONS
df = pd.read_csv("../Data/updateDF.csv", low_memory=False)

df2 = df.drop_duplicates(subset=['SE_Number', "LactationNumber", "InseminationDate"])
df2 = df2.copy()
df2['next_ins'] = df2.groupby(['SE_Number', 'LactationNumber'])['InseminationDate'].shift(-1)
df2['prev_ins'] = df2.groupby(['SE_Number', 'LactationNumber'])['InseminationDate'].shift(1)
col_keep = ["SE_Number", "LactationNumber", "InseminationDate", "next_ins", "prev_ins"]
df2 = df2[col_keep]
df = pd.merge(df, df2, on=["SE_Number", "LactationNumber", "InseminationDate"], how="left")

df3 = df.drop_duplicates(subset=['SE_Number', "LactationNumber"])
df3 = df3.copy()
df3["shift_calf"] = df3.groupby(["SE_Number"])["CalvingDate"].shift(-1)
col_keep = ["SE_Number", "LactationNumber", "shift_calf"]
df3 = df3[col_keep]

df = pd.merge(df, df3, on=["SE_Number", "LactationNumber"], how="left")
col_keep = ["SE_Number", "LactationNumber", "CalvingDate", "prev_ins", "InseminationDate", "next_ins",
            "PregnancyCheckDate", "PregnancyStatus", "DryOffDate", "next_calving",
            "CullingDate", "GL", "NINS", "shift_calf"]
df = df[col_keep]
df["upper_limit"] = df["DryOffDate"].fillna(df["shift_calf"]).fillna(df["CullingDate"])
df.to_csv("next_ins.csv", index=False)

"""
# Subset chosen cows
SE_Number = ["SE-064c0cec-1189", "SE-5c06d92d-3145", "SE-5c06d92d-3177", "SE-5b581702-1742",
             "SE-5b581702-1851", "SE-5c06d92d-2915", "SE-5b581702-2002", "SE-5c06d92d-2515",
             "SE-169e580a-0015"]
df = df[df["SE_Number"].isin(SE_Number)]
"""
# Keep only one obs per ins and within pregnancy check
df = df.drop_duplicates(subset=['SE_Number', "LactationNumber", "InseminationDate", "PregnancyCheckDate"])
print(df.shape)  # 6078 preg checks, 5938 ins, 3636 lact, 1873 kor

# Make insemination interval to sort insemination in the same cycle (<= 5 days)
df["next_ins"] = pd.to_datetime(df["next_ins"])
df["InseminationDate"] = pd.to_datetime(df["InseminationDate"])
df["ins_within_cycle"] = (df["next_ins"] - df["InseminationDate"]).dt.days
df.to_csv("dataframe4.csv", index=False)

In [None]:
# Defining phenotypes for conception rate
# Each new insemination is preliminarily set to successful
df["CR0"] = 1


def cr1(row):
    # If followed by a new insemination the former CR is set to CR=0
    if pd.notna(row["next_ins"]):
        return 0
    # Set former CR to missing if the cow was inseminated in the same cycle (<= 5 days).
    if row["ins_within_cycle"] <= 5:
        return np.nan
    # Pregnancy checks: the last insemination is updated accordingly.
    if pd.notna(row["PregnancyCheckDate"]):
        if row["PregnancyCheckResult"] == "Negative":
            return 0
        elif row["PregnancyCheckResult"] == "Positive":
            return 1


# Apply the function to df
df['CR0'] = df.apply(cr1, axis=1)
df.to_csv("dataframe4.csv", index=False)

In [None]:
# If a cow faced an early abortion, i.e., insemination started again after successful
# pregnancy check, the last insemination before pregnancy check is left as successful.


def cr4(row):
    if pd.notna(row["PregnancyCheckDate"]):
        if row["PregnancyCheckResult"] == "Positive":
            if pd.notna(row["next_ins"]):
                return 1
            if pd.isna(row["next_ins"]):
                return np.nan
    else:
        return 2


# Apply the function to df
df['CR4'] = df.apply(cr4, axis=1)
df.to_csv("dataframe4.csv", index=False)

In [None]:
# If the pregnancy period is shorter, it is checked iteratively whether some of
# the former inseminations is within acceptable limits, in which case insemination
# for this day is set to 1. All inseminations that are newer than this successful
# insemination are set to missing values.

df["next_calving"] = pd.to_datetime(df["next_calving"])
df["preg_period"] = (df['next_calving'] - df['InseminationDate']).dt.days
df["CR5"] = 2


def cr5(row):
    if 260 <= row["preg_period"] <= 302:
        return 1
    elif row["preg_period"] > 302:
        return 0


df['CR5'] = df.apply(cr5, axis=1)
print(df.shape)

# Ensure the DataFrame has a default integer index
df.reset_index(drop=True, inplace=True)

# Iterate over the DataFrame to check and edit previous rows
for i in range(len(df)):
    if df.loc[i, "preg_period"] < 260:
        if i > 0:  # Ensure there is a previous row
            prev_row_index = i - 1
            df.at[prev_row_index, "CR5"] = 1
        elif i == 0:
            df.at[i, "CR5"] = np.nan


df.to_csv("dataframe4.csv", index=False)
print(df.shape)

In [None]:
# Note, in principle two possibilities exist if the last insemination does not fit
# within the acceptable limits but results in too short a pregnancy period:
#   a cow calved too early or pregnant cow was inseminated.
# Based on the consultancy of a veterinarian, the latter is more common since
# AI technicians have skills to inseminate cows so that a possible pregnancy
# is not terminated. Therefore, the latter option was selected.

# Special case for defining phenotypes: if there is only one insemination record and
# positive pregnancy check result after this, the last insemination is accepted as
# successful, even if the pregnancy period is too short, i.e., too early calving occurred.
# e.g. SE-5b581702-1756


def cr6(row):
    if pd.isna(row["CR5"]):
        if (row["NINS"] == 1) and (row["PregnancyCheckResult"] == "Positive"):
            return 1
    return 2


# Apply the function to df
df["CR6"] = df.apply(cr6, axis=1)
df.to_csv("dataframe4.csv", index=False)
print(df.shape)

In [None]:
# If a non-consecutive calving was noticed or a cow started with insemination records,
# inseminations that were done within 365 days from the new calving are considered.
# This means that with an average pregnancy period of 281 days and an average cycle
# of 21 days, as a maximum 5 inseminations are included for the new calving.
# All earlier inseminations are set to missing values.

# e.g. SE-5b581702-1742, lact 1 and lact 3, i.e. lact 2 missing
"""
#First fix index
# Find the number of levels in the MultiIndex
num_levels = df.index.nlevels
print(num_levels)  # 3
index_level_names = df.index.names
print("Index level names:", index_level_names)  # 'SE_Number', 'LactationNumber', None
# Rename index cuz reasons
df.index = df.index.set_names(['SE_Number_idx', 'LactationNumber_idx', 'None'])
"""

# Shift lact number upwards within animal
col_keep = ["SE_Number", "LactationNumber"]
df2 = df[col_keep]
df2 = df2.drop_duplicates(subset=['SE_Number', 'LactationNumber'])

# Make variable for non-consecutive calving, ie where != 1 means have non_concec_calving
df2['prev_lact'] = df2.groupby('SE_Number')['LactationNumber'].shift(-1)
df2["non_consec_calving"] = (df2["prev_lact"] - df2["LactationNumber"])
df2.to_csv("dataframe5.csv", index=False)

# Merge onto other df
print(df2.shape)  # 3636
print(df.shape)  # 6078
df = pd.merge(df2, df, on=["SE_Number", "LactationNumber"], how="left")
print(df.shape)  # 6078
df.to_csv("dataframe4.csv", index=False)

# Occurence of non-consecutive calvings
subset_df = df.drop_duplicates(subset=['SE_Number', 'LactationNumber'])
print(subset_df.shape)  # 3636
subset_df2 = subset_df[subset_df["non_consec_calving"] > 1]
print(subset_df2.shape)  # 10 lact where missing MY data?

# To handle non-consecutive calving
df.loc[df["non_consec_calving"] > 1, "next_calving"] = df["upper_limit"]
df["upper_limit"] = pd.to_datetime(df["upper_limit"])
df["ins_int"] = df["upper_limit"] - pd.Timedelta(days=365)

col_keep = ["SE_Number", "LactationNumber", "InseminationDate"]
df3 = df[col_keep]
df3 = df3.sort_values(by=['SE_Number', 'LactationNumber', 'InseminationDate'])
df3 = df3.drop_duplicates(subset=['SE_Number', 'LactationNumber', 'InseminationDate'])
df3['NINS_bw'] = df3.groupby(['SE_Number', 'LactationNumber']).cumcount(ascending=False) + 1
print(df3.shape)  # 5938 ins
print(df.shape)  # 6078
df = pd.merge(df3, df, on=["SE_Number", "LactationNumber", "InseminationDate"])
print(df.shape)  # 6078

df["CR7"] = 2


def cr7a(row):
    if row["ins_int"] <= row["InseminationDate"] <= row["upper_limit"]:
        return 1
    else:
        return np.nan


def cr7b(row):
    if row["NINS_bw"] <=5:
        return 1
    else:
        return np.nan


df['CR7'] = df.apply(cr7a, axis=1)
df['CR7'] = df.apply(cr7b, axis=1)
df.to_csv("dataframe4.csv", index=False)
print(df.shape)  # 6078

In [None]:
# If a cow was sold during a service period, all subsequent inseminations are set to
# missing and those before accepted. If the service period occurred before or after
# the cow was sold, inseminations are accepted.

# Load culling reasons
df4 = pd.read_csv("Del_Cow240531.csv", delimiter=';', low_memory=False)
col_keep = ["SE_Number", "CullDecisionDate", "CullReason1", "CullReason2"]
df4 = df4[col_keep]

unique_values = df4['CullReason1'].unique()
print(unique_values)
unique_values = df4['CullReason2'].unique()
print(unique_values)

print(df4.shape)  # 24,473

In [None]:
"""
# Find duplicates where glitches are noticed, due to double-clicking when sending report to database on farm
# or when sold to other herds
df4["CullCount"] = df4.groupby(["SE_Number"]).cumcount() + 1
df4 = df4.sort_values(by=["SE_Number", "CullDecisionDate", "CullReason1", "CullReason2"])
df4.to_csv("dataframe5.csv", index=False)

# Also, herd specific records in del pro for calving but universal in database for culling, 
# hence double records where sold to other farm, followed by culling in that farm
test = pd.read_csv("Del_Calving240531.csv", delimiter=';', low_memory=False)
SE_Number = ["SE-ad0a39f5-2662"]
test = test[test["SE_Number"].isin(SE_Number)]
test.to_csv("dataframe6.csv", index=False)
"""
"""
SE-ad0a39f5-2651 obs 20,193
condition1 = df4["CullCount"] > 1
result = df4[condition1]
result.to_csv("dataframe5.csv", index=False)

result["count"] = 1
col_keep = ["SE_Number", "count"]
result2 = result[col_keep]
result2 = result2.drop_duplicates(subset=["SE_Number"])
print(result2.shape)  # 397 obs

df4 = pd.merge(df4, result2, on=["SE_Number"], how="left")
df4 = df4.sort_values(by=["SE_Number", "CullDecisionDate", "CullReason1", "CullReason2"])
condition1 = df4["count"] == 1
df5 = df4[condition1]
df5 = df5.drop_duplicates(subset=["SE_Number"])
print(df5.shape) # 397 obs
df5.to_csv("dataframe5.csv", index=False)
"""

In [None]:
df4 = df4.sort_values(by=["SE_Number", "CullDecisionDate", "CullReason1", "CullReason2"])
df_unique = df4.drop_duplicates(subset=["SE_Number", "CullDecisionDate"])  # ie first record is kept
df_unique.to_csv("dataframe5.csv", index=False)

count_date_recorded = df_unique['CullDecisionDate'].count()
print(count_date_recorded)  # 7070
count_reason_recorded = df_unique['CullReason1'].count()
print(count_reason_recorded)  # 10080, ie 3000 missing date for record

# Remove "43 Tillfälligt ute", due to potential duplicated recordings caused by this
df5 = df_unique[df_unique["CullReason1"] != "43 Tillfälligt ute"]

# Add culling reasons to cows in df
print(df.shape)  # 6078
df = pd.merge(df, df5, on=["SE_Number", "CullDecisionDate"], how="left")
print(df.shape)  # 6078
df.to_csv("dataframe4.csv", index=False)

"""
# Find example cows
df7 = df6[df6["CullReason1"] == "01 Såld till liv"]

SE_Number = ["SE-ad0a39f5-2762", "SE-5c06d92d-2631"]
df = df[df["SE_Number"].isin(SE_Number)]
"""

# Define service periods, find first ins date, allow for 7ins ie 147d
first_ins1 = df.groupby(['SE_Number', 'LactationNumber'])['InseminationDate'].min().reset_index()
first_ins1.rename(columns={'InseminationDate': 'first_ins_before_sold'}, inplace=True)
df = df.merge(first_ins1, on=['SE_Number', 'LactationNumber'], how='left')
df["first_ins_before_sold"] = pd.to_datetime(df["first_ins_before_sold"])
df["service_period1"] = df["first_ins_before_sold"] + pd.Timedelta(days=147)
df.to_csv("dataframe4.csv", index=False)

df = df.sort_values(by=['SE_Number', 'LactationNumber', 'InseminationDate'])


def ins_after_sold(df):
    # Filter rows where InseminationDate is after CullDecisionDate
    filtered = df[df["InseminationDate"] > df["CullDecisionDate"]]

    # Group by SE_Number and LactationNumber and find the minimum ie first InseminationDate after CullDecisionDate
    first_ins2 = filtered.groupby(['SE_Number', 'LactationNumber'])['InseminationDate'].min().reset_index()
    first_ins2.rename(columns={'InseminationDate': 'first_ins_after_sold'}, inplace=True)

    # Merge the result back to the original test DataFrame
    df = df.merge(first_ins2, on=['SE_Number', 'LactationNumber'], how='left')

    # Convert the merged column to datetime
    df["first_ins_after_sold"] = pd.to_datetime(df["first_ins_after_sold"])

    # Calculate service_period2 by adding 147 days to first_ins_after_sold
    df["service_period2"] = df["first_ins_after_sold"] + pd.Timedelta(days=147)

    return df


# Call the function and save the result
df = ins_after_sold(df)

print(df.dtypes)
df["InseminationDate"] = pd.to_datetime(df["InseminationDate"])
df["CullDecisionDate"] = pd.to_datetime(df["CullDecisionDate"])

"""
SE_Number = ["SE-5c06d92d-2631"]
df = df[df["SE_Number"].isin(SE_Number)]
"""


def cr8(row):
    if pd.notna(row["CullDecisionDate"]) and row["CullReason1"] == "01 Såld till liv":
        # If a cow was sold during a service period, all inseminations before accepted
        if row["first_ins_before_sold"] <= row["InseminationDate"] < row["CullDecisionDate"]:
            return 1
        # If a cow was sold during a service period, all subsequent inseminations are set to missing
        if row["CullDecisionDate"] < row["InseminationDate"] < row["service_period1"]:
            return np.nan
        # If the service period occurred before sold ie date of service period ends sooner than sold date, accept ins
        if row["service_period1"] < row["CullDecisionDate"]:
            if row["first_ins_before_sold"] <= row["InseminationDate"] < row["service_period1"]:
                return 1
        # If the service period occurred after sold ie date of first ins after sold is past sold date, accept ins
        if row["service_period1"] < row["CullDecisionDate"]:
            if row["first_ins_after_sold"] <= row["InseminationDate"] < row["service_period2"]:
                return 1
    else:
        return 2


# Apply the function to df
df['CR8'] = df.apply(cr8, axis=1)
df.to_csv("dataframe4.csv", index=False)
print(df.shape)  # 6078

"""
Insdate, CullDecisionDate, first_ins_before_sold,service_period1,first_ins_after_sold,service_period2
2022-11-13,  2022-12-01            2022-11-13,        2023-04-09,     2022-12-15,         2023-05-11
2022-12-15   2022-12-01            2022-11-13,        2023-04-09,     2022-12-15,         2023-05-11,
"""

In [None]:
# When a cow is slaughtered, the last phenotype is left successful only in the case
# of a positive pregnancy check; otherwise, it is set to zero.

# Find last ins within cow
df = df.sort_values(by=['SE_Number', 'LactationNumber', 'InseminationDate'])
df['last_ins'] = (df.groupby(['SE_Number'])['InseminationDate'].transform('max') ==
                           df['InseminationDate'])
# Find last pregnancy check date within cow in case of multiple preg checks for last ins
df["PregnancyCheckDate"] = pd.to_datetime(df["PregnancyCheckDate"])
df['last_preg_check'] = (df.groupby(['SE_Number'])['PregnancyCheckDate'].transform('max') ==
                           df['PregnancyCheckDate'])


def cr9(row):
    if pd.isna(row["next_calving"]) and row["CullReason1"] != "01 Såld till liv" or row["CullReason2"] != "01 Såld till liv":
        if row["last_ins"] == True:
            if row["last_preg_check"] == True and row["PregnancyCheckResult"] == "Positive":
                return 1
            else:
                return 0
        else:
            return 2
    else:
        return 2


# Apply the function to df
df['CR9'] = df.apply(cr9, axis=1)
df.to_csv("dataframe4.csv", index=False)

In [None]:
# Open records due to the data extraction: All data is used to define phenotypes
# before removing data that is too new (150-d gap).

# OPEN REC signified by no next_ins, next_calving or CullDecisionDate

# In this class of open records, the last CR is set to 0 if the lactation length
# is > 260 days and days from the data extraction to the last insemination is
# > 340 days. For the remaining open records, CR is set to 0.7, i.e., average NRR
# in heifers. The rationale behind this is that if there are no events during 150
# days before the extraction of data, it is very probable that a cow is pregnant
# and has not calved yet.

df["date_extraction"] = "2024-05-31"
df["date_extraction"] = pd.to_datetime(df["date_extraction"])
df["extraction_to_last_ins"] = (df["date_extraction"] - df["InseminationDate"]).dt.days
df["CalvingDate"] = pd.to_datetime(df["CalvingDate"])
df["lact_length"] = (df["InseminationDate"] - df["CalvingDate"]).dt.days

df.loc[df["extraction_to_last_ins"] <= 150, "rec_open"] = 1


def cr10(row):
    if pd.isna(row["next_ins"]) and pd.isna(row["next_calving"]) and pd.isna(row["CullDecisionDate"]):
        if row["last_ins"] == True and row["lact_length"] > 260 and row["extraction_to_last_ins"] > 340:
            return 0
        else:
            return 0.7
    else:
        return 2


# Apply the function to df
df['CR10'] = df.apply(cr10, axis=1)
print(df.shape)  # 6078
df.to_csv("dataframe4.csv", index=False)

In [None]:
#### FIX PROGRAM!!!######


# SUMMARIZE PREG CHECKS
grouped = df.groupby(['SE_Number', 'LactationNumber', 'InseminationDate'])
summary = grouped["CR0"].agg(["sum"]).reset_index()
summary.to_csv("dataframe6.csv", index=False)


# Put CRs TOGETHER
# Update column 'CR' where column 'CR0-CR10' have data
col_keep = ["SE_Number", "LactationNumber", "CalvingDate", "InseminationDate", "PregnancyCheckDate",
            "PregnancyCheckResult", "CR0", "CR4", "CR5", "CR6", "CR7", "CR8", "CR9", "CR10"]
df = df[col_keep]
df.to_csv("dataframe5.csv", index=False)

# df["CR0u"] = df.groupby(["SE_Number", "LactationNumber", "InseminationDate"])["CR0"].shift(1)

"""
# Define a function to update the 'Value' column
def update_value(group):
    group["CR0u"] = group["CR0"]
    return group


# Group by 'AnimalID' and 'LactationNumber', then apply the update function
df = df.groupby(["SE_Number", "LactationNumber", "InseminationDate"]).apply(update_value).reset_index(drop=True)

"""

# Identify groups with multiple observations
multiple_observations = df.groupby(["SE_Number", "LactationNumber", "InseminationDate", "PregnancyCheckDate"]).filter(lambda x: len(x) > 1)

# Create a boolean index for rows with multiple observations
is_multiple = df.apply(lambda x: (x["SE_Number"], x["LactationNumber"], x["InseminationDate"], x["PregnancyCheckDate"]) in
                       set(zip(multiple_observations["SE_Number"], multiple_observations["LactationNumber"], multiple_observations["InseminationDate"])), multiple_observations["PregnancyCheckDate"], axis=1)

# Split the DataFrame
df_multiple = df[is_multiple]
df_single = df[~is_multiple]

df_multiple.to_csv("dataframe6.csv", index=False)
df_single.to_csv("dataframe7.csv", index=False)

"""
df["CR"] = np.nan

df.loc[df["CR0"] != 2, "CR"] = df["CR0"]
df.loc[df["CR4"] != 2, "CR"] = df["CR4"]
df.loc[df["CR5"] != 2, "CR"] = df["CR5"]
df.loc[df["CR6"] != 2, "CR"] = df["CR6"]
df.loc[df["CR7"] != 2, "CR"] = df["CR7"]
df.loc[df["CR8"] != 2, "CR"] = df["CR8"]
df.loc[df["CR9"] != 2, "CR"] = df["CR9"]
df.loc[df["CR10"] != 2, "CR"] = df["CR10"]
"""


"""
# Subset chosen cows
SE_Number = ["SE-064c0cec-1189", "SE-5c06d92d-3145", "SE-5c06d92d-3177", "SE-5b581702-1742",
             "SE-5b581702-1851", "SE-5c06d92d-2915", "SE-5b581702-2002", "SE-5c06d92d-2515",
             "SE-169e580a-0015"]
df = df[df["SE_Number"].isin(SE_Number)]

new_order = ['B', 'C', 'A']
df = df[new_order]


"""

In [None]:
"""
# FILTERING
# Records within 150 days from data extraction are excluded from the data set
# Only the first 10 inseminations are accepted
# first insemination in heifers the lowest value (270 d) and the highest age (900 d)
# first calving age: lowest (550 d) and highest (Jersey 975 d, others 1100 d)
# heifers not older than 3.4 years without calving or culling,
# cows not longer than 2 years since last calving,
# ICF within limits of 20 - 230 days,
# IFL in maximum 365 days.

def filter(row):
    if row["extraction_to_last_ins"] < 150:
        return 2


# Apply the function to df
df['edit'] = df.apply(filter, axis=1)
df.to_csv("dataframe4.csv", index=False)
"""