# SCRIPT TO GENERATE FERTILITY TRAITS

Script assuming that "HeatStressCleanWorkFlow.csv" has been run to generate "updateDF.csv" with following structure:
- SE_Number, LactationNumber, Breed, FarmName_Pseudo, AnimalNumber, Del_Cow_Id, BirthDate, Father_SE_Number, Mother_SE_Number
- CalvingDate, InseminationDate, PregnancyCheckDate, PregnancyStatus, DryOffDate, CullingDate,
- ExitReason_PrimaryReasonKok, ExitReason_SecondaryReason1Kok, ExitReason_SecondaryReason2Kok, CullingReason1, CullingReason2,
- next_calving, next_ins, prev_ins, shift_calf, upper_limit


Program creates the following fertility traits
- NINS: Number of inseminations
- CFI: Interval from calving to first service
- CLI: Interval from calving to last service
- FLI: Interval from first to last service
- CI: Calving interval
- GL: Gestation length
- CR: Conception rate - NEEDS EDITING!

In [None]:
import pandas as pd
import numpy as np

In [None]:
# LOAD DATA
df_ins2d = pd.read_csv("../Data/MergedData/updateDF.csv", low_memory=False)

In [None]:
# NINS - NUMBER OF INSEMINATIONS
ins_count = df_ins2d.drop_duplicates(subset=["SE_Number", "LactationNumber", "InseminationDate"])
ins_count = (ins_count.groupby(["SE_Number", 'LactationNumber']).size().reset_index(name='NINS'))
df_ins2d = df_ins2d.merge(ins_count, on=["SE_Number", "LactationNumber"], how="left")
df_ins2d.to_csv("../Data/CowData/fertilityDF.csv", index=False)

In [None]:
# CFI - INTERVAL FROM CALVING TO FIRST SERVICE
# CLI - INTERVAL FROM CALVING TO LAST SERVICE
# FLI - INTERVAL FROM FIRST TO LAST SERVICE
# Group by cow and lactation, and get the first and last inseminations
first_observations = (df_ins2d.groupby(["SE_Number", "LactationNumber"]).first().reset_index())
last_observations = (df_ins2d.groupby(["SE_Number", "LactationNumber"]).last().reset_index())

print(last_observations.columns)
print(len(last_observations.columns))

col_keep = ["SE_Number", "LactationNumber", "CalvingDate", "InseminationDate"]
first_observations = first_observations[col_keep]

col_keep = ["SE_Number", "LactationNumber", "CalvingDate", "InseminationDate"]
last_observations = last_observations[col_keep]

# Rename the columns of the last observations df to distinguish them
last_observations.rename(columns={
    "CalvingDate": "CalvingDate_last",
    "InseminationDate": "InseminationDate_last",
}, inplace=True)


# Concatenate first and last observations df side by side
df_ins2e = pd.concat([first_observations, last_observations.iloc[:, 2:]], axis=1)

# Convert columns to datetime objects
df_ins2e["InseminationDate"] = pd.to_datetime(df_ins2e["InseminationDate"])
df_ins2e["InseminationDate_last"] = pd.to_datetime(df_ins2e["InseminationDate_last"])
df_ins2e["CalvingDate"] = pd.to_datetime(df_ins2e["CalvingDate"])
df_ins2e["CalvingDate_last"] = pd.to_datetime(df_ins2e["CalvingDate_last"])

# Calculate fertility traits
df_ins2e["CFI"] = (df_ins2e["InseminationDate"] - df_ins2e["CalvingDate"]).dt.days
df_ins2e["CLI"] = (df_ins2e["InseminationDate_last"] - df_ins2e["CalvingDate_last"]).dt.days
df_ins2e["FLI"] = (df_ins2e["InseminationDate_last"] - df_ins2e["InseminationDate"]).dt.days
col_keep = ["SE_Number", "LactationNumber", "CFI", "CLI", "FLI", "InseminationDate_last"]
df_ins2e = df_ins2e[col_keep]

df_ins2f = df_ins2d.merge(df_ins2e, on=["SE_Number", "LactationNumber"], how="left")
df_ins2f.to_csv("../Data/CowData/fertilityDF.csv", index=False)

In [None]:
# CI - CALVING INTERVAL
df_ins2g = df_ins2f.drop_duplicates(subset=["SE_Number", "LactationNumber", "CalvingDate", "next_calving"])
col_keep = ["SE_Number", "LactationNumber", "CalvingDate", "next_calving"]
df_ins2g = df_ins2g[col_keep]
df_ins2g["CalvingDate"] = pd.to_datetime(df_ins2g["CalvingDate"])
df_ins2g["next_calving"] = pd.to_datetime(df_ins2g["next_calving"])
df_ins2g["CI"] = (df_ins2g["next_calving"] - df_ins2g["CalvingDate"]).dt.days
df_ins2g.drop(columns=["CalvingDate", "next_calving"], inplace=True)

df_ins2h = df_ins2f.merge(df_ins2g, on=["SE_Number", "LactationNumber"], how="left")

In [None]:
# GL - GESTATION LENGTH
# i.e. length from last insemination to next calving
df_ins2h["InseminationDate_last"] = pd.to_datetime(df_ins2h["InseminationDate_last"])
df_ins2h["next_calving"] = pd.to_datetime(df_ins2h["next_calving"])
df_ins2h["GL"] = (df_ins2h["next_calving"] - df_ins2h["InseminationDate_last"]).dt.days
df_ins2h.to_csv("../Data/CowData/fertilityDF.csv", index=False)

# CONCEPTION RATE - script not finished
- see Appendix 2 in "NAV official genetic evaluation of Dairy Cattle - data and genetic models" for full description

In [None]:
# Load data
df = pd.read_csv("../Data/CowData/fertilityDF.csv", low_memory=False)

In [None]:
# PREPARATIONS
# Make insemination interval to later sort insemination in the same cycle (<= 6 days)
df["next_ins"] = pd.to_datetime(df["next_ins"])
df["InseminationDate"] = pd.to_datetime(df["InseminationDate"])
df["ins_within_cycle"] = (df["next_ins"] - df["InseminationDate"]).dt.days
df.to_csv("../Data/CowData/fertilityDF.csv", index=False)

In [None]:
unique_values = df['PregnancyStatus'].value_counts()
print(unique_values)

In [None]:
# REORDER PregnancyStatus
df["PregnancyStatus"] = df["PregnancyStatus"].replace(
    {2: "Positive",
     22: "Positive",
     42: "Positive",
     52: "Positive",
     
     1: "Negative",
     21: "Negative",
     51: "Negative",
     
     3: "Uncertain",
     53: "Uncertain"
     })

# Preliminary value
Each new insemination is preliminarily set to successful

In [None]:
# Defining phenotypes for conception rate
df["CR0"] = 1

# New ins, same cycle, pregnancy checks
CR0 is updated based on new data for each insemination record

In [None]:
def cr1(row):
    # If followed by a new insemination the former CR is set to CR=0
    if pd.notna(row["next_ins"]):
        return 0
    # Set former CR to missing if the cow was inseminated in the same cycle (<= 6 days).
    if row["ins_within_cycle"] <= 6:
        return np.nan
    # Pregnancy checks: the last insemination is updated accordingly.
    if pd.notna(row["PregnancyCheckDate"]):
        if row["PregnancyStatus"] == "Negative":
            return 0
        elif row["PregnancyStatus"] == "Positive":
            return 1


# Apply the function to df
df['CR1'] = df.apply(cr1, axis=1)

df.to_csv("../Data/CowData/fertilityDF.csv", index=False)

In [None]:
# Count unique values including NaNs for several columns
columns_to_count = ['CR0', 'CR1']  # List of columns you want to count unique values for
value_counts = df[columns_to_count].apply(lambda x: x.value_counts(dropna=False))

# Print the result
print(value_counts)

In [None]:
# Check occurrence
df5 = df[pd.isna(df["CR1"])]
df5.to_csv("../Data/test.csv", index=False)

In [None]:
# ERRONEOUS UPDATE
# Update CR0 with CR1 wherever CR1 has data (including NaN, 0, 1)
df['CR0'] = df['CR1'].combine_first(df['CR0'])
df.to_csv("../Data/CowData/fertilityDF.csv", index=False)

In [None]:
df['CR0'] = df['CR1']

In [None]:
# Count unique values including NaNs for several columns
columns_to_count = ['CR0', 'CR1']  # List of columns you want to count unique values for
value_counts = df[columns_to_count].apply(lambda x: x.value_counts(dropna=False))

# Print the result
print(value_counts)

# Early abortion
If a cow faced an early abortion, i.e., insemination started again after successful pregnancy check, the last insemination before pregnancy check is left as successful. E.g. SE-5b581702-1851, LactationNumber 2.0

In [None]:
def cr4(row):
    if pd.notna(row["PregnancyCheckDate"]):
        if row["PregnancyStatus"] == "Positive":
            if pd.notna(row["next_ins"]):
                return 1
#            if pd.isna(row["next_ins"]):
#                return np.nan
#    else:
#        return 2


# Apply the function to df
df['CR4'] = df.apply(cr4, axis=1)
df.to_csv("../Data/CowData/fertilityDF.csv", index=False)

In [None]:
# Update CR0 with CR4 only where CR4 is not missing
df.loc[df['CR4'].notna(), 'CR0'] = df['CR4']
df.to_csv("../Data/CowData/fertilityDF.csv", index=False)

In [None]:
df['CR0'] = np.where(df['CR4'] == 1, 1, df['CR0'])

In [None]:
# Count unique values
columns_to_count = ['CR0', 'CR1', 'CR4']
value_counts = df[columns_to_count].apply(lambda x: x.value_counts(dropna=False))

# Print the result
print(value_counts)

# Pregnancy period
After calving, it is checked whether the last insemination is within the limits of acceptable pregnancy period (260-302 days).

If the pregnancy period is shorter, it is checked iteratively whether some of
the former/previous inseminations are within acceptable limits, in which case insemination
for this day is set to 1. All inseminations that are newer than/ie after this successful
insemination are set to missing values. E.g. SE-5b581702-1756, LactationNumber 2.0

In [None]:
# Calculate pregnancy period
df["next_calving"] = pd.to_datetime(df["next_calving"])
df["preg_period"] = (df['next_calving'] - df['InseminationDate']).dt.days
df["CR5"] = np.nan

# If the pregnancy period is longer, the last insemination is set to zero (possible that a farm bull was used).
def cr5(row):
    if pd.notna(row["preg_period"]):  # Ensure preg_period is valid
        if 260 <= row["preg_period"] <= 302:
            return 1  # Valid insemination
        elif row["preg_period"] > 302:
            return 0  # Too long pregnancy period

# Apply the cr5 function
df['CR5'] = df.apply(cr5, axis=1)


In [None]:
df['CR0'] = np.where(df['CR5'].notna(), df['CR5'], df['CR0'])

In [None]:
# Count unique values 
columns_to_count = ['CR0', 'CR1', 'CR4', 'CR5'] 
value_counts = df[columns_to_count].apply(lambda x: x.value_counts(dropna=False))

# Print the result
print(value_counts)

In [None]:
# Initiate with 2 because have to overwrite with 1s and NaNs
df["CR6"] = 2


# Iterate through rows and handle pregnancy period less than 260 days
for i in df.index:
    if df.loc[i, "preg_period"] < 260:
        # Filter for valid previous inseminations (preg_period between 260 and 302 days) up to the current row
        valid_prev = df.loc[:i]
        valid_prev = valid_prev[(valid_prev['preg_period'] >= 260) & (valid_prev['preg_period'] <= 302)]
        
        if not valid_prev.empty:
            # If valid previous insemination exists, get the last valid index
            last_valid_index = valid_prev.index[-1]
            df.at[last_valid_index, "CR6"] = 1  # Update CR6 to 1 for the last valid insemination

In [None]:
# Count unique values 
columns_to_count = ['CR0', 'CR1', 'CR4', 'CR5', 'CR6'] 
value_counts = df[columns_to_count].apply(lambda x: x.value_counts(dropna=False))

print(value_counts)

In [None]:
# Define a function to process each group
def cr6(group):
    # Ensure the condition is applied to the Series and `.any()` works correctly
    if group['CR6'].eq(1).any():  # Check if any insemination in the group has CR6 == 1
        # Set rows to NaN or retain 1 based on conditions
        group.loc[group['CR6'] != 1, 'CR6'] = np.nan  # Set rows where CR6 != 1 to NaN
        group.loc[group['CR6'] == 1, 'CR6'] = 1      # Ensure rows with CR6 == 1 retain their value
    return group

# Apply the function group by group
df = df.groupby(['SE_Number', 'LactationNumber']).apply(cr6)
df.to_csv("../Data/CowData/fertilityDF.csv", index=False)

In [None]:
# Define a function to process each group, =============================================================================================>>> does same as above, only run one of these cells!
def cr6(group):
    if (group['CR6'] == 1).any():  # Check if any insemination in the group has CR6 == 1
        # Set rows to NaN or retain 1 based on conditions
        group.loc[group['CR6'] != 1, 'CR6'] = np.nan  # Set to NaN based on condition
        group.loc[group['CR6'] == 1, 'CR6'] = 1      # Retain 1 for specific condition
    return group

# Apply the function group by group
df = df.groupby(['SE_Number', 'LactationNumber']).apply(cr6)
df.to_csv("../Data/CowData/fertilityDF.csv", index=False)

In [None]:
df

In [None]:
df = pd.read_csv("../Data/CowData/fertilityDF.csv", low_memory=False)

In [None]:
df

In [None]:
# Count unique values 
columns_to_count = ['CR0', 'CR1', 'CR4', 'CR5', 'CR6']  
value_counts = df[columns_to_count].apply(lambda x: x.value_counts(dropna=False))

print(value_counts)

In [None]:
df['CR0'] = df['CR6'].where((df['CR6'] == 1) | (df['CR6'].isna()), df['CR0'])

In [None]:
# Count unique values 
columns_to_count = ['CR0', 'CR1', 'CR4', 'CR5', 'CR6']
value_counts = df[columns_to_count].apply(lambda x: x.value_counts(dropna=False))

print(value_counts)

In [None]:
print(df.index.names)
print(df.columns)

In [None]:
SE_Number = ["SE-5b581702-1756"]
df5 = df[df["SE_Number"].isin(SE_Number)]

col_keep = ["SE_Number", "LactationNumber", "CalvingDate", "InseminationDate", "PregnancyCheckDate", "prev_ins", "next_ins", "preg_period", "CR0", "CR5", "CR6"]
df5 = df5[col_keep]


In [None]:
df5

# NINS = 1
Special case for defining phenotypes: if there is only one insemination record (i.e. NINS = 1) and
positive pregnancy check result after this, the last insemination is accepted as
successful, even if the pregnancy period is too short (<260d), i.e., too early calving occurred.
E.g. SE-5b581702-1756, SE-4b8091ac-1472, LactationNumber 1.0

In [None]:
# Initiate with 2 and overwrite with 1s
df["CR7"] = 2


def cr7(row):
    if (row["NINS"] == 1) and (row["PregnancyStatus"] == "Positive") and (row["preg_period"] < 260):
        return 1


# Apply the function to df
df["CR7"] = df.apply(cr7, axis=1)
df.to_csv("../Data/CowData/fertilityDF.csv", index=False)
print(df.shape)

In [None]:
df['CR0'] = df['CR7'].where((df['CR7'] == 1), df['CR0'])

In [None]:
# Count unique values
columns_to_count = ['CR0', 'CR1', 'CR4', 'CR5', 'CR6', 'CR7'] 
value_counts = df[columns_to_count].apply(lambda x: x.value_counts(dropna=False))

print(value_counts)

# Non-consecutive calvings or (re-?)started inseminations
If a non-consecutive calving was noticed or a cow started with insemination records (again? after long period of pregnancy??),
inseminations that were done within 365 days from the new calving are considered.
This means that with an average pregnancy period of 281 days and an average cycle
of 21 days, as a maximum 5 inseminations are included for the new calving.
All earlier inseminations are set to missing values.
e.g. SE-5b581702-1742, lact 1 and lact 3, i.e. lact 2 missing

In [None]:
# Make variable for non-consecutive calving, ie where != 1 means have non_concec_calving
col_keep = ["SE_Number", "LactationNumber"]
df2 = df[col_keep]
df2 = df2.drop_duplicates(subset=['SE_Number', 'LactationNumber'])

df2['prev_lact'] = df2.groupby('SE_Number')['LactationNumber'].shift(-1)
df2["non_consec_calving"] = (df2["prev_lact"] - df2["LactationNumber"])
df2.to_csv("../Data/CowData/fertilityDF.csv", index=False)

# Merge onto other df
print(df2.shape) 
print(df.shape)  
df = pd.merge(df, df2, on=["SE_Number", "LactationNumber"], how="left")
print(df.shape)  
df.to_csv("../Data/CowData/fertilityDF.csv", index=False)

# Occurence of non-consecutive calvings
subset_df = df.drop_duplicates(subset=['SE_Number', 'LactationNumber'])
print(subset_df.shape)  
subset_df2 = subset_df[subset_df["non_consec_calving"] > 1]
print(subset_df2.shape)  # 0 lact where missing lactations? ie don't have non-consecutive calvings in current dataframe

In [None]:
# To handle non-consecutive calving or ins within longer service period
df.loc[df["non_consec_calving"] > 1, "next_calving"] = df["upper_limit"]
df["upper_limit"] = pd.to_datetime(df["upper_limit"])
df["InseminationDate"] = pd.to_datetime(df["InseminationDate"])

df["ins_int"] = df["upper_limit"] - pd.Timedelta(days=365)
df["ins_int"] = pd.to_datetime(df["ins_int"])

#Make NINS that is counted backwards from calving i.e. last ins record is first NINS
col_keep = ["SE_Number", "LactationNumber", "InseminationDate"]
df3 = df[col_keep]
df3 = df3.sort_values(by=['SE_Number', 'LactationNumber', 'InseminationDate'])
df3 = df3.drop_duplicates(subset=['SE_Number', 'LactationNumber', 'InseminationDate'])
df3['NINS_bw'] = df3.groupby(['SE_Number', 'LactationNumber']).cumcount(ascending=False) + 1
print(df3.shape)  
print(df.shape)  
df = pd.merge(df3, df, on=["SE_Number", "LactationNumber", "InseminationDate"])
print(df.shape)  

# Initiate variable to indicate where have conditions
df["CR8a"] = 2
df["CR8b"] = 2


def cr8a(row):
    if row["ins_int"] <= row["InseminationDate"] <= row["upper_limit"]:
        return 1
    else:
        return np.nan


def cr8b(row):
    if row["NINS_bw"] <=5:
        return 1
    else:
        return np.nan


df['CR8a'] = df.apply(cr8a, axis=1)
df['CR8b'] = df.apply(cr8b, axis=1)
df.to_csv("../Data/CowData/fertilityDF.csv", index=False)
print(df.shape)

In [None]:
df

In [None]:
value_counts1 = df["CR8a"].value_counts()
print(value_counts1)
value_counts2 = df["CR8b"].value_counts()
print(value_counts2)

In [None]:
# Now run previous functions for these data points to define their CR scoring
def create_CR8a(row):
    # Check if either CR8a or CR8b is equal to 1
    if row['CR8a'] == 1 or row['CR8b'] == 1:
        # Apply cr1 to cr5 functions as needed
        cr1_value = cr1(row)
        cr4_value = cr4(row)
        cr5_value = cr5(row)
        
        # Check the values of cr1, cr4, cr5 in order
        if pd.notna(cr1_value) or pd.isna(cr1_value):
            return cr1_value  # CR8 is set based on cr1_value
        
        if pd.notna(cr4_value):
            return cr4_value  # CR8 is set based on cr4_value
        
        if pd.notna(cr5_value) or pd.isna(cr5_value):
            return cr5_value  # CR8 is set based on cr5_value

        # If none of cr1, cr4, or cr5 are valid, return NaN for CR8aa
        return np.nan
    
    else:
        # If neither CR8a nor CR8b is equal to 1, return NaN for CR8aa
        return np.nan

# Apply the create_CR8a function to the dataframe
df['CR8aa'] = df.apply(create_CR8a, axis=1)

# Save to CSV
df.to_csv("../Data/CowData/fertilityDF.csv", index=False)

In [None]:
# Count unique values
columns_to_count = ['CR0', 'CR1', 'CR4', 'CR5', 'CR6', 'CR7', 'CR8aa']
value_counts = df[columns_to_count].apply(lambda x: x.value_counts(dropna=False))

print(value_counts)

In [None]:
# Rerun code for CR6 separately here due to issue with .any() and eq() from function, returning AttributeErrors for bool and float handling in python => called CR8bb

# If the pregnancy period is shorter, it is checked iteratively whether some of
# the former/previous inseminations are within acceptable limits, in which case insemination
# for this day is set to 1. All inseminations that are newer than/ie after this successful
# insemination are set to missing values. E.g. SE-5b581702-1756, LactationNumber 2.0
df["CR8bb"] = 2


# Iterate through rows and handle pregnancy period less than 260 days
for i in df.index:
    if df.loc[i, "preg_period"] < 260:
        # Filter for valid previous inseminations (preg_period between 260 and 302 days) up to the current row
        valid_prev = df.loc[:i]
        valid_prev = valid_prev[(valid_prev['preg_period'] >= 260) & (valid_prev['preg_period'] <= 302)]
        
        if not valid_prev.empty:
            # If valid previous insemination exists, get the last valid index
            last_valid_index = valid_prev.index[-1]
            df.at[last_valid_index, "CR8bb"] = 1  # Update CR8bb to 1 for the last valid insemination


# Define a function to process each group
def cr8c(group):
    # Ensure the condition is applied to the Series and `.any()` works correctly
    if group['CR8bb'].eq(1).any():  # Check if any insemination in the group has CR8bb == 1
        # Set rows to NaN or retain 1 based on conditions
        group.loc[group['CR8bb'] != 1, 'CR8bb'] = np.nan  # Set rows where CR8bb != 1 to NaN
        group.loc[group['CR8bb'] == 1, 'CR8bb'] = 1      # Ensure rows with CR8bb == 1 retain their value
    return group

# Apply the function group by group
df = df.groupby(['SE_Number', 'LactationNumber']).apply(cr8c)
df.to_csv("../Data/CowData/fertilityDF.csv", index=False)

In [None]:
df

In [None]:
df = pd.read_csv("../Data/CowData/fertilityDF.csv", low_memory=False)

In [None]:
df

In [None]:
# Now run function for cr7 creating CR8cc
def create_CR8b(row):
    # Check if either CR8a or CR8b is equal to 1
    if row['CR8a'] == 1 or row['CR8b'] == 1:
        # Apply cr7 function as needed
        cr7_value = cr7(row)
        
        # Check the values of cr7
        if pd.notna(cr7_value):
            return cr7_value  # CR8cc is set based on cr7_value

        # If cr7 is not valid, return NaN for CR8cc
        return np.nan
    
    else:
        # If neither CR8a nor CR8b is equal to 1, return NaN for CR8cc
        return np.nan

# Apply the create_CR7 function to the dataframe
df['CR8cc'] = df.apply(create_CR8b, axis=1)

# Save to CSV
df.to_csv("../Data/CowData/fertilityDF.csv", index=False)

In [None]:
# Count unique values
columns_to_count = ['CR0', 'CR1', 'CR4', 'CR5', 'CR6', 'CR7', 'CR8aa', 'CR8bb', 'CR8cc']
value_counts = df[columns_to_count].apply(lambda x: x.value_counts(dropna=False))

print(value_counts)

In [None]:
# Update CR0 with CR8aa wherever CR8aa has data (including NaN, 0, 1)
df['CR0'] = df['CR8aa'].combine_first(df['CR8aa'])

In [None]:
# Update CR0 with CR8bb where CR8bb has 1s or NaNs
df['CR0'] = df['CR8bb'].where((df['CR8bb'] == 1) | (df['CR8bb'].isna()), df['CR0'])

In [None]:
# Update CR0 with CR8cc where CR8cc has 1s
df['CR0'] = df['CR8cc'].where((df['CR8cc'] == 1), df['CR0'])
df.to_csv("../Data/CowData/fertilityDF.csv", index=False)

In [None]:
# Count unique values
columns_to_count = ['CR0', 'CR1', 'CR4', 'CR5', 'CR6', 'CR7', 'CR8aa', 'CR8bb', 'CR8cc']
value_counts = df[columns_to_count].apply(lambda x: x.value_counts(dropna=False))

print(value_counts)

# Sales data
If a cow was sold during a service period, all subsequent inseminations are set to
missing and those before accepted. If the service period occurred before or after
the cow was sold, inseminations are accepted.

# DelPro data - assume cow database is superiod

In [None]:
# Collect sales date from raw data
sold_delpro = pd.read_csv("C:/Users/pagd0001/Desktop/Gigacow/Data/20241009/Gigacow-tools/Projects/HeatStressEvaluation/Data/CowData/Del_Cow240823.csv", low_memory=False, delimiter=";")
col_keep = ["SE_Number", "ArrivalDate", "CullDecisionDate","CullReason1","CullReason2"]
sold_delpro = sold_delpro[col_keep]

In [None]:
sold_delpro

In [None]:
# Count and remove cows with missing IDs
missing_ids_count = sold_delpro['SE_Number'].isna().sum()
print(f"No. of cows with missing IDs: {missing_ids_count}")
print(f"No. of observations in dataframe: {sold_delpro.shape}")

sold_delpro = sold_delpro[sold_delpro["SE_Number"].notna()]
print(f"No. of observations in dataframe after removing missing cow IDs: {sold_delpro.shape}")

In [None]:
unique_values = sold_delpro['CullReason1'].unique()
print(unique_values)
unique_values = sold_delpro['CullReason2'].unique()
print(unique_values)

In [None]:
# Keep only data from sales
sold_delpro = sold_delpro[(sold_delpro["CullReason1"] == "01 Såld till liv") | (sold_delpro["CullReason2"] == "01 Såld till liv")]
print(f"No. of observations with sales data: {sold_delpro.shape}")

# Remove cows where missing CullDecisionDate
sold_delpro = sold_delpro[sold_delpro["CullDecisionDate"].notna()]
print(f"No. of observations with exit date: {sold_delpro.shape}")

In [None]:
# Check no. of observations with arrival date
test = sold_delpro[sold_delpro["ArrivalDate"].isna()]
print(f"No. of observaitons with missing arrival i.e. entry date in DelPro: {sold_delpro.shape}")

In [None]:
sold_delpro

In [None]:
# Rename & keep only SE_Number and SalesDate for each cow
sold_delpro.rename(columns={"CullDecisionDate": "SalesDate_delpro"}, inplace=True)

col_keep = ["SE_Number", "SalesDate_delpro"]
sold_delpro = sold_delpro[col_keep]

In [None]:
# Count the number of times each cow has been sold
sales_count = sold_delpro.groupby('SE_Number').size().reset_index(name='times_sold')

# Count the number of cows for each "times_sold" value
summary = sales_count['times_sold'].value_counts().reset_index()
summary.columns = ['number_of_times_sold', 'number_of_cows']

# Sort by the number of times sold
summary = summary.sort_values(by='number_of_times_sold')

# Display the result
print(summary)

In [None]:
# Manually look at cows with multiple sales date
sold_delpro = pd.merge(sold_delpro, sales_count, on=["SE_Number"], how="left")
multiple_sold = sold_delpro[sold_delpro["times_sold"] == 2]

In [None]:
multiple_sold

In [None]:
print(sold_delpro.index.names)
print(sold_delpro.columns)

In [None]:
# Transpose CullDecisionDate into two columns, one for each date within cow
# Create a new column for the sequential index within each group
multiple_sold = multiple_sold.copy()
multiple_sold['sequence'] = multiple_sold.groupby('SE_Number').cumcount() + 1

# Pivot the table to transpose `column2` into multiple columns
result = multiple_sold.pivot(index='SE_Number', columns='sequence', values='SalesDate_delpro')

# Rename the columns for clarity (optional)
result.columns = [f'SalesDate_delpro_{i}' for i in result.columns]

# Reset the index to return a clean DataFrame (optional)
result = result.reset_index()

# Display the result
print(result)

In [None]:
# Rename SalesDate_delpro to SalesDate_delpro_1, 
sold_delpro.rename(columns={"SalesDate_delpro": "SalesDate_delpro_1"}, inplace=True)

# Remove the cows with multiple obs from original dataset
sold_delpro = sold_delpro[~sold_delpro['SE_Number'].isin(multiple_sold['SE_Number'])]

# Initiate SalesDate_delpro_2 in original dataframe so can fill it with data from multiple_sold
sold_delpro = sold_delpro.copy()
sold_delpro["SalesDate_delpro_2"] = np.nan

In [None]:
sold_delpro

In [None]:
# concat multiple_sold back to original df
# Sort both datasets by cowID and date before concatenating
sold_delpro = sold_delpro.sort_values(by=['SE_Number', 'SalesDate_delpro_1', 'SalesDate_delpro_2'])
result = result.sort_values(by=['SE_Number', 'SalesDate_delpro_1', 'SalesDate_delpro_2'])

# Concatenate along rows
sold_delpro = pd.concat([sold_delpro, result], axis=0, ignore_index=True)

# Change times_sold to 2 for cows with multiple records
sold_delpro['times_sold'] = sold_delpro['times_sold'].fillna(2)

In [None]:
sold_delpro

In [None]:
sold_delpro.to_csv("../Data/CowData/sales_delpro.csv", index=False)

# Cow database sales data

In [None]:
# Collect sales date from raw data
sold_kok = pd.read_csv("C:/Users/pagd0001/Desktop/Gigacow/Data/20241009/Gigacow-tools/Projects/HeatStressEvaluation/Data/CowData/Kok_HerdEntryExit240820.csv", low_memory=False, delimiter=";")
col_keep = ["BirthID", "EntryDate", "ExitDate", "ExitReason_PrimaryReason", "ExitReason_SecondaryReason1", "ExitReason_SecondaryReason2"]
sold_kok = sold_kok[col_keep]

In [None]:
# How does the recording look? Entry-Exit pairing possible?
test2 = sold_kok[sold_kok["EntryDate"].isna()]
print(f"No. of cows with missing EntryDate: {test2.shape}")

test2 = sold_kok[sold_kok["ExitDate"].isna()]
print(f"No. of cows with missing ExitDate, i.e. still alive: {test2.shape}")

In [None]:
SE_Number = ["SE-a756bc39-1200"]
test = sold_kok[sold_kok["BirthID"].isin(SE_Number)]
test = test.sort_values(by=['BirthID', 'EntryDate'])

In [None]:
test

In [None]:
sold_kok = sold_kok.drop_duplicates(subset=["BirthID", "ExitDate"])
sold_kok.rename(columns={"BirthID": "SE_Number"}, inplace=True)
col_keep = ["SE_Number", "ExitDate"]
sold_kok = sold_kok[col_keep]

sold_kok = sold_kok.sort_values(by=["SE_Number", "ExitDate"])

In [None]:
# Remove those missing exit date, i.e. still alive
df2 = sold_kok[sold_kok["ExitDate"].notna()]

In [None]:
df2

Now pair sales data within each lactation within cow

In [None]:
df1 = pd.read_csv("../Data/CowData/fertilityDF.csv", low_memory=False)
col_keep = ["SE_Number", "LactationNumber", "CalvingDate", "upper_limit"]
df1 = df1[col_keep]

df1 = df1.drop_duplicates(subset=["SE_Number", "LactationNumber"])

In [None]:
df1

In [None]:
# Add date of extraction as last possible upper_limit for cows with open records ============================================>>> change this date according to date of data extraction from Samo
df1["upper_limit"] = df1["upper_limit"].where(pd.notna(df1["upper_limit"]), "2024-08-18")

In [None]:
df1

In [None]:
# Convert date columns to datetime
for col in ['CalvingDate', 'upper_limit']:
    df1[col] = pd.to_datetime(df1[col])
for col in ['ExitDate']:
    df2[col] = pd.to_datetime(df2[col])

# Initialize a list to store results
results = []

# Iterate over each row in df1
for _, row1 in df1.iterrows():
    # Filter df2 for matching SE_Number and date intervals
    matches = df2[
        (df2['SE_Number'] == row1['SE_Number']) &
        (df2['ExitDate'].between(row1['CalvingDate'], row1['upper_limit']))
    ]
    
    # If matches are found, add all matches to results
    if not matches.empty:
        for _, match in matches.iterrows():
            results.append({
                'SE_Number': row1['SE_Number'],
                'LactationNumber': row1['LactationNumber'],  # From df1
                'CalvingDate': row1['CalvingDate'],
                'upper_limit': row1['upper_limit'],
                'matching_ExitDate': match['ExitDate']  # From df2
            })
    else:
        # If no matches, add the row with NaN for matching_ExitDate
        results.append({
            'SE_Number': row1['SE_Number'],
            'LactationNumber': row1['LactationNumber'],  # From df1
            'CalvingDate': row1['CalvingDate'],
            'upper_limit': row1['upper_limit'],
            'matching_ExitDate': None  # No match
        })

# Create a DataFrame with results
result_df = pd.DataFrame(results)
result_df.to_csv("../Data/CowData/sales_matched.csv", index=False)

# Display the resulting table
print(result_df)

In [None]:
# Check for sales between herds
test = pd.read_csv("../Data/CowData/sales_matched.csv", low_memory=False)
test2 = test[test["upper_limit"] == test["matching_ExitDate"]]
print(f"No. of observations where ExitDate is equal to upper_limit for given lactation: {test2.shape}")

In [None]:
test

In [None]:
# Count number of sales within cow and lactation
sales_count = test.groupby(['SE_Number', "LactationNumber"]).size().reset_index(name='times_sold')

# Count the number "times_sold"
summary = sales_count['times_sold'].value_counts().reset_index()
summary.columns = ['number_of_times_sold', 'number_of_cows']

# Sort by the number of times sold
summary = summary.sort_values(by='number_of_times_sold')

# Display the result
print(summary)

In [None]:
sales_count

In [None]:
# Transpose CullDecisionDate into two columns, one for each date within cow
# Create a new column for the sequential index within each group
test = test.copy()
test['sequence'] = test.groupby(['SE_Number', 'LactationNumber']).cumcount() + 1

# Pivot the table to transpose `column2` into multiple columns
result = test.pivot(index=['SE_Number', 'LactationNumber'], columns='sequence', values='matching_ExitDate')

# Rename the columns for clarity (optional)
result.columns = [f'SalesDate{i}' for i in result.columns]

# Reset the index to return a clean DataFrame (optional)
result = result.reset_index()

# Display the result
print(result)

In [None]:
result2 = result[pd.notna(result["SalesDate2"])]

In [None]:
result2

In [None]:
# Transfer sales data to fertilityDF.csv
df1 = pd.read_csv("../Data/CowData/fertilityDF.csv", low_memory=False)

df1 = pd.merge(df1, result, on=["SE_Number", "LactationNumber"], how="left")

In [None]:
df1

In [None]:
df1.to_csv("../Data/CowData/fertilityDF.csv", index=False)

# Processkod
- if want to match sales_delpro.csv with sales_kok.csv to create sales.csv (however, not matched with fertilityDF.csv)

In [None]:
unique_values = sold_kok['ExitReason_PrimaryReason'].unique()
print(unique_values)
unique_values = sold_kok['ExitReason_SecondaryReason1'].unique()
print(unique_values)
unique_values = sold_kok['ExitReason_SecondaryReason2'].unique()
print(unique_values)

In [None]:
# Rename BirthID for concatenating
sold_kok.rename(columns=
                {"BirthID": "SE_Number",
                 "ExitDate": "SalesDate_kok",
                 "ExitReason_PrimaryReason": "ExitReason_PrimaryReason_kok"}, inplace=True)

In [None]:
# Count and remove cows with missing IDs
missing_ids_count = sold_kok['SE_Number'].isna().sum()
print(f"Number of cows with missing IDs: {missing_ids_count}")
print(f"Number of observations in dataframe: {sold_kok.shape}")

In [None]:
# Keep only data from sales
sold_kok = sold_kok[(sold_kok["ExitReason_PrimaryReason_kok"] == "Såld till liv")]

# Remove cows where missing ExitDate
sold_kok = sold_kok[sold_kok["SalesDate_kok"].notna()]
print(f"Number of cows with sales date: {sold_kok.shape}")

In [None]:
sold_kok

In [None]:
# Count the number of times each cow has been sold
sales_count = sold_kok.groupby('SE_Number').size().reset_index(name='times_sold')

# Count the number of cows for each "times_sold" value
summary = sales_count['times_sold'].value_counts().reset_index()
summary.columns = ['number_of_times_sold', 'number_of_cows']

# Sort by the number of times sold
summary = summary.sort_values(by='number_of_times_sold')

# Display the result
print(summary)

In [None]:
# Add "times_sold" to original df and sort for cows with multiple records
sold_kok = pd.merge(sold_kok, sales_count, on=["SE_Number"], how="left")
multiple_sold = sold_kok[sold_kok["times_sold"] > 1]

In [None]:
multiple_sold

In [None]:
# Transpose SalesDate into columns, one for each date within cow
#First sort to ensure getting sales into chronological order
multiple_sold = multiple_sold.sort_values(by=['SE_Number', 'SalesDate_kok'])

# Create a new column for the sequential index within each group
multiple_sold = multiple_sold.copy()
multiple_sold['sequence'] = multiple_sold.groupby('SE_Number').cumcount() + 1

# Pivot the table to transpose `column2` into multiple columns
result = multiple_sold.pivot(index='SE_Number', columns='sequence', values='SalesDate_kok')

# Rename the columns for clarity (optional)
result.columns = [f'SalesDate_kok_{i}' for i in result.columns]

# Reset the index to return a clean DataFrame
result = result.reset_index()
print(result)

In [None]:
sold_kok

In [None]:
# Rename SalesDate_delpro to SalesDate_delpro_1, 
sold_kok.rename(columns={"SalesDate_kok": "SalesDate_kok_1"}, inplace=True)

# Remove the cows with multiple obs from original dataset
sold_kok = sold_kok[~sold_kok['SE_Number'].isin(multiple_sold['SE_Number'])]

# Initiate SalesDate_delpro_n in original dataframe so can fill it with data from multiple_sold
sold_kok = sold_kok.copy()
sold_kok["SalesDate_kok_2"] = np.nan
sold_kok["SalesDate_kok_3"] = np.nan
sold_kok["SalesDate_kok_4"] = np.nan
sold_kok["SalesDate_kok_5"] = np.nan
sold_kok["SalesDate_kok_6"] = np.nan
sold_kok["SalesDate_kok_7"] = np.nan
sold_kok["SalesDate_kok_8"] = np.nan
sold_kok["SalesDate_kok_9"] = np.nan

In [None]:
sold_kok

In [None]:
# concat multiple_sold back to original df
# Sort both datasets by cowID and date before concatenating
sold_kok = sold_kok.sort_values(by=['SE_Number', 'EntryDate', 'SalesDate_kok_1', 'SalesDate_kok_2', 'SalesDate_kok_3', 'SalesDate_kok_4', 'SalesDate_kok_5', 'SalesDate_kok_6', 
                                    'SalesDate_kok_7', 'SalesDate_kok_8', 'SalesDate_kok_9'])
result = result.sort_values(by=['SE_Number', 'SalesDate_kok_1', 'SalesDate_kok_2', 'SalesDate_kok_3', 'SalesDate_kok_4', 'SalesDate_kok_5', 'SalesDate_kok_6', 
                                'SalesDate_kok_7', 'SalesDate_kok_8', 'SalesDate_kok_9'])

# Concatenate along rows
sold_kok = pd.concat([sold_kok, result], axis=0, ignore_index=True)

In [None]:
sold_kok

In [None]:
multiple_sold

In [None]:
# Update EntryDate and times_sold for cows with multiple records
col_keep = ["SE_Number", "EntryDate", "times_sold"]
multi_sold = multiple_sold[col_keep]
multi_sold.drop_duplicates(subset="SE_Number")

sold_kok = pd.merge(sold_kok, multi_sold, on=["SE_Number"], how="left")

In [None]:
sold_kok

In [None]:
sold_kok["times_sold"] = sold_kok["times_sold_x"].fillna(sold_kok["times_sold_y"])
sold_kok["EntryDate"] = sold_kok["EntryDate_x"].fillna(sold_kok["EntryDate_y"])

In [None]:
sold_kok

In [None]:
col_keep = ["SE_Number", "EntryDate", "times_sold", 'SalesDate_kok_1', 'SalesDate_kok_2', 'SalesDate_kok_3', 'SalesDate_kok_4', 'SalesDate_kok_5', 'SalesDate_kok_6', 
            'SalesDate_kok_7', 'SalesDate_kok_8', 'SalesDate_kok_9']
sold_kok = sold_kok[col_keep]

sold_kok.to_csv("../Data/CowData/sales_kok.csv", index=False)

In [None]:
# PAIRING COW DATABASE AND DELPRO FOR SALES
ckok = pd.read_csv("../Data/CowData/sales_kok.csv", low_memory=False)
cDel = pd.read_csv("../Data/CowData/sales_delpro.csv", low_memory=False)

ckok2 = pd.merge(ckok, cDel, on=["SE_Number"], how="left")
ckok2.drop_duplicates(subset=["SE_Number"], inplace=True)
# ckok2['CullingDate'] = ckok2['ExitDateKok'].fillna(cDel['CullDecisionDateDelPro'])

In [None]:
ckok2

In [None]:
df

In [None]:
# How many of these cows are in our dataframe?
df = pd.read_csv("../Data/CowData/fertilityDF.csv", low_memory=False)
col_keep = ["FarmName_Pseudo", "SE_Number", "LactationNumber", "CalvingDate", "upper_limit"]
df_fert = df[col_keep]
df_fert = df_fert.copy()
df_fert.drop_duplicates(subset=["SE_Number", "LactationNumber"], inplace=True)
print(df_fert.shape)

df_fert = pd.merge(df_fert, ckok2, on=("SE_Number"), how="left")
print(df_fert.shape)

In [None]:
df_fert

In [None]:
df_fert.to_csv("../Data/CowData/sales.csv", index=False)

In [None]:
# How does this data look like?
non_null_counts = df_fert.groupby("FarmName_Pseudo").count()
print(f"Non-null counts in each column: {non_null_counts}")

# Continue with CR program based on cow database data

In [None]:
# Check culling reasons
col_keep = ["SE_Number", "CullingDate", "CullingReason1", "CullingReason2"]
df4 = df[col_keep]

unique_values = df4['CullingReason1'].unique()
print(unique_values)
unique_values = df4['CullingReason2'].unique()
print(unique_values)

print(df4.shape)

In [None]:
df = pd.read_csv("../Data/CowData/fertilityDF.csv", low_memory=False)

In [None]:
df

In [None]:
# Find example cows
df = pd.read_csv("../Data/CowData/fertilityDF.csv", low_memory=False)

# df2 = df[pd.notna(df["SalesDate1"])]
# df2 = df[df["InseminationDate"] > df["SalesDate1"]]

SE_Number = ["SE-a756bc39-1249"]
df2 = df[df["SE_Number"].isin(SE_Number)]

df2.to_csv("../Data/CowData/test.csv", index=False)

In [None]:
df = pd.read_csv("../Data/CowData/fertilityDF.csv", low_memory=False)

# Define service periods, find first ins date, allow for 7ins i.e. 147d
df["InseminationDate"] = pd.to_datetime(df["InseminationDate"])
first_ins1 = df.groupby(['SE_Number', 'LactationNumber'])['InseminationDate'].min().reset_index()
first_ins1.rename(columns={'InseminationDate': 'first_ins_before_sold'}, inplace=True)
df = df.merge(first_ins1, on=['SE_Number', 'LactationNumber'], how='left')

df["first_ins_before_sold"] = pd.to_datetime(df["first_ins_before_sold"])
df["service_period1_end"] = df["first_ins_before_sold"] + pd.Timedelta(days=147)
# df.to_csv("../Data/CowData/fertilityDF.csv", index=False)

df = df.sort_values(by=['SE_Number', 'LactationNumber', 'InseminationDate', 'PregnancyCheckDate'])


def ins_after_sold(df):
    # Filter rows where InseminationDate is after SalesDate
    filtered = df[df["InseminationDate"] > df["SalesDate1"]]

    # Group by SE_Number and LactationNumber and find the minimum ie first InseminationDate after SalesDate
    first_ins2 = filtered.groupby(['SE_Number', 'LactationNumber'])['InseminationDate'].min().reset_index()
    first_ins2.rename(columns={'InseminationDate': 'first_ins_after_sold'}, inplace=True)

    # Merge the result back to the original test DataFrame
    df = pd.merge(df, first_ins2, on=['SE_Number', 'LactationNumber'], how='left')

    # Convert the merged column to datetime
    df["first_ins_after_sold"] = pd.to_datetime(df["first_ins_after_sold"])

    # Calculate service_period2 by adding 147 days to first_ins_after_sold
    df["service_period2_end"] = df["first_ins_after_sold"] + pd.Timedelta(days=147)

    return df


# Call the function and save the result
df = ins_after_sold(df)
# df.to_csv("../Data/CowData/fertilityDF.csv", index=False)
# df.to_csv("../Data/CowData/test.csv", index=False)

In [None]:
df

In [None]:
df = pd.read_csv("../Data/CowData/fertilityDF.csv", low_memory=False)

# Define service periods, find first ins date
df["InseminationDate"] = pd.to_datetime(df["InseminationDate"])
first_ins1 = df.groupby(['SE_Number', 'LactationNumber'])['InseminationDate'].min().reset_index()
first_ins1.rename(columns={'InseminationDate': 'first_ins_before_sold'}, inplace=True)
df = df.merge(first_ins1, on=['SE_Number', 'LactationNumber'], how='left')

# Convert date columns to datetime
df["InseminationDate"] = pd.to_datetime(df["InseminationDate"])
df["SalesDate1"] = pd.to_datetime(df["SalesDate1"])
df["first_ins_before_sold"] = pd.to_datetime(df["first_ins_before_sold"])

# Define end of service period (147 days after the first insemination)
df["service_period_end"] = df["first_ins_before_sold"] + pd.Timedelta(days=147)

# Initialize CR9 with a default value of 2
df["CR9"] = 2

# Sort dataframe to ensure it's ordered by SE_Number, LactationNumber, and InseminationDate
df = df.sort_values(by=['SE_Number', 'LactationNumber', 'InseminationDate'])

def handle_inseminations(df):
    # List to store the updated rows
    updated_rows = []

    # Iterate over each group (cow and lactation)
    for _, group in df.groupby(['SE_Number', 'LactationNumber']):
        cow_sales_date = group['SalesDate1'].iloc[0]
        service_period_start = group['first_ins_before_sold'].iloc[0]
        service_period_end = group['service_period_end'].iloc[0]

        # Insemination logic based on cow sales
        for index, row in group.iterrows():

            # If cow was sold during service period, all subsequent inseminations are set to NaT (missing)
            if pd.notna(cow_sales_date) and service_period_start <= cow_sales_date <= service_period_end:
                # If insemination date is after the sales date within the service period, set to NaT and CR9 to None
                if row['InseminationDate'] > cow_sales_date:
                    row['CR9'] = np.nan  # Set CR9 to missing for post-sale inseminations
            else:
                # If the sale happened before or after the service period, accept insemination date and CR9=1
                row['CR9'] = 1

            updated_rows.append(row)

    # Convert the list of updated rows into a new dataframe
    updated_df = pd.DataFrame(updated_rows)
    return updated_df

# Call the function to process inseminations
df_updated = handle_inseminations(df)

# Display the result
print(df_updated)

In [None]:
df

In [None]:
# Convert date columns to datetime
df["InseminationDate"] = pd.to_datetime(df["InseminationDate"])
df["SalesDate1"] = pd.to_datetime(df["SalesDate1"])
df["first_ins_before_sold"] = pd.to_datetime(df["first_ins_before_sold"])

# Initiate CR9 with 2 to be overwritten with 1s and NaNs
df["CR9a"] = 2


def cr9a(row):
    if pd.notna(row["SalesDate1"]):
        
        # If a cow was sold during a service period, all inseminations before are accepted
        if row["first_ins_before_sold"] <= row["InseminationDate"] < row["SalesDate1"]:
            return 1
        
        # If a cow was sold during a service period, all subsequent inseminations are set to missing ============================================================>>> done below instead
        #if row["first_ins_before_sold"] < row["SalesDate1"] < row["service_period1_end"]:
        #   return np.nan
                
        # If the service period occurred before sold ie date of service period ends sooner than sold date, accept ins
        if row["service_period1_end"] < row["SalesDate1"]:
            if row["first_ins_before_sold"] <= row["InseminationDate"] < row["service_period1_end"]:
                return 1
            
        # If the service period occurred after sold ie date of first ins after sold is past sold date, accept ins
        if row["service_period1_end"] < row["SalesDate1"]:
            if row["first_ins_after_sold"] <= row["InseminationDate"] < row["service_period2_end"]:
                return 1


# Apply the function to df
df['CR9a'] = df.apply(cr9a, axis=1)
# df.to_csv("../Data/CowData/fertilityDF.csv", index=False)
# df.to_csv("../data/CowData/test.csv", index=False)
print(df.shape)

In [None]:
# If a cow was sold during a service period, all subsequent inseminations are set to missing
# Step 1: Sort by cow (SE_Number), lactation, and insemination date
df = df.sort_values(by=['SE_Number', 'LactationNumber', 'InseminationDate'])

# Step 2: Define the function to handle inseminations based on sale date
def handle_inseminations_after_sale(df):
    # Initialize CR9 with 2 (or any initial value)
    df['CR9b'] = 2
    
    # Step 3: Loop through each cow and lactation group
    for _, group in df.groupby(['SE_Number', 'LactationNumber']):
        # Get the sale date for the group (assuming only one sale date per cow per lactation)
        sale_date = group['SalesDate1'].iloc[0]
        
        # Step 4: Identify inseminations after the sale date
        # Inseminations before or on the sale date are kept, after the sale date are set to NaN
        is_after_sale = group['InseminationDate'] > sale_date
        
        # Step 5: Set CR9 to 1 for valid inseminations and NaN for subsequent inseminations
        group.loc[~is_after_sale, 'CR9b'] = 1  # Set CR9=1 for valid inseminations
        group.loc[is_after_sale, 'CR9b'] = np.nan  # Set subsequent inseminations to NaN
        
        # Replace the original group in the dataframe
        df.loc[group.index, ['InseminationDate', 'CR9b']] = group[['InseminationDate', 'CR9b']]

    return df

# Step 6: Apply the function to the dataset
df = handle_inseminations_after_sale(df)
print(df)

In [None]:
columns_to_count = ['CR0', 'CR1', 'CR4', 'CR5', 'CR6', 'CR7', 'CR8aa', 'CR8bb', 'CR8cc', 'CR9a', 'CR9b']
value_counts = df[columns_to_count].apply(lambda x: x.value_counts(dropna=False))

print(value_counts)

In [None]:
# Update CR0 with CR9a where CR9a has 1s
df["CR0"] = df["CR9a"].where((df["CR9a"] == 1), df["CR0"])

In [None]:
columns_to_count = ['CR0', 'CR1', 'CR4', 'CR5', 'CR6', 'CR7', 'CR8aa', 'CR8bb', 'CR8cc', 'CR9a', 'CR9b']
value_counts = df[columns_to_count].apply(lambda x: x.value_counts(dropna=False))

print(value_counts)

In [None]:
df["CR0"] = df["CR9b"].where((df["CR9b"].isna()), df["CR0"])

In [None]:
columns_to_count = ['CR0', 'CR1', 'CR4', 'CR5', 'CR6', 'CR7', 'CR8aa', 'CR8bb', 'CR8cc', 'CR9a', 'CR9b']
value_counts = df[columns_to_count].apply(lambda x: x.value_counts(dropna=False))

print(value_counts)

In [None]:
df.to_csv("../Data/CowData/fertilityDF.csv", index=False)

# Slaughter data
When a cow is slaughtered (i.e. all other exit codes but sold or moved), the last phenotype is left successful only in the case
of a positive pregnancy check; otherwise, it is set to zero.

In [None]:
df = pd.read_csv("../Data/CowData/fertilityDF.csv", low_memory=False)

In [None]:
unique_values = df['CullingReason1'].unique()
print(unique_values)
unique_values = df['CullingReason2'].unique()
print(unique_values)

In [None]:
# Find last ins within cow and lactation
df = df.sort_values(by=['SE_Number', 'LactationNumber', 'InseminationDate', 'PregnancyCheckDate'])
df["InseminationDate"] = pd.to_datetime(df["InseminationDate"])
df['last_ins'] = (df.groupby(['SE_Number', 'LactationNumber'])['InseminationDate'].transform('max') == df['InseminationDate'])

# Find last pregnancy check date within cow in case of multiple preg checks for last ins
df["PregnancyCheckDate"] = pd.to_datetime(df["PregnancyCheckDate"])
df['last_preg_check'] = (df.groupby(['SE_Number', 'LactationNumber'])['PregnancyCheckDate'].transform('max') == df['PregnancyCheckDate'])
df["CR10"] = np.nan


def cr10(row):
    if pd.isna(row["next_calving"]) and pd.notna(row["CullingReason1"]) and row["CullingReason1"] != "Såld till liv" and row["CullingReason2"] != "Såld till liv":
        if row["last_ins"] and row["last_preg_check"] and row["PregnancyStatus"] == "Positive":
            return 1
        else:
            return 0


# Apply the function to df
df['CR10'] = df.apply(cr10, axis=1)

In [None]:
columns_to_count = ['CR0', 'CR1', 'CR4', 'CR5', 'CR6', 'CR7', 'CR8aa', 'CR8bb', 'CR8cc', 'CR9a', 'CR9b', 'CR10']
value_counts = df[columns_to_count].apply(lambda x: x.value_counts(dropna=False))

print(value_counts)

In [None]:
# Update CR0 with CR10 only where have data
df['CR0'] = np.where(df['CR10'].notna(), df['CR10'], df['CR0'])

In [None]:
columns_to_count = ['CR0', 'CR1', 'CR4', 'CR5', 'CR6', 'CR7', 'CR8aa', 'CR8bb', 'CR8cc', 'CR9a', 'CR9b', 'CR10']
value_counts = df[columns_to_count].apply(lambda x: x.value_counts(dropna=False))

print(value_counts)

In [None]:
df.to_csv("../Data/CowData/fertilityDF.csv", index=False)

# Open records
Open records due to the data extraction: All data is used to define phenotypes
before removing data that is too new (150-d gap).

OPEN REC signified by no next_ins, next_calving or CullingDate i.e. all pd.isna

In this class of open records, the last CR is set to 0 if the lactation length
is > 260 days and days from the data extraction to the last insemination is > 340 days. For the remaining open records, CR is set to 0.7, i.e., average NRR
in heifers. The rationale behind this is that if there are no events during 150
days before the extraction of data, it is very probable that a cow is pregnant
and has not calved yet.

In [None]:
import pandas as pd
import numpy as np

In [None]:
df = pd.read_csv("../Data/CowData/fertilityDF.csv", low_memory=False)

In [None]:
# When is the maximum of pregnancy checks happening?
df["PregnancyCheckDate"] = pd.to_datetime(df["PregnancyCheckDate"])
max_value = df['PregnancyCheckDate'].max()
print(max_value)

In [None]:
# Define date for extraction from Samo
df["date_extraction"] = "2024-08-18" # ==========================================================================>>> Change this date to correspond with raw data extraction date from Samo
df["date_extraction"] = pd.to_datetime(df["date_extraction"])

# Set interval for open records, i.e. 150d from data extraction
df["extraction_limit"] = df["date_extraction"] - pd.Timedelta(days=150)
df["extraction_limit"] = pd.to_datetime(df["extraction_limit"])

# Define lactation length, between calving date and dry off date
df["CalvingDate"] = pd.to_datetime(df["CalvingDate"])
df["DryOffDate"] = pd.to_datetime(df["DryOffDate"])
df["lact_length"] = (df["DryOffDate"] - df["CalvingDate"]).dt.days

# Define interval from data extraction to last ins
df["InseminationDate_last"] = pd.to_datetime(df["InseminationDate_last"])
df["extraction_to_last_ins"] = (df["date_extraction"] - df["InseminationDate_last"]).dt.days

In [None]:
df

In [None]:
def cr11(row):
    if pd.isna(row["next_ins"]) and pd.isna(row["next_calving"]) and pd.isna(row["CullingDate"]):
        if row["last_ins"] == True:
            if row["lact_length"] > 260 and row["extraction_to_last_ins"] > 340:
                return 0
        else:
            return 0.7


# Apply the function to df
df['CR11'] = df.apply(cr11, axis=1)
df.to_csv("../Data/CowData/fertilityDF.csv", index=False)

In [None]:
columns_to_count = ['CR0', 'CR1', 'CR4', 'CR5', 'CR6', 'CR7', 'CR8aa', 'CR8bb', 'CR8cc', 'CR9a', 'CR9b', 'CR10', 'CR11']
value_counts = df[columns_to_count].apply(lambda x: x.value_counts(dropna=False))

print(value_counts)

In [None]:
# Update CR0 with CR9a where CR9a has 1s or 0.7s
df["CR0"] = df["CR11"].where((df["CR11"] == 1) | (df["CR11"] == 0.7), df["CR0"])

In [None]:
columns_to_count = ['CR0', 'CR1', 'CR4', 'CR5', 'CR6', 'CR7', 'CR8aa', 'CR8bb', 'CR8cc', 'CR9a', 'CR9b', 'CR10', 'CR11']
value_counts = df[columns_to_count].apply(lambda x: x.value_counts(dropna=False))

print(value_counts)

In [None]:
df.to_csv("../Data/CowData/fertilityDF.csv", index=False)