In [None]:
#IMPORT CALVING DATA
import pandas as pd
import numpy as np
df=pd.read_csv("Del_Calving240531.csv", delimiter=';', low_memory=False)

#CHECK CALVING EASE
unique_values = df['CalvingEase'].unique()
print(unique_values)

"""
#CHECK DISTRIBUTION CE
import matplotlib.pyplot as plt
df["CalvingEase"] = df["CalvingEase"].astype(str)
CE = df["CalvingEase"]
plt.hist(CE)
plt.xticks(rotation=45)
plt.show()
"""

#REORDER CE
df["CE"]=df["CalvingEase"].replace(
    {"1 Normal delivery" : "11",
    "2 Difficult delivery" : "13",
    "9 Early calving (215-240 days)" : "9",
    "11 Easy, without assistance" : "11",
    "12 Easy, with assistance" : "12",
    "13 Difficult, without veterinary assistance" : "13",
    "14 Difficult, with veterinary assistance" : "14",
    "15 Not specified" : "15",
    "nan" : "15",
    "485" : "15",
    "486" : "15",
    "487" : "15",
    "Normal" : "11",
    "3 Abnormal position" : "3",
    "8 Kastning (<215 dagar)" : "8",
    "09 Tidig kalvning" : "9",
    "12 Lätt med hjälp" : "12",
    "13 Svår utan veterinärhjälp" : "13",
    "14 Svår med veterinärhjälp" : "14"})

"""
#CHECK DISTRIBUTION
CE = df["CE"]
plt.hist(CE)
plt.xticks(rotation=45)
plt.show()
"""

#KEEP RELEVANT COLUMNS
col_keep = ["FarmName_Pseudo","SE_Number","AnimalNumber","Del_Cow_Id","CalvingDate","CalvingSireBullID","CE"]
df2 = df[col_keep]

#CHECK FOR DUPLICATES
## Remove duplicate rows based cowid and calving date, sort
print(df2.shape) #9917 calving events, 7 col
df_unique = df2.drop_duplicates(subset=["SE_Number", "CalvingDate"])
print(df_unique.shape) #9710 unique calving events, 7col
sort_df = df_unique.sort_values(by=["SE_Number", "CalvingDate"])

#save df
sort_df.to_csv("dataframe.csv", index=False)

In [None]:
#MAKE UPPER LIMIT FOR EACH LACTATION USING CALVING DATA
#To fill non-missing values and shift them up in a DataFrame to a new column called upper_limit
dfm2 = pd.DataFrame(sort_df, columns=["FarmName_Pseudo","SE_Number","AnimalNumber","Del_Cow_Id","CalvingDate"])

#apply grouped shifting
arr = dfm2[["SE_Number", "CalvingDate"]].values
grouper = np.split(arr[:,1], np.unique(arr[:, 0], return_index=True)[1][1:])
shift = [np.append(np.roll(i,-1)[:-1], np.nan) for i in grouper]
new_col = np.hstack(shift)
dfm2["upper_limit"] = new_col
dfm2.to_csv("dataframe2.csv", index=False)

#Merge onto master df
dfm3 = pd.merge(sort_df, dfm2, on=["FarmName_Pseudo","SE_Number","AnimalNumber","Del_Cow_Id","CalvingDate"])
dfm3.to_csv("dataframe.csv", index=False)

In [None]:
#ADD CULLING AND BIRTH DATA
#"DeathDate" and ExitDate not used in database? use CullDecisionDate instead
df6=pd.read_csv("Del_Cow240531.csv", delimiter=';', low_memory=False)
col_keep = ["Del_Cow_Id","FarmName_Pseudo","SE_Number","AnimalNumber","Mother","Father","BreedName","BirthDate","CullDecisionDate"]
df7=df6[col_keep]

#Check for duplicates
print(df7.shape) #24,473 events, 4col
df8 = df7.drop_duplicates(subset=["SE_Number","BreedName","BirthDate","CullDecisionDate"])
print(df8.shape) #24,222 unique events, 4col

#Drop rows where SE_Number is na
print(df8.isna().sum()) #369, 9044, 9044, 17185 for "SE_Number","BreedName","BirthDate","CullDecisionDate"
df9 = df8.dropna(subset=["SE_Number"])
print(df9.shape) #23,886 unique records, 4col
#Drop first row with strange entry
df10 = df9.loc[1:] #1-23,887 ie removing index row 0
df10.to_csv("dataframe2.csv", index=False)

#Merge onto master df
dfm4 = pd.merge(dfm3, df10, on=["FarmName_Pseudo","SE_Number","AnimalNumber","Del_Cow_Id"], how="outer")

#If upper_limit missing, use culling data
dfm4["upper_limit"].fillna(dfm4["CullDecisionDate"], inplace=True)
dfm4.to_csv("dataframe.csv", index=False)

In [None]:
#ADD DRY OFF DATE
#Load data, keep variables, check for duplicates, sort
df11=pd.read_csv("Del_DryOff240531.csv", delimiter=';', low_memory=False)
col_keep = ["FarmName_Pseudo","SE_Number","AnimalNumber","Del_Cow_Id","DryOffDate"]
df12 = df11[col_keep]
print(df12.shape) #4989 dryoff events, 2col
df13 = df12.drop_duplicates(subset=["SE_Number", "DryOffDate"])
print(df13.shape) #4989 unique dryoff events, 2col
df13 = df13.sort_values(by=["SE_Number", "DryOffDate"])
#df13.to_csv("dataframe2.csv", index=False)

#Merge dry off data onto master df
dfm5 = pd.merge(dfm4, df13, on=["FarmName_Pseudo","SE_Number","AnimalNumber","Del_Cow_Id"])
dfm5.to_csv("dataframe.csv", index=False)
"""
#Subset chosen cows
SE_Number = ["SE-5c06d92d-2950", "SE-5c06d92d-3010", "SE-5c06d92d-3068"]
dfm6 = dfm5[dfm5["SE_Number"].isin(SE_Number)]
dfm6.to_csv("dataframe3.csv", index=False)
"""
"""
#Subset chosen cows
SE_Number = ["SE-064c0cec-1189", "SE-169e580a-3221", "SE-169e580a-3418", "SE-169e580a-3634", "SE-169e580a-2843"]
dfm6 = dfm5[dfm5["SE_Number"].isin(SE_Number)]
"""

#Sort which dry off data belongs to which lactation
dfm6=dfm5

#Convert columns to datetime
dfm6['DryOffDate'] = pd.to_datetime(dfm6['DryOffDate'])
dfm6['BirthDate'] = pd.to_datetime(dfm6['BirthDate'])
dfm6['CullDecisionDate'] = pd.to_datetime(dfm6['CullDecisionDate'])
dfm6['CalvingDate'] = pd.to_datetime(dfm6['CalvingDate'])
dfm6['upper_limit'] = pd.to_datetime(dfm6['upper_limit'])

#Set obs to NaN where values in upper_limit = culling date, ie doesn't have drying off date
dfm6.loc[dfm6["upper_limit"] == dfm6["CullDecisionDate"], "DryOffDate"] = np.nan
#Also set dryoffdate to missing when both upper_limit and culling date are missing
dfm6.loc[dfm6["upper_limit"].isna() & dfm6["CullDecisionDate"].isna(), "DryOffDate"] = pd.NA
#dfm6.to_csv("dataframe2.csv", index=False)

#Filter df for relevant lactations
dfm7 = dfm6[(dfm6['DryOffDate'] > dfm6['CalvingDate']) & (dfm6['DryOffDate'] < dfm6['upper_limit'])]
#dfm7.to_csv("dataframe2.csv", index=False)

#Also filter df to keep last lactation, ie doesn't have a drying off date yet (might also miss upper_limit)
#Keep only rows where dryoff has NaN values
dfm8 = dfm6[dfm6["DryOffDate"].isna()]
dfm8 = dfm8.drop_duplicates(subset=["SE_Number", "CalvingDate", "upper_limit"])
#dfm8.to_csv("dataframe3.csv", index=False)

#concatinate dfs
frames = [dfm7, dfm8]
dfm9 = pd.concat(frames)
dfm9 = dfm9.sort_values(by=["SE_Number", "CalvingDate", "upper_limit"])
dfm9.to_csv("dataframe.csv", index=False)

from datetime import datetime
#Get today's date for current lactations missing upper_limit
today_date = datetime.today().date()
dfm9.loc[dfm9["upper_limit"].isna() & dfm9["CullDecisionDate"].isna(), "upper_limit"] = today_date
dfm9.to_csv("dataframe.csv", index=False)

In [None]:
#ADD LACTATION NUMBER
df3=pd.read_csv("Del_Lactation240531.csv", delimiter=';', low_memory=False)
col_keep = ["FarmName_Pseudo","SE_Number","AnimalNumber","Del_Cow_Id","LactationInfoDate","LactationNumber"]
df4=df3[col_keep]
df4 = df4.sort_values(by=["SE_Number", "LactationInfoDate"])
df4.to_csv("dataframe2.csv")

"""
#Following cow has an error in date column
#Create a frequency table for the date column
frequency_table = df4["LactationInfoDate"].value_counts()
frequency_table.to_csv("dataframe3.csv") #2022-05,1
filt = df4["LactationInfoDate"] == "2022-05"
print(df4.loc[filt]) #[1row, 6col]

#Subset chosen cow
SE_Number = ["SE-a756bc39-1002"] #813obs
df5 = df4[df4["SE_Number"].isin(SE_Number)]
df5.to_csv("dataframe3.csv", index=False)
"""
#Check for duplicates
print(df4.shape) #2,158,630 events, 6col
df5 = df4.drop_duplicates(subset=["SE_Number","LactationNumber"])
print(df5.shape) #9,728 unique lactations, 6col

#Delete row where this cow has messed up date using boolean indexing
df6 = df5[df5["LactationInfoDate"] != "2022-05"]

#Convert LactationInfoDate to datetime
df6["LactationInfoDate"] = pd.to_datetime(df6["LactationInfoDate"])

#Merge
dfm10 = pd.merge(dfm9, df6, on=["FarmName_Pseudo","SE_Number","AnimalNumber","Del_Cow_Id"])
dfm10.to_csv("dataframe.csv", index=False)

#Filter df for relevant lactations
dfm11 = dfm10[(dfm10["LactationInfoDate"] >= dfm10["CalvingDate"]) & (dfm10["LactationInfoDate"] <= dfm10["upper_limit"])]

#For some reason today_date included time, convert column to datetime object
dfm11["upper_limit"] = pd.to_datetime(dfm11["upper_limit"])

#Drop column LactationInfoDate cuz reasons
dfm11 = dfm11.drop('LactationInfoDate', axis=1)
dfm11.to_csv("dataframe.csv")

"""
#Subset chosen cows
SE_Number = ["SE-5c06d92d-2950", "SE-5c06d92d-3010", "SE-5c06d92d-3068"]
dfm12 = dfm11[dfm11["SE_Number"].isin(SE_Number)]
"""