In [1]:
#IMPORT CALVING DATA
import pandas as pd
import numpy as np
df=pd.read_csv("Del_Calving240531.csv", delimiter=';', low_memory=False)

#CHECK CALVING EASE
unique_values = df['CalvingEase'].unique()
print(unique_values)

"""
#CHECK DISTRIBUTION CE
import matplotlib.pyplot as plt
df["CalvingEase"] = df["CalvingEase"].astype(str)
CE = df["CalvingEase"]
plt.hist(CE)
plt.xticks(rotation=45)
plt.show()
"""

#REORDER CE
df["CE"]=df["CalvingEase"].replace(
    {"1 Normal delivery" : "11",
    "2 Difficult delivery" : "13",
    "9 Early calving (215-240 days)" : "9",
    "11 Easy, without assistance" : "11",
    "12 Easy, with assistance" : "12",
    "13 Difficult, without veterinary assistance" : "13",
    "14 Difficult, with veterinary assistance" : "14",
    "15 Not specified" : "15",
    "nan" : "15",
    "485" : "15",
    "486" : "15",
    "487" : "15",
    "Normal" : "11",
    "3 Abnormal position" : "3",
    "8 Kastning (<215 dagar)" : "8",
    "09 Tidig kalvning" : "9",
    "12 Lätt med hjälp" : "12",
    "13 Svår utan veterinärhjälp" : "13",
    "14 Svår med veterinärhjälp" : "14"})

"""
#CHECK DISTRIBUTION
CE = df["CE"]
plt.hist(CE)
plt.xticks(rotation=45)
plt.show()
"""

#KEEP RELEVANT COLUMNS
col_keep = ["FarmName_Pseudo","SE_Number","AnimalNumber","Del_Cow_Id","CalvingDate","CalvingSireBullID","CE"]
df2 = df[col_keep]

#CHECK FOR DUPLICATES
## Remove duplicate rows based cowid and calving date, sort
print(df2.shape) #9917 calving events, 7 col
df_unique = df2.drop_duplicates(subset=["SE_Number", "CalvingDate"])
print(df_unique.shape) #9710 unique calving events, 7col
sort_df = df_unique.sort_values(by=["SE_Number", "CalvingDate"])

#save df
sort_df.to_csv("dataframe.csv", index=False)

['11 Easy, without assistance' '12 Easy, with assistance'
 '13 Difficult, without veterinary assistance' '15 Not specified'
 '2 Difficult delivery' '1 Normal delivery'
 '9 Early calving (215-240 days)' nan
 '14 Difficult, with veterinary assistance' '485'
 '14 Svår med veterinärhjälp' '486' '13 Svår utan veterinärhjälp'
 '8 Kastning (<215 dagar)' '487' '12 Lätt med hjälp' '09 Tidig kalvning'
 'Normal' '3 Abnormal position']
(9917, 7)
(9710, 7)


In [2]:
#MAKE UPPER LIMIT FOR EACH LACTATION USING CALVING DATA
#To fill non-missing values and shift them up in a DataFrame to a new column called upper_limit
dfm2 = pd.DataFrame(sort_df, columns=["FarmName_Pseudo","SE_Number","AnimalNumber","Del_Cow_Id","CalvingDate"])

#apply grouped shifting
arr = dfm2[["SE_Number", "CalvingDate"]].values
grouper = np.split(arr[:,1], np.unique(arr[:, 0], return_index=True)[1][1:])
shift = [np.append(np.roll(i,-1)[:-1], np.nan) for i in grouper]
new_col = np.hstack(shift)
dfm2["upper_limit"] = new_col
#dfm2.to_csv("dataframe2.csv", index=False)

#Merge onto master df
dfm3 = pd.merge(sort_df, dfm2, on=["FarmName_Pseudo","SE_Number","AnimalNumber","Del_Cow_Id","CalvingDate"])
dfm3.to_csv("dataframe.csv", index=False)

In [3]:
#ADD CULLING AND BIRTH DATA
#"DeathDate" and ExitDate not used in database? use CullDecisionDate instead
df6=pd.read_csv("Del_Cow240531.csv", delimiter=';', low_memory=False)
col_keep = ["Del_Cow_Id","FarmName_Pseudo","SE_Number","AnimalNumber","Mother","Father","BreedName","BirthDate","CullDecisionDate"]
df7=df6[col_keep]

#Check for duplicates
print(df7.shape) #24,473 events, 4col
df8 = df7.drop_duplicates(subset=["SE_Number","BreedName","BirthDate","CullDecisionDate"])
print(df8.shape) #24,222 unique events, 4col

#Drop rows where SE_Number is na
print(df8.isna().sum()) #369, 9044, 9044, 17185 for "SE_Number","BreedName","BirthDate","CullDecisionDate"
df9 = df8.dropna(subset=["SE_Number"])
print(df9.shape) #23,886 unique records, 4col
#Drop first row with strange entry
df10 = df9.loc[1:] #1-23,887 ie removing index row 0
#df10.to_csv("dataframe2.csv", index=False)

#Merge onto master df
dfm4 = pd.merge(dfm3, df10, on=["FarmName_Pseudo","SE_Number","AnimalNumber","Del_Cow_Id"], how="outer")

#If upper_limit missing, use culling data
dfm4["upper_limit"].fillna(dfm4["CullDecisionDate"], inplace=True)
dfm4.to_csv("dataframe.csv", index=False)

(24473, 9)
(24255, 9)
Del_Cow_Id              0
FarmName_Pseudo         1
SE_Number             369
AnimalNumber         8674
Mother               9620
Father               9750
BreedName            9044
BirthDate            9044
CullDecisionDate    17185
dtype: int64
(23886, 9)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  dfm4["upper_limit"].fillna(dfm4["CullDecisionDate"], inplace=True)


In [4]:
#ADD DRY OFF DATE
#Load data, keep variables, check for duplicates, sort
df11=pd.read_csv("Del_DryOff240531.csv", delimiter=';', low_memory=False)
col_keep = ["FarmName_Pseudo","SE_Number","AnimalNumber","Del_Cow_Id","DryOffDate"]
df12 = df11[col_keep]
print(df12.shape) #4989 dryoff events, 2col
df13 = df12.drop_duplicates(subset=["SE_Number", "DryOffDate"])
print(df13.shape) #4989 unique dryoff events, 2col
df13 = df13.sort_values(by=["SE_Number", "DryOffDate"])
#df13.to_csv("dataframe2.csv", index=False)

#Merge dry off data onto master df
dfm5 = pd.merge(dfm4, df13, on=["FarmName_Pseudo","SE_Number","AnimalNumber","Del_Cow_Id"])
dfm5.to_csv("dataframe.csv", index=False)
"""
#Subset chosen cows
SE_Number = ["SE-5c06d92d-2950", "SE-5c06d92d-3010", "SE-5c06d92d-3068"]
dfm6 = dfm5[dfm5["SE_Number"].isin(SE_Number)]
dfm6.to_csv("dataframe3.csv", index=False)
"""
"""
#Subset chosen cows
SE_Number = ["SE-064c0cec-1189", "SE-169e580a-3221", "SE-169e580a-3418", "SE-169e580a-3634", "SE-169e580a-2843"]
dfm6 = dfm5[dfm5["SE_Number"].isin(SE_Number)]
"""

#Sort which dry off data belongs to which lactation
dfm6=dfm5

#Convert columns to datetime
dfm6['DryOffDate'] = pd.to_datetime(dfm6['DryOffDate'])
dfm6['BirthDate'] = pd.to_datetime(dfm6['BirthDate'])
dfm6['CullDecisionDate'] = pd.to_datetime(dfm6['CullDecisionDate'])
dfm6['CalvingDate'] = pd.to_datetime(dfm6['CalvingDate'])
dfm6['upper_limit'] = pd.to_datetime(dfm6['upper_limit'])

#Set obs to NaN where values in upper_limit = culling date, ie doesn't have drying off date
dfm6.loc[dfm6["upper_limit"] == dfm6["CullDecisionDate"], "DryOffDate"] = np.nan
#Also set dryoffdate to missing when both upper_limit and culling date are missing
dfm6.loc[dfm6["upper_limit"].isna() & dfm6["CullDecisionDate"].isna(), "DryOffDate"] = pd.NA
#dfm6.to_csv("dataframe2.csv", index=False)

#Filter df for relevant lactations
dfm7 = dfm6[(dfm6['DryOffDate'] > dfm6['CalvingDate']) & (dfm6['DryOffDate'] < dfm6['upper_limit'])]
#dfm7.to_csv("dataframe2.csv", index=False)

#Also filter df to keep last lactation, ie doesn't have a drying off date yet (might also miss upper_limit)
#Keep only rows where dryoff has NaN values
dfm8 = dfm6[dfm6["DryOffDate"].isna()]
dfm8 = dfm8.drop_duplicates(subset=["SE_Number", "CalvingDate", "upper_limit"])
#dfm8.to_csv("dataframe3.csv", index=False)

#concatinate dfs
frames = [dfm7, dfm8]
dfm9 = pd.concat(frames)
dfm9 = dfm9.sort_values(by=["SE_Number", "CalvingDate", "upper_limit"])
dfm9.to_csv("dataframe.csv", index=False)

from datetime import datetime
#Get today's date for current lactations missing upper_limit
today_date = datetime.today().date()
dfm9.loc[dfm9["upper_limit"].isna() & dfm9["CullDecisionDate"].isna(), "upper_limit"] = today_date
dfm9.to_csv("dataframe.csv", index=False)

(4989, 5)
(4989, 5)


  dfm9.loc[dfm9["upper_limit"].isna() & dfm9["CullDecisionDate"].isna(), "upper_limit"] = today_date


In [5]:
#ADD LACTATION NUMBER
df3=pd.read_csv("Del_Lactation240531.csv", delimiter=';', low_memory=False)
col_keep = ["FarmName_Pseudo","SE_Number","AnimalNumber","Del_Cow_Id","LactationInfoDate","LactationNumber"]
df4=df3[col_keep]
df4 = df4.sort_values(by=["SE_Number", "LactationInfoDate"])
#df4.to_csv("dataframe2.csv")

"""
#Following cow has an error in date column
#Create a frequency table for the date column
frequency_table = df4["LactationInfoDate"].value_counts()
frequency_table.to_csv("dataframe3.csv") #2022-05,1
filt = df4["LactationInfoDate"] == "2022-05"
print(df4.loc[filt]) #[1row, 6col]

#Subset chosen cow
SE_Number = ["SE-a756bc39-1002"] #813obs
df5 = df4[df4["SE_Number"].isin(SE_Number)]
df5.to_csv("dataframe3.csv", index=False)
"""
#Check for duplicates
print(df4.shape) #2,158,630 events, 6col
df5 = df4.drop_duplicates(subset=["SE_Number","LactationNumber"])
print(df5.shape) #9,728 unique lactations, 6col

#Delete row where this cow has messed up date using boolean indexing
df6 = df5[df5["LactationInfoDate"] != "2022-05"]

#Convert LactationInfoDate to datetime
df6["LactationInfoDate"] = pd.to_datetime(df6["LactationInfoDate"])

#Merge
dfm10 = pd.merge(dfm9, df6, on=["FarmName_Pseudo","SE_Number","AnimalNumber","Del_Cow_Id"])
dfm10.to_csv("dataframe.csv", index=False)

#Filter df for relevant lactations
dfm11 = dfm10[(dfm10["LactationInfoDate"] >= dfm10["CalvingDate"]) & (dfm10["LactationInfoDate"] <= dfm10["upper_limit"])]

#For some reason today_date included time, convert column to datetime object
dfm11["upper_limit"] = pd.to_datetime(dfm11["upper_limit"])

#Drop column LactationInfoDate cuz reasons
dfm11 = dfm11.drop('LactationInfoDate', axis=1)
dfm11.to_csv("dataframe.csv")

"""
#Subset chosen cows
SE_Number = ["SE-5c06d92d-2950", "SE-5c06d92d-3010", "SE-5c06d92d-3068"]
dfm12 = dfm11[dfm11["SE_Number"].isin(SE_Number)]
"""

(2158630, 6)
(9728, 6)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df6["LactationInfoDate"] = pd.to_datetime(df6["LactationInfoDate"])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfm11["upper_limit"] = pd.to_datetime(dfm11["upper_limit"])


'\n#Subset chosen cows\nSE_Number = ["SE-5c06d92d-2950", "SE-5c06d92d-3010", "SE-5c06d92d-3068"]\ndfm12 = dfm11[dfm11["SE_Number"].isin(SE_Number)]\n'

In [6]:
#ADD INSEMINATION DATA
#Load data, keep cowid, insdate, check for duplicates, sort
dfins=pd.read_csv("Del_Insemination240531.csv", delimiter=';', low_memory=False)
col_keep = ["FarmName_Pseudo","SE_Number","AnimalNumber","Del_Cow_Id","InseminationDate","Breeder"]
dfins2 = dfins[col_keep]
print(dfins2.shape) #18,775 insemination events, 6col
dfins2 = dfins2.drop_duplicates(subset=["SE_Number", "InseminationDate"])
print(dfins2.shape) #18,689 unique insemination events, 6col
dfins2 = dfins2.sort_values(by=["SE_Number", "InseminationDate"])

#MERGE WITH CALVING DATA
dfins4 = pd.merge(dfm11, dfins2, on=["FarmName_Pseudo","SE_Number","AnimalNumber","Del_Cow_Id"])

"""
#Subset chosen cow
SE_Number = ["SE-064c0cec-1189"]
dfins4 = dfins3[dfins3["SE_Number"].isin(SE_Number)]
"""
#Filter df for relevant inseminations sorted to correct lactation
dfins5 = dfins4[(dfins4["InseminationDate"] >= dfins4["CalvingDate"]) & (dfins4["InseminationDate"] <= dfins4["upper_limit"])]
#col_keep = ["SE_Number","LactationNumber","BirthDate","CalvingDate","InseminationDate", "DryOffDate", "upper_limit", "CullDecisionDate"]
#dfins6 = dfins5[col_keep]
dfins5.to_csv("dataframe.csv", index=False)

(18775, 6)
(18689, 6)


In [7]:
#PREGNANCY CHECKS
#Make next_ins to sort pregnancy checks
#To fill non-missing values and shift them up in df to a new column called next_ins
dfins6 = pd.DataFrame(dfins5, columns=["SE_Number","LactationNumber","InseminationDate", "upper_limit"])

#apply grouped shifting
arr = dfins6[["SE_Number", "InseminationDate"]].values
grouper = np.split(arr[:,1], np.unique(arr[:, 0], return_index=True)[1][1:])
shift = [np.append(np.roll(i,-1)[:-1], np.nan) for i in grouper]
new_col = np.hstack(shift)
dfins6["next_ins"] = new_col
#dfins6.to_csv("dataframe2.csv", index=False)

#only keep next_ins where falls within range
dfins7 = dfins6[(dfins6["next_ins"] >= dfins6["InseminationDate"]) & (dfins6["next_ins"] <= dfins6["upper_limit"])]
col_keep = ["SE_Number","LactationNumber","InseminationDate","next_ins"]
dfins7 = dfins7[col_keep]
#dfins7.to_csv("dataframe2.csv", index=False)

#Merge onto master df
dfins8=dfins5.join(dfins7.set_index(["SE_Number", "LactationNumber", "InseminationDate"]), on=["SE_Number","LactationNumber","InseminationDate"])
dfins8.to_csv("dataframe.csv", index=False)

"""
#Subset chosen cow
SE_Number = ["SE-064c0cec-1189"]
dfins8 = dfins8[dfins8["SE_Number"].isin(SE_Number)]
#dfins8.to_csv("dataframe3.csv", index=False)
"""

#Load pregnancy check data, check for duplicates, sort
preg=pd.read_csv("Del_PregnancyCheck240531.csv", delimiter=';', low_memory=False)
#col_keep = ["FarmName_Pseudo","SE_Number","AnimalNumber","Del_Cow_Id","PregnancyCheckDate","PregnancyCheckResult"]
col_keep = ["SE_Number","PregnancyCheckDate","PregnancyCheckResult"]
preg = preg[col_keep]
print(preg.shape) #13,315 events, 6col
preg = preg.drop_duplicates(subset=["SE_Number", "PregnancyCheckDate"])
print(preg.shape) #13,292 unique events, 6col
preg = preg.sort_values(by=["SE_Number", "PregnancyCheckDate"])

#preg.to_csv("dataframe2.csv", index=False)

#Add to subset df
col_keep = ["SE_Number","LactationNumber","InseminationDate", "next_ins", "upper_limit"]
dfins8 = dfins8[col_keep]
dfins9=dfins8.join(preg.set_index(["SE_Number"]), on=["SE_Number"])
dfins9['InseminationDate'] = pd.to_datetime(dfins9['InseminationDate'])
dfins9['PregnancyCheckDate'] = pd.to_datetime(dfins9['PregnancyCheckDate'])
dfins9['next_ins'] = pd.to_datetime(dfins9['next_ins'])
#dfins9.to_csv("dataframe3.csv", index=False)

#Sort and keep only relevant obs
dfins9['C'] = None
def filter_pregcheck(row):
    if pd.isna(row["next_ins"]):
        if (row["PregnancyCheckDate"] >= row["InseminationDate"]) and (row["PregnancyCheckDate"] <= row["upper_limit"]):
            return "Yes"
        else:
            return "No"
    if pd.notna(row["next_ins"]):
        if (row["PregnancyCheckDate"] >= row["InseminationDate"]) and (row["PregnancyCheckDate"] <= row["next_ins"]):
            return "Yes"
        else:
            return "No"
#Apply filter
dfins9['C'] = dfins9.apply(filter_pregcheck, axis=1)

dfins10 = dfins9[dfins9["C"] == "Yes"]
col_keep = ["SE_Number","LactationNumber","InseminationDate", "PregnancyCheckDate","PregnancyCheckResult"]
dfins11 = dfins10[col_keep]
#dfins11.to_csv("dataframe4.csv", index=False)

#Convert the 'InseminationDate' column from datetime64[ns] to object for merging
print(dfins11.dtypes)
dfins11['InseminationDate'] = dfins11['InseminationDate'].astype(str)

#Add to master df
dfins12=pd.read_csv("dataframe.csv")
print(dfins12.dtypes)
dfins13 = dfins12.merge(dfins11, on=["SE_Number", "LactationNumber", "InseminationDate"], how="left")
dfins13.to_csv("dataframe.csv", index=False)
"""
#Subset chosen cow
SE_Number = ["SE-064c0cec-1189"]
dfins13 = dfins13[dfins13["SE_Number"].isin(SE_Number)]

col_keep = ["SE_Number","LactationNumber","CalvingDate","upper_limit","InseminationDate","next_ins","PregnancyCheckDate"]
dfins13 = dfins13[col_keep]
dfins13.to_csv("dataframe5.csv", index=False)
"""

(13315, 3)
(13292, 3)
SE_Number                       object
LactationNumber                float64
InseminationDate        datetime64[ns]
PregnancyCheckDate      datetime64[ns]
PregnancyCheckResult            object
dtype: object
FarmName_Pseudo       object
SE_Number             object
AnimalNumber         float64
Del_Cow_Id             int64
CalvingDate           object
CalvingSireBullID     object
CE                     int64
upper_limit           object
Mother                object
Father                object
BreedName             object
BirthDate             object
CullDecisionDate      object
DryOffDate            object
LactationNumber      float64
InseminationDate      object
Breeder              float64
next_ins              object
dtype: object


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfins11['InseminationDate'] = dfins11['InseminationDate'].astype(str)


'\n#Subset chosen cow\nSE_Number = ["SE-064c0cec-1189"]\ndfins13 = dfins13[dfins13["SE_Number"].isin(SE_Number)]\n\ncol_keep = ["SE_Number","LactationNumber","CalvingDate","upper_limit","InseminationDate","next_ins","PregnancyCheckDate"]\ndfins13 = dfins13[col_keep]\ndfins13.to_csv("dataframe5.csv", index=False)\n'

In [8]:
#ALT1: ADD MY DATA
    #Del_CowMilkYield_Common
    #Del_CowMilkYield_Other
    #Del_CowMilkYield_Robot
    #Del_Milk_Other
    #Del_Milk_Robot
#TRANSPOSE INSEMINATION DATA TO FIT MILK YIELD DATA
import pandas as pd
df=pd.read_csv("dataframe.csv")
col_keep = ["SE_Number","LactationNumber","InseminationDate"]
df = df[col_keep]
df_ins = df.drop_duplicates(subset=["SE_Number","InseminationDate"])
#df_ins.to_csv("dataframe2.csv", index=False)

#Add a helper column for the order of insemination dates within each group
df_ins['InseminationOrder'] = df_ins.groupby(['SE_Number', 'LactationNumber']).cumcount() + 1
#Pivot the DataFrame
df_pivot = df_ins.pivot_table(index=['SE_Number','LactationNumber'], columns='InseminationOrder', values='InseminationDate', aggfunc='first')
#Flatten the MultiIndex columns??
df_pivot.columns = [f'InseminationDate_{col}' for col in df_pivot.columns]
#Reset the index
df_pivot = df_pivot.reset_index()
#df_pivot.to_csv("dataframe2.csv", index=False)
#Add to df
df=pd.read_csv("dataframe.csv")
df.drop(columns=["CalvingSireBullID","CE","InseminationDate","Mother","Father","Breeder","next_ins","PregnancyCheckDate",
                 "BirthDate","CullDecisionDate","PregnancyCheckResult"], inplace=True)
df = df.drop_duplicates(subset=["SE_Number","LactationNumber"])
df_ins2 = df.merge(df_pivot, on=["SE_Number","LactationNumber"], how="left")
df_ins2.to_csv("MY.csv", index=False)

#ADD MY
dfmy=pd.read_csv("Del_CowMilkYield_Common240617.csv", delimiter=';', low_memory=False)
print(dfmy) #5,367,467 x 15col
"""
#CHECK DIST
    #LactationNumber 1-10 + NaN
    #DaysInMilk - is used, including Nan
    #SessionNumber - is used, including nan
    #TotalYield - is used, comma!
unique_values = dfmy["TotalYield"].unique()
print(unique_values)
"""
#Keep relevant col
col_keep = ["SE_Number","StartDate","StartTime","LactationNumber","DaysInMilk","SessionNumber","TotalYield"]
dfmy = dfmy[col_keep]

#Change TotalYield comma to dot
dfmy["TotalYield"] = dfmy["TotalYield"].str.replace(',', '.')
#Change to datetime
dfmy["StartDate"] = pd.to_datetime(dfmy["StartDate"])

#Check for dups and sort
print(dfmy.shape) #5,367,467 x 7col
dfmy_unique = dfmy.drop_duplicates(subset=["SE_Number","StartDate","StartTime"])
print(dfmy_unique.shape) #3,529,625 x 7col =>3,529,625obs
dfmy2 = dfmy_unique.sort_values(by=["SE_Number","StartDate","StartTime"])
"""
#Subset chosen cow
SE_Number = ["SE-064c0cec-1189"]
dfmy2 = dfmy2[dfmy2["SE_Number"].isin(SE_Number)]
dfmy2.to_csv("dataframe2.csv", index=False)
"""
#Build master df
df=pd.read_csv("MY.csv")
"""
#Subset chosen cow
SE_Number = ["SE-064c0cec-1189"]
df = df[df["SE_Number"].isin(SE_Number)]
"""
#Merge
dfmy3 = df.merge(dfmy2, on=["SE_Number"], how="left")
#dfmy3.to_csv("dataframe3.csv", index=False)

#only keep MY data where falls within lactation, ie between CalvingDate and upper_limit
dfmy4 = dfmy3[(dfmy3["StartDate"] >= dfmy3["CalvingDate"]) & (dfmy3["StartDate"] <= dfmy3["upper_limit"])]
dfmy4.loc[:, "Lactation"] = dfmy4["LactationNumber_x"]
dfmy4.drop(columns=["LactationNumber_x","LactationNumber_y","DaysInMilk"], inplace=True)

#Make DIM
dfmy4["StartDate"] = pd.to_datetime(dfmy4["StartDate"])
dfmy4["CalvingDate"] = pd.to_datetime(dfmy4["CalvingDate"])
dfmy4["DaysInMilk"] = (dfmy4["StartDate"] - dfmy4["CalvingDate"]).dt.days + 1
dfmy4.to_csv("MY.csv", index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_ins['InseminationOrder'] = df_ins.groupby(['SE_Number', 'LactationNumber']).cumcount() + 1


         Unnamed: 0  Del_Cow_Id FarmName_Pseudo         SE_Number  \
0                 1       12585        a624fb9a  SE-a624fb9a-1244   
1                 2        7982        a624fb9a  SE-a624fb9a-1292   
2                 3       10815        a624fb9a  SE-a624fb9a-1297   
3                 4       10815        a624fb9a  SE-a624fb9a-1297   
4                 5        8884        a624fb9a  SE-a624fb9a-1207   
...             ...         ...             ...               ...   
5367462     5367463        7904        ad0a39f5  SE-ad0a39f5-2541   
5367463     5367464        6566        ad0a39f5  SE-ad0a39f5-2628   
5367464     5367465        3573        5c06d92d  SE-5c06d92d-3605   
5367465     5367466        3573        5c06d92d  SE-5c06d92d-3605   
5367466     5367467        3573        5c06d92d  SE-5c06d92d-3605   

         AnimalNumber   StartDate StartTime  LactationNumber  DaysInMilk  \
0                1244  2019-12-27  14:37:00              NaN         NaN   
1                12

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfmy4.loc[:, "Lactation"] = dfmy4["LactationNumber_x"]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfmy4.drop(columns=["LactationNumber_x","LactationNumber_y","DaysInMilk"], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfmy4["StartDate"] = pd.to_datetime(dfmy4["StartDate"])
A value is trying to be set on a copy of a slice from a DataFr

In [9]:
#ALT 2: ADD MY
    #Del_CowMilkYield_Common
    #Del_CowMilkYield_Other
    #Del_CowMilkYield_Robot
    #Del_Milk_Other
    #Del_Milk_Robot
import pandas as pd
dfmy=pd.read_csv("Del_CowMilkYield_Common240617.csv", delimiter=';', low_memory=False)
print(dfmy) #5,367,467 x 15col

"""
#CHECK DIST
    #LactationNumber 1-10 + NaN
    #DaysInMilk - is used, including Nan
    #SessionNumber - is used, including nan
    #TotalYield - is used, comma!
unique_values = dfmy["TotalYield"].unique()
print(unique_values)
"""

#Keep relevant col
col_keep = ["SE_Number","StartDate","StartTime","LactationNumber","DaysInMilk","SessionNumber","TotalYield"]
dfmy = dfmy[col_keep]

#Change TotalYield comma to dot
dfmy["TotalYield"] = dfmy["TotalYield"].str.replace(',', '.')
#Change to datetime
dfmy["StartDate"] = pd.to_datetime(dfmy["StartDate"])

#Check for dups and sort
print(dfmy.shape) #5,367,467 x 7col
dfmy_unique = dfmy.drop_duplicates(subset=["SE_Number","StartDate","StartTime"])
print(dfmy_unique.shape) #3,529,625 x 7col =>3,529,625obs
dfmy2 = dfmy_unique.sort_values(by=["SE_Number","StartDate","StartTime"])
"""
#Subset chosen cow
SE_Number = ["SE-064c0cec-1189"]
dfmy2 = dfmy2[dfmy2["SE_Number"].isin(SE_Number)]
#save df
dfmy2.to_csv("dataframe2.csv", index=False)
"""
#Build master df
df=pd.read_csv("dataframe.csv")
col_keep = ["SE_Number","LactationNumber","CalvingDate","upper_limit"]
df = df[col_keep]
df = df.drop_duplicates(subset=["SE_Number","CalvingDate"])
"""
#Subset chosen cow
SE_Number = ["SE-064c0cec-1189"]
df = df[df["SE_Number"].isin(SE_Number)]
df.to_csv("dataframe3.csv", index=False)
"""
#Merge
dfmy3 = df.merge(dfmy2, on=["SE_Number"], how="left")
#dfmy3.to_csv("dataframe3.csv", index=False)

#only keep MY data where falls within lactation, ie between CalvingDate and upper_limit
    #where upper_limit is either next lactation, slaughter date or today's date for open records (ie ongoing lactation)
dfmy4 = dfmy3[(dfmy3["StartDate"] >= dfmy3["CalvingDate"]) & (dfmy3["StartDate"] <= dfmy3["upper_limit"])]
dfmy4.loc[:, "Lactation"] = dfmy4["LactationNumber_x"]
dfmy4.drop(columns=["LactationNumber_x","LactationNumber_y","DaysInMilk","upper_limit"], inplace=True)

#Make DIM
dfmy4["StartDate"] = pd.to_datetime(dfmy4["StartDate"])
dfmy4["CalvingDate"] = pd.to_datetime(dfmy4["CalvingDate"])
dfmy4["DaysInMilk"] = (dfmy4["StartDate"] - dfmy4["CalvingDate"]).dt.days + 1
#dfmy4.to_csv("dataframe3.csv", index=False)

#Make previous_ins
df=pd.read_csv("dataframe.csv")
"""
#Subset chosen cow
SE_Number = ["SE-064c0cec-1189"]
df = df[df["SE_Number"].isin(SE_Number)]
"""
df2 = df.drop_duplicates(subset=["SE_Number","CalvingDate","InseminationDate"])
col_keep = ["SE_Number","LactationNumber","InseminationDate"]
df2 = df2[col_keep]

df2["prev_ins"] = df2.groupby(["SE_Number","LactationNumber"])["InseminationDate"].shift(1)
df2 = df2.drop('LactationNumber', axis=1)
df3 = df.merge(df2, on=["SE_Number","InseminationDate"], how="left")
#df3.to_csv("dataframe4.csv", index=False)

#Merge rest of master df
df3["prev_ins"] = pd.to_datetime(df3["prev_ins"])
df3["CalvingDate"] = pd.to_datetime(df3["CalvingDate"])

"""
#Cuz reasons - sätt som kommentar när färdig:
col_keep = ["SE_Number","LactationNumber","CalvingDate","upper_limit","InseminationDate","DryOffDate","next_ins","prev_ins"]
df3 = df3[col_keep]
"""

dfmy5 = pd.merge(df3,dfmy4, how="left", on=["SE_Number","CalvingDate"])
dfmy5["StartDate"] = pd.to_datetime(dfmy5["StartDate"])
dfmy5["upper_limit"] = pd.to_datetime(dfmy5["upper_limit"])
dfmy5["next_ins"] = pd.to_datetime(dfmy5["next_ins"])
dfmy5["DryOffDate"] = pd.to_datetime(dfmy5["DryOffDate"])
dfmy5["InseminationDate"] = pd.to_datetime(dfmy5["InseminationDate"])
#dfmy5.to_csv("dataframe4.csv", index=False)

#Make VoluntaryWaitingPeriod (ie the interval from calving to first insemination)
    #For some lactations not really VWP due to don't have all ins data from herds, rather interval from calving to first recorded ins in database
first_observations = dfmy5.sort_values("InseminationDate").groupby(["SE_Number","LactationNumber"]).first().reset_index()
#first_observations.to_csv("dataframe2.csv", index=False)

from datetime import timedelta
first_observations["VoluntaryWaitingDate"] = first_observations["InseminationDate"] - timedelta(days=1)
col_keep = ["SE_Number","CalvingDate","VoluntaryWaitingDate"]
first_observations = first_observations[col_keep]

dfmy5 = dfmy5.merge(first_observations, on=["SE_Number","CalvingDate"], how="left")
#dfmy5.to_csv("dataframe5.csv", index=False)
print(dfmy5.dtypes)

def milk_data(row):
    #keep MY data between calving and voluntary waiting period (VWP)
    if pd.isna(row["prev_ins"]) and (row["StartDate"] >= row["CalvingDate"]) and (row["StartDate"] <= row["VoluntaryWaitingDate"]):
        return True
    #keep MY data between VWP and insdates
    if pd.notna(row["next_ins"]) and (row["StartDate"] >= row["InseminationDate"]) and (row["StartDate"] < row["next_ins"]):
        return True
    #keep MY data between insdate and upper_limit for lactation (next lact, slaughter date or today's date for open records)
    if pd.isna(row["next_ins"]) and (row["StartDate"] >= row["InseminationDate"]) and (row["StartDate"] <= row["upper_limit"]):
        return True
    return False
#Apply the function and filter
mask = dfmy5.apply(milk_data, axis=1)
df_filt = dfmy5[mask]

#Remove obs if last record is equal to next lactation, ie after dryoffdate for lactation
    #e.g. SE-064c0cec-1189, lact 6 is double recorded for lact 7 with milking date 2021-06-24, 401DIM
def dry_off(row):
    if pd.notna(row["DryOffDate"]) and (row["StartDate"] <= row["DryOffDate"]):
        return True
    if pd.isna(row["DryOffDate"]):
        return True
    return False
#Apply the function and filter
mask = df_filt.apply(dry_off, axis=1)
df_filt2 = df_filt[mask]

"""
col_keep = ["SE_Number","Lactation","CalvingDate","VoluntaryWaitingDate","InseminationDate","PregnancyCheckDate","StartDate","StartTime","next_ins","DryOffDate","upper_limit"]
df_filt2 = df_filt2[col_keep]
df_filt2.to_csv("dataframe4.csv", index=False)
"""
#Handle pregnancy checks
#Make next_pregcheck, prev_pregcheck and sort data to relevant insemination and MY data
df2 = df_filt2.drop_duplicates(subset=["SE_Number","CalvingDate","InseminationDate","PregnancyCheckDate"])
col_keep = ["SE_Number","Lactation","InseminationDate","PregnancyCheckDate"]
df2 = df2[col_keep]
#df2.to_csv("dataframe2.csv", index=False)

df2["next_pregcheck"] = df2.groupby(["SE_Number","Lactation","InseminationDate"])["PregnancyCheckDate"].shift(-1)
df2["prev_pregcheck"] = df2.groupby(["SE_Number","Lactation","InseminationDate"])["PregnancyCheckDate"].shift(1)

df3 = df_filt2.merge(df2, on=["SE_Number","Lactation","InseminationDate","PregnancyCheckDate"], how="left")
df3["next_pregcheck"] = pd.to_datetime(df3["next_pregcheck"])
df3["prev_pregcheck"] = pd.to_datetime(df3["prev_pregcheck"])
df3["next_ins"] = pd.to_datetime(df3["next_ins"])
df3["StartDate"] = pd.to_datetime(df3["StartDate"])
df3["PregnancyCheckDate"] = pd.to_datetime(df3["PregnancyCheckDate"])
#df3.to_csv("dataframe2.csv", index=False)

def preg_checks(row):
    if pd.notna(row["PregnancyCheckDate"]) and pd.notna(row["next_pregcheck"]) and (row["StartDate"] >= row["next_pregcheck"]):
        return False
    if pd.notna(row["PregnancyCheckDate"]) and pd.notna(row["prev_pregcheck"]) and (row["StartDate"] < row["PregnancyCheckDate"]):
        return False
    return True

#Apply function, filter, save
mask = df3.apply(preg_checks, axis=1)
df3_filtered = df3[mask]
df3_filtered.to_csv("dataframe.csv", index=False)

         Unnamed: 0  Del_Cow_Id FarmName_Pseudo         SE_Number  \
0                 1       12585        a624fb9a  SE-a624fb9a-1244   
1                 2        7982        a624fb9a  SE-a624fb9a-1292   
2                 3       10815        a624fb9a  SE-a624fb9a-1297   
3                 4       10815        a624fb9a  SE-a624fb9a-1297   
4                 5        8884        a624fb9a  SE-a624fb9a-1207   
...             ...         ...             ...               ...   
5367462     5367463        7904        ad0a39f5  SE-ad0a39f5-2541   
5367463     5367464        6566        ad0a39f5  SE-ad0a39f5-2628   
5367464     5367465        3573        5c06d92d  SE-5c06d92d-3605   
5367465     5367466        3573        5c06d92d  SE-5c06d92d-3605   
5367466     5367467        3573        5c06d92d  SE-5c06d92d-3605   

         AnimalNumber   StartDate StartTime  LactationNumber  DaysInMilk  \
0                1244  2019-12-27  14:37:00              NaN         NaN   
1                12

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfmy4.loc[:, "Lactation"] = dfmy4["LactationNumber_x"]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfmy4.drop(columns=["LactationNumber_x","LactationNumber_y","DaysInMilk","upper_limit"], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfmy4["StartDate"] = pd.to_datetime(dfmy4["StartDate"])
A value is trying to be set on a copy of a slice

FarmName_Pseudo                 object
SE_Number                       object
AnimalNumber                   float64
Del_Cow_Id                       int64
CalvingDate             datetime64[ns]
CalvingSireBullID               object
CE                               int64
upper_limit             datetime64[ns]
Mother                          object
Father                          object
BreedName                       object
BirthDate                       object
CullDecisionDate                object
DryOffDate              datetime64[ns]
LactationNumber                float64
InseminationDate        datetime64[ns]
Breeder                        float64
next_ins                datetime64[ns]
PregnancyCheckDate              object
PregnancyCheckResult            object
prev_ins                datetime64[ns]
StartDate               datetime64[ns]
StartTime                       object
SessionNumber                  float64
TotalYield                      object
Lactation                

In [10]:
#FERTILITY TRAITS
    #NOTE: ONLY DEFINED TRAITS, TBC FOR FILTERING!!!
import pandas as pd

dfins6=pd.read_csv("dataframe.csv", low_memory=False)
col_keep = ["SE_Number","LactationNumber","CalvingDate","InseminationDate"]
dfins6 = dfins6[col_keep]
dfins6 = dfins6.drop_duplicates(subset=["SE_Number","LactationNumber","CalvingDate","InseminationDate"])

#NINS - Number of inseminations
#Group by cowid and lactation and count the number of inseminations
ins_count = dfins6.groupby(["SE_Number", 'LactationNumber']).size().reset_index(name='NINS')

#CFI - Interval from calving to first ins
#CLI - Interval from calving to last ins
#FLI - Interval from first to last ins
#Group by cow and lactation, and get the first and last inseminations
first_observations = dfins6.groupby(["SE_Number","LactationNumber"]).first().reset_index()
last_observations = dfins6.groupby(["SE_Number","LactationNumber"]).last().reset_index()

#Rename the columns of the last observations df to distinguish them
last_observations.columns = ["SE_Number","LactationNumber", "CalvingDate_last", "InseminationDate_last"]

#Concatenate first and last observations df side by side
dfins7 = pd.concat([first_observations, last_observations.iloc[:, 2:]], axis=1)

#Convert columns to datetime objects
dfins7["InseminationDate"] = pd.to_datetime(dfins7["InseminationDate"])
dfins7["InseminationDate_last"] = pd.to_datetime(dfins7["InseminationDate_last"])
dfins7["CalvingDate"] = pd.to_datetime(dfins7["CalvingDate"])
dfins7["CalvingDate_last"] = pd.to_datetime(dfins7["CalvingDate_last"])

#Calculate fertility traits
dfins7["CFI"] = (dfins7["InseminationDate"] - dfins7["CalvingDate"]).dt.days
dfins7["CLI"] = (dfins7["InseminationDate_last"] - dfins7["CalvingDate_last"]).dt.days
dfins7["FLI"] = (dfins7["InseminationDate_last"] - dfins7["InseminationDate"]).dt.days
col_keep = ["SE_Number","LactationNumber","CFI","CLI","FLI"]
dfins7 = dfins7[col_keep]

#Concatenate dfs side by side
dfins8 = pd.concat([ins_count, dfins7.iloc[:, 2:]], axis=1)
print(dfins8.shape) #3636, 6col


#GL - Gestation length, interval from last ins (ie effective ins) to calving
#dfins6.to_csv("dataframe2.csv", index=False)

#Make next_calving
dfins7 = dfins6.drop_duplicates(subset=["SE_Number","LactationNumber","CalvingDate"])
dfins7.drop(columns=["InseminationDate"], inplace=True)
dfins7["next_calving"] = dfins7.groupby(["SE_Number"])["CalvingDate"].shift(-1)
#Add to df
dfins9 = dfins6.merge(dfins7, on=["SE_Number","LactationNumber","CalvingDate"], how="left")

last_observations.rename(columns={"InseminationDate":"InseminationDate_last"}, inplace=True)
#last_observations.to_csv("dataframe6.csv", index=False)
dfins9 = dfins9.merge(last_observations, on=["SE_Number","LactationNumber"], how="left")
dfins9.drop(columns=["CalvingDate","InseminationDate","CalvingDate_last"], inplace=True)

#convert to dateformat
dfins9["InseminationDate_last"] = pd.to_datetime(dfins9["InseminationDate_last"])
dfins9["next_calving"] = pd.to_datetime(dfins9["next_calving"])
#Make GL
dfins9["GL"] = (dfins9["next_calving"] - dfins9["InseminationDate_last"]).dt.days
#Drop dups, drop unnecessary col, add to df
dfins10 = dfins9.drop_duplicates(subset=["SE_Number","LactationNumber"])
dfins10.drop(columns=["next_calving","InseminationDate_last"], inplace=True)
dfins10 = dfins8.merge(dfins10, on=["SE_Number","LactationNumber"], how="left")


#CI - Calving interval
dfins7["CalvingDate"] = pd.to_datetime(dfins7["CalvingDate"])
dfins7["next_calving"] = pd.to_datetime(dfins7["next_calving"])
dfins7["CI"] = (dfins7["next_calving"] - dfins7["CalvingDate"]).dt.days
dfins7.drop(columns=["CalvingDate","next_calving"], inplace=True)
#Add to other fertility traits
dfins10 = dfins10.merge(dfins7, on=["SE_Number", "LactationNumber"], how="left")

#ADD ALL TO MASTER DF
df=pd.read_csv("dataframe.csv")
df = df.merge(dfins10, on=["SE_Number", "LactationNumber"], how="left")
df.to_csv("dataframe.csv", index=False)




"""
#CR - conception rate!!!
"""

(3636, 6)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfins7.drop(columns=["InseminationDate"], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfins7["next_calving"] = dfins7.groupby(["SE_Number"])["CalvingDate"].shift(-1)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfins10.drop(columns=["next_calving","InseminationDate_last"], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer]

'\n#CR - conception rate!!!\n'