In [17]:
'''
#Gigacow-tools# - data collection for fast/slow learner.
This scripts used for single cow data collection work.
Data Tables: gigacow_filter.csv, lactation_filter.csv, traffic_raw_filter.csv
'''

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path
pd.options.mode.chained_assignment = None
from tsfresh import extract_features
from tsfresh import select_features
from tsfresh.utilities.dataframe_functions import impute
from tsfresh.feature_selection.relevance import calculate_relevance_table
from tsfresh.feature_extraction import ComprehensiveFCParameters, MinimalFCParameters
from sklearn.preprocessing import StandardScaler, OneHotEncoder

#access data from local directory
dataDir = Path.cwd().parent.parent/'Data/processed'
gigacow_cols = ['Gigacow_Cow_Id', 'FarmName_Pseudo', 'BreedName', 'BirthDate']
lactation_cols = ['Gigacow_Cow_Id', 'FarmName_Pseudo', 'LactationInfoDate', 'LactationNumber', 'DaysInMilk']
gigacow = pd.read_csv(dataDir/'gigacow_filter.csv', encoding='utf-8', usecols=gigacow_cols)
lactation = pd.read_csv(dataDir/'lactation_filter.csv', encoding='utf-8', usecols=lactation_cols)
traffic = pd.read_csv(dataDir/'traffic_raw_filter.csv', encoding='utf-8', index_col=False)
#check out cows with most milking events
print(traffic.Gigacow_Cow_Id.value_counts().nlargest(10))

5613    12729
4478    11601
3757    11428
6197    11250
2423    11192
4504    10688
3147    10658
5147    10576
5046    10204
478     10028
Name: Gigacow_Cow_Id, dtype: int64


In [2]:
# Select cows with sufficient data points on single lactation periods
# Try to collect cow's data that contain milking events on lactation periods 1
# fetch all milking traffic events for merging
traffic_milking = traffic.TrafficResult.str.contains('kg', regex=False)
all_list = traffic_milking.index[traffic_milking.values == True].tolist()
milking_total = traffic[traffic.index.isin(all_list)]
milking_total.TrafficEventDateTime = pd.to_datetime(milking_total.TrafficEventDateTime)
milking_total['milking_date'] = milking_total.TrafficEventDateTime.dt.date

# convert data type
milking_total.milking_date = pd.to_datetime(milking_total.milking_date)
lactation.LactationInfoDate = pd.to_datetime(lactation.LactationInfoDate)
# merge all milking events with lactation table for filtering
milking_total = milking_total.merge(lactation, how='left', left_on=['FarmName_Pseudo', 'Gigacow_Cow_Id', 'milking_date'], right_on=['FarmName_Pseudo', 'Gigacow_Cow_Id', 'LactationInfoDate'])

def lac_collect(NumLac, milking_total):
    """Generate cow list for multiple lactation periods.

    Args:
        NumLac: The number of lactaion period
        milking_total: A dataframe contains all the milkings events

    Returns:
        A list contain all the cows events with sufficient data points within the lactation periods.
    """
    cow_list = list()
    for num in range(1, NumLac+1):
        # select records that contains lactation period #num
        milking_select1 = milking_total.loc[milking_total['LactationNumber'] == num]
        milking_select1.drop_duplicates(subset=['Gigacow_Cow_Id', 'milking_date', 'LactationNumber', 'DaysInMilk'], inplace=True)
        # drop the anomaly data point
        milking_select1 = milking_select1.loc[milking_select1.DaysInMilk < 400]
        # select sufficient data points on lactation
        selected1 = milking_select1.Gigacow_Cow_Id.value_counts(ascending=True)
        selected1 = selected1.loc[(selected1.values > 150) & (selected1.values < 365)]
        selected_cow_list = selected1.index.to_list()
        if num == 1:
            cow_list = selected_cow_list
        cow_list = list(set(cow_list) & set(selected_cow_list))
        print(len(cow_list), cow_list)
    return cow_list

cow_list = lac_collect(1, milking_total)

105 [2560, 3075, 2569, 4109, 3603, 1555, 5147, 3613, 544, 550, 5160, 2090, 3638, 3657, 74, 1103, 4176, 2135, 4704, 6241, 4195, 622, 5235, 3705, 3707, 645, 2694, 5767, 3718, 2702, 151, 6300, 1181, 3744, 2724, 6316, 3762, 5301, 182, 4792, 708, 4295, 5322, 2251, 1738, 1744, 5846, 4311, 1244, 5852, 2278, 6380, 5873, 5372, 1277, 769, 1796, 2844, 6434, 4899, 3890, 1336, 2374, 6472, 4939, 5965, 855, 5465, 1905, 3446, 3447, 2423, 4988, 4990, 6018, 3979, 2956, 6039, 2460, 926, 3487, 2979, 4525, 943, 5555, 5046, 951, 4537, 961, 1985, 6086, 4038, 2510, 5076, 2517, 985, 478, 5088, 5092, 3563, 2031, 6127, 5616, 2039, 2047]


In [3]:
def countCowAge(birthDate, milkingDate):
    '''
    func: Calculate cows age based on birthDate and milkingDate
    args: 
        birthDate: cow's birth datetime
        milkingDate: milking events datetime
    return: cow age in human years(float)
    '''
    birthDate = pd.to_datetime(birthDate)
    milkingDate = pd.to_datetime(milkingDate)
    days = np.float32(np.datetime64(milkingDate, 'D') - np.datetime64(birthDate, 'D'))
    age = np.around(days/365, 2)
    return age

# select single cow from the traffic table

def data_collector(traffic, gigacow, lactation, cow_id, lacNumList):
    '''
    func: collect features from for a single cow
    args: 
        traffic: traffic data table
        gigacow: gigacow data table
        lactation: lactation data table
        cow_id: gigacow_id of the cow
        lacNumList: a list lactation period number
    return: A dataframe contains all features for a single cow on specfic lactation period
    '''

    traffic_single_cow = traffic.loc[traffic['Gigacow_Cow_Id'] == cow_id]
    traffic_single_cow.sort_values(by='TrafficEventDateTime', inplace=True)
    traffic_single_cow.index = range(len(traffic_single_cow))

    '''
        Extract Milking Event and its most recent traffic event to calculate T2-T1
        T1: Entry time into the Mjolkfalla
        T2: Entry time into the milking robot
        T2-T1: calculate time difference between T2&T1 (i.e., Time spend in Mjolkfalla/holding area)
    '''
    # locate mikling event by searching 'kg' keyword in traffic result
    # the most recent traffic event to milking event should be pre_milking event
    # need to filter out records with gate failure
    track_milking = traffic_single_cow.TrafficResult.str.contains('kg', regex=False)
    milking_index_list = track_milking.index[track_milking.values == True].tolist()
    pre_milking_index_list = [x-1 for x in milking_index_list]
    milking_traffic = traffic_single_cow[traffic_single_cow.index.isin(milking_index_list)]
    pre_milking_traffic = traffic_single_cow[traffic_single_cow.index.isin(pre_milking_index_list)]

    # drop rows that the gate failed to detect cows but have milking result
    # previous area in milking_traffic table should only be Mjolkfalla
    # previous area in pre_milking_traffic table should not be Mjolkfalla
    failed_list_1_milk = milking_traffic.index[milking_traffic['PreviousArea'] == 'Koridor till Sorteringsgrind 2'].tolist()
    failed_list_1_pre = [x-1 for x in failed_list_1_milk]
    failed_list_2_pre = pre_milking_traffic.index[pre_milking_traffic['PreviousArea'] == 'Mjolkfalla'].tolist()
    failed_list_2_milk = [x+1 for x in failed_list_2_pre]
    # traffic result in pre_milking_traffic table should contain Mjolkfalla
    track_pre_milking = pre_milking_traffic.TrafficResult.str.contains('Mjolkfalla', regex=False)
    failed_list_3_pre = track_pre_milking.index[track_pre_milking.values == False].tolist()
    failed_list_3_milk = [x+1 for x in failed_list_3_pre]

    # remove failed records based on index list
    milking_traffic_failed = failed_list_1_milk + failed_list_2_milk + failed_list_3_milk
    pre_milking_traffic_failed = failed_list_1_pre + failed_list_2_pre + failed_list_3_pre
    milking_traffic.drop(axis=0, index=milking_traffic_failed, inplace=True)
    pre_milking_traffic.drop(axis=0, index=pre_milking_traffic_failed, inplace=True)
    # concatenate two tables to track the traffic directly
    all_milking_traffic = pd.concat([milking_traffic, pre_milking_traffic])
    all_milking_traffic.sort_values(by=['TrafficEventDateTime'], inplace=True)
    #rename table columns for merging
    milking_traffic.rename(columns={"TrafficEventDateTime": "MilkingEventDateTime", "TrafficResult": "MilkProduction", "TimeInArea_totalSeconds": "RoundedSecondsTimeInArea"}, inplace=True)
    pre_milking_traffic.rename(columns={"TrafficEventDateTime": "Pre_MilkingEventDateTime", "TimeInArea_totalSeconds": "RoundedSecondsTimeInArea"}, inplace=True)
    # unify the index of two tables
    milking_traffic.index = range(len(milking_traffic))
    pre_milking_traffic.index = range(len(pre_milking_traffic))
    # inert "pre_traffic_milking" to milking traffic table
    milking_traffic.insert(5, 'Pre_MilkingEventDateTime', pre_milking_traffic['Pre_MilkingEventDateTime'])
    # calculate T2-T1
    milking_traffic.MilkingEventDateTime = pd.to_datetime(milking_traffic.MilkingEventDateTime)
    milking_traffic.Pre_MilkingEventDateTime = pd.to_datetime(milking_traffic.Pre_MilkingEventDateTime)
    milking_traffic['timeDelta_Seconds'] = (milking_traffic['MilkingEventDateTime'] - milking_traffic['Pre_MilkingEventDateTime']).dt.total_seconds()

    # extract traffic result(milk production)
    milking_traffic['MilkProduction'].replace(r"[^0-9.,]+"," ", inplace=True, regex=True)
    milking_traffic['MilkProduction'].replace(r"\s*","", inplace=True, regex=True)
    milking_traffic['MilkProduction'].replace(r"[,]+",".", inplace=True, regex=True)
    milking_traffic['MilkProduction'] = milking_traffic['MilkProduction'].astype('float64')

    # merge all the other features into milking_traffic table
    milking_traffic['MilkingDate'] = milking_traffic.MilkingEventDateTime.dt.date
    milking_traffic.MilkingDate = pd.to_datetime(milking_traffic.MilkingDate)
    lactation.LactationInfoDate = pd.to_datetime(lactation.LactationInfoDate)
    single_cow_merge = milking_traffic.merge(lactation, how='left', left_on=['FarmName_Pseudo', 'Gigacow_Cow_Id', 'MilkingDate'], right_on=['FarmName_Pseudo', 'Gigacow_Cow_Id', 'LactationInfoDate'])
    single_cow_merge = single_cow_merge.merge(gigacow, how='left', left_on=['FarmName_Pseudo', 'Gigacow_Cow_Id'], right_on=['FarmName_Pseudo', 'Gigacow_Cow_Id'])

    # drop failed data points based on RoundedSecondsTimeInArea & timeDelta_Seconds
    single_cow_merge.drop(single_cow_merge.loc[abs(single_cow_merge.timeDelta_Seconds - single_cow_merge.RoundedSecondsTimeInArea) > 300].index, inplace=True)
    single_cow_merge['TrafficDeviceName'].replace(r"[A-Za-z]+\s*","vms", inplace=True, regex=True)
    # calculate age of cows
    single_cow_merge['Age'] = single_cow_merge.apply(lambda x: countCowAge(x['BirthDate'], x['MilkingEventDateTime']), axis=1)
    single_cow_merge.drop(['BirthDate'], axis=1, inplace=True)
    single_cow_merge.dropna(inplace=True)

    # integrate multiple milking events for a single DIM
    single_cow_merge = single_cow_merge[single_cow_merge.LactationNumber.isin(lacNumList)]
    single_cow_merge.index = range(1,len(single_cow_merge)+1) 
    single_cow_merge.drop(['MilkingEventDateTime', 'Pre_MilkingEventDateTime', 'Traffic_Id', 'MilkingInterval_totalSeconds', 'RoundedSecondsTimeInArea', 'PreviousArea', 'GroupName', 'LactationInfoDate', 'TrafficDeviceName'], axis=1, inplace=True)

    # uncomment following part to get combined milking events for each DIM
    # comb_cows = single_cow_merge.groupby(by=['FarmName_Pseudo', 'Gigacow_Cow_Id', 'MilkingDate', 'LactationNumber', 'DaysInMilk', 'BreedName', 'Age'], sort=False, as_index=False).sum(['MilkProduction', 'timeDelta_Seconds'])
    # single_cow_merge_size = single_cow_merge.groupby(by=['FarmName_Pseudo', 'Gigacow_Cow_Id', 'MilkingDate', 'LactationNumber', 'DaysInMilk', 'BreedName', 'Age'], sort=False, as_index=False).size()
    # comb_cows = pd.concat([comb_cows, single_cow_merge_size['size']], axis=1, ignore_index=False)
    # comb_cows.rename(columns={"MilkProduction": "Total_MilkProduction", "timeDelta_Seconds": "Total_timeDelta_Seconds", "size": "milking_times"}, inplace=True)
    # comb_cows.index = range(1, len(comb_cows)+1)
    # return comb_cows

    single_cow_merge.rename(columns={"MilkProduction": "Total_MilkProduction", "timeDelta_Seconds": "Total_timeDelta_Seconds", "size": "milking_times"}, inplace=True)
    single_cow_merge.index = range(1, len(single_cow_merge)+1)
    return single_cow_merge


In [4]:
"""
labeling cow with problematic/normal(1/0)
"""
threshold_ratio = 0.05
Path(dataDir/'Problematic_targetCows').mkdir(parents=True, exist_ok=True)
def labeling_problematic(threshold_ratio, cow_total): 
    '''
    func: labeling problematic cows
    args: 
        threshold_ratio: threshold ratio for the abnormal event milking events
        cow_total: A dataframe contains all data points for a single cow
    return: problematic cows dataset with label
    '''
    global learner
    total_events = len(cow_total)
    abnoramal_cows = cow_total.loc[cow_total.Total_timeDelta_Seconds > 7200]
    abnoramal_ratio = len(abnoramal_cows)/total_events
    print(abnoramal_ratio)
    if abnoramal_ratio > threshold_ratio:
        problematic = 1 # problematic cow
    else:
        problematic = 0 # normal cow
    cow_total['problematic'] = problematic
    return cow_total

In [5]:
# filter out cows' record start at the middle of the lactation
filter_list = []
for id in cow_list:
    single_cow = data_collector(traffic, gigacow, lactation, id, [1])
    if single_cow.DaysInMilk.min() < 60:
        filter_list.append(id)

print("filtered: ", len(filter_list), filter_list)

filtered:  100 [2560, 3075, 2569, 4109, 3603, 1555, 5147, 3613, 544, 550, 5160, 2090, 3638, 3657, 74, 1103, 4176, 2135, 4704, 6241, 4195, 5235, 3705, 3707, 645, 2694, 5767, 3718, 2702, 151, 6300, 1181, 3744, 2724, 6316, 5301, 4792, 708, 4295, 5322, 2251, 1738, 1744, 5846, 4311, 1244, 5852, 2278, 6380, 5873, 5372, 1277, 769, 1796, 2844, 6434, 4899, 3890, 1336, 2374, 6472, 4939, 5965, 855, 5465, 1905, 3446, 3447, 2423, 4988, 4990, 6018, 3979, 2956, 6039, 2460, 926, 2979, 4525, 943, 5555, 5046, 951, 961, 1985, 6086, 4038, 2510, 5076, 2517, 985, 478, 5088, 5092, 3563, 2031, 6127, 5616, 2039, 2047]


In [None]:
from matplotlib.backends.backend_pdf import PdfPages
import warnings
pd.set_option('mode.chained_assignment', None)

""" plot the relations between timeDelta and Lactation/DIM(DaysInMilk)
        @@@ Total_timeDelta @@@
    """ 
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    with PdfPages(dataDir/'100cows_timecost_scatters_lac1_with_label_scale30000.pdf') as pdf:
        for id in filter_list:
            print("cow_id:", id)
            single_cow_merge = data_collector(traffic, gigacow, lactation, id, [1])
            single_cow_merge = labeling_problematic(0.05, single_cow_merge)
            prob = single_cow_merge.problematic.unique()[0]
            fig1 = plt.figure()
            # fig2 = plt.figure()
            if prob == 1:
                title = "Problematic_Cow_cow_id_"+ str(id)
            else:
                title = "Normal_Cow_cow_id_"+ str(id)
            fig1 = single_cow_merge.loc[single_cow_merge.LactationNumber == 1].plot(x="DaysInMilk", y="Total_timeDelta_Seconds", kind='scatter', title=title+"_Lac1", xlim=[1, 360], ylim=[0, 30000], s=2, c='b')
            # fig2 = single_cow_merge.loc[single_cow_merge.LactationNumber == 2].plot(x="DaysInMilk", y="Total_timeDelta_Seconds", kind='scatter', title=title+"_Lac2", xlim=[1, 360], ylim=[0, 10000], s=2, c='b')
            pdf.savefig(fig1.get_figure())
            # pdf.savefig(fig2.get_figure())
            plt.close()

In [8]:
pd.options.mode.chained_assignment = None

mean_totalTimeCost = 0
Path(dataDir/'Problematic_targetCows').mkdir(parents=True, exist_ok=True)
lactationNum = [1]

# save a list of cow data for abnormal cows detection problem
for i, cow_id in enumerate(filter_list):
    single_cow_merge = data_collector(traffic, gigacow, lactation, cow_id, lactationNum)
    mean_totalTimeCost += single_cow_merge.Total_timeDelta_Seconds.mean()
    single_cow_merge = labeling_problematic(threshold_ratio, single_cow_merge)
    problematic = single_cow_merge.problematic.unique()[0]
    if problematic == 1:
        print("This cow is problematic")
    single_cow_merge["id"] = i+1
    single_cow_merge.dropna(inplace=True)
    fileName = 'Problematic_targetCows/cow_' + str(i) + '.csv'
    single_cow_merge.to_csv(dataDir/fileName)
print("num of cows: ", len(cow_list))
print("Mean of total time cost: ", mean_totalTimeCost/len(cow_list))

0.004132231404958678
0.06535947712418301
This cow is problematic
0.0
0.2611683848797251
This cow is problematic
0.032362459546925564
0.0017452006980802793
0.0
0.0
0.22893772893772893
This cow is problematic
0.0899581589958159
This cow is problematic
0.004722550177095631
0.005474452554744526
0.0027397260273972603
0.09400544959128065
This cow is problematic
0.0026490066225165563
0.0020408163265306124
0.032577903682719546
0.06801736613603473
This cow is problematic
0.03292181069958848
0.014265335235378032
0.011217948717948718
0.017857142857142856
0.004645760743321719
0.007556675062972292
0.11026615969581749
This cow is problematic
0.125
This cow is problematic
0.016320474777448073
0.0021668472372697724
0.0
0.004464285714285714
0.07692307692307693
This cow is problematic
0.02621359223300971
0.12152777777777778
This cow is problematic
0.1095890410958904
This cow is problematic
0.12165450121654502
This cow is problematic
0.14165261382799327
This cow is problematic
0.1022964509394572
This cow

In [6]:
""" Data Preparation """

usecols = ['id', 'FarmName_Pseudo', 'Gigacow_Cow_Id', 'Total_MilkProduction', 'Total_timeDelta_Seconds', 'LactationNumber', 'DaysInMilk', 'BreedName', 'Age', 'MilkingDate', 'problematic']
dataDir = Path.cwd().parent.parent/'Data/processed/Problematic_targetCows/'

# integrate all the cows data into one dataset
filelist = list(Path(dataDir).glob('*.csv'))
for i, _ in enumerate(filelist):
    fileName = 'cow_' + str(i) + '.csv'
    single_cow = pd.read_csv(dataDir/fileName, encoding='utf-8', usecols=usecols)
    single_cow.sort_values(by=['MilkingDate'], inplace=True)
    if i == 0:
        cow_total = single_cow
    else:
        cow_total = pd.concat([cow_total, single_cow], axis=0, ignore_index=True)
cow_total.to_csv(dataDir.parent/"Cow_Prob_dataset_L1.csv", index=False)
cow_total

Unnamed: 0,FarmName_Pseudo,Gigacow_Cow_Id,Total_MilkProduction,Total_timeDelta_Seconds,MilkingDate,LactationNumber,DaysInMilk,BreedName,Age,problematic,id
0,a624fb9a,2560,9.38,3176.0,2022-02-14,1.0,2.0,1,2.15,0,1
1,a624fb9a,2560,8.46,352.0,2022-02-14,1.0,2.0,1,2.15,0,1
2,a624fb9a,2560,6.68,997.0,2022-02-15,1.0,3.0,1,2.15,0,1
3,a624fb9a,2560,7.34,9274.0,2022-02-15,1.0,3.0,1,2.15,0,1
4,a624fb9a,2560,8.15,407.0,2022-02-16,1.0,4.0,1,2.15,0,1
...,...,...,...,...,...,...,...,...,...,...,...
65499,a624fb9a,2047,10.51,25.0,2022-10-05,1.0,284.0,1,2.92,0,100
65500,a624fb9a,2047,7.49,9.0,2022-10-05,1.0,284.0,1,2.92,0,100
65501,a624fb9a,2047,7.92,181.0,2022-10-06,1.0,285.0,1,2.92,0,100
65502,a624fb9a,2047,6.77,90.0,2022-10-06,1.0,285.0,1,2.92,0,100


In [10]:
#df_static_features = cow_total[['FarmName_Pseudo']]
static_cols = ['FarmName_Pseudo', 'TrafficDeviceName', 'LactationNumber', 'BreedName']
timeSeries_cols = ['Age', 'Total_MilkProduction', 'Total_timeDelta_Seconds', 'DaysInMilk']
output_col = ['problematic']

cow_label = cow_total[['id', 'problematic']].copy()
cow_timeseries = cow_total[['id', 'MilkingDate']].copy()
cow_timeseries.index = range(len(cow_timeseries))
# fetch y for feature extraction
y = cow_label.drop_duplicates(subset=['id'])
y = y["problematic"]
y.index = range(1,len(y)+1) 
print(y.value_counts())

0    68
1    32
Name: problematic, dtype: int64


In [11]:
# Prepare datasert for Tsfresh extraction
ts_extracted_dataset = cow_total[['id']].copy()
ts_extracted_dataset.drop_duplicates(subset=['id'], inplace=True)
ts_extracted_dataset.index = range(1, len(ts_extracted_dataset)+1)
ts_processed = pd.DataFrame(cow_total[timeSeries_cols].copy())
ts_processed.index = range(0,len(ts_processed)) 
ts_processed = pd.concat([cow_timeseries, ts_processed], axis=1)
ts_processed

Unnamed: 0,id,MilkingDate,Age,Total_MilkProduction,Total_timeDelta_Seconds,DaysInMilk
0,1,2022-02-14,2.15,9.38,3176.0,2.0
1,1,2022-02-14,2.15,8.46,352.0,2.0
2,1,2022-02-15,2.15,6.68,997.0,3.0
3,1,2022-02-15,2.15,7.34,9274.0,3.0
4,1,2022-02-16,2.15,8.15,407.0,4.0
...,...,...,...,...,...,...
65499,100,2022-10-05,2.92,10.51,25.0,284.0
65500,100,2022-10-05,2.92,7.49,9.0,284.0
65501,100,2022-10-06,2.92,7.92,181.0,285.0
65502,100,2022-10-06,2.92,6.77,90.0,285.0


In [26]:
settings_1 = MinimalFCParameters() 
#settings_2 = ComprehensiveFCParameters

for i, col in enumerate(timeSeries_cols):
    ts_processed = pd.DataFrame(cow_total[col].copy())
    ts_processed.index = range(0,len(ts_processed)) 
    ts_processed = pd.concat([cow_timeseries, ts_processed], axis=1)
    print(ts_processed)
    # extract time series features
    extracted_features = extract_features(ts_processed, column_id="id", column_sort="MilkingDate", default_fc_parameters=settings_1)
    #extracted_features.dropna(axis=1, inplace=True)
    impute(extracted_features)
    
    # select most relevant features based on relevance table
    if i == 0:
        temp = calculate_relevance_table(extracted_features, y)
        relevance_table = temp
    else:
        temp = calculate_relevance_table(extracted_features, y)
        relevance_table = pd.concat([relevance_table, temp], axis=0)
    features_filtered = select_features(extracted_features, y)
    ts_extracted_dataset = pd.concat([ts_extracted_dataset, features_filtered], axis=1)

# Select relevant features from the relevance table
relevance_table = relevance_table[relevance_table.relevant]
relevance_table.sort_values("p_value", ascending=False, inplace=True)
relevant_features_list = list(relevance_table.feature[:])
print(relevance_table)

ts_extracted_dataset_filtered = ts_extracted_dataset[relevant_features_list].copy()
ts_extracted_dataset_filtered = ts_extracted_dataset_filtered.loc[:,~ts_extracted_dataset_filtered.columns.duplicated()]
ts_extracted_dataset = ts_extracted_dataset_filtered
ts_extracted_dataset

        id MilkingDate   Age
0        1  2022-02-14  2.15
1        1  2022-02-14  2.15
2        1  2022-02-15  2.15
3        1  2022-02-15  2.15
4        1  2022-02-16  2.15
...    ...         ...   ...
65499  100  2022-10-05  2.92
65500  100  2022-10-05  2.92
65501  100  2022-10-06  2.92
65502  100  2022-10-06  2.92
65503  100  2022-10-06  2.92

[65504 rows x 3 columns]


Feature Extraction: 100%|██████████| 34/34 [00:05<00:00,  6.23it/s]


        id MilkingDate  Total_MilkProduction
0        1  2022-02-14                  9.38
1        1  2022-02-14                  8.46
2        1  2022-02-15                  6.68
3        1  2022-02-15                  7.34
4        1  2022-02-16                  8.15
...    ...         ...                   ...
65499  100  2022-10-05                 10.51
65500  100  2022-10-05                  7.49
65501  100  2022-10-06                  7.92
65502  100  2022-10-06                  6.77
65503  100  2022-10-06                  7.81

[65504 rows x 3 columns]


Feature Extraction: 100%|██████████| 34/34 [00:05<00:00,  6.53it/s]


        id MilkingDate  Total_timeDelta_Seconds
0        1  2022-02-14                   3176.0
1        1  2022-02-14                    352.0
2        1  2022-02-15                    997.0
3        1  2022-02-15                   9274.0
4        1  2022-02-16                    407.0
...    ...         ...                      ...
65499  100  2022-10-05                     25.0
65500  100  2022-10-05                      9.0
65501  100  2022-10-06                    181.0
65502  100  2022-10-06                     90.0
65503  100  2022-10-06                    635.0

[65504 rows x 3 columns]


Feature Extraction: 100%|██████████| 34/34 [00:04<00:00,  6.90it/s]


        id MilkingDate  DaysInMilk
0        1  2022-02-14         2.0
1        1  2022-02-14         2.0
2        1  2022-02-15         3.0
3        1  2022-02-15         3.0
4        1  2022-02-16         4.0
...    ...         ...         ...
65499  100  2022-10-05       284.0
65500  100  2022-10-05       284.0
65501  100  2022-10-06       285.0
65502  100  2022-10-06       285.0
65503  100  2022-10-06       285.0

[65504 rows x 3 columns]


Feature Extraction: 100%|██████████| 34/34 [00:04<00:00,  7.11it/s]


                                                                                 feature  \
feature                                                                                    
DaysInMilk__minimum                                                  DaysInMilk__minimum   
DaysInMilk__sum_values                                            DaysInMilk__sum_values   
Total_MilkProduction__minimum                              Total_MilkProduction__minimum   
Total_MilkProduction__maximum                              Total_MilkProduction__maximum   
Total_MilkProduction__absolute_maximum            Total_MilkProduction__absolute_maximum   
Age__sum_values                                                          Age__sum_values   
Total_MilkProduction__standard_deviation        Total_MilkProduction__standard_deviation   
Total_MilkProduction__variance                            Total_MilkProduction__variance   
DaysInMilk__length                                                    DaysInMilk

Unnamed: 0,DaysInMilk__minimum,DaysInMilk__sum_values,Total_MilkProduction__minimum,Total_MilkProduction__maximum,Total_MilkProduction__absolute_maximum,Age__sum_values,Total_MilkProduction__standard_deviation,Total_MilkProduction__variance,DaysInMilk__length,Total_timeDelta_Seconds__length,...,Total_MilkProduction__mean,Total_MilkProduction__median,Total_timeDelta_Seconds__sum_values,Total_timeDelta_Seconds__maximum,Total_timeDelta_Seconds__absolute_maximum,Total_timeDelta_Seconds__median,Total_timeDelta_Seconds__mean,Total_timeDelta_Seconds__standard_deviation,Total_timeDelta_Seconds__root_mean_square,Total_timeDelta_Seconds__variance
1,2.0,76755.0,0.54,16.23,16.23,1763.74,2.082052,4.334941,726.0,726.0,...,8.920055,8.900,557837.0,9274.0,9274.0,330.0,768.370523,1146.471180,1380.141090,1.314396e+06
2,14.0,93071.0,1.14,15.38,15.38,1391.84,2.117326,4.483070,612.0,612.0,...,9.213676,9.080,1420084.0,54718.0,54718.0,845.0,2320.398693,4919.058491,5438.877323,2.419714e+07
3,11.0,131892.0,1.09,16.44,16.44,1867.81,1.609935,2.591890,781.0,781.0,...,9.643534,9.440,251817.0,3621.0,3621.0,211.0,322.428937,421.207932,530.449377,1.774161e+05
4,6.0,46592.0,3.01,20.60,20.60,692.09,2.544574,6.474859,291.0,291.0,...,11.029278,10.860,1659808.0,35048.0,35048.0,2841.0,5703.807560,7055.432846,9072.626595,4.977913e+07
5,5.0,95895.0,0.89,20.68,20.68,956.95,2.757360,7.603035,309.0,309.0,...,7.673625,7.260,562325.0,13210.0,13210.0,957.0,1819.822006,2067.507172,2754.330779,4.274586e+06
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
96,37.0,88335.0,0.73,17.32,17.32,883.48,2.289577,5.242164,296.0,296.0,...,8.022399,7.595,309840.0,9727.0,9727.0,603.5,1046.756757,1234.991862,1618.920816,1.525205e+06
97,3.0,45246.0,0.68,18.30,18.30,1192.13,2.198376,4.832859,540.0,540.0,...,10.076907,9.690,670619.0,12242.0,12242.0,427.0,1241.887037,1816.919779,2200.790970,3.301197e+06
98,1.0,109591.0,0.35,19.27,19.27,1899.73,2.382444,5.676040,790.0,790.0,...,10.056608,9.920,1271788.0,14305.0,14305.0,582.5,1609.858228,2183.442151,2712.759322,4.767420e+06
99,10.0,112748.0,5.98,22.38,22.38,1612.68,2.519919,6.349992,705.0,705.0,...,11.380128,10.880,926222.0,22967.0,22967.0,685.0,1313.790071,1903.702103,2313.033949,3.624082e+06


In [27]:
# Construct extracted training data
ts_extracted_id = cow_total[['id']].copy()
ts_extracted_id.drop_duplicates(subset=['id'], inplace=True)
ts_extracted_id.index = range(1, len(ts_extracted_id)+1)
ts_extracted_dataset = pd.concat([ts_extracted_id, ts_extracted_dataset], axis=1)

ts_extracted_features = ts_extracted_dataset.iloc[:, 1:len(ts_extracted_dataset.columns)].copy()
# normalize numerical features
ts_extracted_cols = ts_extracted_features.columns
scaler_std = StandardScaler()
ts_std = scaler_std.fit_transform(ts_extracted_features)
# transform standard data into dataframe
ts_extracted_processed = pd.DataFrame(ts_std, columns=ts_extracted_cols)
ts_extracted_processed.index = range(1,len(ts_extracted_processed)+1)
# append id col to the dataframe
ts_extracted_processed = pd.concat([ts_extracted_dataset['id'], ts_extracted_processed], axis=1)

# One-Hot encoding categorical feature - BreedName
ohe = OneHotEncoder(handle_unknown='ignore',sparse=False)
cow_breed = cow_total[['id', 'BreedName']].copy()
cow_breed.drop_duplicates(subset=['id'], inplace=True)
cat = ohe.fit_transform(np.array(cow_breed['BreedName']).reshape(-1, 1))
col_names = ohe.get_feature_names_out(['BreedName'])
cat_breed = pd.DataFrame(cat, columns=col_names)
cat_breed.index = range(1,len(cow_breed)+1)

# append features on extracted dataset
ts_dataset = pd.concat([ts_extracted_processed, cat_breed], axis=1)
ts_dataset = pd.concat([ts_dataset, y], axis=1)
ts_dataset.to_csv(dataDir/"problematic_100cows_7200s_5percent.csv", index=False)
ts_dataset

Unnamed: 0,id,DaysInMilk__minimum,DaysInMilk__sum_values,Total_MilkProduction__minimum,Total_MilkProduction__maximum,Total_MilkProduction__absolute_maximum,Age__sum_values,Total_MilkProduction__standard_deviation,Total_MilkProduction__variance,DaysInMilk__length,...,Total_timeDelta_Seconds__median,Total_timeDelta_Seconds__mean,Total_timeDelta_Seconds__standard_deviation,Total_timeDelta_Seconds__root_mean_square,Total_timeDelta_Seconds__variance,BreedName_1,BreedName_2,BreedName_4,BreedName_99,problematic
1,1,-0.564442,-0.366224,-0.519649,-0.598940,-0.598940,0.179023,-0.744216,-0.719037,0.303051,...,-0.710248,-0.781338,-0.804922,-0.805043,-0.653336,1.0,0.0,0.0,0.0,0
2,2,0.508982,-0.173549,-0.150318,-0.827626,-0.827626,-0.367402,-0.690739,-0.679881,-0.183812,...,-0.175317,0.274075,1.356482,0.986734,1.160595,0.0,1.0,0.0,0.0,1
3,3,0.240626,0.284888,-0.181095,-0.542442,-0.542442,0.331931,-1.459976,-1.179790,0.537940,...,-0.833854,-1.084588,-1.220442,-1.180150,-0.743465,1.0,0.0,0.0,0.0,0
4,4,-0.206634,-0.722418,1.000764,0.576771,0.576771,-1.395532,-0.043003,-0.153377,-1.554714,...,1.897930,2.574865,2.580461,2.590896,3.188498,0.0,0.0,1.0,0.0,1
5,5,-0.296086,-0.140200,-0.304206,0.598295,0.598295,-1.006378,0.279594,0.144842,-1.477841,...,-0.058983,-0.066328,-0.277239,-0.198391,-0.418680,0.0,1.0,0.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
96,96,2.566380,-0.229476,-0.402694,-0.305685,-0.305685,-1.114326,-0.429595,-0.479224,-1.533361,...,-0.426163,-0.592029,-0.754206,-0.699631,-0.636625,0.0,1.0,0.0,0.0,0
97,97,-0.474990,-0.738313,-0.433471,-0.042024,-0.042024,-0.660833,-0.567861,-0.587419,-0.491304,...,-0.609494,-0.459336,-0.420806,-0.442758,-0.495841,1.0,0.0,0.0,0.0,0
98,98,-0.653895,0.021536,-0.636603,0.218946,0.218946,0.378831,-0.288803,-0.364535,0.576377,...,-0.447976,-0.209108,-0.210817,-0.216743,-0.379612,0.0,1.0,0.0,0.0,0
99,99,0.151174,0.058817,2.828952,1.055666,1.055666,-0.042927,-0.080382,-0.186384,0.213365,...,-0.341509,-0.410441,-0.371087,-0.393207,-0.470246,1.0,0.0,0.0,0.0,0


In [34]:
"""Implementation on Gaussian Process Classifier"""

import numpy as np
from pathlib import Path
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.model_selection import GridSearchCV, train_test_split, RepeatedStratifiedKFold
from sklearn.metrics import accuracy_score, log_loss
pd.options.mode.chained_assignment = None

from sklearn.gaussian_process.kernels import RBF, DotProduct, Matern, RationalQuadratic, WhiteKernel, ExpSineSquared
grid = dict()
grid['kernel'] = [1*RBF(length_scale=1.0, length_scale_bounds=(1e-2, 1e2)), 
                  1*DotProduct(), 1*Matern(),  1*RationalQuadratic(),
                  1*Matern()+1*WhiteKernel(noise_level=0.5),
                  1*ExpSineSquared(),
                  1.0 * ExpSineSquared(length_scale=1.0, periodicity=3.0, length_scale_bounds=(0.1, 10.0), periodicity_bounds=(1.0, 10.0)),
                  1*Matern(length_scale=1.0, length_scale_bounds=(1e-1, 10.0), nu=1.5)+1*WhiteKernel(noise_level=0.5),
                ]


dataDir = Path.cwd().parent.parent/'Data/processed'
ts_dataset = pd.read_csv(dataDir/"problematic_100cows_7200s_5percent.csv", index_col="id")

In [51]:
X = ts_dataset.iloc[:, 0:len(ts_dataset.columns)-1].copy()
y = pd.DataFrame(ts_dataset.iloc[:, -1])
# split dataset into training and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=86)
model = GaussianProcessClassifier()
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=5, random_state=5)
# exhausive search over different kernels
search = GridSearchCV(estimator=model, param_grid=grid, scoring='accuracy', cv=cv, n_jobs=-1)
# fit model into the dataset
result = search.fit(X, y)

print("Mean cross-validated score of the best_estimator: ", result.best_score_)
print("Best estimator parameters: ", result.best_params_)

means = result.cv_results_['mean_test_score']
params = result.cv_results_['params']
for mean, param in zip(means, params):
    print("Accuracy %.3f with: %r" % (mean, param))

50 fits failed out of a total of 200.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
3 fits failed with the following error:
Traceback (most recent call last):
  File "d:\Toolbox\python\lib\site-packages\sklearn\model_selection\_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "d:\Toolbox\python\lib\site-packages\sklearn\gaussian_process\_gpc.py", line 715, in fit
    self.base_estimator_.fit(X, y)
  File "d:\Toolbox\python\lib\site-packages\sklearn\gaussian_process\_gpc.py", line 224, in fit
    self._constrained_optimization(
  File "d:\Toolbox\python\lib\site-packages\sklearn\gaussian_process\_gpc.py", line 469, in _constrained_optimization
    opt_res = scipy.optimize.minimize(
  F

Mean cross-validated score of the best_estimator:  0.978
Best estimator parameters:  {'kernel': 1**2 * Matern(length_scale=1, nu=1.5) + 1**2 * WhiteKernel(noise_level=0.5)}
Accuracy 0.974 with: {'kernel': 1**2 * RBF(length_scale=1)}
Accuracy 0.972 with: {'kernel': 1**2 * DotProduct(sigma_0=1)}
Accuracy 0.974 with: {'kernel': 1**2 * Matern(length_scale=1, nu=1.5)}
Accuracy 0.974 with: {'kernel': 1**2 * RationalQuadratic(alpha=1, length_scale=1)}
Accuracy 0.974 with: {'kernel': 1**2 * Matern(length_scale=1, nu=1.5) + 1**2 * WhiteKernel(noise_level=0.5)}
Accuracy nan with: {'kernel': 1**2 * ExpSineSquared(length_scale=1, periodicity=1)}
Accuracy nan with: {'kernel': 1**2 * ExpSineSquared(length_scale=1, periodicity=3)}
Accuracy 0.978 with: {'kernel': 1**2 * Matern(length_scale=1, nu=1.5) + 1**2 * WhiteKernel(noise_level=0.5)}




In [52]:
search.best_params_

{'kernel': 1**2 * Matern(length_scale=1, nu=1.5) + 1**2 * WhiteKernel(noise_level=0.5)}

In [53]:
y_train.value_counts()

problematic
0              53
1              22
dtype: int64

In [54]:
y_test.value_counts()

problematic
0              15
1              10
dtype: int64

In [55]:
# best_kernel = 1*DotProduct()
# best_kernel = 1**2 * RationalQuadratic(alpha=1, length_scale=1)
best_kernel = 1**2 * Matern(length_scale=1, nu=1.5) + 1**2 * WhiteKernel(noise_level=0.5)
# best_kernel = 1**2 * Matern(length_scale=1, nu=1.5)
# best_kernel = 1**2 * RBF(length_scale=1)
model = GaussianProcessClassifier(kernel=best_kernel, random_state=30, n_jobs=-1)
model.fit(X_train, y_train.values.ravel())
print("Mean accuracy on training data: ", model.score(X_train, y_train))
print("Prediction on test data: ", model.predict(X_test))
pred_test = model.predict(X_test)
print("Prediction accuracy on test data: ", accuracy_score(y_test, pred_test))

Mean accuracy on training data:  1.0
Prediction on test data:  [1 1 1 0 1 0 1 0 1 1 0 0 1 1 0 0 0 0 1 0 0 0 0 0 0]
Prediction accuracy on test data:  1.0


In [56]:
prob_list = model.predict_proba(X_test)
prob_list

array([[0.1008989 , 0.8991011 ],
       [0.07287753, 0.92712247],
       [0.31705844, 0.68294156],
       [0.88830167, 0.11169833],
       [0.05632446, 0.94367554],
       [0.96845917, 0.03154083],
       [0.0700798 , 0.9299202 ],
       [0.79918891, 0.20081109],
       [0.0738107 , 0.9261893 ],
       [0.08402116, 0.91597884],
       [0.86272997, 0.13727003],
       [0.88470098, 0.11529902],
       [0.10429109, 0.89570891],
       [0.13388823, 0.86611177],
       [0.86975656, 0.13024344],
       [0.87194862, 0.12805138],
       [0.90707336, 0.09292664],
       [0.82480743, 0.17519257],
       [0.22735236, 0.77264764],
       [0.89944822, 0.10055178],
       [0.93381763, 0.06618237],
       [0.83860234, 0.16139766],
       [0.93713469, 0.06286531],
       [0.93404599, 0.06595401],
       [0.96655879, 0.03344121]])