In [1]:
import os
from pathlib import Path
from matplotlib.style import use
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from tsfresh import extract_features
from tsfresh import select_features
from tsfresh.utilities.dataframe_functions import impute
from tsfresh.feature_extraction import ComprehensiveFCParameters, EfficientFCParameters, MinimalFCParameters
from tsfresh.feature_extraction.settings import from_columns


dataDir = Path.cwd().parent.parent.parent/'Data/processed/memory_targetCows/'
usecols = ['id', 'FarmName_Pseudo', 'Gigacow_Cow_Id', 'Total_MilkProduction', 'Total_timeDelta_Seconds', 'LactationNumber', 'DaysInMilk', 'BreedName', 'Age', 'milking_times', 'MilkingDate']

In [5]:
# labeling cow with good/bad memory(1/0)
def labeling_memory(threshold_percentage, cow_l1, cow_l2): 
    '''
    func: labeling memory dataset
    args: 
        threshold_percentage: threshold for the ratio of milking time cost reduction
        cow_l1: A dataframe contains all data points for a single cow on lactation period 1
        cow_l2: A dataframe contains all data points for a single cow on lactation period 2
    return: memory dataset with label
    '''
    global memory
    total_timeCost_1 = 0
    total_timeCost_2 = 0
    # lactation period 1
    total_timeCost_1 = cow_l1.Total_timeDelta_Seconds.sum()
    totalEvents_1 = cow_l1.milking_times.sum()
    mean_timeCost_1 = total_timeCost_1/totalEvents_1
    # lactation period 2
    total_timeCost_2 = cow_l2.Total_timeDelta_Seconds.sum()
    totalEvents_2 = cow_l2.milking_times.sum()
    mean_timeCost_2 = total_timeCost_2/totalEvents_2
    time_diff = (mean_timeCost_1-mean_timeCost_2)/mean_timeCost_1
    # print("cow: " + str(time_diff))
    # print(mean_timeCost_1)
    # print(mean_timeCost_2)
    
    # anomalies detection
    if time_diff < 0.1:
        print(cow_l1.FarmName_Pseudo.unique())
        print(time_diff)
        memory = 2
        return memory
    print(time_diff)
    if time_diff > threshold_percentage:
        memory = 1 # good memory
        print("good")
    else:
        memory = 0 # bad memory
        print("bad")
    return memory

In [4]:
'''
periods = [1, 2]

# integrate all the cows data into one dataset
for p in periods:
    filelist = list(Path(dataDir/str(p)).glob('*.csv'))
    for i, _ in enumerate(filelist):
        fileName = 'cow_' + str(i) + '.csv'
        single_cow = pd.read_csv(dataDir/str(p)/fileName, encoding='utf-8', usecols=usecols)
        single_cow.sort_values(by=['MilkingDate'], inplace=True)
        if i == 0:
            cow_total = single_cow
        else:
            cow_total = pd.concat([cow_total, single_cow], axis=0, ignore_index=True)
    fileName = 'cow_total_' + str(p) + '.csv'
    Path(dataDir/'cow_total/').mkdir(parents=True, exist_ok=True)
    cow_total.to_csv(dataDir/'cow_total'/fileName, index=False)
'''

In [37]:
# periods = os.listdir(dataDir)
periods = [1, 2]

filelist = list(Path(dataDir/'1').glob('*.csv'))

# integrate all the cows data into one dataset
for i, _ in enumerate(filelist):
    fileName = 'cow_' + str(i) + '.csv'
    single_cow_1 = pd.read_csv(dataDir/'1'/fileName, encoding='utf-8', usecols=usecols)
    single_cow_2 = pd.read_csv(dataDir/'2'/fileName, encoding='utf-8', usecols=usecols)
    single_cow_1.sort_values(by=['MilkingDate'], inplace=True)
    single_cow_2.sort_values(by=['MilkingDate'], inplace=True)
    print(i)
    label = labeling_memory(0.56, single_cow_1, single_cow_2)
    # skip anomalies
    if label == 2:
        continue
    single_cow = pd.concat([single_cow_1, single_cow_2], axis=0, ignore_index=True)
    single_cow['label'] = label
    if i == 0:
        cow_total = single_cow
        cow_total_1 = single_cow_1
        cow_total_2 = single_cow_2
    else:
        cow_total = pd.concat([cow_total, single_cow], axis=0, ignore_index=True)
        cow_total_1 = pd.concat([cow_total_1, single_cow_1], axis=0, ignore_index=True)
        cow_total_2 = pd.concat([cow_total_2, single_cow_2], axis=0, ignore_index=True) 
fileName = 'Cow_Memory_dataset_L1_L2.csv'
Path(dataDir/'cow_total/').mkdir(parents=True, exist_ok=True)
cow_total.to_csv(dataDir/'cow_total'/fileName, index=False)
cow_total_1.to_csv(dataDir/'cow_total/Cow_Memory_dataset_L1.csv', index=False)
cow_total_2.to_csv(dataDir/'cow_total/Cow_Memory_dataset_L2.csv', index=False)

0
0.20359552642520393
bad
1
0.6739063177668262
good
2
0.5296885142107118
bad
3
0.672588427516125
good
4
0.6853469323339527
good
5
0.7007858717829946
good
6
0.554003879659926
bad
7
0.47329586008694424
bad
8
0.4854409250262452
bad
9
0.21299886477419697
bad
10
0.24468990053898731
bad
11
0.6011589584037812
good
12
0.5484093381343471
bad
13
0.4926624860581385
bad
14
0.8026352481873954
good
15
0.358188736516344
bad
16
0.39916173464069155
bad
17
0.81094523547455
good
18
0.6714218546956363
good
19
0.7417756204979071
good
20
0.5408892534573655
bad
21
0.6378422397499658
good
22
0.40071210243093236
bad
23
0.7887735555091778
good
24
0.7867464394182372
good
25
0.4390578845490215
bad
26
0.5214979896667522
bad
27
0.48369125353055814
bad
28
0.41985847285470845
bad
29
0.37021197973935144
bad
30
0.7532111163189767
good
31
0.5084688777750369
bad


In [9]:
cow_total

Unnamed: 0,FarmName_Pseudo,Gigacow_Cow_Id,MilkingDate,LactationNumber,DaysInMilk,BreedName,Age,Total_MilkProduction,Total_timeDelta_Seconds,milking_times,id,label
0,a624fb9a,769,2021-09-09,1.0,1.0,2,1.96,9.48,5191.0,2,1,0
1,a624fb9a,769,2021-09-10,1.0,2.0,2,1.96,14.49,9981.0,2,1,0
2,a624fb9a,769,2021-09-11,1.0,3.0,2,1.96,14.93,3531.0,2,1,0
3,a624fb9a,769,2021-09-12,1.0,4.0,2,1.97,14.62,8567.0,2,1,0
4,a624fb9a,769,2021-09-13,1.0,5.0,2,1.97,13.38,7612.0,2,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...
3343,a624fb9a,3707,2021-09-26,2.0,55.0,1,3.06,37.49,620.0,3,32,0
3344,a624fb9a,3707,2021-09-27,2.0,56.0,1,3.06,48.64,1425.0,4,32,0
3345,a624fb9a,3707,2021-09-28,2.0,57.0,1,3.06,37.54,6408.0,3,32,0
3346,a624fb9a,3707,2021-09-29,2.0,58.0,1,3.07,32.58,2759.0,3,32,0


In [11]:
cow_total_1

Unnamed: 0,FarmName_Pseudo,Gigacow_Cow_Id,MilkingDate,LactationNumber,DaysInMilk,BreedName,Age,Total_MilkProduction,Total_timeDelta_Seconds,milking_times,id
0,a624fb9a,769,2021-09-09,1.0,1.0,2,1.96,9.48,5191.0,2,1
1,a624fb9a,769,2021-09-10,1.0,2.0,2,1.96,14.49,9981.0,2,1
2,a624fb9a,769,2021-09-11,1.0,3.0,2,1.96,14.93,3531.0,2,1
3,a624fb9a,769,2021-09-12,1.0,4.0,2,1.97,14.62,8567.0,2,1
4,a624fb9a,769,2021-09-13,1.0,5.0,2,1.97,13.38,7612.0,2,1
...,...,...,...,...,...,...,...,...,...,...,...
1586,a624fb9a,3707,2020-11-04,1.0,55.0,1,2.16,46.06,4030.0,5,32
1587,a624fb9a,3707,2020-11-05,1.0,56.0,1,2.17,44.03,3565.0,4,32
1588,a624fb9a,3707,2020-11-06,1.0,57.0,1,2.17,35.73,5023.0,4,32
1589,a624fb9a,3707,2020-11-07,1.0,58.0,1,2.17,37.62,2102.0,4,32


In [12]:
cow_total_2

Unnamed: 0,FarmName_Pseudo,Gigacow_Cow_Id,MilkingDate,LactationNumber,DaysInMilk,BreedName,Age,Total_MilkProduction,Total_timeDelta_Seconds,milking_times,id
0,a624fb9a,769,2022-07-29,2.0,1.0,2,2.84,15.19,1923.0,3,1
1,a624fb9a,769,2022-07-30,2.0,2.0,2,2.85,25.94,3894.0,3,1
2,a624fb9a,769,2022-07-31,2.0,3.0,2,2.85,25.91,2377.0,3,1
3,a624fb9a,769,2022-08-01,2.0,4.0,2,2.85,30.19,3898.0,3,1
4,a624fb9a,769,2022-08-02,2.0,5.0,2,2.85,30.78,3485.0,3,1
...,...,...,...,...,...,...,...,...,...,...,...
1752,a624fb9a,3707,2021-09-26,2.0,55.0,1,3.06,37.49,620.0,3,32
1753,a624fb9a,3707,2021-09-27,2.0,56.0,1,3.06,48.64,1425.0,4,32
1754,a624fb9a,3707,2021-09-28,2.0,57.0,1,3.06,37.54,6408.0,3,32
1755,a624fb9a,3707,2021-09-29,2.0,58.0,1,3.07,32.58,2759.0,3,32


In [19]:
cow_total_1["mean_Total_timeDelta_Seconds"] = cow_total_1.Total_timeDelta_Seconds/cow_total_1.milking_times
cow_total_2["mean_Total_timeDelta_Seconds"] = cow_total_2.Total_timeDelta_Seconds/cow_total_2.milking_times
cow_total["mean_Total_timeDelta_Seconds"] = cow_total.Total_timeDelta_Seconds/cow_total.milking_times
cow_total

Unnamed: 0,FarmName_Pseudo,Gigacow_Cow_Id,MilkingDate,LactationNumber,DaysInMilk,BreedName,Age,Total_MilkProduction,Total_timeDelta_Seconds,milking_times,id,label,mean_Total_timeDelta_Seconds
0,a624fb9a,769,2021-09-09,1.0,1.0,2,1.96,9.48,5191.0,2,1,0,2595.500000
1,a624fb9a,769,2021-09-10,1.0,2.0,2,1.96,14.49,9981.0,2,1,0,4990.500000
2,a624fb9a,769,2021-09-11,1.0,3.0,2,1.96,14.93,3531.0,2,1,0,1765.500000
3,a624fb9a,769,2021-09-12,1.0,4.0,2,1.97,14.62,8567.0,2,1,0,4283.500000
4,a624fb9a,769,2021-09-13,1.0,5.0,2,1.97,13.38,7612.0,2,1,0,3806.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3343,a624fb9a,3707,2021-09-26,2.0,55.0,1,3.06,37.49,620.0,3,32,0,206.666667
3344,a624fb9a,3707,2021-09-27,2.0,56.0,1,3.06,48.64,1425.0,4,32,0,356.250000
3345,a624fb9a,3707,2021-09-28,2.0,57.0,1,3.06,37.54,6408.0,3,32,0,2136.000000
3346,a624fb9a,3707,2021-09-29,2.0,58.0,1,3.07,32.58,2759.0,3,32,0,919.666667


In [27]:
static_cols = ['FarmName_Pseudo', 'TrafficDeviceName', 'LactationNumber', 'BreedName']
# timeSeries_cols = ['Age', 'Total_MilkProduction', 'Total_timeDelta_Seconds', 'DaysInMilk', 'milking_times']
timeSeries_cols = ['Age', 'Total_MilkProduction', 'mean_Total_timeDelta_Seconds', 'DaysInMilk', 'milking_times']
output_col = ['label']

In [28]:
cow_label = cow_total[['id', 'label']].copy()
# fetch y for feature extraction
y = cow_label.drop_duplicates(subset=['id'])
y = y["label"]
y.index = range(1, len(y)+1)
y

1     0
2     1
3     0
4     1
5     1
6     1
7     1
8     0
9     0
10    0
11    0
12    1
13    0
14    0
15    1
16    0
17    0
18    1
19    1
20    1
21    0
22    1
23    0
24    1
25    1
26    0
27    0
28    0
29    0
30    0
31    1
32    0
Name: label, dtype: int64

In [29]:
y.value_counts()

0    18
1    14
Name: label, dtype: int64

In [17]:
# using the relevant feature names list from learner problem
ts_learner_dataset = pd.read_csv(dataDir.parent/'learner_118_new_meanTimeCost.csv', encoding='utf-8', index_col=False)
ts_learner_features = ts_learner_dataset.iloc[:, 1:(len(ts_learner_dataset.columns)-5)].copy()
para_dict = from_columns(ts_learner_features.columns)
para_dict

{'mean_Total_timeDelta_Seconds': {'agg_autocorrelation': [{'f_agg': 'mean',
    'maxlag': 40}],
  'fft_coefficient': [{'attr': 'real', 'coeff': 4},
   {'attr': 'imag', 'coeff': 62},
   {'attr': 'real', 'coeff': 42},
   {'attr': 'real', 'coeff': 47},
   {'attr': 'real', 'coeff': 45},
   {'attr': 'imag', 'coeff': 12},
   {'attr': 'imag', 'coeff': 7},
   {'attr': 'imag', 'coeff': 14},
   {'attr': 'angle', 'coeff': 1},
   {'attr': 'real', 'coeff': 1},
   {'attr': 'imag', 'coeff': 9},
   {'attr': 'imag', 'coeff': 6},
   {'attr': 'real', 'coeff': 9}],
  'fourier_entropy': [{'bins': 3}, {'bins': 5}],
  'augmented_dickey_fuller': [{'attr': 'usedlag', 'autolag': 'AIC'},
   {'attr': 'pvalue', 'autolag': 'AIC'},
   {'attr': 'teststat', 'autolag': 'AIC'}],
  'cwt_coefficients': [{'coeff': 11, 'w': 2, 'widths': (2, 5, 10, 20)},
   {'coeff': 8, 'w': 5, 'widths': (2, 5, 10, 20)}],
  'count_below_mean': None,
  'change_quantiles': [{'f_agg': 'mean',
    'isabs': False,
    'qh': 1.0,
    'ql': 0.2}],


In [31]:
# extract features from cow dataset in lactation period 1

ts_extracted_dataset_1 = cow_total_1[['id']].copy()
ts_extracted_dataset_1.drop_duplicates(subset=['id'], inplace=True)
ts_extracted_dataset_1.index = ts_extracted_dataset_1.id

cow_timeseries = cow_total_1[['id', 'MilkingDate']].copy()
cow_timeseries.index = range(len(cow_timeseries))

for col in timeSeries_cols:
    ts_processed = pd.DataFrame(cow_total_1[col].copy())
    ts_processed.index = range(0,len(ts_processed)) 
    ts_processed = pd.concat([cow_timeseries, ts_processed], axis=1)
    print(ts_processed)
    # extract time series features
    extracted_dataset = extract_features(ts_processed, column_id="id", column_sort="MilkingDate", kind_to_fc_parameters=para_dict)
    # extracted_dataset = extract_features(ts_processed, column_id="id", column_sort="MilkingDate")
    # extracted_features.dropna(axis=1, inplace=True)
    impute(extracted_dataset)
    # features_filtered = select_features(extracted_dataset, y)
    ts_extracted_dataset_1 = pd.concat([ts_extracted_dataset_1, extracted_dataset], axis=1)
ts_extracted_dataset_1

      id MilkingDate   Age
0      1  2021-09-09  1.96
1      1  2021-09-10  1.96
2      1  2021-09-11  1.96
3      1  2021-09-12  1.97
4      1  2021-09-13  1.97
...   ..         ...   ...
1586  32  2020-11-04  2.16
1587  32  2020-11-05  2.17
1588  32  2020-11-06  2.17
1589  32  2020-11-07  2.17
1590  32  2020-11-08  2.18

[1591 rows x 3 columns]


Feature Extraction: 100%|██████████| 32/32 [00:04<00:00,  6.52it/s]


      id MilkingDate  Total_MilkProduction
0      1  2021-09-09                  9.48
1      1  2021-09-10                 14.49
2      1  2021-09-11                 14.93
3      1  2021-09-12                 14.62
4      1  2021-09-13                 13.38
...   ..         ...                   ...
1586  32  2020-11-04                 46.06
1587  32  2020-11-05                 44.03
1588  32  2020-11-06                 35.73
1589  32  2020-11-07                 37.62
1590  32  2020-11-08                 27.47

[1591 rows x 3 columns]


Feature Extraction: 100%|██████████| 32/32 [00:05<00:00,  6.39it/s]
 'Total_MilkProduction__fft_coefficient__attr_"abs"__coeff_40'
 'Total_MilkProduction__fft_coefficient__attr_"abs"__coeff_49'
 'Total_MilkProduction__fft_coefficient__attr_"abs"__coeff_52'] did not have any finite values. Filling with zeros.


      id MilkingDate  mean_Total_timeDelta_Seconds
0      1  2021-09-09                   2595.500000
1      1  2021-09-10                   4990.500000
2      1  2021-09-11                   1765.500000
3      1  2021-09-12                   4283.500000
4      1  2021-09-13                   3806.000000
...   ..         ...                           ...
1586  32  2020-11-04                    806.000000
1587  32  2020-11-05                    891.250000
1588  32  2020-11-06                   1255.750000
1589  32  2020-11-07                    525.500000
1590  32  2020-11-08                    193.666667

[1591 rows x 3 columns]


Feature Extraction: 100%|██████████| 32/32 [00:05<00:00,  5.76it/s]
 'mean_Total_timeDelta_Seconds__fft_coefficient__attr_"real"__coeff_42'
 'mean_Total_timeDelta_Seconds__fft_coefficient__attr_"real"__coeff_47'
 'mean_Total_timeDelta_Seconds__fft_coefficient__attr_"real"__coeff_45'] did not have any finite values. Filling with zeros.


      id MilkingDate  DaysInMilk
0      1  2021-09-09         1.0
1      1  2021-09-10         2.0
2      1  2021-09-11         3.0
3      1  2021-09-12         4.0
4      1  2021-09-13         5.0
...   ..         ...         ...
1586  32  2020-11-04        55.0
1587  32  2020-11-05        56.0
1588  32  2020-11-06        57.0
1589  32  2020-11-07        58.0
1590  32  2020-11-08        59.0

[1591 rows x 3 columns]


Feature Extraction: 100%|██████████| 32/32 [00:04<00:00,  6.57it/s]


      id MilkingDate  milking_times
0      1  2021-09-09              2
1      1  2021-09-10              2
2      1  2021-09-11              2
3      1  2021-09-12              2
4      1  2021-09-13              2
...   ..         ...            ...
1586  32  2020-11-04              5
1587  32  2020-11-05              4
1588  32  2020-11-06              4
1589  32  2020-11-07              4
1590  32  2020-11-08              3

[1591 rows x 3 columns]


Feature Extraction: 100%|██████████| 32/32 [00:05<00:00,  6.32it/s]


Unnamed: 0,id,"Total_MilkProduction__cwt_coefficients__coeff_1__w_5__widths_(2, 5, 10, 20)","Total_MilkProduction__cwt_coefficients__coeff_0__w_2__widths_(2, 5, 10, 20)","Total_MilkProduction__cwt_coefficients__coeff_12__w_20__widths_(2, 5, 10, 20)","Total_MilkProduction__cwt_coefficients__coeff_12__w_5__widths_(2, 5, 10, 20)","Total_MilkProduction__cwt_coefficients__coeff_10__w_10__widths_(2, 5, 10, 20)","Total_MilkProduction__cwt_coefficients__coeff_13__w_20__widths_(2, 5, 10, 20)","Total_MilkProduction__cwt_coefficients__coeff_7__w_5__widths_(2, 5, 10, 20)","Total_MilkProduction__cwt_coefficients__coeff_8__w_5__widths_(2, 5, 10, 20)","Total_MilkProduction__fft_coefficient__attr_""imag""__coeff_12",...,milking_times__count_below_mean,milking_times__large_standard_deviation__r_0.30000000000000004,milking_times__large_standard_deviation__r_0.2,milking_times__energy_ratio_by_chunks__num_segments_10__segment_focus_6,milking_times__energy_ratio_by_chunks__num_segments_10__segment_focus_0,milking_times__absolute_sum_of_changes,milking_times__number_cwt_peaks__n_1,"milking_times__fft_aggregated__aggtype_""skew""","milking_times__fft_aggregated__aggtype_""variance""",milking_times__range_count__max_1000000000000.0__min_0
1,1,-6.688095,-3.108722,90.839003,10.721901,28.891505,94.492562,10.973665,11.641196,-29.814081,...,14.0,0.0,1.0,0.12963,0.044444,30.0,10.0,1.22951,80.848615,59.0
2,2,-2.414862,-7.383614,130.008542,12.69276,53.475083,134.576209,20.899686,19.268866,77.985701,...,31.0,0.0,1.0,0.089501,0.072289,31.0,10.0,1.394746,55.145015,50.0
3,3,6.307668,-4.469439,89.463396,4.622078,35.075586,92.996843,7.734775,5.723686,80.506731,...,23.0,0.0,1.0,0.153846,0.081197,36.0,6.0,0.811231,45.804487,41.0
4,4,-4.572472,-4.228776,99.505886,6.732709,33.241449,103.025427,12.548463,11.544853,-15.296916,...,32.0,0.0,1.0,0.12069,0.07931,37.0,6.0,1.006562,72.074412,51.0
5,5,-3.014259,-10.103797,111.763304,8.776363,39.517589,116.066254,23.292382,21.082155,-10.947864,...,45.0,0.0,0.0,0.077477,0.068468,38.0,9.0,1.244066,77.045142,57.0
6,6,-2.697365,0.158948,123.230787,6.165378,38.411897,128.111092,21.970785,19.771146,38.056509,...,23.0,0.0,1.0,0.158235,0.049516,44.0,11.0,1.392851,68.219516,58.0
7,7,11.756431,-6.883696,146.612477,3.336683,89.944248,150.163735,31.678231,25.05381,-2.768299,...,25.0,1.0,1.0,0.079452,0.090411,25.0,7.0,1.325726,27.255339,35.0
8,8,-7.419647,-2.135965,103.212259,11.302046,34.954923,107.370173,18.022253,17.996539,64.24053,...,34.0,1.0,1.0,0.123308,0.058647,32.0,10.0,1.370728,75.946996,58.0
9,9,-5.999017,-4.698493,93.487448,9.035393,28.782643,97.812118,15.377276,15.107496,-3.600713,...,28.0,1.0,1.0,0.106838,0.051282,39.0,10.0,1.387074,87.251481,58.0
10,10,8.15875,-0.120864,120.794858,3.562332,58.192652,123.567198,20.983385,17.207758,-69.644091,...,31.0,0.0,1.0,0.085714,0.122449,27.0,10.0,1.160281,50.8823,45.0


In [32]:
# extract features from cow dataset in lactation period 2

ts_extracted_dataset_2 = cow_total_2[['id']].copy()
ts_extracted_dataset_2.drop_duplicates(subset=['id'], inplace=True)
ts_extracted_dataset_2.index = ts_extracted_dataset_2.id

cow_timeseries = cow_total_2[['id', 'MilkingDate']].copy()
cow_timeseries.index = range(len(cow_timeseries))

for col in timeSeries_cols:
    ts_processed = pd.DataFrame(cow_total_2[col].copy())
    ts_processed.index = range(0,len(ts_processed)) 
    ts_processed = pd.concat([cow_timeseries, ts_processed], axis=1)
    print(ts_processed)
    # extract time series features
    extracted_dataset = extract_features(ts_processed, column_id="id", column_sort="MilkingDate", kind_to_fc_parameters=para_dict)
    # extracted_features.dropna(axis=1, inplace=True)
    impute(extracted_dataset)
    # features_filtered = select_features(extracted_dataset, y)
    ts_extracted_dataset_2 = pd.concat([ts_extracted_dataset_2, extracted_dataset], axis=1)
ts_extracted_dataset_2

      id MilkingDate   Age
0      1  2022-07-29  2.84
1      1  2022-07-30  2.85
2      1  2022-07-31  2.85
3      1  2022-08-01  2.85
4      1  2022-08-02  2.85
...   ..         ...   ...
1752  32  2021-09-26  3.06
1753  32  2021-09-27  3.06
1754  32  2021-09-28  3.06
1755  32  2021-09-29  3.07
1756  32  2021-09-30  3.07

[1757 rows x 3 columns]


Feature Extraction: 100%|██████████| 32/32 [00:05<00:00,  6.34it/s]


      id MilkingDate  Total_MilkProduction
0      1  2022-07-29                 15.19
1      1  2022-07-30                 25.94
2      1  2022-07-31                 25.91
3      1  2022-08-01                 30.19
4      1  2022-08-02                 30.78
...   ..         ...                   ...
1752  32  2021-09-26                 37.49
1753  32  2021-09-27                 48.64
1754  32  2021-09-28                 37.54
1755  32  2021-09-29                 32.58
1756  32  2021-09-30                 30.89

[1757 rows x 3 columns]


Feature Extraction: 100%|██████████| 32/32 [00:04<00:00,  6.66it/s]
 'Total_MilkProduction__fft_coefficient__attr_"abs"__coeff_40'
 'Total_MilkProduction__fft_coefficient__attr_"abs"__coeff_49'
 'Total_MilkProduction__fft_coefficient__attr_"abs"__coeff_52'] did not have any finite values. Filling with zeros.


      id MilkingDate  mean_Total_timeDelta_Seconds
0      1  2022-07-29                    641.000000
1      1  2022-07-30                   1298.000000
2      1  2022-07-31                    792.333333
3      1  2022-08-01                   1299.333333
4      1  2022-08-02                   1161.666667
...   ..         ...                           ...
1752  32  2021-09-26                    206.666667
1753  32  2021-09-27                    356.250000
1754  32  2021-09-28                   2136.000000
1755  32  2021-09-29                    919.666667
1756  32  2021-09-30                    595.333333

[1757 rows x 3 columns]


Feature Extraction: 100%|██████████| 32/32 [00:05<00:00,  6.14it/s]
 'mean_Total_timeDelta_Seconds__fft_coefficient__attr_"real"__coeff_42'
 'mean_Total_timeDelta_Seconds__fft_coefficient__attr_"real"__coeff_47'
 'mean_Total_timeDelta_Seconds__fft_coefficient__attr_"real"__coeff_45'] did not have any finite values. Filling with zeros.


      id MilkingDate  DaysInMilk
0      1  2022-07-29         1.0
1      1  2022-07-30         2.0
2      1  2022-07-31         3.0
3      1  2022-08-01         4.0
4      1  2022-08-02         5.0
...   ..         ...         ...
1752  32  2021-09-26        55.0
1753  32  2021-09-27        56.0
1754  32  2021-09-28        57.0
1755  32  2021-09-29        58.0
1756  32  2021-09-30        59.0

[1757 rows x 3 columns]


Feature Extraction: 100%|██████████| 32/32 [00:04<00:00,  6.58it/s]


      id MilkingDate  milking_times
0      1  2022-07-29              3
1      1  2022-07-30              3
2      1  2022-07-31              3
3      1  2022-08-01              3
4      1  2022-08-02              3
...   ..         ...            ...
1752  32  2021-09-26              3
1753  32  2021-09-27              4
1754  32  2021-09-28              3
1755  32  2021-09-29              3
1756  32  2021-09-30              3

[1757 rows x 3 columns]


Feature Extraction: 100%|██████████| 32/32 [00:04<00:00,  6.53it/s]


Unnamed: 0,id,"Total_MilkProduction__cwt_coefficients__coeff_1__w_5__widths_(2, 5, 10, 20)","Total_MilkProduction__cwt_coefficients__coeff_0__w_2__widths_(2, 5, 10, 20)","Total_MilkProduction__cwt_coefficients__coeff_12__w_20__widths_(2, 5, 10, 20)","Total_MilkProduction__cwt_coefficients__coeff_12__w_5__widths_(2, 5, 10, 20)","Total_MilkProduction__cwt_coefficients__coeff_10__w_10__widths_(2, 5, 10, 20)","Total_MilkProduction__cwt_coefficients__coeff_13__w_20__widths_(2, 5, 10, 20)","Total_MilkProduction__cwt_coefficients__coeff_7__w_5__widths_(2, 5, 10, 20)","Total_MilkProduction__cwt_coefficients__coeff_8__w_5__widths_(2, 5, 10, 20)","Total_MilkProduction__fft_coefficient__attr_""imag""__coeff_12",...,milking_times__count_below_mean,milking_times__large_standard_deviation__r_0.30000000000000004,milking_times__large_standard_deviation__r_0.2,milking_times__energy_ratio_by_chunks__num_segments_10__segment_focus_6,milking_times__energy_ratio_by_chunks__num_segments_10__segment_focus_0,milking_times__absolute_sum_of_changes,milking_times__number_cwt_peaks__n_1,"milking_times__fft_aggregated__aggtype_""skew""","milking_times__fft_aggregated__aggtype_""variance""",milking_times__range_count__max_1000000000000.0__min_0
1,1,-6.00323,-8.999119,142.782251,17.384753,59.447279,146.981571,29.050555,27.756513,59.603443,...,46.0,0.0,0.0,0.112583,0.089404,34.0,10.0,1.218985,94.94812,59.0
2,2,9.497866,-1.035383,140.280514,9.429442,54.877905,144.642366,28.943013,24.389246,-12.722239,...,38.0,0.0,1.0,0.097372,0.137558,33.0,10.0,1.305938,84.544144,58.0
3,3,-0.840241,-8.187381,131.408537,9.257838,39.723544,136.842673,29.764489,27.121531,38.513409,...,31.0,0.0,0.0,0.083333,0.094444,31.0,9.0,1.125808,73.401261,56.0
4,4,-9.564919,-8.849292,88.275006,11.566152,27.972541,92.045206,12.392694,12.789497,-26.520777,...,21.0,0.0,1.0,0.100613,0.092025,34.0,11.0,1.501814,74.967874,59.0
5,5,-5.236483,-9.243856,117.419583,11.203768,74.405838,122.067103,24.574386,23.326249,6.494161,...,6.0,0.0,0.0,0.07971,0.083333,18.0,6.0,1.302221,25.648095,32.0
6,6,-5.011488,-9.626998,145.825113,12.73566,56.103173,151.247309,31.906026,29.224261,46.206359,...,34.0,0.0,0.0,0.096454,0.079433,44.0,12.0,1.210954,92.262252,59.0
7,7,13.140907,1.052165,149.441741,5.043048,61.028193,153.30602,39.891506,33.363171,28.8201,...,10.0,0.0,1.0,0.090909,0.12616,41.0,12.0,1.031354,101.122856,58.0
8,8,-8.679412,-11.927271,113.909987,7.938672,38.013664,119.042893,26.112821,24.730831,25.868553,...,27.0,0.0,0.0,0.11245,0.080321,57.0,12.0,0.978095,104.23468,59.0
9,9,-5.370892,-4.186659,162.660177,19.131737,59.239259,168.504426,33.144014,32.249846,27.521805,...,47.0,0.0,1.0,0.090355,0.092386,33.0,11.0,1.597082,74.743453,58.0
10,10,1.668407,-2.113594,153.519601,9.800962,57.468737,159.28814,29.18518,25.637096,-32.498378,...,12.0,0.0,1.0,0.124211,0.107368,38.0,10.0,1.111357,79.192393,54.0


In [33]:
ts_extracted_dataset_2.drop(['id'], axis=1, inplace=True)
ts_extracted_dataset = pd.concat([ts_extracted_dataset_1, ts_extracted_dataset_2], axis=1)
nunique = ts_extracted_dataset.nunique()
cols_to_drop = nunique[nunique == 1].index
ts_extracted_dataset.drop(cols_to_drop, axis=1, inplace=True)
ts_extracted_dataset

Unnamed: 0,id,"Total_MilkProduction__cwt_coefficients__coeff_1__w_5__widths_(2, 5, 10, 20)","Total_MilkProduction__cwt_coefficients__coeff_0__w_2__widths_(2, 5, 10, 20)","Total_MilkProduction__cwt_coefficients__coeff_12__w_20__widths_(2, 5, 10, 20)","Total_MilkProduction__cwt_coefficients__coeff_12__w_5__widths_(2, 5, 10, 20)","Total_MilkProduction__cwt_coefficients__coeff_10__w_10__widths_(2, 5, 10, 20)","Total_MilkProduction__cwt_coefficients__coeff_13__w_20__widths_(2, 5, 10, 20)","Total_MilkProduction__cwt_coefficients__coeff_7__w_5__widths_(2, 5, 10, 20)","Total_MilkProduction__cwt_coefficients__coeff_8__w_5__widths_(2, 5, 10, 20)","Total_MilkProduction__fft_coefficient__attr_""imag""__coeff_12",...,milking_times__count_below_mean,milking_times__large_standard_deviation__r_0.30000000000000004,milking_times__large_standard_deviation__r_0.2,milking_times__energy_ratio_by_chunks__num_segments_10__segment_focus_6,milking_times__energy_ratio_by_chunks__num_segments_10__segment_focus_0,milking_times__absolute_sum_of_changes,milking_times__number_cwt_peaks__n_1,"milking_times__fft_aggregated__aggtype_""skew""","milking_times__fft_aggregated__aggtype_""variance""",milking_times__range_count__max_1000000000000.0__min_0
1,1,-6.688095,-3.108722,90.839003,10.721901,28.891505,94.492562,10.973665,11.641196,-29.814081,...,46.0,0.0,0.0,0.112583,0.089404,34.0,10.0,1.218985,94.94812,59.0
2,2,-2.414862,-7.383614,130.008542,12.69276,53.475083,134.576209,20.899686,19.268866,77.985701,...,38.0,0.0,1.0,0.097372,0.137558,33.0,10.0,1.305938,84.544144,58.0
3,3,6.307668,-4.469439,89.463396,4.622078,35.075586,92.996843,7.734775,5.723686,80.506731,...,31.0,0.0,0.0,0.083333,0.094444,31.0,9.0,1.125808,73.401261,56.0
4,4,-4.572472,-4.228776,99.505886,6.732709,33.241449,103.025427,12.548463,11.544853,-15.296916,...,21.0,0.0,1.0,0.100613,0.092025,34.0,11.0,1.501814,74.967874,59.0
5,5,-3.014259,-10.103797,111.763304,8.776363,39.517589,116.066254,23.292382,21.082155,-10.947864,...,6.0,0.0,0.0,0.07971,0.083333,18.0,6.0,1.302221,25.648095,32.0
6,6,-2.697365,0.158948,123.230787,6.165378,38.411897,128.111092,21.970785,19.771146,38.056509,...,34.0,0.0,0.0,0.096454,0.079433,44.0,12.0,1.210954,92.262252,59.0
7,7,11.756431,-6.883696,146.612477,3.336683,89.944248,150.163735,31.678231,25.05381,-2.768299,...,10.0,0.0,1.0,0.090909,0.12616,41.0,12.0,1.031354,101.122856,58.0
8,8,-7.419647,-2.135965,103.212259,11.302046,34.954923,107.370173,18.022253,17.996539,64.24053,...,27.0,0.0,0.0,0.11245,0.080321,57.0,12.0,0.978095,104.23468,59.0
9,9,-5.999017,-4.698493,93.487448,9.035393,28.782643,97.812118,15.377276,15.107496,-3.600713,...,47.0,0.0,1.0,0.090355,0.092386,33.0,11.0,1.597082,74.743453,58.0
10,10,8.15875,-0.120864,120.794858,3.562332,58.192652,123.567198,20.983385,17.207758,-69.644091,...,12.0,0.0,1.0,0.124211,0.107368,38.0,10.0,1.111357,79.192393,54.0


In [34]:
ts_extracted_dataset.index = range(1, len(ts_extracted_dataset)+1)
ts_extracted_features = ts_extracted_dataset.iloc[:, 1:len(ts_extracted_dataset.columns)].copy()
# normalize numerical features
ts_extracted_cols = ts_extracted_features.columns
scaler_std = StandardScaler()
ts_std = scaler_std.fit_transform(ts_extracted_features)
# transform standard data into dataframe
ts_extracted_processed = pd.DataFrame(ts_std, columns=ts_extracted_cols)
ts_extracted_processed.index = range(1,len(ts_extracted_processed)+1)
# append id col to the dataframe
ts_extracted_processed = pd.concat([ts_extracted_dataset['id'], ts_extracted_processed], axis=1)
ts_extracted_processed

Unnamed: 0,id,"Total_MilkProduction__cwt_coefficients__coeff_1__w_5__widths_(2, 5, 10, 20)","Total_MilkProduction__cwt_coefficients__coeff_0__w_2__widths_(2, 5, 10, 20)","Total_MilkProduction__cwt_coefficients__coeff_12__w_20__widths_(2, 5, 10, 20)","Total_MilkProduction__cwt_coefficients__coeff_12__w_5__widths_(2, 5, 10, 20)","Total_MilkProduction__cwt_coefficients__coeff_10__w_10__widths_(2, 5, 10, 20)","Total_MilkProduction__cwt_coefficients__coeff_13__w_20__widths_(2, 5, 10, 20)","Total_MilkProduction__cwt_coefficients__coeff_7__w_5__widths_(2, 5, 10, 20)","Total_MilkProduction__cwt_coefficients__coeff_8__w_5__widths_(2, 5, 10, 20)","Total_MilkProduction__fft_coefficient__attr_""imag""__coeff_12",...,milking_times__count_below_mean,milking_times__large_standard_deviation__r_0.30000000000000004,milking_times__large_standard_deviation__r_0.2,milking_times__energy_ratio_by_chunks__num_segments_10__segment_focus_6,milking_times__energy_ratio_by_chunks__num_segments_10__segment_focus_0,milking_times__absolute_sum_of_changes,milking_times__number_cwt_peaks__n_1,"milking_times__fft_aggregated__aggtype_""skew""","milking_times__fft_aggregated__aggtype_""variance""",milking_times__range_count__max_1000000000000.0__min_0
1,1,-1.015706,0.202499,-0.538062,0.600443,-0.69881,-0.528962,-1.066386,-0.848579,-1.242208,...,1.875952,-0.179605,-1.290994,0.276562,-0.448776,-0.491792,0.031513,0.125394,0.74867,0.458887
2,2,-0.370088,-0.74293,1.263955,1.009445,0.560211,1.278024,0.138514,0.190373,2.100237,...,1.097599,-0.179605,0.774597,-0.18462,1.701128,-0.586027,0.031513,0.508881,0.25506,0.346793
3,3,0.947747,-0.098435,-0.601347,-0.665421,-0.382099,-0.596389,-1.459548,-1.654593,2.178404,...,0.41654,-0.179605,-1.290994,-0.610293,-0.223737,-0.774498,-0.47269,-0.285542,-0.273607,0.122603
4,4,-0.696069,-0.04521,-0.139337,-0.227413,-0.476032,-0.144297,-0.875224,-0.861702,-0.792088,...,-0.556401,-0.179605,0.774597,-0.086352,-0.331777,-0.491792,0.535715,1.372743,-0.19928,0.458887
5,5,-0.460647,-1.344521,0.424572,0.196696,-0.154607,0.443588,0.428958,0.437358,-0.657242,...,-2.015812,-0.179605,-1.290994,-0.72015,-0.719809,-1.99956,-1.985298,0.492485,-2.539226,-2.567666
6,6,-0.41277,0.925172,0.95214,-0.345148,-0.211234,0.986574,0.268532,0.258788,0.862191,...,0.708423,-0.179605,-1.290994,-0.212472,-0.893962,0.450564,1.039918,0.089975,0.621241,0.458887
7,7,1.770969,-0.632369,2.027828,-0.932173,2.427939,1.980716,1.446899,0.978331,-0.403626,...,-1.626636,-0.179605,0.774597,-0.380593,1.19223,0.167857,1.039918,-0.702109,1.041626,0.346793
8,8,-1.126232,0.417633,0.031177,0.720838,-0.388279,0.051566,-0.210772,0.017072,1.674054,...,0.027364,-0.179605,-1.290994,0.27253,-0.854286,1.675625,1.039918,-0.936993,1.189265,0.458887
9,9,-0.911597,-0.149092,-0.416218,0.250451,-0.704385,-0.379315,-0.53184,-0.37644,-0.429435,...,1.973246,-0.179605,0.774597,-0.397384,-0.315649,-0.586027,0.535715,1.792902,-0.209928,0.346793
10,10,1.227416,0.863289,0.840074,-0.885345,0.801816,0.781734,0.148674,-0.090367,-2.47718,...,-1.432048,-0.179605,0.774597,0.62912,0.353272,-0.11485,0.031513,-0.349275,0.001149,-0.101586


In [35]:
# add one-hot encoded categorical features
ohe = OneHotEncoder(handle_unknown='ignore',sparse=False)
cow_breed = cow_total[['id', 'BreedName']].copy()
cow_breed.drop_duplicates(subset=['id'], inplace=True)
cat = ohe.fit_transform(np.array(cow_breed['BreedName']).reshape(-1, 1))
col_names = ohe.get_feature_names_out(['BreedName'])
cat_breed = pd.DataFrame(cat, columns=col_names)
cat_breed.index = range(1,len(cow_breed)+1)
cat_breed

Unnamed: 0,BreedName_1,BreedName_2,BreedName_4,BreedName_99
1,0.0,1.0,0.0,0.0
2,1.0,0.0,0.0,0.0
3,1.0,0.0,0.0,0.0
4,1.0,0.0,0.0,0.0
5,1.0,0.0,0.0,0.0
6,0.0,1.0,0.0,0.0
7,0.0,1.0,0.0,0.0
8,1.0,0.0,0.0,0.0
9,1.0,0.0,0.0,0.0
10,0.0,1.0,0.0,0.0


In [36]:
ts_dataset = pd.concat([ts_extracted_processed, cat_breed], axis=1)
ts_dataset = pd.concat([ts_dataset, y], axis=1)
ts_dataset.to_csv(dataDir.parent/"memory_32_meanCost_188.csv", index=False)
ts_dataset

Unnamed: 0,id,"Total_MilkProduction__cwt_coefficients__coeff_1__w_5__widths_(2, 5, 10, 20)","Total_MilkProduction__cwt_coefficients__coeff_0__w_2__widths_(2, 5, 10, 20)","Total_MilkProduction__cwt_coefficients__coeff_12__w_20__widths_(2, 5, 10, 20)","Total_MilkProduction__cwt_coefficients__coeff_12__w_5__widths_(2, 5, 10, 20)","Total_MilkProduction__cwt_coefficients__coeff_10__w_10__widths_(2, 5, 10, 20)","Total_MilkProduction__cwt_coefficients__coeff_13__w_20__widths_(2, 5, 10, 20)","Total_MilkProduction__cwt_coefficients__coeff_7__w_5__widths_(2, 5, 10, 20)","Total_MilkProduction__cwt_coefficients__coeff_8__w_5__widths_(2, 5, 10, 20)","Total_MilkProduction__fft_coefficient__attr_""imag""__coeff_12",...,milking_times__absolute_sum_of_changes,milking_times__number_cwt_peaks__n_1,"milking_times__fft_aggregated__aggtype_""skew""","milking_times__fft_aggregated__aggtype_""variance""",milking_times__range_count__max_1000000000000.0__min_0,BreedName_1,BreedName_2,BreedName_4,BreedName_99,label
1,1,-1.015706,0.202499,-0.538062,0.600443,-0.69881,-0.528962,-1.066386,-0.848579,-1.242208,...,-0.491792,0.031513,0.125394,0.74867,0.458887,0.0,1.0,0.0,0.0,0
2,2,-0.370088,-0.74293,1.263955,1.009445,0.560211,1.278024,0.138514,0.190373,2.100237,...,-0.586027,0.031513,0.508881,0.25506,0.346793,1.0,0.0,0.0,0.0,1
3,3,0.947747,-0.098435,-0.601347,-0.665421,-0.382099,-0.596389,-1.459548,-1.654593,2.178404,...,-0.774498,-0.47269,-0.285542,-0.273607,0.122603,1.0,0.0,0.0,0.0,0
4,4,-0.696069,-0.04521,-0.139337,-0.227413,-0.476032,-0.144297,-0.875224,-0.861702,-0.792088,...,-0.491792,0.535715,1.372743,-0.19928,0.458887,1.0,0.0,0.0,0.0,1
5,5,-0.460647,-1.344521,0.424572,0.196696,-0.154607,0.443588,0.428958,0.437358,-0.657242,...,-1.99956,-1.985298,0.492485,-2.539226,-2.567666,1.0,0.0,0.0,0.0,1
6,6,-0.41277,0.925172,0.95214,-0.345148,-0.211234,0.986574,0.268532,0.258788,0.862191,...,0.450564,1.039918,0.089975,0.621241,0.458887,0.0,1.0,0.0,0.0,1
7,7,1.770969,-0.632369,2.027828,-0.932173,2.427939,1.980716,1.446899,0.978331,-0.403626,...,0.167857,1.039918,-0.702109,1.041626,0.346793,0.0,1.0,0.0,0.0,1
8,8,-1.126232,0.417633,0.031177,0.720838,-0.388279,0.051566,-0.210772,0.017072,1.674054,...,1.675625,1.039918,-0.936993,1.189265,0.458887,1.0,0.0,0.0,0.0,0
9,9,-0.911597,-0.149092,-0.416218,0.250451,-0.704385,-0.379315,-0.53184,-0.37644,-0.429435,...,-0.586027,0.535715,1.792902,-0.209928,0.346793,1.0,0.0,0.0,0.0,0
10,10,1.227416,0.863289,0.840074,-0.885345,0.801816,0.781734,0.148674,-0.090367,-2.47718,...,-0.11485,0.031513,-0.349275,0.001149,-0.101586,0.0,1.0,0.0,0.0,0


cow_total.BreedName.unique()

In [None]:
'''
from tsfresh.feature_extraction import extract_features, EfficientFCParameters

ts_processed = pd.DataFrame(cow_total_1[timeSeries_cols].copy())
ts_processed.index = range(0,len(ts_processed)) 
ts_processed = pd.concat([cow_timeseries, ts_processed], axis=1)
ts_processed

extracted_dataset = extract_features(ts_processed, column_id="id", column_sort="MilkingDate", default_fc_parameters=EfficientFCParameters())
impute(extracted_dataset)
#extracted_dataset.index = range(1,len(extracted_dataset)+1)
features_filtered = select_features(extracted_dataset, y)
ts_extracted_dataset = pd.concat([ts_extracted_dataset, features_filtered], axis=1)
ts_extracted_dataset
'''