In [13]:
import os
from pathlib import Path
from matplotlib.style import use
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from tsfresh import extract_features
from tsfresh import select_features
from tsfresh.utilities.dataframe_functions import impute
from tsfresh.feature_extraction import ComprehensiveFCParameters, EfficientFCParameters, MinimalFCParameters
from tsfresh.feature_extraction.settings import from_columns


dataDir = Path.cwd().parent.parent.parent/'Data/processed/memory_targetCows/'
usecols = ['id', 'FarmName_Pseudo', 'Gigacow_Cow_Id', 'Total_MilkProduction', 'Total_timeDelta_Seconds', 'LactationNumber', 'DaysInMilk', 'BreedName', 'Age', 'milking_times', 'MilkingDate']

In [15]:
# labeling cow with good/bad memory(1/0)
def labeling_memory(threshold_percentage, cow_l1, cow_l2): 
    '''
    func: labeling memory dataset
    args: 
        threshold_percentage: threshold for the ratio of milking time cost reduction
        cow_l1: A dataframe contains all data points for a single cow on lactation period 1
        cow_l2: A dataframe contains all data points for a single cow on lactation period 2
    return: memory dataset with label
    '''
    global memory
    total_timeCost_1 = 0
    total_timeCost_2 = 0
    # lactation period 1
    total_timeCost_1 = cow_l1.Total_timeDelta_Seconds.sum()
    totalEvents_1 = cow_l1.milking_times.sum()
    mean_timeCost_1 = total_timeCost_1/totalEvents_1
    # lactation period 2
    total_timeCost_2 = cow_l2.Total_timeDelta_Seconds.sum()
    totalEvents_2 = cow_l2.milking_times.sum()
    mean_timeCost_2 = total_timeCost_2/totalEvents_2
    time_diff = (mean_timeCost_1-mean_timeCost_2)/mean_timeCost_1
    # print("cow: " + str(time_diff))
    # print(mean_timeCost_1)
    # print(mean_timeCost_2)
    
    # anomalies detection
    if time_diff < 0.1:
        print(cow_l1.FarmName_Pseudo.unique())
        print(time_diff)
        memory = 2
        return memory
    print(time_diff)
    if time_diff > threshold_percentage:
        memory = 1 # good memory
        print("good")
    else:
        memory = 0 # bad memory
        print("bad")
    return memory

In [4]:
'''
periods = [1, 2]

# integrate all the cows data into one dataset
for p in periods:
    filelist = list(Path(dataDir/str(p)).glob('*.csv'))
    for i, _ in enumerate(filelist):
        fileName = 'cow_' + str(i) + '.csv'
        single_cow = pd.read_csv(dataDir/str(p)/fileName, encoding='utf-8', usecols=usecols)
        single_cow.sort_values(by=['MilkingDate'], inplace=True)
        if i == 0:
            cow_total = single_cow
        else:
            cow_total = pd.concat([cow_total, single_cow], axis=0, ignore_index=True)
    fileName = 'cow_total_' + str(p) + '.csv'
    Path(dataDir/'cow_total/').mkdir(parents=True, exist_ok=True)
    cow_total.to_csv(dataDir/'cow_total'/fileName, index=False)
'''

In [16]:
# periods = os.listdir(dataDir)
periods = [1, 2]

filelist = list(Path(dataDir/'1').glob('*.csv'))

# integrate all the cows data into one dataset
for i, _ in enumerate(filelist):
    fileName = 'cow_' + str(i) + '.csv'
    single_cow_1 = pd.read_csv(dataDir/'1'/fileName, encoding='utf-8', usecols=usecols)
    single_cow_2 = pd.read_csv(dataDir/'2'/fileName, encoding='utf-8', usecols=usecols)
    single_cow_1.sort_values(by=['MilkingDate'], inplace=True)
    single_cow_2.sort_values(by=['MilkingDate'], inplace=True)
    print(i)
    label = labeling_memory(0.65, single_cow_1, single_cow_2)
    # skip anomalies
    if label == 2:
        continue
    single_cow = pd.concat([single_cow_1, single_cow_2], axis=0, ignore_index=True)
    single_cow['label'] = label
    if i == 0:
        cow_total = single_cow
        cow_total_1 = single_cow_1
        cow_total_2 = single_cow_2
    else:
        cow_total = pd.concat([cow_total, single_cow], axis=0, ignore_index=True)
        cow_total_1 = pd.concat([cow_total_1, single_cow_1], axis=0, ignore_index=True)
        cow_total_2 = pd.concat([cow_total_2, single_cow_2], axis=0, ignore_index=True) 
fileName = 'Cow_Memory_dataset_L1_L2.csv'
Path(dataDir/'cow_total/').mkdir(parents=True, exist_ok=True)
cow_total.to_csv(dataDir/'cow_total'/fileName, index=False)
cow_total_1.to_csv(dataDir/'cow_total/Cow_Memory_dataset_L1.csv', index=False)
cow_total_2.to_csv(dataDir/'cow_total/Cow_Memory_dataset_L2.csv', index=False)

0
0.1321973542312047
bad
1
0.6682159152012692
good
2
0.6687787308106531
good
3
0.6926361535649901
good
4
0.7661651953328444
good
5
0.35323895701425423
bad
6
0.4673379381314594
bad
7
0.5252242895345633
bad
8
0.623927318327258
bad
9
0.456035406089327
bad
10
0.767580272488248
good
11
0.48177181428952587
bad
12
0.39461618750444
bad
13
0.8002926495738242
good
14
0.8030236072979323
good
15
0.5795623719357742
bad
16
0.5089241079730903
bad
17
0.7971960443293966
good
18
0.7694151772976306
good
19
0.4875436580850392
bad
20
0.5436826892630562
bad
21
0.440723191939797
bad
22
0.41610662065831605
bad
23
0.26950983213429264
bad


In [17]:
cow_total

Unnamed: 0,FarmName_Pseudo,Gigacow_Cow_Id,MilkingDate,LactationNumber,DaysInMilk,BreedName,Age,Total_MilkProduction,Total_timeDelta_Seconds,milking_times,id,label
0,a624fb9a,769,2021-09-09,1.0,1.0,2,1.96,9.48,5191.0,2,1,0
1,a624fb9a,769,2021-09-10,1.0,2.0,2,1.96,14.49,9981.0,2,1,0
2,a624fb9a,769,2021-09-11,1.0,3.0,2,1.96,14.93,3531.0,2,1,0
3,a624fb9a,769,2021-09-12,1.0,4.0,2,1.97,14.62,8567.0,2,1,0
4,a624fb9a,769,2021-09-13,1.0,5.0,2,1.97,13.38,7612.0,2,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...
1717,a624fb9a,3447,2022-07-19,2.0,35.0,2,3.02,39.44,3924.0,3,24,0
1718,a624fb9a,3447,2022-07-20,2.0,36.0,2,3.02,37.27,3086.0,3,24,0
1719,a624fb9a,3447,2022-07-21,2.0,37.0,2,3.02,40.61,3807.0,3,24,0
1720,a624fb9a,3447,2022-07-22,2.0,38.0,2,3.03,38.51,1476.0,3,24,0


In [18]:
cow_total_1

Unnamed: 0,FarmName_Pseudo,Gigacow_Cow_Id,MilkingDate,LactationNumber,DaysInMilk,BreedName,Age,Total_MilkProduction,Total_timeDelta_Seconds,milking_times,id
0,a624fb9a,769,2021-09-09,1.0,1.0,2,1.96,9.48,5191.0,2,1
1,a624fb9a,769,2021-09-10,1.0,2.0,2,1.96,14.49,9981.0,2,1
2,a624fb9a,769,2021-09-11,1.0,3.0,2,1.96,14.93,3531.0,2,1
3,a624fb9a,769,2021-09-12,1.0,4.0,2,1.97,14.62,8567.0,2,1
4,a624fb9a,769,2021-09-13,1.0,5.0,2,1.97,13.38,7612.0,2,1
...,...,...,...,...,...,...,...,...,...,...,...
846,a624fb9a,3447,2021-08-07,1.0,35.0,2,2.07,32.18,13518.0,4,24
847,a624fb9a,3447,2021-08-08,1.0,36.0,2,2.07,40.27,2634.0,5,24
848,a624fb9a,3447,2021-08-09,1.0,37.0,2,2.08,30.49,3499.0,3,24
849,a624fb9a,3447,2021-08-10,1.0,38.0,2,2.08,27.60,2655.0,3,24


In [19]:
cow_total_2

Unnamed: 0,FarmName_Pseudo,Gigacow_Cow_Id,MilkingDate,LactationNumber,DaysInMilk,BreedName,Age,Total_MilkProduction,Total_timeDelta_Seconds,milking_times,id
0,a624fb9a,769,2022-07-29,2.0,1.0,2,2.84,15.19,1923.0,3,1
1,a624fb9a,769,2022-07-30,2.0,2.0,2,2.85,25.94,3894.0,3,1
2,a624fb9a,769,2022-07-31,2.0,3.0,2,2.85,25.91,2377.0,3,1
3,a624fb9a,769,2022-08-01,2.0,4.0,2,2.85,30.19,3898.0,3,1
4,a624fb9a,769,2022-08-02,2.0,5.0,2,2.85,30.78,3485.0,3,1
...,...,...,...,...,...,...,...,...,...,...,...
866,a624fb9a,3447,2022-07-19,2.0,35.0,2,3.02,39.44,3924.0,3,24
867,a624fb9a,3447,2022-07-20,2.0,36.0,2,3.02,37.27,3086.0,3,24
868,a624fb9a,3447,2022-07-21,2.0,37.0,2,3.02,40.61,3807.0,3,24
869,a624fb9a,3447,2022-07-22,2.0,38.0,2,3.03,38.51,1476.0,3,24


In [20]:
cow_total_1["mean_Total_timeDelta_Seconds"] = cow_total_1.Total_timeDelta_Seconds/cow_total_1.milking_times
cow_total_2["mean_Total_timeDelta_Seconds"] = cow_total_2.Total_timeDelta_Seconds/cow_total_2.milking_times
cow_total["mean_Total_timeDelta_Seconds"] = cow_total.Total_timeDelta_Seconds/cow_total.milking_times
cow_total

Unnamed: 0,FarmName_Pseudo,Gigacow_Cow_Id,MilkingDate,LactationNumber,DaysInMilk,BreedName,Age,Total_MilkProduction,Total_timeDelta_Seconds,milking_times,id,label,mean_Total_timeDelta_Seconds
0,a624fb9a,769,2021-09-09,1.0,1.0,2,1.96,9.48,5191.0,2,1,0,2595.500000
1,a624fb9a,769,2021-09-10,1.0,2.0,2,1.96,14.49,9981.0,2,1,0,4990.500000
2,a624fb9a,769,2021-09-11,1.0,3.0,2,1.96,14.93,3531.0,2,1,0,1765.500000
3,a624fb9a,769,2021-09-12,1.0,4.0,2,1.97,14.62,8567.0,2,1,0,4283.500000
4,a624fb9a,769,2021-09-13,1.0,5.0,2,1.97,13.38,7612.0,2,1,0,3806.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1717,a624fb9a,3447,2022-07-19,2.0,35.0,2,3.02,39.44,3924.0,3,24,0,1308.000000
1718,a624fb9a,3447,2022-07-20,2.0,36.0,2,3.02,37.27,3086.0,3,24,0,1028.666667
1719,a624fb9a,3447,2022-07-21,2.0,37.0,2,3.02,40.61,3807.0,3,24,0,1269.000000
1720,a624fb9a,3447,2022-07-22,2.0,38.0,2,3.03,38.51,1476.0,3,24,0,492.000000


In [21]:
static_cols = ['FarmName_Pseudo', 'TrafficDeviceName', 'LactationNumber', 'BreedName']
# timeSeries_cols = ['Age', 'Total_MilkProduction', 'Total_timeDelta_Seconds', 'DaysInMilk', 'milking_times']
timeSeries_cols = ['Age', 'Total_MilkProduction', 'mean_Total_timeDelta_Seconds', 'DaysInMilk', 'milking_times']
output_col = ['label']

In [22]:
cow_label = cow_total[['id', 'label']].copy()
# fetch y for feature extraction
y = cow_label.drop_duplicates(subset=['id'])
y = y["label"]
y.index = range(1, len(y)+1)
y

1     0
2     1
3     1
4     1
5     1
6     0
7     0
8     0
9     0
10    0
11    1
12    0
13    0
14    1
15    1
16    0
17    0
18    1
19    1
20    0
21    0
22    0
23    0
24    0
Name: label, dtype: int64

In [23]:
y.value_counts()

0    15
1     9
Name: label, dtype: int64

In [26]:
# using the relevant feature names list from learner problem
ts_learner_dataset = pd.read_csv(dataDir.parent/'learner_118_new_meanTimeCost_minPara.csv', encoding='utf-8', index_col=False)
ts_learner_features = ts_learner_dataset.iloc[:, 1:(len(ts_learner_dataset.columns)-5)].copy()
para_dict = from_columns(ts_learner_features.columns)
para_dict

{'DaysInMilk': {'sum_values': None, 'minimum': None, 'length': None},
 'Age': {'sum_values': None, 'length': None},
 'mean_Total_timeDelta_Seconds': {'length': None,
  'minimum': None,
  'sum_values': None,
  'maximum': None,
  'absolute_maximum': None,
  'standard_deviation': None,
  'variance': None,
  'median': None,
  'root_mean_square': None,
  'mean': None},
 'milking_times': {'length': None,
  'variance': None,
  'standard_deviation': None,
  'median': None,
  'sum_values': None,
  'absolute_maximum': None,
  'maximum': None,
  'mean': None,
  'root_mean_square': None},
 'Total_MilkProduction': {'length': None, 'sum_values': None}}

In [27]:
# extract features from cow dataset in lactation period 1

ts_extracted_dataset_1 = cow_total_1[['id']].copy()
ts_extracted_dataset_1.drop_duplicates(subset=['id'], inplace=True)
ts_extracted_dataset_1.index = ts_extracted_dataset_1.id

cow_timeseries = cow_total_1[['id', 'MilkingDate']].copy()
cow_timeseries.index = range(len(cow_timeseries))

for col in timeSeries_cols:
    ts_processed = pd.DataFrame(cow_total_1[col].copy())
    ts_processed.index = range(0,len(ts_processed)) 
    ts_processed = pd.concat([cow_timeseries, ts_processed], axis=1)
    print(ts_processed)
    # extract time series features
    extracted_dataset = extract_features(ts_processed, column_id="id", column_sort="MilkingDate", kind_to_fc_parameters=para_dict)
    # extracted_dataset = extract_features(ts_processed, column_id="id", column_sort="MilkingDate")
    # extracted_features.dropna(axis=1, inplace=True)
    impute(extracted_dataset)
    # features_filtered = select_features(extracted_dataset, y)
    ts_extracted_dataset_1 = pd.concat([ts_extracted_dataset_1, extracted_dataset], axis=1)
ts_extracted_dataset_1

     id MilkingDate   Age
0     1  2021-09-09  1.96
1     1  2021-09-10  1.96
2     1  2021-09-11  1.96
3     1  2021-09-12  1.97
4     1  2021-09-13  1.97
..   ..         ...   ...
846  24  2021-08-07  2.07
847  24  2021-08-08  2.07
848  24  2021-08-09  2.08
849  24  2021-08-10  2.08
850  24  2021-08-11  2.08

[851 rows x 3 columns]


Feature Extraction: 100%|██████████| 24/24 [00:05<00:00,  4.30it/s]


     id MilkingDate  Total_MilkProduction
0     1  2021-09-09                  9.48
1     1  2021-09-10                 14.49
2     1  2021-09-11                 14.93
3     1  2021-09-12                 14.62
4     1  2021-09-13                 13.38
..   ..         ...                   ...
846  24  2021-08-07                 32.18
847  24  2021-08-08                 40.27
848  24  2021-08-09                 30.49
849  24  2021-08-10                 27.60
850  24  2021-08-11                 31.65

[851 rows x 3 columns]


Feature Extraction: 100%|██████████| 24/24 [00:06<00:00,  3.74it/s]


     id MilkingDate  mean_Total_timeDelta_Seconds
0     1  2021-09-09                   2595.500000
1     1  2021-09-10                   4990.500000
2     1  2021-09-11                   1765.500000
3     1  2021-09-12                   4283.500000
4     1  2021-09-13                   3806.000000
..   ..         ...                           ...
846  24  2021-08-07                   3379.500000
847  24  2021-08-08                    526.800000
848  24  2021-08-09                   1166.333333
849  24  2021-08-10                    885.000000
850  24  2021-08-11                    800.333333

[851 rows x 3 columns]


Feature Extraction: 100%|██████████| 24/24 [00:05<00:00,  4.44it/s]


     id MilkingDate  DaysInMilk
0     1  2021-09-09         1.0
1     1  2021-09-10         2.0
2     1  2021-09-11         3.0
3     1  2021-09-12         4.0
4     1  2021-09-13         5.0
..   ..         ...         ...
846  24  2021-08-07        35.0
847  24  2021-08-08        36.0
848  24  2021-08-09        37.0
849  24  2021-08-10        38.0
850  24  2021-08-11        39.0

[851 rows x 3 columns]


Feature Extraction: 100%|██████████| 24/24 [00:05<00:00,  4.25it/s]


     id MilkingDate  milking_times
0     1  2021-09-09              2
1     1  2021-09-10              2
2     1  2021-09-11              2
3     1  2021-09-12              2
4     1  2021-09-13              2
..   ..         ...            ...
846  24  2021-08-07              4
847  24  2021-08-08              5
848  24  2021-08-09              3
849  24  2021-08-10              3
850  24  2021-08-11              3

[851 rows x 3 columns]


Feature Extraction: 100%|██████████| 24/24 [00:05<00:00,  4.73it/s]


Unnamed: 0,id,Age__sum_values,Age__length,Total_MilkProduction__length,Total_MilkProduction__sum_values,mean_Total_timeDelta_Seconds__length,mean_Total_timeDelta_Seconds__minimum,mean_Total_timeDelta_Seconds__sum_values,mean_Total_timeDelta_Seconds__maximum,mean_Total_timeDelta_Seconds__absolute_maximum,...,DaysInMilk__length,milking_times__length,milking_times__variance,milking_times__standard_deviation,milking_times__median,milking_times__sum_values,milking_times__absolute_maximum,milking_times__maximum,milking_times__mean,milking_times__root_mean_square
1,1,78.42,39.0,39.0,973.3,39.0,28.0,59839.583333,4990.5,4990.5,...,39.0,39.0,0.407627,0.638456,3.0,115.0,4.0,4.0,2.948718,3.017046
2,2,66.99,33.0,33.0,1039.97,33.0,194.75,36403.816667,3049.0,3049.0,...,33.0,33.0,0.550964,0.74227,3.0,114.0,5.0,5.0,3.454545,3.533391
3,3,65.0,32.0,32.0,790.29,32.0,8.0,35030.5,5912.0,5912.0,...,32.0,32.0,0.389648,0.624218,2.0,73.0,3.0,3.0,2.28125,2.365111
4,4,76.84,37.0,37.0,1094.35,37.0,762.0,114767.733333,19565.5,19565.5,...,37.0,37.0,0.528853,0.727223,3.0,115.0,5.0,5.0,3.108108,3.192051
5,5,75.01,38.0,38.0,1299.12,38.0,137.666667,83822.566667,5003.333333,5003.333333,...,38.0,38.0,0.554709,0.744788,3.0,129.0,5.0,5.0,3.394737,3.475478
6,6,72.62,38.0,38.0,1094.3,38.0,61.0,29150.166667,3448.75,3448.75,...,38.0,38.0,0.404432,0.63595,3.0,124.0,4.0,4.0,3.263158,3.32455
7,7,78.65,38.0,38.0,1008.32,38.0,59.333333,44155.416667,3918.0,3918.0,...,38.0,38.0,0.521468,0.722128,3.0,125.0,4.0,4.0,3.289474,3.367804
8,8,72.83,38.0,38.0,1450.71,38.0,146.0,37920.35,3175.666667,3175.666667,...,38.0,38.0,0.684211,0.82717,4.0,152.0,5.0,5.0,4.0,4.084631
9,9,78.1,39.0,39.0,810.23,39.0,102.0,49201.416667,4386.5,4386.5,...,39.0,39.0,0.536489,0.732454,3.0,108.0,4.0,4.0,2.769231,2.864459
10,10,76.93,39.0,39.0,1108.97,39.0,65.0,71745.966667,5503.5,5503.5,...,39.0,39.0,0.959895,0.979742,4.0,146.0,6.0,6.0,3.74359,3.869672


In [28]:
# extract features from cow dataset in lactation period 2

ts_extracted_dataset_2 = cow_total_2[['id']].copy()
ts_extracted_dataset_2.drop_duplicates(subset=['id'], inplace=True)
ts_extracted_dataset_2.index = ts_extracted_dataset_2.id

cow_timeseries = cow_total_2[['id', 'MilkingDate']].copy()
cow_timeseries.index = range(len(cow_timeseries))

for col in timeSeries_cols:
    ts_processed = pd.DataFrame(cow_total_2[col].copy())
    ts_processed.index = range(0,len(ts_processed)) 
    ts_processed = pd.concat([cow_timeseries, ts_processed], axis=1)
    print(ts_processed)
    # extract time series features
    extracted_dataset = extract_features(ts_processed, column_id="id", column_sort="MilkingDate", kind_to_fc_parameters=para_dict)
    # extracted_features.dropna(axis=1, inplace=True)
    impute(extracted_dataset)
    # features_filtered = select_features(extracted_dataset, y)
    ts_extracted_dataset_2 = pd.concat([ts_extracted_dataset_2, extracted_dataset], axis=1)
ts_extracted_dataset_2

     id MilkingDate   Age
0     1  2022-07-29  2.84
1     1  2022-07-30  2.85
2     1  2022-07-31  2.85
3     1  2022-08-01  2.85
4     1  2022-08-02  2.85
..   ..         ...   ...
866  24  2022-07-19  3.02
867  24  2022-07-20  3.02
868  24  2022-07-21  3.02
869  24  2022-07-22  3.03
870  24  2022-07-23  3.03

[871 rows x 3 columns]


Feature Extraction: 100%|██████████| 24/24 [00:05<00:00,  4.70it/s]


     id MilkingDate  Total_MilkProduction
0     1  2022-07-29                 15.19
1     1  2022-07-30                 25.94
2     1  2022-07-31                 25.91
3     1  2022-08-01                 30.19
4     1  2022-08-02                 30.78
..   ..         ...                   ...
866  24  2022-07-19                 39.44
867  24  2022-07-20                 37.27
868  24  2022-07-21                 40.61
869  24  2022-07-22                 38.51
870  24  2022-07-23                 49.29

[871 rows x 3 columns]


Feature Extraction: 100%|██████████| 24/24 [00:05<00:00,  4.69it/s]


     id MilkingDate  mean_Total_timeDelta_Seconds
0     1  2022-07-29                    641.000000
1     1  2022-07-30                   1298.000000
2     1  2022-07-31                    792.333333
3     1  2022-08-01                   1299.333333
4     1  2022-08-02                   1161.666667
..   ..         ...                           ...
866  24  2022-07-19                   1308.000000
867  24  2022-07-20                   1028.666667
868  24  2022-07-21                   1269.000000
869  24  2022-07-22                    492.000000
870  24  2022-07-23                    202.000000

[871 rows x 3 columns]


Feature Extraction: 100%|██████████| 24/24 [00:04<00:00,  5.06it/s]


     id MilkingDate  DaysInMilk
0     1  2022-07-29         1.0
1     1  2022-07-30         2.0
2     1  2022-07-31         3.0
3     1  2022-08-01         4.0
4     1  2022-08-02         5.0
..   ..         ...         ...
866  24  2022-07-19        35.0
867  24  2022-07-20        36.0
868  24  2022-07-21        37.0
869  24  2022-07-22        38.0
870  24  2022-07-23        39.0

[871 rows x 3 columns]


Feature Extraction: 100%|██████████| 24/24 [00:04<00:00,  4.97it/s]


     id MilkingDate  milking_times
0     1  2022-07-29              3
1     1  2022-07-30              3
2     1  2022-07-31              3
3     1  2022-08-01              3
4     1  2022-08-02              3
..   ..         ...            ...
866  24  2022-07-19              3
867  24  2022-07-20              3
868  24  2022-07-21              3
869  24  2022-07-22              3
870  24  2022-07-23              4

[871 rows x 3 columns]


Feature Extraction: 100%|██████████| 24/24 [00:05<00:00,  4.63it/s]


Unnamed: 0,id,Age__sum_values,Age__length,Total_MilkProduction__length,Total_MilkProduction__sum_values,mean_Total_timeDelta_Seconds__length,mean_Total_timeDelta_Seconds__minimum,mean_Total_timeDelta_Seconds__sum_values,mean_Total_timeDelta_Seconds__maximum,mean_Total_timeDelta_Seconds__absolute_maximum,...,DaysInMilk__length,milking_times__length,milking_times__variance,milking_times__standard_deviation,milking_times__median,milking_times__sum_values,milking_times__absolute_maximum,milking_times__maximum,milking_times__mean,milking_times__root_mean_square
1,1,112.94,39.0,39.0,1496.72,39.0,12.0,46710.833333,3100.333333,3100.333333,...,39.0,39.0,0.335306,0.579056,3.0,123.0,4.0,4.0,3.153846,3.206564
2,2,119.16,38.0,38.0,1417.57,38.0,8.333333,13667.166667,1737.0,1737.0,...,38.0,38.0,0.382964,0.618841,3.0,127.0,4.0,4.0,3.342105,3.398916
3,3,114.98,39.0,39.0,960.44,39.0,56.5,13065.816667,1510.75,1510.75,...,39.0,39.0,0.287968,0.536627,4.0,141.0,5.0,5.0,3.615385,3.654993
4,4,98.67,32.0,32.0,939.96,32.0,312.0,27184.416667,1966.333333,1966.333333,...,32.0,32.0,0.359375,0.599479,3.0,92.0,4.0,4.0,2.875,2.936835
5,5,113.05,39.0,39.0,1527.63,39.0,7.25,19828.5,1478.333333,1478.333333,...,39.0,39.0,0.324786,0.5699,3.0,130.0,4.0,4.0,3.333333,3.3817
6,6,109.1,39.0,39.0,1243.44,39.0,56.5,22959.233333,2893.0,2893.0,...,39.0,39.0,0.646943,0.804328,3.0,132.0,5.0,5.0,3.384615,3.478874
7,7,125.4,38.0,38.0,1673.73,38.0,85.75,23089.066667,1628.4,1628.4,...,38.0,38.0,0.304709,0.552005,4.0,156.0,5.0,5.0,4.105263,4.142209
8,8,109.78,38.0,38.0,1868.93,38.0,18.5,18054.8,2289.666667,2289.666667,...,38.0,38.0,0.479224,0.69226,4.0,164.0,6.0,6.0,4.315789,4.370957
9,9,116.25,39.0,39.0,1169.23,39.0,64.0,18139.6,1215.25,1215.25,...,39.0,39.0,0.486522,0.697511,3.0,131.0,5.0,5.0,3.358974,3.430631
10,10,111.56,38.0,38.0,1615.86,38.0,120.333333,41759.366667,4075.0,4075.0,...,38.0,38.0,0.784626,0.885791,4.0,141.0,6.0,6.0,3.710526,3.814791


In [29]:
ts_extracted_dataset_2.drop(['id'], axis=1, inplace=True)
ts_extracted_dataset = pd.concat([ts_extracted_dataset_1, ts_extracted_dataset_2], axis=1)
nunique = ts_extracted_dataset.nunique()
cols_to_drop = nunique[nunique == 1].index
ts_extracted_dataset.drop(cols_to_drop, axis=1, inplace=True)
ts_extracted_dataset

Unnamed: 0,id,Age__sum_values,Age__length,Total_MilkProduction__length,Total_MilkProduction__sum_values,mean_Total_timeDelta_Seconds__length,mean_Total_timeDelta_Seconds__minimum,mean_Total_timeDelta_Seconds__sum_values,mean_Total_timeDelta_Seconds__maximum,mean_Total_timeDelta_Seconds__absolute_maximum,...,DaysInMilk__length,milking_times__length,milking_times__variance,milking_times__standard_deviation,milking_times__median,milking_times__sum_values,milking_times__absolute_maximum,milking_times__maximum,milking_times__mean,milking_times__root_mean_square
1,1,78.42,39.0,39.0,973.3,39.0,28.0,59839.583333,4990.5,4990.5,...,39.0,39.0,0.335306,0.579056,3.0,123.0,4.0,4.0,3.153846,3.206564
2,2,66.99,33.0,33.0,1039.97,33.0,194.75,36403.816667,3049.0,3049.0,...,38.0,38.0,0.382964,0.618841,3.0,127.0,4.0,4.0,3.342105,3.398916
3,3,65.0,32.0,32.0,790.29,32.0,8.0,35030.5,5912.0,5912.0,...,39.0,39.0,0.287968,0.536627,4.0,141.0,5.0,5.0,3.615385,3.654993
4,4,76.84,37.0,37.0,1094.35,37.0,762.0,114767.733333,19565.5,19565.5,...,32.0,32.0,0.359375,0.599479,3.0,92.0,4.0,4.0,2.875,2.936835
5,5,75.01,38.0,38.0,1299.12,38.0,137.666667,83822.566667,5003.333333,5003.333333,...,39.0,39.0,0.324786,0.5699,3.0,130.0,4.0,4.0,3.333333,3.3817
6,6,72.62,38.0,38.0,1094.3,38.0,61.0,29150.166667,3448.75,3448.75,...,39.0,39.0,0.646943,0.804328,3.0,132.0,5.0,5.0,3.384615,3.478874
7,7,78.65,38.0,38.0,1008.32,38.0,59.333333,44155.416667,3918.0,3918.0,...,38.0,38.0,0.304709,0.552005,4.0,156.0,5.0,5.0,4.105263,4.142209
8,8,72.83,38.0,38.0,1450.71,38.0,146.0,37920.35,3175.666667,3175.666667,...,38.0,38.0,0.479224,0.69226,4.0,164.0,6.0,6.0,4.315789,4.370957
9,9,78.1,39.0,39.0,810.23,39.0,102.0,49201.416667,4386.5,4386.5,...,39.0,39.0,0.486522,0.697511,3.0,131.0,5.0,5.0,3.358974,3.430631
10,10,76.93,39.0,39.0,1108.97,39.0,65.0,71745.966667,5503.5,5503.5,...,38.0,38.0,0.784626,0.885791,4.0,141.0,6.0,6.0,3.710526,3.814791


In [30]:
ts_extracted_dataset.index = range(1, len(ts_extracted_dataset)+1)
ts_extracted_features = ts_extracted_dataset.iloc[:, 1:len(ts_extracted_dataset.columns)].copy()
# normalize numerical features
ts_extracted_cols = ts_extracted_features.columns
scaler_std = StandardScaler()
ts_std = scaler_std.fit_transform(ts_extracted_features)
# transform standard data into dataframe
ts_extracted_processed = pd.DataFrame(ts_std, columns=ts_extracted_cols)
ts_extracted_processed.index = range(1,len(ts_extracted_processed)+1)
# append id col to the dataframe
ts_extracted_processed = pd.concat([ts_extracted_dataset['id'], ts_extracted_processed], axis=1)
ts_extracted_processed

Unnamed: 0,id,Age__sum_values,Age__length,Total_MilkProduction__length,Total_MilkProduction__sum_values,mean_Total_timeDelta_Seconds__length,mean_Total_timeDelta_Seconds__minimum,mean_Total_timeDelta_Seconds__sum_values,mean_Total_timeDelta_Seconds__maximum,mean_Total_timeDelta_Seconds__absolute_maximum,...,DaysInMilk__length,milking_times__length,milking_times__variance,milking_times__standard_deviation,milking_times__median,milking_times__sum_values,milking_times__absolute_maximum,milking_times__maximum,milking_times__mean,milking_times__root_mean_square
1,1,0.246876,0.507691,0.507691,0.121727,0.507691,-0.388758,-0.188705,-0.402794,-0.402794,...,0.461388,0.461388,-0.58828,-0.621055,-0.101015,0.11647,-0.676123,-0.676123,-0.055808,-0.101937
2,2,-0.463978,-0.352397,-0.352397,0.367822,-0.352397,-0.168235,-0.608693,-0.726688,-0.726688,...,0.291029,0.291029,-0.426198,-0.39862,-0.101015,0.234166,-0.676123,-0.676123,0.211915,0.179959
3,3,-0.58774,-0.495746,-0.495746,-0.553809,-0.495746,-0.415207,-0.633304,-0.249063,-0.249063,...,0.461388,0.461388,-0.74927,-0.858273,1.111168,0.646102,0.338062,0.338062,0.600546,0.555244
4,4,0.148613,0.220995,0.220995,0.568552,0.220995,0.581941,0.795652,2.028707,2.028707,...,-0.731122,-0.731122,-0.506422,-0.506871,-0.101015,-0.795674,-0.676123,-0.676123,-0.452355,-0.497229
5,5,0.034802,0.364343,0.364343,1.32441,0.364343,-0.243726,0.241089,-0.400653,-0.400653,...,0.461388,0.461388,-0.624055,-0.672244,-0.101015,0.322438,-0.676123,-0.676123,0.199441,0.154729
6,6,-0.113837,0.364343,0.364343,0.568368,0.364343,-0.345116,-0.738685,-0.659999,-0.659999,...,0.461388,0.461388,0.471575,0.638425,-0.101015,0.381286,0.338062,0.338062,0.272369,0.297138
7,7,0.261181,0.364343,0.364343,0.250994,0.364343,-0.34732,-0.469778,-0.581716,-0.581716,...,0.291029,0.291029,-0.692337,-0.772297,1.111168,1.087461,0.338062,0.338062,1.297202,1.269267
8,8,-0.100777,0.364343,0.364343,1.883966,0.364343,-0.232706,-0.581516,-0.705557,-0.705557,...,0.291029,0.291029,-0.098823,0.011864,1.111168,1.322853,1.352247,1.352247,1.596592,1.604501
9,9,0.226975,0.507691,0.507691,-0.480205,0.507691,-0.290895,-0.37935,-0.503557,-0.503557,...,0.461388,0.461388,-0.074004,0.041222,-0.101015,0.351862,0.338062,0.338062,0.235905,0.226438
10,10,0.15421,0.507691,0.507691,0.622519,0.507691,-0.339826,0.024667,-0.317212,-0.317212,...,0.291029,0.291029,0.939825,1.093884,1.111168,0.646102,1.352247,1.352247,0.735847,0.789431


In [31]:
# add one-hot encoded categorical features
ohe = OneHotEncoder(handle_unknown='ignore',sparse=False)
cow_breed = cow_total[['id', 'BreedName']].copy()
cow_breed.drop_duplicates(subset=['id'], inplace=True)
cat = ohe.fit_transform(np.array(cow_breed['BreedName']).reshape(-1, 1))
col_names = ohe.get_feature_names_out(['BreedName'])
cat_breed = pd.DataFrame(cat, columns=col_names)
cat_breed.index = range(1,len(cow_breed)+1)
cat_breed

Unnamed: 0,BreedName_1,BreedName_2,BreedName_4,BreedName_99
1,0.0,1.0,0.0,0.0
2,1.0,0.0,0.0,0.0
3,1.0,0.0,0.0,0.0
4,1.0,0.0,0.0,0.0
5,0.0,1.0,0.0,0.0
6,1.0,0.0,0.0,0.0
7,1.0,0.0,0.0,0.0
8,0.0,1.0,0.0,0.0
9,1.0,0.0,0.0,0.0
10,0.0,1.0,0.0,0.0


In [32]:
ts_dataset = pd.concat([ts_extracted_processed, cat_breed], axis=1)
ts_dataset = pd.concat([ts_dataset, y], axis=1)
ts_dataset.to_csv(dataDir.parent/"memory_24_meanCost_minPara_65%.csv", index=False)
ts_dataset

Unnamed: 0,id,Age__sum_values,Age__length,Total_MilkProduction__length,Total_MilkProduction__sum_values,mean_Total_timeDelta_Seconds__length,mean_Total_timeDelta_Seconds__minimum,mean_Total_timeDelta_Seconds__sum_values,mean_Total_timeDelta_Seconds__maximum,mean_Total_timeDelta_Seconds__absolute_maximum,...,milking_times__sum_values,milking_times__absolute_maximum,milking_times__maximum,milking_times__mean,milking_times__root_mean_square,BreedName_1,BreedName_2,BreedName_4,BreedName_99,label
1,1,0.246876,0.507691,0.507691,0.121727,0.507691,-0.388758,-0.188705,-0.402794,-0.402794,...,0.11647,-0.676123,-0.676123,-0.055808,-0.101937,0.0,1.0,0.0,0.0,0
2,2,-0.463978,-0.352397,-0.352397,0.367822,-0.352397,-0.168235,-0.608693,-0.726688,-0.726688,...,0.234166,-0.676123,-0.676123,0.211915,0.179959,1.0,0.0,0.0,0.0,1
3,3,-0.58774,-0.495746,-0.495746,-0.553809,-0.495746,-0.415207,-0.633304,-0.249063,-0.249063,...,0.646102,0.338062,0.338062,0.600546,0.555244,1.0,0.0,0.0,0.0,1
4,4,0.148613,0.220995,0.220995,0.568552,0.220995,0.581941,0.795652,2.028707,2.028707,...,-0.795674,-0.676123,-0.676123,-0.452355,-0.497229,1.0,0.0,0.0,0.0,1
5,5,0.034802,0.364343,0.364343,1.32441,0.364343,-0.243726,0.241089,-0.400653,-0.400653,...,0.322438,-0.676123,-0.676123,0.199441,0.154729,0.0,1.0,0.0,0.0,1
6,6,-0.113837,0.364343,0.364343,0.568368,0.364343,-0.345116,-0.738685,-0.659999,-0.659999,...,0.381286,0.338062,0.338062,0.272369,0.297138,1.0,0.0,0.0,0.0,0
7,7,0.261181,0.364343,0.364343,0.250994,0.364343,-0.34732,-0.469778,-0.581716,-0.581716,...,1.087461,0.338062,0.338062,1.297202,1.269267,1.0,0.0,0.0,0.0,0
8,8,-0.100777,0.364343,0.364343,1.883966,0.364343,-0.232706,-0.581516,-0.705557,-0.705557,...,1.322853,1.352247,1.352247,1.596592,1.604501,0.0,1.0,0.0,0.0,0
9,9,0.226975,0.507691,0.507691,-0.480205,0.507691,-0.290895,-0.37935,-0.503557,-0.503557,...,0.351862,0.338062,0.338062,0.235905,0.226438,1.0,0.0,0.0,0.0,0
10,10,0.15421,0.507691,0.507691,0.622519,0.507691,-0.339826,0.024667,-0.317212,-0.317212,...,0.646102,1.352247,1.352247,0.735847,0.789431,0.0,1.0,0.0,0.0,0


cow_total.BreedName.unique()

In [None]:
'''
from tsfresh.feature_extraction import extract_features, EfficientFCParameters

ts_processed = pd.DataFrame(cow_total_1[timeSeries_cols].copy())
ts_processed.index = range(0,len(ts_processed)) 
ts_processed = pd.concat([cow_timeseries, ts_processed], axis=1)
ts_processed

extracted_dataset = extract_features(ts_processed, column_id="id", column_sort="MilkingDate", default_fc_parameters=EfficientFCParameters())
impute(extracted_dataset)
#extracted_dataset.index = range(1,len(extracted_dataset)+1)
features_filtered = select_features(extracted_dataset, y)
ts_extracted_dataset = pd.concat([ts_extracted_dataset, features_filtered], axis=1)
ts_extracted_dataset
'''