In [41]:
import os
from pathlib import Path
from matplotlib.style import use
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from tsfresh import extract_features
from tsfresh import select_features
from tsfresh.utilities.dataframe_functions import impute
from tsfresh.feature_extraction import ComprehensiveFCParameters


dataDir = Path.cwd().parent.parent.parent/'Data/processed/memory_targetCows/'
usecols = ['id', 'MilkingEventDateTime', 'FarmName_Pseudo', 'TrafficDeviceName', 'MilkProduction', 'timeDelta_Seconds', 'LactationNumber', 'DaysInMilk', 'BreedName', 'Age', 'label']

In [42]:
# periods = os.listdir(dataDir)
# periods
# for p in periods:
#     print(p)

# integrate all the cows data into one dataset
filelist = list(Path(dataDir).glob('*.csv'))
for i, _ in enumerate(filelist):
    fileName = 'cow_' + str(i) + '.csv'
    single_cow = pd.read_csv(dataDir/fileName, encoding='utf-8', usecols=usecols)
    single_cow.sort_values(by=['MilkingEventDateTime'], inplace=True)
    print(single_cow.timeDelta_Seconds.mean())
    if i == 0:
        cow_total = single_cow
    else:
        cow_total = pd.concat([cow_total, single_cow], axis=0, ignore_index=True)
# fileName = 'cow_total_' + str(p) + '.csv'
# Path(dataDir/'cow_total/').mkdir(parents=True, exist_ok=True)
# cow_total.to_csv(dataDir/'cow_total'/fileName, index=False)


742.255995203837
655.4297376093294
287.84170112226815
534.8969866853539
630.3773324118866
512.1685534591195
597.5614275909403
929.9478402607987
534.5014731879788
480.96941323345817
1116.654131618011
727.4478092783505
788.1771177117712


In [43]:
cow_total

Unnamed: 0,FarmName_Pseudo,MilkingEventDateTime,TrafficDeviceName,MilkProduction,timeDelta_Seconds,LactationNumber,DaysInMilk,BreedName,Age,label,id
0,a624fb9a,2020-10-21 19:06:33,vms2,3.38,210.0,1.0,1.0,1,1.91,1,1
1,a624fb9a,2020-10-22 05:52:24,vms2,6.53,1686.0,1.0,2.0,1,1.91,1,1
2,a624fb9a,2020-10-22 15:58:42,vms1,5.00,1304.0,1.0,2.0,1,1.91,1,1
3,a624fb9a,2020-10-23 02:06:03,vms2,7.16,1339.0,1.0,3.0,1,1.92,1,1
4,a624fb9a,2020-10-23 13:28:20,vms2,8.13,1494.0,1.0,3.0,1,1.92,1,1
...,...,...,...,...,...,...,...,...,...,...,...
20909,a624fb9a,2022-08-22 00:40:37,vms2,7.83,276.0,2.0,339.0,2,3.92,1,13
20910,a624fb9a,2022-08-22 13:31:20,vms1,13.30,4594.0,2.0,339.0,2,3.92,1,13
20911,a624fb9a,2022-08-22 22:08:42,vms2,8.19,538.0,2.0,339.0,2,3.92,1,13
20912,a624fb9a,2022-08-23 07:34:52,vms1,9.51,107.0,2.0,340.0,2,3.92,1,13


In [44]:
# df_static_features = cow_total[['FarmName_Pseudo']]
static_cols = ['Age', 'FarmName_Pseudo', 'TrafficDeviceName', 'LactationNumber', 'BreedName', 'DaysInMilk']
timeSeries_cols = ['MilkProduction', 'timeDelta_Seconds']
output_col = ['label']

In [45]:
cow_label = cow_total[['id', 'label']].copy()
# fetch y for feature extraction
y = cow_label.drop_duplicates(subset=['id'])
y = y["label"]
y.index = range(1,len(y)+1) 

In [46]:
ts_extracted_dataset = cow_total[['id']].copy()
ts_extracted_dataset.drop_duplicates(subset=['id'], inplace=True)
ts_extracted_dataset.index = range(1, len(ts_extracted_dataset)+1)

cow_timeseries = cow_total[['id', 'MilkingEventDateTime']].copy()
cow_timeseries.index = range(len(cow_timeseries))

In [47]:
ts_processed = pd.DataFrame(cow_total[timeSeries_cols].copy())
ts_processed.index = range(0,len(ts_processed)) 
ts_processed = pd.concat([cow_timeseries, ts_processed], axis=1)
print(ts_processed)
# extract time series features
extracted_features = extract_features(ts_processed, column_id="id", column_sort="MilkingEventDateTime")
#extracted_features.dropna(axis=1, inplace=True)
impute(extracted_features)
features_filtered = select_features(extracted_features, y)
ts_extracted_dataset = pd.concat([ts_extracted_dataset, features_filtered], axis=1)

       id MilkingEventDateTime  MilkProduction  timeDelta_Seconds
0       1  2020-10-21 19:06:33            3.38              210.0
1       1  2020-10-22 05:52:24            6.53             1686.0
2       1  2020-10-22 15:58:42            5.00             1304.0
3       1  2020-10-23 02:06:03            7.16             1339.0
4       1  2020-10-23 13:28:20            8.13             1494.0
...    ..                  ...             ...                ...
20909  13  2022-08-22 00:40:37            7.83              276.0
20910  13  2022-08-22 13:31:20           13.30             4594.0
20911  13  2022-08-22 22:08:42            8.19              538.0
20912  13  2022-08-23 07:34:52            9.51              107.0
20913  13  2022-08-23 16:54:47            8.41              632.0

[20914 rows x 4 columns]


Feature Extraction: 100%|██████████| 26/26 [00:17<00:00,  1.52it/s]
 'timeDelta_Seconds__query_similarity_count__query_None__threshold_0.0'] did not have any finite values. Filling with zeros.


In [49]:
cow_timeseries = cow_total[['id', 'MilkingEventDateTime']].copy()
# cow_timeseries.sort_values(by="MilkingEventDateTime", inplace=True)
cow_timeseries.index = range(len(cow_timeseries))

for col in timeSeries_cols:
    ts_processed = pd.DataFrame(cow_total[col].copy())
    ts_processed.index = range(0,len(ts_processed)) 
    ts_processed = pd.concat([cow_timeseries, ts_processed], axis=1)
    print(ts_processed)
    # extract time series features
    extracted_features = extract_features(ts_processed, column_id="id", column_sort="MilkingEventDateTime")
    #extracted_features.dropna(axis=1, inplace=True)
    impute(extracted_features)
    features_filtered = select_features(extracted_features, y)
    ts_extracted_dataset = pd.concat([ts_extracted_dataset, features_filtered], axis=1)


       id MilkingEventDateTime  MilkProduction
0       1  2020-10-21 19:06:33            3.38
1       1  2020-10-22 05:52:24            6.53
2       1  2020-10-22 15:58:42            5.00
3       1  2020-10-23 02:06:03            7.16
4       1  2020-10-23 13:28:20            8.13
...    ..                  ...             ...
20909  13  2022-08-22 00:40:37            7.83
20910  13  2022-08-22 13:31:20           13.30
20911  13  2022-08-22 22:08:42            8.19
20912  13  2022-08-23 07:34:52            9.51
20913  13  2022-08-23 16:54:47            8.41

[20914 rows x 3 columns]


Feature Extraction: 100%|██████████| 13/13 [00:12<00:00,  1.08it/s]


       id MilkingEventDateTime  timeDelta_Seconds
0       1  2020-10-21 19:06:33              210.0
1       1  2020-10-22 05:52:24             1686.0
2       1  2020-10-22 15:58:42             1304.0
3       1  2020-10-23 02:06:03             1339.0
4       1  2020-10-23 13:28:20             1494.0
...    ..                  ...                ...
20909  13  2022-08-22 00:40:37              276.0
20910  13  2022-08-22 13:31:20             4594.0
20911  13  2022-08-22 22:08:42              538.0
20912  13  2022-08-23 07:34:52              107.0
20913  13  2022-08-23 16:54:47              632.0

[20914 rows x 3 columns]


Feature Extraction: 100%|██████████| 13/13 [00:12<00:00,  1.05it/s]


In [51]:
ts_processed.to_csv(dataDir/'ts_processed.csv', index=False)

In [52]:
ts_extracted_dataset

Unnamed: 0,id
1,1
2,2
3,3
4,4
5,5
6,6
7,7
8,8
9,9
10,10


In [None]:
# @deprecated
# periods = [1, 2]

# for i in periods:
#     cow_total_i = cow_total.loc[cow_total.LactationNumber == i].copy()
#     cow_total_i.sort_values(by="MilkingEventDateTime", inplace=True)
#     cow_timeseries = cow_total[['id', 'MilkingEventDateTime']].loc[cow_total.LactationNumber == i].copy()
#     cow_timeseries.sort_values(by="MilkingEventDateTime", inplace=True)
#     cow_timeseries.index = range(len(cow_timeseries))
#     for col in timeSeries_cols:
#         ts_processed = pd.DataFrame(cow_total[col].loc[cow_total.LactationNumber == i]).copy()
#         ts_processed.index = range(0,len(ts_processed)) 
#         ts_processed = pd.concat([cow_timeseries, ts_processed], axis=1)
#         #print(ts_processed)
#         # extract time series features
#         extracted_features = extract_features(ts_processed, column_id="id", column_sort="MilkingEventDateTime")
#         print(extracted_features)
#         # extracted_features.dropna(axis=1, inplace=True)
#         impute(extracted_features)
#         features_filtered = select_features(extracted_features, y)
#         ts_extracted_dataset = pd.concat([ts_extracted_dataset, features_filtered], axis=1)


In [32]:
features_filtered

1
2
3
4
5
6
7
8
9
10
11


In [25]:
ts_extracted_features = ts_extracted_dataset.iloc[:, 1:len(ts_extracted_dataset.columns)].copy()
# normalize numerical features
ts_extracted_cols = ts_extracted_features.columns
scaler_std = StandardScaler()
ts_std = scaler_std.fit_transform(ts_extracted_features)
# transform standard data into dataframe
ts_extracted_processed = pd.DataFrame(ts_std, columns=ts_extracted_cols)
ts_extracted_processed.index = range(1,len(ts_extracted_processed)+1)
# append id col to the dataframe
ts_extracted_processed = pd.concat([ts_extracted_dataset['id'], ts_extracted_processed], axis=1)

ValueError: at least one array or dtype is required

In [None]:
ts_extracted_processed

cow_total.BreedName.unique()

In [3]:
cow_total

Unnamed: 0,FarmName_Pseudo,MilkingEventDateTime,TrafficDeviceName,MilkProduction,timeDelta_Seconds,LactationNumber,DaysInMilk,BreedName,Age,label,id
0,a624fb9a,2020-10-10 03:46:46,vms1,7.49,748.0,1.0,2.0,1,1.99,1,1
1,a624fb9a,2020-10-10 16:13:02,vms1,10.21,233.0,1.0,2.0,1,1.99,1,1
2,a624fb9a,2020-10-11 05:09:29,vms2,10.34,1372.0,1.0,3.0,1,1.99,1,1
3,a624fb9a,2020-10-11 19:26:21,vms1,6.80,484.0,1.0,3.0,1,1.99,1,1
4,a624fb9a,2020-10-12 02:49:54,vms1,7.12,4537.0,1.0,4.0,1,1.99,1,1
...,...,...,...,...,...,...,...,...,...,...,...
1813,a624fb9a,2022-08-22 00:40:37,vms2,7.83,276.0,2.0,339.0,2,3.92,1,22
1814,a624fb9a,2022-08-22 13:31:20,vms1,13.30,4594.0,2.0,339.0,2,3.92,1,22
1815,a624fb9a,2022-08-22 22:08:42,vms2,8.19,538.0,2.0,339.0,2,3.92,1,22
1816,a624fb9a,2022-08-23 07:34:52,vms1,9.51,107.0,2.0,340.0,2,3.92,1,22
