In [1]:
import os
from pathlib import Path
from matplotlib.style import use
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from tsfresh import extract_features
from tsfresh import select_features
from tsfresh.utilities.dataframe_functions import impute
from tsfresh.feature_extraction import ComprehensiveFCParameters


dataDir = Path.cwd().parent.parent.parent/'Data/processed/memory_targetCows/'
usecols = ['id', 'MilkingEventDateTime', 'FarmName_Pseudo', 'TrafficDeviceName', 'MilkProduction', 'timeDelta_Seconds', 'LactationNumber', 'DaysInMilk', 'BreedName', 'Age', 'label']

In [3]:
periods = os.listdir(dataDir)
for p in periods:
    print(p)

# integrate all the cows data into one dataset
for p in periods:
    filelist = list(Path(dataDir/str(p)).glob('*.csv'))
    for i, _ in enumerate(filelist):
        fileName = 'cow_' + str(i) + '.csv'
        single_cow = pd.read_csv(dataDir/str(p)/fileName, encoding='utf-8', usecols=usecols)
        single_cow.sort_values(by=['MilkingEventDateTime'], inplace=True)
        print(single_cow.timeDelta_Seconds.mean())
        if i == 0:
            cow_total = single_cow
        else:
            cow_total = pd.concat([cow_total, single_cow], axis=0, ignore_index=True)
    fileName = 'cow_total_' + str(p) + '.csv'
    Path(dataDir/'cow_total/').mkdir(parents=True, exist_ok=True)
    cow_total.to_csv(dataDir/'cow_total'/fileName, index=False)


1
2
829.011917659805
322.4289372599232
800.7993019197207
1823.434262948207
1476.7307692307693
761.6274509803922
653.2897657213317
1463.2184466019417
933.7167325428195
601.0283353010626
832.6587030716723
1182.4926253687315
1039.6613924050632
1217.2872570194384
765.9821428571429
1224.2215189873418
847.2289281997919
918.6905660377358
669.6450331125828
1244.0983810709838
603.2755102040817
947.7948717948718
1022.0250896057348
2261.590778097983
2952.826666666667
629.7751277683135
1077.136638452237
866.7722543352601
461.0262812089356
690.5282555282555
924.180790960452
1065.5211864406779
1160.4178712220762
911.9521410579345
453.1363636363636
258.22258771929825
465.82466063348414
1744.8197424892703
850.4590792838875
475.5327868852459
304.2920353982301
756.4510595358224
530.0226986128625
500.0518292682927
465.3905109489051
1888.476595744681
1308.1762711864408
755.8141361256545
360.5923632610939
1988.9856321839081
505.7655986509275
654.5029736618521
383.5059523809524
867.6142595978063
365.8
807.2

In [4]:
periods = os.listdir(dataDir)
for p in periods:
    print(p)

filelist1 = list(Path(dataDir/'1').glob('*.csv'))
filelist2 = list(Path(dataDir/'2').glob('*.csv'))

# integrate all the cows data into one dataset
for i, _ in enumerate(filelist):
    fileName = 'cow_' + str(i) + '.csv'
    single_cow_1 = pd.read_csv(dataDir/'1'/fileName, encoding='utf-8', usecols=usecols)
    single_cow_2 = pd.read_csv(dataDir/'2'/fileName, encoding='utf-8', usecols=usecols)
    single_cow_1.sort_values(by=['MilkingEventDateTime'], inplace=True)
    single_cow_2.sort_values(by=['MilkingEventDateTime'], inplace=True)
    single_cow = pd.concat([single_cow_1, single_cow_2], axis=0, ignore_index=True)
    if i == 0:
        cow_total = single_cow
    else:
        cow_total = pd.concat([cow_total, single_cow], axis=0, ignore_index=True)
fileName = 'cow_total' + '.csv'
Path(dataDir/'cow_total/').mkdir(parents=True, exist_ok=True)
cow_total.to_csv(dataDir/'cow_total'/fileName, index=False)

1
2
cow_total


In [5]:
total_files = os.listdir(dataDir/"cow_total")
total_files
cow_total_1 = pd.read_csv(dataDir/"cow_total/cow_total_1.csv", encoding='utf-8', usecols=usecols)
cow_total_2 = pd.read_csv(dataDir/"cow_total/cow_total_2.csv", encoding='utf-8', usecols=usecols)

In [6]:
# df_static_features = cow_total[['FarmName_Pseudo']]
static_cols = ['Age', 'FarmName_Pseudo', 'TrafficDeviceName', 'BreedName']
timeSeries_cols = ['Age', 'MilkProduction', 'timeDelta_Seconds', 'DaysInMilk']
output_col = ['label']

In [11]:
cow_label = cow_total[['id', 'label']].copy()
# fetch y for feature extraction
y = cow_label.drop_duplicates(subset=['id'])
y = y["label"]
y.index = range(1,len(y)+1)
y

1     1
2     0
3     0
4     0
5     0
6     0
7     1
8     1
9     0
10    0
11    0
12    0
13    0
14    0
15    1
16    0
17    0
18    0
19    0
20    0
21    0
22    0
23    0
24    0
25    1
26    1
27    1
28    1
29    0
30    0
31    1
32    0
33    1
34    0
Name: label, dtype: int64

In [59]:
y = [1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1]
y = pd.Series(y)
y.index = range(1,len(y)+1) 
y

1     1
2     1
3     0
4     1
5     1
6     1
7     0
8     1
9     0
10    0
11    0
12    1
13    0
14    1
15    1
16    1
17    1
18    0
19    1
20    1
dtype: int64

In [12]:
ts_extracted_dataset = cow_total[['id']].copy()
ts_extracted_dataset.drop_duplicates(subset=['id'], inplace=True)
ts_extracted_dataset.index = range(1, len(ts_extracted_dataset)+1)

cow_timeseries = cow_total[['id', 'MilkingEventDateTime']].copy()
cow_timeseries.index = range(len(cow_timeseries))

In [13]:
cow_timeseries

Unnamed: 0,id,MilkingEventDateTime
0,1,2020-10-10 03:46:46
1,1,2020-10-10 16:13:02
2,1,2020-10-11 05:09:29
3,1,2020-10-11 19:26:21
4,1,2020-10-12 02:49:54
...,...,...
41200,34,2021-10-31 20:33:26
41201,34,2021-11-01 08:44:03
41202,34,2021-11-01 19:17:07
41203,34,2021-11-02 09:04:59


In [26]:
timeSeries_cols

['Age', 'MilkProduction', 'timeDelta_Seconds', 'DaysInMilk']

In [14]:
for col in timeSeries_cols:
    ts_processed = pd.DataFrame(cow_total[col].copy())
    ts_processed.index = range(0,len(ts_processed)) 
    ts_processed = pd.concat([cow_timeseries, ts_processed], axis=1)
    print(ts_processed)
    # extract time series features
    extracted_features = extract_features(ts_processed, column_id="id", column_sort="MilkingEventDateTime")
    #extracted_features.dropna(axis=1, inplace=True)
    impute(extracted_features)
    features_filtered = select_features(extracted_features, y)
    ts_extracted_dataset = pd.concat([ts_extracted_dataset, features_filtered], axis=1)

       id MilkingEventDateTime   Age
0       1  2020-10-10 03:46:46  1.99
1       1  2020-10-10 16:13:02  1.99
2       1  2020-10-11 05:09:29  1.99
3       1  2020-10-11 19:26:21  1.99
4       1  2020-10-12 02:49:54  1.99
...    ..                  ...   ...
41200  34  2021-10-31 20:33:26  3.15
41201  34  2021-11-01 08:44:03  3.16
41202  34  2021-11-01 19:17:07  3.16
41203  34  2021-11-02 09:04:59  3.16
41204  34  2021-11-02 19:42:59  3.16

[41205 rows x 3 columns]


Feature Extraction: 100%|██████████| 34/34 [00:14<00:00,  2.38it/s]


       id MilkingEventDateTime  MilkProduction
0       1  2020-10-10 03:46:46            7.49
1       1  2020-10-10 16:13:02           10.21
2       1  2020-10-11 05:09:29           10.34
3       1  2020-10-11 19:26:21            6.80
4       1  2020-10-12 02:49:54            7.12
...    ..                  ...             ...
41200  34  2021-10-31 20:33:26           11.67
41201  34  2021-11-01 08:44:03           10.82
41202  34  2021-11-01 19:17:07           12.90
41203  34  2021-11-02 09:04:59           14.60
41204  34  2021-11-02 19:42:59           11.04

[41205 rows x 3 columns]


Feature Extraction: 100%|██████████| 34/34 [00:15<00:00,  2.15it/s]


       id MilkingEventDateTime  timeDelta_Seconds
0       1  2020-10-10 03:46:46              748.0
1       1  2020-10-10 16:13:02              233.0
2       1  2020-10-11 05:09:29             1372.0
3       1  2020-10-11 19:26:21              484.0
4       1  2020-10-12 02:49:54             4537.0
...    ..                  ...                ...
41200  34  2021-10-31 20:33:26              142.0
41201  34  2021-11-01 08:44:03              293.0
41202  34  2021-11-01 19:17:07               12.0
41203  34  2021-11-02 09:04:59              201.0
41204  34  2021-11-02 19:42:59              308.0

[41205 rows x 3 columns]


Feature Extraction: 100%|██████████| 34/34 [00:15<00:00,  2.18it/s]


       id MilkingEventDateTime  DaysInMilk
0       1  2020-10-10 03:46:46         2.0
1       1  2020-10-10 16:13:02         2.0
2       1  2020-10-11 05:09:29         3.0
3       1  2020-10-11 19:26:21         3.0
4       1  2020-10-12 02:49:54         4.0
...    ..                  ...         ...
41200  34  2021-10-31 20:33:26        90.0
41201  34  2021-11-01 08:44:03        91.0
41202  34  2021-11-01 19:17:07        91.0
41203  34  2021-11-02 09:04:59        92.0
41204  34  2021-11-02 19:42:59        92.0

[41205 rows x 3 columns]


Feature Extraction: 100%|██████████| 34/34 [00:15<00:00,  2.17it/s]


In [20]:
ts_extracted_dataset

Unnamed: 0,id,"timeDelta_Seconds__agg_linear_trend__attr_""rvalue""__chunk_len_5__f_agg_""mean""","timeDelta_Seconds__agg_linear_trend__attr_""rvalue""__chunk_len_5__f_agg_""max""",timeDelta_Seconds__index_mass_quantile__q_0.7,timeDelta_Seconds__index_mass_quantile__q_0.6,"timeDelta_Seconds__linear_trend__attr_""rvalue""","timeDelta_Seconds__agg_linear_trend__attr_""rvalue""__chunk_len_10__f_agg_""max""","timeDelta_Seconds__agg_linear_trend__attr_""rvalue""__chunk_len_10__f_agg_""mean""","timeDelta_Seconds__agg_linear_trend__attr_""rvalue""__chunk_len_5__f_agg_""var""","timeDelta_Seconds__linear_trend__attr_""pvalue"""
1,1,-0.382086,-0.363548,0.577843,0.468805,-0.178699,-0.41659,-0.484212,-0.263355,8.981192e-14
2,2,-0.156056,-0.130979,0.659776,0.544595,-0.080499,-0.173264,-0.195498,-0.075003,0.0009159099
3,3,-0.246199,-0.193205,0.66232,0.499657,-0.148063,-0.220317,-0.281835,-0.083903,1.361659e-08
4,4,-0.215251,-0.212903,0.661157,0.572314,-0.104026,-0.239556,-0.301066,-0.185719,0.02208761
5,5,-0.273132,-0.253372,0.659596,0.545455,-0.13021,-0.317126,-0.358059,-0.245931,3.968778e-05
6,6,-0.264808,-0.166707,0.619477,0.48963,-0.124907,-0.158504,-0.344795,-0.064998,3.029218e-05
7,7,-0.429203,-0.392117,0.517478,0.426966,-0.251508,-0.46994,-0.478889,-0.29994,1.555982e-24
8,8,-0.343427,-0.354665,0.583869,0.463137,-0.174787,-0.454624,-0.408606,-0.32437,2.496625e-15
9,9,-0.350201,-0.340785,0.585052,0.472294,-0.179189,-0.371803,-0.397392,-0.24499,1.15061e-12
10,10,-0.299511,-0.264415,0.649362,0.473191,-0.140944,-0.33204,-0.37005,-0.249887,1.232071e-06


In [51]:
ts_processed.to_csv(dataDir/'ts_processed.csv', index=False)

In [22]:
ts_extracted_features = ts_extracted_dataset.iloc[:, 1:len(ts_extracted_dataset.columns)].copy()
# normalize numerical features
ts_extracted_cols = ts_extracted_features.columns
scaler_std = StandardScaler()
ts_std = scaler_std.fit_transform(ts_extracted_features)
# transform standard data into dataframe
ts_extracted_processed = pd.DataFrame(ts_std, columns=ts_extracted_cols)
ts_extracted_processed.index = range(1,len(ts_extracted_processed)+1)
# append id col to the dataframe
ts_extracted_processed = pd.concat([ts_extracted_dataset['id'], ts_extracted_processed], axis=1)

In [23]:
ts_extracted_processed

Unnamed: 0,id,"timeDelta_Seconds__agg_linear_trend__attr_""rvalue""__chunk_len_5__f_agg_""mean""","timeDelta_Seconds__agg_linear_trend__attr_""rvalue""__chunk_len_5__f_agg_""max""",timeDelta_Seconds__index_mass_quantile__q_0.7,timeDelta_Seconds__index_mass_quantile__q_0.6,"timeDelta_Seconds__linear_trend__attr_""rvalue""","timeDelta_Seconds__agg_linear_trend__attr_""rvalue""__chunk_len_10__f_agg_""max""","timeDelta_Seconds__agg_linear_trend__attr_""rvalue""__chunk_len_10__f_agg_""mean""","timeDelta_Seconds__agg_linear_trend__attr_""rvalue""__chunk_len_5__f_agg_""var""","timeDelta_Seconds__linear_trend__attr_""pvalue"""
1,1,-0.619074,-0.684764,-0.575428,-0.575104,-0.452449,-0.658722,-0.675563,-0.596749,-0.295091
2,2,0.35193,0.401458,0.584897,0.400736,0.323225,0.323284,0.356143,0.494041,-0.286178
3,3,-0.035318,0.110828,0.620929,-0.177868,-0.210457,0.133389,0.047622,0.442498,-0.29509
4,4,0.097633,0.018827,0.604462,0.757626,0.137392,0.055746,-0.021101,-0.147144,-0.080166
5,5,-0.151017,-0.170183,0.582354,0.411798,-0.069436,-0.257311,-0.224762,-0.495844,-0.294704
6,6,-0.11526,0.234589,0.014194,-0.306964,-0.027547,0.382853,-0.177365,0.55198,-0.294796
7,7,-0.821487,-0.818197,-1.430301,-1.113792,-1.027558,-0.874033,-0.656543,-0.80862,-0.295091
8,8,-0.452999,-0.643274,-0.490077,-0.648077,-0.42155,-0.81222,-0.40539,-0.950098,-0.295091
9,9,-0.482099,-0.578447,-0.473335,-0.530179,-0.456318,-0.477975,-0.365316,-0.490395,-0.295091
10,10,-0.26434,-0.221762,0.437418,-0.518621,-0.154224,-0.3175,-0.26761,-0.518754,-0.295079


In [24]:
# add one-hot encoded categorical features
ohe = OneHotEncoder(handle_unknown='ignore',sparse=False)
cow_breed = cow_total[['id', 'BreedName']].copy()
cow_breed.drop_duplicates(subset=['id'], inplace=True)
cat = ohe.fit_transform(np.array(cow_breed['BreedName']).reshape(-1, 1))
col_names = ohe.get_feature_names_out(['BreedName'])
cat_breed = pd.DataFrame(cat, columns=col_names)
cat_breed.index = range(1,len(cow_breed)+1)
cat_breed

Unnamed: 0,BreedName_1,BreedName_2,BreedName_4,BreedName_99
1,1.0,0.0,0.0,0.0
2,1.0,0.0,0.0,0.0
3,0.0,1.0,0.0,0.0
4,0.0,1.0,0.0,0.0
5,0.0,1.0,0.0,0.0
6,1.0,0.0,0.0,0.0
7,1.0,0.0,0.0,0.0
8,0.0,1.0,0.0,0.0
9,0.0,1.0,0.0,0.0
10,1.0,0.0,0.0,0.0


In [25]:
ts_dataset = pd.concat([ts_extracted_processed, cat_breed], axis=1)
ts_dataset = pd.concat([ts_dataset, y], axis=1)
ts_dataset.to_csv(dataDir.parent/"memory_dataset_34_with_cat.csv", index=False)
ts_dataset

Unnamed: 0,id,"timeDelta_Seconds__agg_linear_trend__attr_""rvalue""__chunk_len_5__f_agg_""mean""","timeDelta_Seconds__agg_linear_trend__attr_""rvalue""__chunk_len_5__f_agg_""max""",timeDelta_Seconds__index_mass_quantile__q_0.7,timeDelta_Seconds__index_mass_quantile__q_0.6,"timeDelta_Seconds__linear_trend__attr_""rvalue""","timeDelta_Seconds__agg_linear_trend__attr_""rvalue""__chunk_len_10__f_agg_""max""","timeDelta_Seconds__agg_linear_trend__attr_""rvalue""__chunk_len_10__f_agg_""mean""","timeDelta_Seconds__agg_linear_trend__attr_""rvalue""__chunk_len_5__f_agg_""var""","timeDelta_Seconds__linear_trend__attr_""pvalue""",BreedName_1,BreedName_2,BreedName_4,BreedName_99,label
1,1,-0.619074,-0.684764,-0.575428,-0.575104,-0.452449,-0.658722,-0.675563,-0.596749,-0.295091,1.0,0.0,0.0,0.0,1
2,2,0.35193,0.401458,0.584897,0.400736,0.323225,0.323284,0.356143,0.494041,-0.286178,1.0,0.0,0.0,0.0,0
3,3,-0.035318,0.110828,0.620929,-0.177868,-0.210457,0.133389,0.047622,0.442498,-0.29509,0.0,1.0,0.0,0.0,0
4,4,0.097633,0.018827,0.604462,0.757626,0.137392,0.055746,-0.021101,-0.147144,-0.080166,0.0,1.0,0.0,0.0,0
5,5,-0.151017,-0.170183,0.582354,0.411798,-0.069436,-0.257311,-0.224762,-0.495844,-0.294704,0.0,1.0,0.0,0.0,0
6,6,-0.11526,0.234589,0.014194,-0.306964,-0.027547,0.382853,-0.177365,0.55198,-0.294796,1.0,0.0,0.0,0.0,0
7,7,-0.821487,-0.818197,-1.430301,-1.113792,-1.027558,-0.874033,-0.656543,-0.80862,-0.295091,1.0,0.0,0.0,0.0,1
8,8,-0.452999,-0.643274,-0.490077,-0.648077,-0.42155,-0.81222,-0.40539,-0.950098,-0.295091,0.0,1.0,0.0,0.0,1
9,9,-0.482099,-0.578447,-0.473335,-0.530179,-0.456318,-0.477975,-0.365316,-0.490395,-0.295091,0.0,1.0,0.0,0.0,0
10,10,-0.26434,-0.221762,0.437418,-0.518621,-0.154224,-0.3175,-0.26761,-0.518754,-0.295079,1.0,0.0,0.0,0.0,0


cow_total.BreedName.unique()

In [3]:
cow_total

Unnamed: 0,FarmName_Pseudo,MilkingEventDateTime,TrafficDeviceName,MilkProduction,timeDelta_Seconds,LactationNumber,DaysInMilk,BreedName,Age,label,id
0,a624fb9a,2020-10-10 03:46:46,vms1,7.49,748.0,1.0,2.0,1,1.99,1,1
1,a624fb9a,2020-10-10 16:13:02,vms1,10.21,233.0,1.0,2.0,1,1.99,1,1
2,a624fb9a,2020-10-11 05:09:29,vms2,10.34,1372.0,1.0,3.0,1,1.99,1,1
3,a624fb9a,2020-10-11 19:26:21,vms1,6.80,484.0,1.0,3.0,1,1.99,1,1
4,a624fb9a,2020-10-12 02:49:54,vms1,7.12,4537.0,1.0,4.0,1,1.99,1,1
...,...,...,...,...,...,...,...,...,...,...,...
1813,a624fb9a,2022-08-22 00:40:37,vms2,7.83,276.0,2.0,339.0,2,3.92,1,22
1814,a624fb9a,2022-08-22 13:31:20,vms1,13.30,4594.0,2.0,339.0,2,3.92,1,22
1815,a624fb9a,2022-08-22 22:08:42,vms2,8.19,538.0,2.0,339.0,2,3.92,1,22
1816,a624fb9a,2022-08-23 07:34:52,vms1,9.51,107.0,2.0,340.0,2,3.92,1,22
