In [40]:
import os
from pathlib import Path
from matplotlib.style import use
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from tsfresh import extract_features
from tsfresh import select_features
from tsfresh.utilities.dataframe_functions import impute
from tsfresh.feature_extraction import ComprehensiveFCParameters


dataDir = Path.cwd().parent.parent.parent/'Data/processed/memory_targetCows/'
usecols = ['id', 'MilkingEventDateTime', 'FarmName_Pseudo', 'TrafficDeviceName', 'MilkProduction', 'timeDelta_Seconds', 'LactationNumber', 'DaysInMilk', 'BreedName', 'Age', 'label']

In [41]:
periods = os.listdir(dataDir)
periods
for p in periods:
    print(p)

1
2


In [42]:
# integrate all the cows data into one dataset
periods = os.listdir(dataDir)
for p in periods:
    filelist = list(Path(dataDir/str(p)).glob('*.csv'))
    for i, _ in enumerate(filelist):
        fileName = 'cow_' + str(i) + '.csv'
        single_cow = pd.read_csv(dataDir/str(p)/fileName, encoding='utf-8', usecols=usecols)
        single_cow.sort_values(by=['MilkingEventDateTime'], inplace=True)
        if i == 0:
            cow_total = single_cow
        cow_total = pd.concat([cow_total, single_cow])
    fileName = 'cow_total_' + str(p) + '.csv'
    print(fileName)
    Path(dataDir/'cow_total/').mkdir(parents=True, exist_ok=True)
    
    cow_total.to_csv(dataDir/'cow_total'/fileName, index=False)

cow_total_1.csv
cow_total_2.csv


In [43]:
cow_total_1 = pd.read_csv(dataDir/'cow_total/cow_total_1.csv', encoding='utf-8', index_col=False)
cow_total_2 = pd.read_csv(dataDir/'cow_total/cow_total_2.csv', encoding='utf-8', index_col=False)

In [44]:
cow_total_1

Unnamed: 0,FarmName_Pseudo,MilkingEventDateTime,TrafficDeviceName,MilkProduction,timeDelta_Seconds,LactationNumber,DaysInMilk,BreedName,Age,label,id
0,a624fb9a,2020-10-10 03:46:46,vms1,7.49,748.0,1.0,2.0,1,1.99,1,1
1,a624fb9a,2020-10-10 16:13:02,vms1,10.21,233.0,1.0,2.0,1,1.99,1,1
2,a624fb9a,2020-10-11 05:09:29,vms2,10.34,1372.0,1.0,3.0,1,1.99,1,1
3,a624fb9a,2020-10-11 19:26:21,vms1,6.80,484.0,1.0,3.0,1,1.99,1,1
4,a624fb9a,2020-10-12 02:49:54,vms1,7.12,4537.0,1.0,4.0,1,1.99,1,1
...,...,...,...,...,...,...,...,...,...,...,...
16744,a624fb9a,2021-07-17 16:59:12,vms2,7.63,6.0,1.0,297.0,2,2.82,1,22
16745,a624fb9a,2021-07-18 05:52:00,vms2,7.58,49.0,1.0,298.0,2,2.82,1,22
16746,a624fb9a,2021-07-18 20:20:32,vms2,8.03,516.0,1.0,298.0,2,2.82,1,22
16747,a624fb9a,2021-07-19 11:42:19,vms2,8.64,1516.0,1.0,299.0,2,2.82,1,22


In [45]:
# df_static_features = cow_total[['FarmName_Pseudo']]
static_cols = ['Age', 'FarmName_Pseudo', 'TrafficDeviceName', 'LactationNumber', 'BreedName', 'DaysInMilk']
timeSeries_cols = ['MilkProduction', 'timeDelta_Seconds']
output_col = ['label']

In [48]:
cow_label = cow_total_1[['id', 'label']].copy()
# fetch y for feature extraction
y = cow_label.drop_duplicates(subset=['id'])
y = y["label"]
y.index = range(1,len(y)+1) 

In [47]:
ts_extracted_dataset = cow_total_1[['id']].copy()
ts_extracted_dataset.drop_duplicates(subset=['id'], inplace=True)
ts_extracted_dataset.index = range(1, len(ts_extracted_dataset)+1)

In [51]:
cow_timeseries = cow_total_1[['id', 'MilkingEventDateTime']].copy()
cow_timeseries.index = range(len(cow_timeseries))

ts_processed = pd.DataFrame(cow_total_1["timeDelta_Seconds"].copy())
ts_processed.index = range(0,len(ts_processed))
ts_processed = pd.concat([cow_timeseries, ts_processed], axis=1)
extracted_features = extract_features(ts_processed, column_id="id", column_sort="MilkingEventDateTime")
impute(extracted_features)
features_filtered = select_features(extracted_features, y)
ts_extracted_dataset = pd.concat([ts_extracted_dataset, features_filtered], axis=1)


Feature Extraction: 100%|██████████| 22/22 [00:31<00:00,  1.44s/it]


In [None]:
cow_timeseries = cow_total_2[['id', 'MilkingEventDateTime']].copy()
cow_timeseries.index = range(len(cow_timeseries))

ts_processed = pd.DataFrame(cow_total_1["timeDelta_Seconds"].copy())
ts_processed.index = range(0,len(ts_processed))
ts_processed = pd.concat([cow_timeseries, ts_processed], axis=1)
extracted_features = extract_features(ts_processed, column_id="id", column_sort="MilkingEventDateTime")
impute(extracted_features)
features_filtered = select_features(extracted_features, y)
ts_extracted_dataset = pd.concat([ts_extracted_dataset, features_filtered], axis=1)


In [63]:
ts_processed.to_csv('C:/Users/Savior_Hn/Desktop/ts_pro.csv', index=False)

In [1]:
relevance_table

NameError: name 'relevance_table' is not defined

In [10]:
cow_timeseries = cow_total[['id', 'MilkingEventDateTime']].loc[cow_total.LactationNumber == 1].copy()
cow_timeseries.sort_values(by="MilkingEventDateTime", inplace=True)
cow_timeseries.index = range(len(cow_timeseries))

for col in timeSeries_cols:
    ts_processed = pd.DataFrame(cow_total_1[col].copy())
    ts_processed.index = range(0,len(ts_processed)) 
    ts_processed = pd.concat([cow_timeseries, ts_processed], axis=1)
    print(ts_processed)
    # extract time series features
    extracted_features = extract_features(ts_processed, column_id="id", column_sort="MilkingEventDateTime")
    #extracted_features.dropna(axis=1, inplace=True)
    impute(extracted_features)
    features_filtered = select_features(extracted_features, y)
    ts_extracted_dataset = pd.concat([ts_extracted_dataset, features_filtered], axis=1)


       id MilkingEventDateTime  MilkProduction
0       3  2020-08-17 05:49:08            7.10
1       7  2020-08-17 08:38:47            8.49
2       3  2020-08-17 13:03:21            7.29
3       8  2020-08-17 14:19:46           15.90
4       3  2020-08-17 19:51:27            7.04
...    ..                  ...             ...
10744   6  2021-11-27 21:36:40            8.56
10745   6  2021-11-28 11:28:46            8.92
10746   6  2021-11-28 23:05:36            7.73
10747   6  2022-02-03 18:19:53            3.87
10748   6  2022-02-03 23:59:16            4.17

[10749 rows x 3 columns]


Feature Extraction: 100%|██████████| 13/13 [00:07<00:00,  1.83it/s]


       id MilkingEventDateTime  timeDelta_Seconds
0       3  2020-08-17 05:49:08              621.0
1       7  2020-08-17 08:38:47             4432.0
2       3  2020-08-17 13:03:21             1075.0
3       8  2020-08-17 14:19:46             2186.0
4       3  2020-08-17 19:51:27             1209.0
...    ..                  ...                ...
10744   6  2021-11-27 21:36:40              107.0
10745   6  2021-11-28 11:28:46              309.0
10746   6  2021-11-28 23:05:36              203.0
10747   6  2022-02-03 18:19:53              264.0
10748   6  2022-02-03 23:59:16               10.0

[10749 rows x 3 columns]


Feature Extraction: 100%|██████████| 13/13 [00:07<00:00,  1.77it/s]


In [12]:
ts_extracted_dataset

Unnamed: 0,id
1,1
2,2
3,3
4,4
5,5
6,6
7,7
8,8
9,9
10,10


In [33]:
periods = [1, 2]

for i in periods:
    cow_total_i = cow_total.loc[cow_total.LactationNumber == i].copy()
    cow_total_i.sort_values(by="MilkingEventDateTime", inplace=True)
    cow_timeseries = cow_total[['id', 'MilkingEventDateTime']].loc[cow_total.LactationNumber == i].copy()
    cow_timeseries.sort_values(by="MilkingEventDateTime", inplace=True)
    cow_timeseries.index = range(len(cow_timeseries))
    for col in timeSeries_cols:
        ts_processed = pd.DataFrame(cow_total[col].loc[cow_total.LactationNumber == i]).copy()
        ts_processed.index = range(0,len(ts_processed)) 
        ts_processed = pd.concat([cow_timeseries, ts_processed], axis=1)
        #print(ts_processed)
        # extract time series features
        extracted_features = extract_features(ts_processed, column_id="id", column_sort="MilkingEventDateTime")
        print(extracted_features)
        # extracted_features.dropna(axis=1, inplace=True)
        impute(extracted_features)
        features_filtered = select_features(extracted_features, y)
        ts_extracted_dataset = pd.concat([ts_extracted_dataset, features_filtered], axis=1)


Feature Extraction: 100%|██████████| 22/22 [00:08<00:00,  2.67it/s]


    MilkProduction__variance_larger_than_standard_deviation  \
1                                                 1.0         
2                                                 1.0         
3                                                 1.0         
4                                                 1.0         
5                                                 1.0         
6                                                 1.0         
7                                                 1.0         
8                                                 1.0         
9                                                 1.0         
10                                                1.0         
11                                                1.0         
12                                                1.0         
13                                                1.0         
14                                                1.0         
15                                                1.0  

Feature Extraction: 100%|██████████| 22/22 [00:07<00:00,  2.84it/s]


    timeDelta_Seconds__variance_larger_than_standard_deviation  \
1                                                 1.0            
2                                                 1.0            
3                                                 1.0            
4                                                 1.0            
5                                                 1.0            
6                                                 1.0            
7                                                 1.0            
8                                                 1.0            
9                                                 1.0            
10                                                1.0            
11                                                1.0            
12                                                1.0            
13                                                1.0            
14                                                1.0            
15        

Feature Extraction: 100%|██████████| 22/22 [00:07<00:00,  2.91it/s]


    MilkProduction__variance_larger_than_standard_deviation  \
1                                                 1.0         
2                                                 1.0         
3                                                 1.0         
4                                                 1.0         
5                                                 1.0         
6                                                 1.0         
7                                                 1.0         
8                                                 1.0         
9                                                 1.0         
10                                                1.0         
11                                                1.0         
12                                                1.0         
13                                                1.0         
14                                                1.0         
15                                                1.0  

Feature Extraction: 100%|██████████| 22/22 [00:07<00:00,  2.90it/s]


    timeDelta_Seconds__variance_larger_than_standard_deviation  \
1                                                 1.0            
2                                                 1.0            
3                                                 1.0            
4                                                 1.0            
5                                                 1.0            
6                                                 1.0            
7                                                 1.0            
8                                                 1.0            
9                                                 1.0            
10                                                1.0            
11                                                1.0            
12                                                1.0            
13                                                1.0            
14                                                1.0            
15        

In [32]:
features_filtered

1
2
3
4
5
6
7
8
9
10
11


In [25]:
ts_extracted_features = ts_extracted_dataset.iloc[:, 1:len(ts_extracted_dataset.columns)].copy()
# normalize numerical features
ts_extracted_cols = ts_extracted_features.columns
scaler_std = StandardScaler()
ts_std = scaler_std.fit_transform(ts_extracted_features)
# transform standard data into dataframe
ts_extracted_processed = pd.DataFrame(ts_std, columns=ts_extracted_cols)
ts_extracted_processed.index = range(1,len(ts_extracted_processed)+1)
# append id col to the dataframe
ts_extracted_processed = pd.concat([ts_extracted_dataset['id'], ts_extracted_processed], axis=1)

ValueError: at least one array or dtype is required

In [None]:
ts_extracted_processed

cow_total.BreedName.unique()

In [3]:
cow_total

Unnamed: 0,FarmName_Pseudo,MilkingEventDateTime,TrafficDeviceName,MilkProduction,timeDelta_Seconds,LactationNumber,DaysInMilk,BreedName,Age,label,id
0,a624fb9a,2020-10-10 03:46:46,vms1,7.49,748.0,1.0,2.0,1,1.99,1,1
1,a624fb9a,2020-10-10 16:13:02,vms1,10.21,233.0,1.0,2.0,1,1.99,1,1
2,a624fb9a,2020-10-11 05:09:29,vms2,10.34,1372.0,1.0,3.0,1,1.99,1,1
3,a624fb9a,2020-10-11 19:26:21,vms1,6.80,484.0,1.0,3.0,1,1.99,1,1
4,a624fb9a,2020-10-12 02:49:54,vms1,7.12,4537.0,1.0,4.0,1,1.99,1,1
...,...,...,...,...,...,...,...,...,...,...,...
1813,a624fb9a,2022-08-22 00:40:37,vms2,7.83,276.0,2.0,339.0,2,3.92,1,22
1814,a624fb9a,2022-08-22 13:31:20,vms1,13.30,4594.0,2.0,339.0,2,3.92,1,22
1815,a624fb9a,2022-08-22 22:08:42,vms2,8.19,538.0,2.0,339.0,2,3.92,1,22
1816,a624fb9a,2022-08-23 07:34:52,vms1,9.51,107.0,2.0,340.0,2,3.92,1,22
