In [5]:
from pathlib import Path
from matplotlib.style import use
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from tsfresh import extract_features
from tsfresh import select_features
from tsfresh.utilities.dataframe_functions import impute
from tsfresh.feature_extraction import ComprehensiveFCParameters


import torch
import torch.nn as nn

dataDir = Path.cwd().parent.parent.parent/'Data/processed/ts_targetCows/'
usecols = ['id', 'MilkingEventDateTime', 'FarmName_Pseudo', 'TrafficDeviceName', 'MilkProduction', 'timeDelta_Seconds', 'LactationNumber', 'DaysInMilk', 'BreedName', 'Age']
threshold_time = 800

# labeling cow with good/bad learner(0/1)
def labeling_data(threshold_time, cow_total):
    global learner
    timeCost = cow_total.timeDelta_Seconds.mean()
    if timeCost > threshold_time:
        learner = 1 # good learner
    else:
        learner = 0 # bad learner
    cow_total['label'] = learner
    return cow_total

In [6]:
# integrate all the cows data into one dataset
filelist = list(Path(dataDir).glob('*.csv'))
for i, _ in enumerate(filelist):
    fileName = 'cow_' + str(i) + '.csv'
    single_cow = pd.read_csv(dataDir/fileName, encoding='utf-8', usecols=usecols)
    single_cow = labeling_data(threshold_time, single_cow)
    single_cow.sort_values(by=['MilkingEventDateTime'], inplace=True)
    if i == 0:
        cow_total = single_cow
    cow_total = pd.concat([cow_total, single_cow])

In [7]:
#df_static_features = cow_total[['FarmName_Pseudo']]
static_cols = ['FarmName_Pseudo', 'TrafficDeviceName', 'LactationNumber', 'BreedName']
timeSeries_cols = ['Age', 'MilkProduction', 'timeDelta_Seconds', 'DaysInMilk']
output_col = ['label']


In [8]:
cow_label = cow_total[['id', 'label']].copy()
cow_timeseries = cow_total[['id', 'MilkingEventDateTime']].copy()
cow_timeseries.index = range(len(cow_timeseries))
# fetch y for feature extraction
y = cow_label.drop_duplicates(subset=['id'])
y = y["label"]
y.index = range(1,len(y)+1) 

In [9]:
ts_extracted_dataset = cow_total[['id']].copy()
ts_extracted_dataset.drop_duplicates(subset=['id'], inplace=True)
ts_extracted_dataset.index = range(1, len(ts_extracted_dataset)+1)

In [10]:
cow_timeseries

Unnamed: 0,id,MilkingEventDateTime
0,1,2021-07-05 11:02:14
1,1,2021-07-05 20:52:47
2,1,2021-07-06 06:41:09
3,1,2021-07-06 16:25:11
4,1,2021-07-07 01:39:16
...,...,...
13983,16,2022-08-06 06:38:20
13984,16,2022-08-06 14:50:23
13985,16,2022-08-07 01:52:24
13986,16,2022-08-07 11:38:29


In [11]:
for col in timeSeries_cols:
    ts_processed = pd.DataFrame(cow_total[col].copy())
    # normalize numerical features
    scaler_std = StandardScaler()
    ts_processed = pd.DataFrame(scaler_std.fit_transform(ts_processed))
    ts_processed.rename(columns={0: col}, inplace=True)
    print(ts_processed)
    ts_processed = pd.concat([cow_timeseries, ts_processed], axis=1)
    # extract time series features
    extracted_features = extract_features(ts_processed, column_id="id", column_sort="MilkingEventDateTime")
    #extracted_features.dropna(axis=1, inplace=True)
    impute(extracted_features)
    features_filtered = select_features(extracted_features, y)
    ts_extracted_dataset = pd.concat([ts_extracted_dataset, features_filtered], axis=1)


            Age
0     -1.682210
1     -1.682210
2     -1.682210
3     -1.682210
4     -1.654042
...         ...
13983  2.571260
13984  2.571260
13985  2.571260
13986  2.571260
13987  2.571260

[13988 rows x 1 columns]


Feature Extraction: 100%|██████████| 16/16 [00:07<00:00,  2.11it/s]


       MilkProduction
0           -0.219742
1           -0.808082
2           -1.039215
3           -0.959369
4           -2.224299
...               ...
13983       -0.261767
13984       -1.413231
13985        0.162678
13986       -0.677807
13987       -0.156706

[13988 rows x 1 columns]


Feature Extraction: 100%|██████████| 16/16 [00:07<00:00,  2.14it/s]


       timeDelta_Seconds
0               4.180366
1              -0.555360
2              -0.313789
3              -0.583145
4              -0.437276
...                  ...
13983          -0.505965
13984          -0.528347
13985           0.303646
13986           0.210259
13987           1.391875

[13988 rows x 1 columns]


Feature Extraction: 100%|██████████| 16/16 [00:07<00:00,  2.11it/s]


       DaysInMilk
0       -1.670539
1       -1.670539
2       -1.659383
3       -1.659383
4       -1.648226
...           ...
13983    2.133740
13984    2.133740
13985    2.144896
13986    2.144896
13987    2.144896

[13988 rows x 1 columns]


Feature Extraction: 100%|██████████| 16/16 [00:07<00:00,  2.16it/s]


In [12]:
ts_extracted_dataset

Unnamed: 0,id,timeDelta_Seconds__sum_values,timeDelta_Seconds__ar_coefficient__coeff_0__k_10,"timeDelta_Seconds__change_quantiles__f_agg_""var""__isabs_False__qh_0.8__ql_0.0","timeDelta_Seconds__change_quantiles__f_agg_""mean""__isabs_True__qh_0.8__ql_0.0","timeDelta_Seconds__change_quantiles__f_agg_""var""__isabs_True__qh_0.8__ql_0.0","timeDelta_Seconds__change_quantiles__f_agg_""mean""__isabs_True__qh_1.0__ql_0.0","timeDelta_Seconds__change_quantiles__f_agg_""var""__isabs_True__qh_0.6__ql_0.2","timeDelta_Seconds__change_quantiles__f_agg_""var""__isabs_False__qh_0.8__ql_0.2","timeDelta_Seconds__change_quantiles__f_agg_""mean""__isabs_True__qh_0.8__ql_0.2",...,"timeDelta_Seconds__change_quantiles__f_agg_""var""__isabs_True__qh_0.8__ql_0.6","timeDelta_Seconds__fft_coefficient__attr_""real""__coeff_0",timeDelta_Seconds__count_above__t_0,timeDelta_Seconds__count_below__t_0,"timeDelta_Seconds__change_quantiles__f_agg_""var""__isabs_False__qh_0.8__ql_0.4",timeDelta_Seconds__quantile__q_0.7,"timeDelta_Seconds__fft_coefficient__attr_""angle""__coeff_0",timeDelta_Seconds__mean_abs_change,timeDelta_Seconds__mean,timeDelta_Seconds__variation_coefficient
1,1,-258.270455,-0.059432,0.028329,0.083289,0.021393,0.331732,0.003371,0.026488,0.07955,...,0.002871,-258.270455,0.200708,0.799292,0.013697,-0.22619,180.0,0.331732,-0.152462,-5.832477
2,2,-65.81825,-0.052343,0.092498,0.217239,0.045306,0.769818,0.005436,0.099213,0.236574,...,0.015327,-65.81825,0.261468,0.738532,0.074653,-0.122847,180.0,0.769818,-0.07548,-12.314639
3,3,222.515677,0.224628,0.243725,0.39394,0.088553,1.092149,0.015533,0.202925,0.358721,...,0.029066,222.515677,0.436019,0.563981,0.153784,0.362611,0.0,1.092149,0.351526,3.822593
4,4,191.531087,0.421148,0.323197,0.45469,0.117113,1.072078,0.015992,0.253044,0.403896,...,0.047433,191.531087,0.470054,0.529946,0.193257,0.45569,0.0,1.072078,0.347606,3.096461
5,5,-15.281514,-0.015041,0.080499,0.22583,0.029538,0.68012,0.004516,0.062471,0.199104,...,0.009337,-15.281514,0.258065,0.741935,0.051094,-0.074841,180.0,0.68012,-0.016998,-50.369007
6,6,79.167425,0.044996,0.168901,0.317342,0.068374,0.871197,0.011733,0.170684,0.331177,...,0.025639,79.167425,0.353096,0.646904,0.13374,0.125825,0.0,0.871197,0.104305,10.689828
7,7,-127.520034,-0.106161,0.087597,0.220981,0.038765,0.643985,0.006356,0.089296,0.234857,...,0.013695,-127.520034,0.247845,0.752155,0.061424,-0.103166,180.0,0.643985,-0.137414,-5.414806
8,8,276.115971,0.167051,0.466056,0.511193,0.204795,1.229253,0.022964,0.505636,0.566458,...,0.053394,276.115971,0.41345,0.58655,0.436364,0.581029,0.0,1.229253,0.343856,3.98983
9,9,-75.125493,-0.074791,0.089882,0.231704,0.036224,0.634169,0.006688,0.082267,0.229354,...,0.008153,-75.125493,0.255629,0.744371,0.05703,-0.091049,180.0,0.634169,-0.099504,-7.987953
10,10,36.089554,0.021588,0.140618,0.274528,0.065268,0.889185,0.006735,0.152795,0.302684,...,0.024758,36.089554,0.288241,0.711759,0.129388,-0.03903,0.0,0.889185,0.037554,28.039183


In [14]:
ts_dataset = pd.concat([ts_extracted_dataset, y], axis=1)
ts_dataset.to_csv(dataDir.parent/"ts_dataset.csv", index=False)
ts_dataset

Unnamed: 0,id,timeDelta_Seconds__sum_values,timeDelta_Seconds__ar_coefficient__coeff_0__k_10,"timeDelta_Seconds__change_quantiles__f_agg_""var""__isabs_False__qh_0.8__ql_0.0","timeDelta_Seconds__change_quantiles__f_agg_""mean""__isabs_True__qh_0.8__ql_0.0","timeDelta_Seconds__change_quantiles__f_agg_""var""__isabs_True__qh_0.8__ql_0.0","timeDelta_Seconds__change_quantiles__f_agg_""mean""__isabs_True__qh_1.0__ql_0.0","timeDelta_Seconds__change_quantiles__f_agg_""var""__isabs_True__qh_0.6__ql_0.2","timeDelta_Seconds__change_quantiles__f_agg_""var""__isabs_False__qh_0.8__ql_0.2","timeDelta_Seconds__change_quantiles__f_agg_""mean""__isabs_True__qh_0.8__ql_0.2",...,"timeDelta_Seconds__fft_coefficient__attr_""real""__coeff_0",timeDelta_Seconds__count_above__t_0,timeDelta_Seconds__count_below__t_0,"timeDelta_Seconds__change_quantiles__f_agg_""var""__isabs_False__qh_0.8__ql_0.4",timeDelta_Seconds__quantile__q_0.7,"timeDelta_Seconds__fft_coefficient__attr_""angle""__coeff_0",timeDelta_Seconds__mean_abs_change,timeDelta_Seconds__mean,timeDelta_Seconds__variation_coefficient,label
1,1,-258.270455,-0.059432,0.028329,0.083289,0.021393,0.331732,0.003371,0.026488,0.07955,...,-258.270455,0.200708,0.799292,0.013697,-0.22619,180.0,0.331732,-0.152462,-5.832477,0
2,2,-65.81825,-0.052343,0.092498,0.217239,0.045306,0.769818,0.005436,0.099213,0.236574,...,-65.81825,0.261468,0.738532,0.074653,-0.122847,180.0,0.769818,-0.07548,-12.314639,0
3,3,222.515677,0.224628,0.243725,0.39394,0.088553,1.092149,0.015533,0.202925,0.358721,...,222.515677,0.436019,0.563981,0.153784,0.362611,0.0,1.092149,0.351526,3.822593,1
4,4,191.531087,0.421148,0.323197,0.45469,0.117113,1.072078,0.015992,0.253044,0.403896,...,191.531087,0.470054,0.529946,0.193257,0.45569,0.0,1.072078,0.347606,3.096461,1
5,5,-15.281514,-0.015041,0.080499,0.22583,0.029538,0.68012,0.004516,0.062471,0.199104,...,-15.281514,0.258065,0.741935,0.051094,-0.074841,180.0,0.68012,-0.016998,-50.369007,0
6,6,79.167425,0.044996,0.168901,0.317342,0.068374,0.871197,0.011733,0.170684,0.331177,...,79.167425,0.353096,0.646904,0.13374,0.125825,0.0,0.871197,0.104305,10.689828,1
7,7,-127.520034,-0.106161,0.087597,0.220981,0.038765,0.643985,0.006356,0.089296,0.234857,...,-127.520034,0.247845,0.752155,0.061424,-0.103166,180.0,0.643985,-0.137414,-5.414806,0
8,8,276.115971,0.167051,0.466056,0.511193,0.204795,1.229253,0.022964,0.505636,0.566458,...,276.115971,0.41345,0.58655,0.436364,0.581029,0.0,1.229253,0.343856,3.98983,1
9,9,-75.125493,-0.074791,0.089882,0.231704,0.036224,0.634169,0.006688,0.082267,0.229354,...,-75.125493,0.255629,0.744371,0.05703,-0.091049,180.0,0.634169,-0.099504,-7.987953,0
10,10,36.089554,0.021588,0.140618,0.274528,0.065268,0.889185,0.006735,0.152795,0.302684,...,36.089554,0.288241,0.711759,0.129388,-0.03903,0.0,0.889185,0.037554,28.039183,1


In [62]:
from tsfresh.feature_selection.relevance import calculate_relevance_table
rt = calculate_relevance_table(extracted_features, y)
rt

Unnamed: 0_level_0,feature,type,p_value,relevant
feature,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
"Age__change_quantiles__f_agg_""mean""__isabs_True__qh_1.0__ql_0.2","Age__change_quantiles__f_agg_""mean""__isabs_Tru...",real,0.004662,False
"Age__change_quantiles__f_agg_""mean""__isabs_False__qh_1.0__ql_0.2","Age__change_quantiles__f_agg_""mean""__isabs_Fal...",real,0.004662,False
"Age__fft_coefficient__attr_""real""__coeff_29","Age__fft_coefficient__attr_""real""__coeff_29",real,0.006993,False
"Age__change_quantiles__f_agg_""mean""__isabs_True__qh_1.0__ql_0.4","Age__change_quantiles__f_agg_""mean""__isabs_Tru...",real,0.006993,False
"Age__change_quantiles__f_agg_""mean""__isabs_False__qh_1.0__ql_0.4","Age__change_quantiles__f_agg_""mean""__isabs_Fal...",real,0.006993,False
...,...,...,...,...
Age__permutation_entropy__dimension_4__tau_1,Age__permutation_entropy__dimension_4__tau_1,constant,,False
Age__permutation_entropy__dimension_5__tau_1,Age__permutation_entropy__dimension_5__tau_1,constant,,False
Age__permutation_entropy__dimension_6__tau_1,Age__permutation_entropy__dimension_6__tau_1,constant,,False
Age__permutation_entropy__dimension_7__tau_1,Age__permutation_entropy__dimension_7__tau_1,constant,,False
