In [1]:
from pathlib import Path
from matplotlib.style import use
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from tsfresh import extract_features
from tsfresh import select_features
from tsfresh.utilities.dataframe_functions import impute
from tsfresh.feature_extraction import ComprehensiveFCParameters

dataDir = Path.cwd().parent.parent.parent/'Data/processed/learner_targetCows/'
usecols = ['id', 'MilkingEventDateTime', 'FarmName_Pseudo', 'TrafficDeviceName', 'MilkProduction', 'timeDelta_Seconds', 'LactationNumber', 'DaysInMilk', 'BreedName', 'Age']
threshold_time = 800

# labeling cow with good/bad learner(0/1)
def labeling_data(threshold_time, cow_total):
    global learner
    timeCost = cow_total.timeDelta_Seconds.mean()
    if timeCost > threshold_time:
        learner = 1 # good learner
    else:
        learner = 0 # bad learner
    cow_total['label'] = learner
    return cow_total

In [2]:
# integrate all the cows data into one dataset
filelist = list(Path(dataDir).glob('*.csv'))
for i, _ in enumerate(filelist):
    fileName = 'cow_' + str(i) + '.csv'
    single_cow = pd.read_csv(dataDir/fileName, encoding='utf-8', usecols=usecols)
    single_cow = labeling_data(threshold_time, single_cow)
    single_cow.sort_values(by=['MilkingEventDateTime'], inplace=True)
    if i == 0:
        cow_total = single_cow
    cow_total = pd.concat([cow_total, single_cow])

In [3]:
#df_static_features = cow_total[['FarmName_Pseudo']]
static_cols = ['Age', 'FarmName_Pseudo', 'TrafficDeviceName', 'LactationNumber', 'BreedName', 'DaysInMilk']
timeSeries_cols = ['MilkProduction', 'timeDelta_Seconds']
output_col = ['label']

In [4]:
cow_label = cow_total[['id', 'label']].copy()
cow_timeseries = cow_total[['id', 'MilkingEventDateTime']].copy()
cow_timeseries.index = range(len(cow_timeseries))
# fetch y for feature extraction
y = cow_label.drop_duplicates(subset=['id'])
y = y["label"]
y.index = range(1,len(y)+1) 

In [5]:
ts_extracted_dataset = cow_total[['id']].copy()
ts_extracted_dataset.drop_duplicates(subset=['id'], inplace=True)
ts_extracted_dataset.index = range(1, len(ts_extracted_dataset)+1)

In [137]:
cow_timeseries

Unnamed: 0,id,MilkingEventDateTime
0,1,2021-07-05 11:02:14
1,1,2021-07-05 20:52:47
2,1,2021-07-06 06:41:09
3,1,2021-07-06 16:25:11
4,1,2021-07-07 01:39:16
...,...,...
13983,16,2022-08-06 06:38:20
13984,16,2022-08-06 14:50:23
13985,16,2022-08-07 01:52:24
13986,16,2022-08-07 11:38:29


In [6]:
for col in timeSeries_cols:
    ts_processed = pd.DataFrame(cow_total[col].copy())
    ts_processed.index = range(0,len(ts_processed)) 
    ts_processed = pd.concat([cow_timeseries, ts_processed], axis=1)
    print(ts_processed)
    # extract time series features
    extracted_features = extract_features(ts_processed, column_id="id", column_sort="MilkingEventDateTime")
    #extracted_features.dropna(axis=1, inplace=True)
    impute(extracted_features)
    features_filtered = select_features(extracted_features, y)
    ts_extracted_dataset = pd.concat([ts_extracted_dataset, features_filtered], axis=1)


       id MilkingEventDateTime  MilkProduction
0       1  2021-07-05 11:02:14            9.07
1       1  2021-07-05 20:52:47            7.67
2       1  2021-07-06 06:41:09            7.12
3       1  2021-07-06 16:25:11            7.31
4       1  2021-07-07 01:39:16            4.30
...    ..                  ...             ...
13983  16  2022-08-06 06:38:20            8.97
13984  16  2022-08-06 14:50:23            6.23
13985  16  2022-08-07 01:52:24            9.98
13986  16  2022-08-07 11:38:29            7.98
13987  16  2022-08-07 23:03:05            9.22

[13988 rows x 3 columns]


Feature Extraction: 100%|██████████| 16/16 [00:07<00:00,  2.15it/s]


       id MilkingEventDateTime  timeDelta_Seconds
0       1  2021-07-05 11:02:14             6215.0
1       1  2021-07-05 20:52:47               79.0
2       1  2021-07-06 06:41:09              392.0
3       1  2021-07-06 16:25:11               43.0
4       1  2021-07-07 01:39:16              232.0
...    ..                  ...                ...
13983  16  2022-08-06 06:38:20              143.0
13984  16  2022-08-06 14:50:23              114.0
13985  16  2022-08-07 01:52:24             1192.0
13986  16  2022-08-07 11:38:29             1071.0
13987  16  2022-08-07 23:03:05             2602.0

[13988 rows x 3 columns]


Feature Extraction: 100%|██████████| 16/16 [00:07<00:00,  2.21it/s]


In [139]:
ts_extracted_features = ts_extracted_dataset.iloc[:, 1:len(ts_extracted_dataset.columns)].copy()
# normalize numerical features
ts_extracted_cols = ts_extracted_features.columns
scaler_std = StandardScaler()
ts_std = scaler_std.fit_transform(ts_extracted_features)
# transform standard data into dataframe
ts_extracted_processed = pd.DataFrame(ts_std, columns=ts_extracted_cols)
ts_extracted_processed.index = range(1,len(ts_extracted_processed)+1)
# append id col to the dataframe
ts_extracted_processed = pd.concat([ts_extracted_dataset['id'], ts_extracted_processed], axis=1)

In [140]:
ts_extracted_processed

Unnamed: 0,id,"timeDelta_Seconds__change_quantiles__f_agg_""var""__isabs_True__qh_0.8__ql_0.4","timeDelta_Seconds__change_quantiles__f_agg_""mean""__isabs_True__qh_1.0__ql_0.0","timeDelta_Seconds__change_quantiles__f_agg_""mean""__isabs_True__qh_0.8__ql_0.4","timeDelta_Seconds__change_quantiles__f_agg_""mean""__isabs_True__qh_0.8__ql_0.6","timeDelta_Seconds__change_quantiles__f_agg_""var""__isabs_True__qh_0.8__ql_0.0","timeDelta_Seconds__change_quantiles__f_agg_""mean""__isabs_True__qh_0.8__ql_0.0","timeDelta_Seconds__change_quantiles__f_agg_""var""__isabs_False__qh_0.8__ql_0.6","timeDelta_Seconds__change_quantiles__f_agg_""var""__isabs_False__qh_0.8__ql_0.0","timeDelta_Seconds__change_quantiles__f_agg_""var""__isabs_True__qh_0.6__ql_0.2","timeDelta_Seconds__change_quantiles__f_agg_""var""__isabs_False__qh_0.8__ql_0.2","timeDelta_Seconds__change_quantiles__f_agg_""mean""__isabs_True__qh_0.8__ql_0.2","timeDelta_Seconds__change_quantiles__f_agg_""var""__isabs_True__qh_0.8__ql_0.2",timeDelta_Seconds__quantile__q_0.8,timeDelta_Seconds__quantile__q_0.7,"timeDelta_Seconds__change_quantiles__f_agg_""var""__isabs_True__qh_0.8__ql_0.6",timeDelta_Seconds__mean_abs_change,"timeDelta_Seconds__change_quantiles__f_agg_""var""__isabs_False__qh_0.8__ql_0.4",timeDelta_Seconds__mean
1,1,-0.892543,-1.967178,-1.864354,-2.005686,-0.939033,-1.813592,-1.294771,-1.111943,-1.158618,-1.073848,-1.844416,-0.887723,-1.09525,-1.056712,-1.224945,-1.967178,-1.034363,-0.982661
2,2,-0.368274,-0.095543,-0.333365,-0.040805,-0.412923,-0.552221,-0.274979,-0.532136,-0.775876,-0.425686,-0.41222,-0.310021,-0.511132,-0.626362,-0.387532,-0.095543,-0.4131,-0.559507
3,3,0.474841,1.281555,0.592292,0.280721,0.538522,1.111727,0.25796,0.834284,1.095254,0.498651,0.701862,0.478072,1.299679,1.395223,0.536108,1.281555,0.393414,1.787649
4,4,0.709707,1.195803,1.005099,0.994186,1.166862,1.683797,1.307773,1.552361,1.180452,0.945346,1.113899,0.858129,1.701406,1.782827,1.770966,1.195803,0.795727,1.766106
5,5,-0.708845,-0.478761,-0.627179,-0.629481,-0.759835,-0.471325,-0.785756,-0.640556,-0.946368,-0.753154,-0.753981,-0.819253,-0.632258,-0.426453,-0.790245,-0.478761,-0.65321,-0.238047
6,6,0.234937,0.337581,0.375677,0.196194,0.094586,0.390421,0.110778,0.158206,0.391069,0.211301,0.45064,0.140821,0.413916,0.409178,0.305719,0.337581,0.189125,0.428731
7,7,-0.552449,-0.63314,-0.500709,-0.477285,-0.556826,-0.516984,-0.632304,-0.576422,-0.605416,-0.514072,-0.427885,-0.5352,-0.732339,-0.544406,-0.497241,-0.63314,-0.547924,-0.899945
8,8,3.304686,1.867304,2.631367,1.934144,3.095905,2.215864,2.141279,2.843172,2.472536,3.196588,2.596602,3.232615,2.083023,2.304775,2.171732,1.867304,3.273493,1.745489
9,9,-0.630273,-0.675077,-0.534787,-0.564429,-0.612738,-0.416007,-0.753608,-0.555778,-0.543829,-0.576715,-0.478074,-0.647648,-0.753384,-0.493946,-0.869843,-0.675077,-0.592715,-0.691563
10,10,0.174488,0.414431,0.341585,1.032342,0.026257,-0.012746,0.859535,-0.097343,-0.535283,0.051867,0.190756,0.138926,0.148748,-0.277325,0.246491,0.414431,0.14477,0.061816


In [141]:
cow_total.BreedName.unique()

array([1, 2, 4], dtype=int64)

In [142]:
# add one-hot encoded categorical features
ohe = OneHotEncoder(handle_unknown='ignore',sparse=False)
cow_breed = cow_total[['id', 'BreedName']].copy()
cow_breed.drop_duplicates(subset=['id'], inplace=True)
cat = ohe.fit_transform(np.array(cow_breed['BreedName']).reshape(-1, 1))
col_names = ohe.get_feature_names_out(['BreedName'])
cat_breed = pd.DataFrame(cat, columns=col_names)
cat_breed.index = range(1,len(cow_breed)+1)
cat_breed

Unnamed: 0,BreedName_1,BreedName_2,BreedName_4
1,1.0,0.0,0.0
2,0.0,1.0,0.0
3,1.0,0.0,0.0
4,0.0,0.0,1.0
5,0.0,1.0,0.0
6,0.0,1.0,0.0
7,1.0,0.0,0.0
8,0.0,1.0,0.0
9,1.0,0.0,0.0
10,0.0,1.0,0.0


In [143]:
ts_dataset = pd.concat([ts_extracted_processed, cat_breed], axis=1)
ts_dataset = pd.concat([ts_dataset, y], axis=1)
ts_dataset.to_csv(dataDir.parent/"ts_dataset_16_with_cat.csv", index=False)
ts_dataset

Unnamed: 0,id,"timeDelta_Seconds__change_quantiles__f_agg_""var""__isabs_True__qh_0.8__ql_0.4","timeDelta_Seconds__change_quantiles__f_agg_""mean""__isabs_True__qh_1.0__ql_0.0","timeDelta_Seconds__change_quantiles__f_agg_""mean""__isabs_True__qh_0.8__ql_0.4","timeDelta_Seconds__change_quantiles__f_agg_""mean""__isabs_True__qh_0.8__ql_0.6","timeDelta_Seconds__change_quantiles__f_agg_""var""__isabs_True__qh_0.8__ql_0.0","timeDelta_Seconds__change_quantiles__f_agg_""mean""__isabs_True__qh_0.8__ql_0.0","timeDelta_Seconds__change_quantiles__f_agg_""var""__isabs_False__qh_0.8__ql_0.6","timeDelta_Seconds__change_quantiles__f_agg_""var""__isabs_False__qh_0.8__ql_0.0","timeDelta_Seconds__change_quantiles__f_agg_""var""__isabs_True__qh_0.6__ql_0.2","timeDelta_Seconds__change_quantiles__f_agg_""var""__isabs_False__qh_0.8__ql_0.2","timeDelta_Seconds__change_quantiles__f_agg_""mean""__isabs_True__qh_0.8__ql_0.2","timeDelta_Seconds__change_quantiles__f_agg_""var""__isabs_True__qh_0.8__ql_0.2",timeDelta_Seconds__quantile__q_0.8,timeDelta_Seconds__quantile__q_0.7,"timeDelta_Seconds__change_quantiles__f_agg_""var""__isabs_True__qh_0.8__ql_0.6",timeDelta_Seconds__mean_abs_change,"timeDelta_Seconds__change_quantiles__f_agg_""var""__isabs_False__qh_0.8__ql_0.4",timeDelta_Seconds__mean,BreedName_1,BreedName_2,BreedName_4,label
1,1,-0.892543,-1.967178,-1.864354,-2.005686,-0.939033,-1.813592,-1.294771,-1.111943,-1.158618,-1.073848,-1.844416,-0.887723,-1.09525,-1.056712,-1.224945,-1.967178,-1.034363,-0.982661,1.0,0.0,0.0,0
2,2,-0.368274,-0.095543,-0.333365,-0.040805,-0.412923,-0.552221,-0.274979,-0.532136,-0.775876,-0.425686,-0.41222,-0.310021,-0.511132,-0.626362,-0.387532,-0.095543,-0.4131,-0.559507,0.0,1.0,0.0,0
3,3,0.474841,1.281555,0.592292,0.280721,0.538522,1.111727,0.25796,0.834284,1.095254,0.498651,0.701862,0.478072,1.299679,1.395223,0.536108,1.281555,0.393414,1.787649,1.0,0.0,0.0,1
4,4,0.709707,1.195803,1.005099,0.994186,1.166862,1.683797,1.307773,1.552361,1.180452,0.945346,1.113899,0.858129,1.701406,1.782827,1.770966,1.195803,0.795727,1.766106,0.0,0.0,1.0,1
5,5,-0.708845,-0.478761,-0.627179,-0.629481,-0.759835,-0.471325,-0.785756,-0.640556,-0.946368,-0.753154,-0.753981,-0.819253,-0.632258,-0.426453,-0.790245,-0.478761,-0.65321,-0.238047,0.0,1.0,0.0,0
6,6,0.234937,0.337581,0.375677,0.196194,0.094586,0.390421,0.110778,0.158206,0.391069,0.211301,0.45064,0.140821,0.413916,0.409178,0.305719,0.337581,0.189125,0.428731,0.0,1.0,0.0,1
7,7,-0.552449,-0.63314,-0.500709,-0.477285,-0.556826,-0.516984,-0.632304,-0.576422,-0.605416,-0.514072,-0.427885,-0.5352,-0.732339,-0.544406,-0.497241,-0.63314,-0.547924,-0.899945,1.0,0.0,0.0,0
8,8,3.304686,1.867304,2.631367,1.934144,3.095905,2.215864,2.141279,2.843172,2.472536,3.196588,2.596602,3.232615,2.083023,2.304775,2.171732,1.867304,3.273493,1.745489,0.0,1.0,0.0,1
9,9,-0.630273,-0.675077,-0.534787,-0.564429,-0.612738,-0.416007,-0.753608,-0.555778,-0.543829,-0.576715,-0.478074,-0.647648,-0.753384,-0.493946,-0.869843,-0.675077,-0.592715,-0.691563,1.0,0.0,0.0,0
10,10,0.174488,0.414431,0.341585,1.032342,0.026257,-0.012746,0.859535,-0.097343,-0.535283,0.051867,0.190756,0.138926,0.148748,-0.277325,0.246491,0.414431,0.14477,0.061816,0.0,1.0,0.0,1


In [40]:
from tsfresh.feature_selection.relevance import calculate_relevance_table
rt = calculate_relevance_table(extracted_features, y)
rt

Unnamed: 0_level_0,feature,type,p_value,relevant
feature,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
timeDelta_Seconds__mean,timeDelta_Seconds__mean,real,3.440887e-10,True
timeDelta_Seconds__count_below__t_0,timeDelta_Seconds__count_below__t_0,real,4.695719e-10,True
timeDelta_Seconds__count_above__t_0,timeDelta_Seconds__count_above__t_0,real,4.695719e-10,True
timeDelta_Seconds__quantile__q_0.8,timeDelta_Seconds__quantile__q_0.8,real,5.202670e-10,True
timeDelta_Seconds__quantile__q_0.7,timeDelta_Seconds__quantile__q_0.7,real,5.205836e-10,True
...,...,...,...,...
timeDelta_Seconds__value_count__value_0,timeDelta_Seconds__value_count__value_0,constant,,False
timeDelta_Seconds__value_count__value_1,timeDelta_Seconds__value_count__value_1,constant,,False
timeDelta_Seconds__value_count__value_-1,timeDelta_Seconds__value_count__value_-1,constant,,False
timeDelta_Seconds__number_crossing_m__m_-1,timeDelta_Seconds__number_crossing_m__m_-1,constant,,False
