In [1]:
from pathlib import Path
from matplotlib.style import use
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from tsfresh import extract_features
from tsfresh import select_features
from tsfresh.utilities.dataframe_functions import impute
from tsfresh.feature_extraction import ComprehensiveFCParameters, MinimalFCParameters
from tsfresh.feature_selection.relevance import calculate_relevance_table

dataDir = Path.cwd().parent.parent.parent/'Data/processed/learner_targetCows/'
usecols = ['id', 'FarmName_Pseudo', 'Gigacow_Cow_Id', 'Total_MilkProduction', 'Total_timeDelta_Seconds', 'LactationNumber', 'DaysInMilk', 'BreedName', 'Age', 'milking_times', 'MilkingDate']
threshold_time = 1500

# labeling cow with fast/slow learner(1/0)
def labeling_data(threshold_time, cow_total): 
    '''
    func: labeling learner dataset
    args: 
        threshold_time: threshold for milking time cost
        cow_total: A dataframe contains all data points for a single cow
    return: learner dataset with label
    '''
    global learner
    total_timeCost = cow_total.Total_timeDelta_Seconds.sum()
    totalEvents = cow_total.milking_times.sum()
    mean_timeCost = total_timeCost/totalEvents
    if mean_timeCost < threshold_time:
        learner = 1 # fast learner
    else:
        learner = 0 # slow learner
    cow_total['label'] = learner
    return cow_total

In [2]:
# integrate all the cows data into one dataset
filelist = list(Path(dataDir).glob('*.csv'))
for i, _ in enumerate(filelist):
    fileName = 'cow_' + str(i) + '.csv'
    single_cow = pd.read_csv(dataDir/fileName, encoding='utf-8', usecols=usecols)
    single_cow = labeling_data(threshold_time, single_cow)
    single_cow.sort_values(by=['MilkingDate'], inplace=True)
    if i == 0:
        cow_total = single_cow
    else:
        cow_total = pd.concat([cow_total, single_cow], axis=0, ignore_index=True)
cow_total.to_csv(dataDir.parent/"Cow_Learner_dataset_L1.csv", index=False)
cow_total

Unnamed: 0,FarmName_Pseudo,Gigacow_Cow_Id,MilkingDate,LactationNumber,DaysInMilk,BreedName,Age,Total_MilkProduction,Total_timeDelta_Seconds,milking_times,id,label
0,a624fb9a,2560,2022-02-14,1.0,2.0,1,2.15,17.84,3528.0,2,1,1
1,a624fb9a,2560,2022-02-15,1.0,3.0,1,2.15,14.02,10271.0,2,1,1
2,a624fb9a,2560,2022-02-16,1.0,4.0,1,2.15,24.07,11297.0,4,1,1
3,a624fb9a,2560,2022-02-17,1.0,5.0,1,2.15,21.00,3608.0,4,1,1
4,a624fb9a,2560,2022-02-18,1.0,6.0,1,2.16,21.91,2313.0,3,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...
28171,a624fb9a,2047,2022-10-02,1.0,281.0,1,2.91,18.20,2901.0,2,118,1
28172,a624fb9a,2047,2022-10-03,1.0,282.0,1,2.92,21.58,36.0,3,118,1
28173,a624fb9a,2047,2022-10-04,1.0,283.0,1,2.92,14.69,213.0,2,118,1
28174,a624fb9a,2047,2022-10-05,1.0,284.0,1,2.92,18.00,34.0,2,118,1


In [21]:
cow_total['mean_Total_timeDelta_Seconds'] = cow_total.Total_timeDelta_Seconds/cow_total.milking_times
cow_total

Unnamed: 0,FarmName_Pseudo,Gigacow_Cow_Id,MilkingDate,LactationNumber,DaysInMilk,BreedName,Age,Total_MilkProduction,Total_timeDelta_Seconds,milking_times,id,label,mean_Total_timeDelta_Seconds
0,a624fb9a,2560,2022-02-14,1.0,2.0,1,2.15,17.84,3528.0,2,1,1,1764.00
1,a624fb9a,2560,2022-02-15,1.0,3.0,1,2.15,14.02,10271.0,2,1,1,5135.50
2,a624fb9a,2560,2022-02-16,1.0,4.0,1,2.15,24.07,11297.0,4,1,1,2824.25
3,a624fb9a,2560,2022-02-17,1.0,5.0,1,2.15,21.00,3608.0,4,1,1,902.00
4,a624fb9a,2560,2022-02-18,1.0,6.0,1,2.16,21.91,2313.0,3,1,1,771.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...
28171,a624fb9a,2047,2022-10-02,1.0,281.0,1,2.91,18.20,2901.0,2,118,1,1450.50
28172,a624fb9a,2047,2022-10-03,1.0,282.0,1,2.92,21.58,36.0,3,118,1,12.00
28173,a624fb9a,2047,2022-10-04,1.0,283.0,1,2.92,14.69,213.0,2,118,1,106.50
28174,a624fb9a,2047,2022-10-05,1.0,284.0,1,2.92,18.00,34.0,2,118,1,17.00


In [22]:
#df_static_features = cow_total[['FarmName_Pseudo']]
static_cols = ['FarmName_Pseudo', 'TrafficDeviceName', 'LactationNumber', 'BreedName']
#timeSeries_cols = ['Age', 'Total_MilkProduction', 'Total_timeDelta_Seconds', 'DaysInMilk', 'milking_times']
timeSeries_cols = ['Age', 'Total_MilkProduction', 'mean_Total_timeDelta_Seconds', 'DaysInMilk', 'milking_times']
output_col = ['label']

In [23]:
cow_label = cow_total[['id', 'label']].copy()
cow_timeseries = cow_total[['id', 'MilkingDate']].copy()
cow_timeseries.index = range(len(cow_timeseries))
# fetch y for feature extraction
y = cow_label.drop_duplicates(subset=['id'])
y = y["label"]
y.index = range(1,len(y)+1) 
y

1      1
2      0
3      1
4      0
5      0
      ..
114    1
115    1
116    0
117    1
118    1
Name: label, Length: 118, dtype: int64

In [24]:
y.value_counts()

1    63
0    55
Name: label, dtype: int64

In [86]:
#dataDir1 = Path.cwd().parent.parent.parent/'Data/processed/memory_targetCows/'
#cow_total = pd.read_csv(dataDir1/"cow_total/cow_total_1.csv", encoding='utf-8', usecols=usecols)

In [25]:
ts_extracted_dataset = cow_total[['id']].copy()
ts_extracted_dataset.drop_duplicates(subset=['id'], inplace=True)
ts_extracted_dataset.index = range(1, len(ts_extracted_dataset)+1)

In [26]:
ts_processed = pd.DataFrame(cow_total[timeSeries_cols].copy())
ts_processed.index = range(0,len(ts_processed)) 
ts_processed = pd.concat([cow_timeseries, ts_processed], axis=1)
ts_processed

Unnamed: 0,id,MilkingDate,Age,Total_MilkProduction,mean_Total_timeDelta_Seconds,DaysInMilk,milking_times
0,1,2022-02-14,2.15,17.84,1764.00,2.0,2
1,1,2022-02-15,2.15,14.02,5135.50,3.0,2
2,1,2022-02-16,2.15,24.07,2824.25,4.0,4
3,1,2022-02-17,2.15,21.00,902.00,5.0,4
4,1,2022-02-18,2.16,21.91,771.00,6.0,3
...,...,...,...,...,...,...,...
28171,118,2022-10-02,2.91,18.20,1450.50,281.0,2
28172,118,2022-10-03,2.92,21.58,12.00,282.0,3
28173,118,2022-10-04,2.92,14.69,106.50,283.0,2
28174,118,2022-10-05,2.92,18.00,17.00,284.0,2


In [37]:
'''
from tsfresh.feature_extraction import extract_features, EfficientFCParameters
extracted_dataset = extract_features(ts_processed, column_id="id", column_sort="MilkingDate", default_fc_parameters=EfficientFCParameters())
impute(extracted_dataset)
features_filtered = select_features(extracted_dataset, y)
ts_extracted_dataset = pd.concat([ts_extracted_dataset, features_filtered], axis=1)
'''

Feature Extraction: 100%|██████████| 39/39 [00:10<00:00,  3.65it/s]
 'Total_MilkProduction__query_similarity_count__query_None__threshold_0.0'
 'Total_timeDelta_Seconds__query_similarity_count__query_None__threshold_0.0'
 'DaysInMilk__query_similarity_count__query_None__threshold_0.0'
 'milking_times__friedrich_coefficients__coeff_0__m_3__r_30'
 'milking_times__friedrich_coefficients__coeff_1__m_3__r_30'
 'milking_times__friedrich_coefficients__coeff_2__m_3__r_30'
 'milking_times__friedrich_coefficients__coeff_3__m_3__r_30'
 'milking_times__max_langevin_fixed_point__m_3__r_30'
 'milking_times__query_similarity_count__query_None__threshold_0.0'] did not have any finite values. Filling with zeros.


In [27]:
settings_1 = MinimalFCParameters() 
#settings_2 = ComprehensiveFCParameters

for i, col in enumerate(timeSeries_cols):
    ts_processed = pd.DataFrame(cow_total[col].copy())
    ts_processed.index = range(0,len(ts_processed)) 
    ts_processed = pd.concat([cow_timeseries, ts_processed], axis=1)
    print(ts_processed)
    # extract time series features
    extracted_features = extract_features(ts_processed, column_id="id", column_sort="MilkingDate")
    #extracted_features.dropna(axis=1, inplace=True)
    impute(extracted_features)
    
    # select most relevant features based on relevance table
    if i == 0:
        temp = calculate_relevance_table(extracted_features, y)
        relevance_table = temp
    else:
        temp = calculate_relevance_table(extracted_features, y)
        relevance_table = pd.concat([relevance_table, temp], axis=0)
    features_filtered = select_features(extracted_features, y)
    ts_extracted_dataset = pd.concat([ts_extracted_dataset, features_filtered], axis=1)



        id MilkingDate   Age
0        1  2022-02-14  2.15
1        1  2022-02-15  2.15
2        1  2022-02-16  2.15
3        1  2022-02-17  2.15
4        1  2022-02-18  2.16
...    ...         ...   ...
28171  118  2022-10-02  2.91
28172  118  2022-10-03  2.92
28173  118  2022-10-04  2.92
28174  118  2022-10-05  2.92
28175  118  2022-10-06  2.92

[28176 rows x 3 columns]


Feature Extraction: 100%|██████████| 40/40 [00:08<00:00,  4.80it/s]


        id MilkingDate  Total_MilkProduction
0        1  2022-02-14                 17.84
1        1  2022-02-15                 14.02
2        1  2022-02-16                 24.07
3        1  2022-02-17                 21.00
4        1  2022-02-18                 21.91
...    ...         ...                   ...
28171  118  2022-10-02                 18.20
28172  118  2022-10-03                 21.58
28173  118  2022-10-04                 14.69
28174  118  2022-10-05                 18.00
28175  118  2022-10-06                 22.50

[28176 rows x 3 columns]


Feature Extraction: 100%|██████████| 40/40 [00:08<00:00,  4.88it/s]


        id MilkingDate  mean_Total_timeDelta_Seconds
0        1  2022-02-14                       1764.00
1        1  2022-02-15                       5135.50
2        1  2022-02-16                       2824.25
3        1  2022-02-17                        902.00
4        1  2022-02-18                        771.00
...    ...         ...                           ...
28171  118  2022-10-02                       1450.50
28172  118  2022-10-03                         12.00
28173  118  2022-10-04                        106.50
28174  118  2022-10-05                         17.00
28175  118  2022-10-06                        302.00

[28176 rows x 3 columns]


Feature Extraction: 100%|██████████| 40/40 [00:08<00:00,  4.91it/s]


        id MilkingDate  DaysInMilk
0        1  2022-02-14         2.0
1        1  2022-02-15         3.0
2        1  2022-02-16         4.0
3        1  2022-02-17         5.0
4        1  2022-02-18         6.0
...    ...         ...         ...
28171  118  2022-10-02       281.0
28172  118  2022-10-03       282.0
28173  118  2022-10-04       283.0
28174  118  2022-10-05       284.0
28175  118  2022-10-06       285.0

[28176 rows x 3 columns]


Feature Extraction: 100%|██████████| 40/40 [00:07<00:00,  5.15it/s]


        id MilkingDate  milking_times
0        1  2022-02-14              2
1        1  2022-02-15              2
2        1  2022-02-16              4
3        1  2022-02-17              4
4        1  2022-02-18              3
...    ...         ...            ...
28171  118  2022-10-02              2
28172  118  2022-10-03              3
28173  118  2022-10-04              2
28174  118  2022-10-05              2
28175  118  2022-10-06              3

[28176 rows x 3 columns]


Feature Extraction: 100%|██████████| 40/40 [00:07<00:00,  5.27it/s]
 'milking_times__friedrich_coefficients__coeff_1__m_3__r_30'
 'milking_times__friedrich_coefficients__coeff_2__m_3__r_30'
 'milking_times__friedrich_coefficients__coeff_3__m_3__r_30'
 'milking_times__max_langevin_fixed_point__m_3__r_30'
 'milking_times__query_similarity_count__query_None__threshold_0.0'] did not have any finite values. Filling with zeros.


In [28]:
# Select 30 most relevant features from the relevance table
relevance_table = relevance_table[relevance_table.relevant]
relevance_table.sort_values("p_value", ascending=False, inplace=True)
relevant_features_list = list(relevance_table.feature[:100])
relevance_table

Unnamed: 0_level_0,feature,type,p_value,relevant
feature,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
"mean_Total_timeDelta_Seconds__agg_autocorrelation__f_agg_""mean""__maxlag_40",mean_Total_timeDelta_Seconds__agg_autocorrelat...,real,3.059674e-03,True
"mean_Total_timeDelta_Seconds__fft_coefficient__attr_""real""__coeff_4",mean_Total_timeDelta_Seconds__fft_coefficient_...,real,3.032926e-03,True
mean_Total_timeDelta_Seconds__fourier_entropy__bins_3,mean_Total_timeDelta_Seconds__fourier_entropy_...,real,2.944824e-03,True
"mean_Total_timeDelta_Seconds__fft_coefficient__attr_""imag""__coeff_62",mean_Total_timeDelta_Seconds__fft_coefficient_...,real,2.847723e-03,True
"mean_Total_timeDelta_Seconds__fft_coefficient__attr_""real""__coeff_42",mean_Total_timeDelta_Seconds__fft_coefficient_...,real,2.728311e-03,True
...,...,...,...,...
mean_Total_timeDelta_Seconds__root_mean_square,mean_Total_timeDelta_Seconds__root_mean_square,real,1.802179e-20,True
mean_Total_timeDelta_Seconds__quantile__q_0.8,mean_Total_timeDelta_Seconds__quantile__q_0.8,real,1.802179e-20,True
mean_Total_timeDelta_Seconds__c3__lag_1,mean_Total_timeDelta_Seconds__c3__lag_1,real,1.548177e-20,True
mean_Total_timeDelta_Seconds__quantile__q_0.9,mean_Total_timeDelta_Seconds__quantile__q_0.9,real,1.141639e-20,True


In [29]:
ts_extracted_dataset_filtered = ts_extracted_dataset[relevant_features_list].copy()
ts_extracted_dataset_filtered = ts_extracted_dataset_filtered.loc[:,~ts_extracted_dataset_filtered.columns.duplicated()]
ts_extracted_dataset = ts_extracted_dataset_filtered
ts_extracted_dataset

Unnamed: 0,"mean_Total_timeDelta_Seconds__agg_autocorrelation__f_agg_""mean""__maxlag_40","mean_Total_timeDelta_Seconds__fft_coefficient__attr_""real""__coeff_4",mean_Total_timeDelta_Seconds__fourier_entropy__bins_3,"mean_Total_timeDelta_Seconds__fft_coefficient__attr_""imag""__coeff_62","mean_Total_timeDelta_Seconds__fft_coefficient__attr_""real""__coeff_42","mean_Total_timeDelta_Seconds__augmented_dickey_fuller__attr_""usedlag""__autolag_""AIC""","mean_Total_timeDelta_Seconds__fft_coefficient__attr_""real""__coeff_47","mean_Total_timeDelta_Seconds__cwt_coefficients__coeff_11__w_2__widths_(2, 5, 10, 20)",mean_Total_timeDelta_Seconds__count_below_mean,"mean_Total_timeDelta_Seconds__change_quantiles__f_agg_""mean""__isabs_False__qh_1.0__ql_0.2",...,"Total_MilkProduction__fft_coefficient__attr_""abs""__coeff_52","mean_Total_timeDelta_Seconds__cwt_coefficients__coeff_8__w_5__widths_(2, 5, 10, 20)","DaysInMilk__cwt_coefficients__coeff_9__w_20__widths_(2, 5, 10, 20)","Total_MilkProduction__cwt_coefficients__coeff_8__w_5__widths_(2, 5, 10, 20)","mean_Total_timeDelta_Seconds__fft_coefficient__attr_""real""__coeff_9","DaysInMilk__linear_trend__attr_""slope""","milking_times__fft_coefficient__attr_""imag""__coeff_3",milking_times__number_peaks__n_1,"DaysInMilk__fft_aggregated__aggtype_""variance""",milking_times__range_count__max_1000000000000.0__min_0
1,0.003972,4187.710460,0.581697,3923.872462,-123.714475,0.0,-3888.326689,250.544085,149.0,-16.343177,...,48.339380,71.497131,-67.725053,18.636641,11606.995536,1.000000,-21.548927,42.0,745.432361,235.0
2,0.120248,222539.188202,0.079983,-82026.109262,35490.309335,11.0,46926.875839,-10821.037364,212.0,-82.756629,...,94.263743,30075.177830,-48.029372,13.218712,132876.171522,1.000395,0.823698,60.0,973.910924,273.0
3,0.015946,-224.171444,0.784231,3038.177349,-580.228712,0.0,-1053.107577,62.328381,138.0,-2.830574,...,44.892958,104.698164,-14.720744,42.767179,3604.863734,1.045951,-23.572793,43.0,709.926303,229.0
4,0.134369,108371.270016,0.352048,-39565.189058,454.698406,7.0,16273.035910,-5543.615052,137.0,-422.251908,...,49.290444,18491.140538,-96.747276,16.223657,53876.000953,1.352231,4.111531,32.0,562.015415,205.0
5,-0.015854,38005.539789,0.175154,-23347.836788,-5966.666667,7.0,-10240.534729,-3134.827659,109.0,18.129630,...,38.360916,4720.953188,-388.977242,11.888515,13469.332624,2.063330,-1.809133,23.0,361.135425,168.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
114,0.000537,-8603.674903,0.720942,21478.469414,13061.451073,0.0,2727.177138,-200.046947,106.0,-91.185535,...,49.339120,1664.727919,-255.204530,10.993965,9070.008074,1.854047,5.011627,19.0,359.236321,170.0
115,-0.021485,-11465.920324,0.551301,5215.887294,-1843.214649,1.0,-9554.125232,804.035438,110.0,45.557895,...,17.277897,815.343022,-66.044619,20.589572,-14258.763424,1.000000,-9.161134,35.0,431.920583,176.0
116,0.057784,-16817.107452,0.090729,1425.391553,26358.884169,6.0,-11580.027631,2975.905339,191.0,28.208551,...,22.028089,10.510707,-69.302967,27.350405,-69524.153293,1.000000,-8.309385,56.0,1203.710592,300.0
117,0.036840,57891.599113,0.575647,-4525.049671,-22914.328600,4.0,-8705.048678,-1053.031645,194.0,28.689551,...,46.174371,5943.529583,-43.863195,6.613566,-15022.678568,1.009925,7.384548,39.0,1222.193462,302.0


In [30]:
ts_extracted_id = cow_total[['id']].copy()
ts_extracted_id.drop_duplicates(subset=['id'], inplace=True)
ts_extracted_id.index = range(1, len(ts_extracted_id)+1)
ts_extracted_dataset = pd.concat([ts_extracted_id, ts_extracted_dataset], axis=1)
ts_extracted_dataset

Unnamed: 0,id,"mean_Total_timeDelta_Seconds__agg_autocorrelation__f_agg_""mean""__maxlag_40","mean_Total_timeDelta_Seconds__fft_coefficient__attr_""real""__coeff_4",mean_Total_timeDelta_Seconds__fourier_entropy__bins_3,"mean_Total_timeDelta_Seconds__fft_coefficient__attr_""imag""__coeff_62","mean_Total_timeDelta_Seconds__fft_coefficient__attr_""real""__coeff_42","mean_Total_timeDelta_Seconds__augmented_dickey_fuller__attr_""usedlag""__autolag_""AIC""","mean_Total_timeDelta_Seconds__fft_coefficient__attr_""real""__coeff_47","mean_Total_timeDelta_Seconds__cwt_coefficients__coeff_11__w_2__widths_(2, 5, 10, 20)",mean_Total_timeDelta_Seconds__count_below_mean,...,"Total_MilkProduction__fft_coefficient__attr_""abs""__coeff_52","mean_Total_timeDelta_Seconds__cwt_coefficients__coeff_8__w_5__widths_(2, 5, 10, 20)","DaysInMilk__cwt_coefficients__coeff_9__w_20__widths_(2, 5, 10, 20)","Total_MilkProduction__cwt_coefficients__coeff_8__w_5__widths_(2, 5, 10, 20)","mean_Total_timeDelta_Seconds__fft_coefficient__attr_""real""__coeff_9","DaysInMilk__linear_trend__attr_""slope""","milking_times__fft_coefficient__attr_""imag""__coeff_3",milking_times__number_peaks__n_1,"DaysInMilk__fft_aggregated__aggtype_""variance""",milking_times__range_count__max_1000000000000.0__min_0
1,1,0.003972,4187.710460,0.581697,3923.872462,-123.714475,0.0,-3888.326689,250.544085,149.0,...,48.339380,71.497131,-67.725053,18.636641,11606.995536,1.000000,-21.548927,42.0,745.432361,235.0
2,2,0.120248,222539.188202,0.079983,-82026.109262,35490.309335,11.0,46926.875839,-10821.037364,212.0,...,94.263743,30075.177830,-48.029372,13.218712,132876.171522,1.000395,0.823698,60.0,973.910924,273.0
3,3,0.015946,-224.171444,0.784231,3038.177349,-580.228712,0.0,-1053.107577,62.328381,138.0,...,44.892958,104.698164,-14.720744,42.767179,3604.863734,1.045951,-23.572793,43.0,709.926303,229.0
4,4,0.134369,108371.270016,0.352048,-39565.189058,454.698406,7.0,16273.035910,-5543.615052,137.0,...,49.290444,18491.140538,-96.747276,16.223657,53876.000953,1.352231,4.111531,32.0,562.015415,205.0
5,5,-0.015854,38005.539789,0.175154,-23347.836788,-5966.666667,7.0,-10240.534729,-3134.827659,109.0,...,38.360916,4720.953188,-388.977242,11.888515,13469.332624,2.063330,-1.809133,23.0,361.135425,168.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
114,114,0.000537,-8603.674903,0.720942,21478.469414,13061.451073,0.0,2727.177138,-200.046947,106.0,...,49.339120,1664.727919,-255.204530,10.993965,9070.008074,1.854047,5.011627,19.0,359.236321,170.0
115,115,-0.021485,-11465.920324,0.551301,5215.887294,-1843.214649,1.0,-9554.125232,804.035438,110.0,...,17.277897,815.343022,-66.044619,20.589572,-14258.763424,1.000000,-9.161134,35.0,431.920583,176.0
116,116,0.057784,-16817.107452,0.090729,1425.391553,26358.884169,6.0,-11580.027631,2975.905339,191.0,...,22.028089,10.510707,-69.302967,27.350405,-69524.153293,1.000000,-8.309385,56.0,1203.710592,300.0
117,117,0.036840,57891.599113,0.575647,-4525.049671,-22914.328600,4.0,-8705.048678,-1053.031645,194.0,...,46.174371,5943.529583,-43.863195,6.613566,-15022.678568,1.009925,7.384548,39.0,1222.193462,302.0


In [31]:
ts_extracted_features = ts_extracted_dataset.iloc[:, 1:len(ts_extracted_dataset.columns)].copy()
# normalize numerical features
ts_extracted_cols = ts_extracted_features.columns
scaler_std = StandardScaler()
ts_std = scaler_std.fit_transform(ts_extracted_features)
# transform standard data into dataframe
ts_extracted_processed = pd.DataFrame(ts_std, columns=ts_extracted_cols)
ts_extracted_processed.index = range(1,len(ts_extracted_processed)+1)
# append id col to the dataframe
ts_extracted_processed = pd.concat([ts_extracted_dataset['id'], ts_extracted_processed], axis=1)

In [32]:
ts_extracted_processed

Unnamed: 0,id,"mean_Total_timeDelta_Seconds__agg_autocorrelation__f_agg_""mean""__maxlag_40","mean_Total_timeDelta_Seconds__fft_coefficient__attr_""real""__coeff_4",mean_Total_timeDelta_Seconds__fourier_entropy__bins_3,"mean_Total_timeDelta_Seconds__fft_coefficient__attr_""imag""__coeff_62","mean_Total_timeDelta_Seconds__fft_coefficient__attr_""real""__coeff_42","mean_Total_timeDelta_Seconds__augmented_dickey_fuller__attr_""usedlag""__autolag_""AIC""","mean_Total_timeDelta_Seconds__fft_coefficient__attr_""real""__coeff_47","mean_Total_timeDelta_Seconds__cwt_coefficients__coeff_11__w_2__widths_(2, 5, 10, 20)",mean_Total_timeDelta_Seconds__count_below_mean,...,"Total_MilkProduction__fft_coefficient__attr_""abs""__coeff_52","mean_Total_timeDelta_Seconds__cwt_coefficients__coeff_8__w_5__widths_(2, 5, 10, 20)","DaysInMilk__cwt_coefficients__coeff_9__w_20__widths_(2, 5, 10, 20)","Total_MilkProduction__cwt_coefficients__coeff_8__w_5__widths_(2, 5, 10, 20)","mean_Total_timeDelta_Seconds__fft_coefficient__attr_""real""__coeff_9","DaysInMilk__linear_trend__attr_""slope""","milking_times__fft_coefficient__attr_""imag""__coeff_3",milking_times__number_peaks__n_1,"DaysInMilk__fft_aggregated__aggtype_""variance""",milking_times__range_count__max_1000000000000.0__min_0
1,1,-0.651289,-0.263711,0.810985,0.373440,-0.265878,-1.009695,-0.365139,0.313452,-0.109891,...,-0.462261,-0.518093,-0.062967,0.253646,0.170742,-0.224828,-1.208569,-0.032362,-0.188563,-0.045738
2,2,0.446625,3.487119,-1.284190,-3.570144,1.120343,1.117877,1.847772,-1.901007,0.998566,...,0.636782,3.050002,0.117409,-0.341874,4.293424,-0.224552,0.208745,1.025141,0.235045,0.414106
3,3,-0.538228,-0.339498,1.656774,0.332803,-0.283647,-1.009695,-0.241670,0.275806,-0.303432,...,-0.544739,-0.514144,0.422453,2.905989,-0.101300,-0.192719,-1.336781,0.026388,-0.254393,-0.118345
4,4,0.579957,1.525949,-0.148035,-1.621939,-0.243364,0.344214,0.512852,-0.845455,-0.321026,...,-0.439500,1.672406,-0.328756,-0.011581,1.607724,0.021291,0.417031,-0.619864,-0.528625,-0.408773
5,5,-0.838488,0.317210,-0.886752,-0.877849,-0.493306,0.344214,-0.641766,-0.363666,-0.813674,...,-0.701061,0.034829,-3.005034,-0.488084,0.234054,0.518167,0.041955,-1.148616,-0.901064,-0.856517
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
114,114,-0.683721,-0.483440,1.392475,1.178886,0.247334,-1.009695,-0.077045,0.223328,-0.866457,...,-0.438336,-0.328623,-1.779927,-0.586410,0.084494,0.371932,0.474052,-1.383617,-0.904585,-0.832314
115,115,-0.891655,-0.532608,0.684050,0.432721,-0.332807,-0.816279,-0.611874,0.424157,-0.796079,...,-1.205612,-0.429633,-0.047577,0.468305,-0.708594,-0.224828,-0.423797,-0.443614,-0.769826,-0.759707
116,116,-0.143176,-0.624530,-1.239313,0.258804,0.764916,0.150799,-0.700099,0.858559,0.629080,...,-1.091932,-0.525345,-0.077418,1.211432,-2.587403,-0.224828,-0.369839,0.790140,0.661102,0.740838
117,117,-0.340932,0.658812,0.785719,-0.014216,-1.152967,-0.236033,-0.574898,0.052720,0.681864,...,-0.514073,0.180220,0.155563,-1.067888,-0.734564,-0.217892,0.624378,-0.208613,0.695370,0.765040


In [17]:
from tsfresh.feature_extraction.settings import from_columns
para_dict = from_columns(ts_extracted_features.columns)
para_dict

{'Total_timeDelta_Seconds': {'sum_of_reoccurring_data_points': None,
  'sum_of_reoccurring_values': None,
  'agg_linear_trend': [{'attr': 'rvalue', 'chunk_len': 10, 'f_agg': 'mean'},
   {'attr': 'slope', 'chunk_len': 10, 'f_agg': 'min'},
   {'attr': 'rvalue', 'chunk_len': 50, 'f_agg': 'min'},
   {'attr': 'rvalue', 'chunk_len': 5, 'f_agg': 'mean'}],
  'autocorrelation': [{'lag': 9}, {'lag': 5}, {'lag': 7}, {'lag': 8}],
  'sample_entropy': None,
  'fft_coefficient': [{'attr': 'real', 'coeff': 60},
   {'attr': 'imag', 'coeff': 62},
   {'attr': 'abs', 'coeff': 54},
   {'attr': 'imag', 'coeff': 37},
   {'attr': 'imag', 'coeff': 12},
   {'attr': 'imag', 'coeff': 11},
   {'attr': 'abs', 'coeff': 29},
   {'attr': 'abs', 'coeff': 96}],
  'cwt_coefficients': [{'coeff': 13, 'w': 2, 'widths': (2, 5, 10, 20)},
   {'coeff': 9, 'w': 5, 'widths': (2, 5, 10, 20)}],
  'last_location_of_maximum': None,
  'index_mass_quantile': [{'q': 0.9}, {'q': 0.7}, {'q': 0.1}],
  'number_peaks': [{'n': 50}],
  'first_

In [99]:
ts_extracted_dataset.columns

Index(['id', 'DaysInMilk__quantile__q_0.9',
       'DaysInMilk__fft_coefficient__attr_"abs"__coeff_87',
       'DaysInMilk__agg_linear_trend__attr_"rvalue"__chunk_len_50__f_agg_"var"',
       'DaysInMilk__fft_coefficient__attr_"angle"__coeff_82',
       'DaysInMilk__fft_coefficient__attr_"real"__coeff_80',
       'DaysInMilk__fft_coefficient__attr_"real"__coeff_79',
       'DaysInMilk__mean_change',
       'DaysInMilk__change_quantiles__f_agg_"mean"__isabs_False__qh_1.0__ql_0.0',
       'DaysInMilk__mean_abs_change',
       ...
       'DaysInMilk__linear_trend__attr_"slope"',
       'DaysInMilk__change_quantiles__f_agg_"mean"__isabs_False__qh_1.0__ql_0.8',
       'DaysInMilk__change_quantiles__f_agg_"mean"__isabs_True__qh_1.0__ql_0.8',
       'DaysInMilk__approximate_entropy__m_2__r_0.5',
       'DaysInMilk__fft_coefficient__attr_"imag"__coeff_82',
       'DaysInMilk__fft_coefficient__attr_"real"__coeff_24',
       'DaysInMilk__fft_coefficient__attr_"real"__coeff_25',
       'DaysInMil

In [33]:
cow_total.BreedName.unique()

array([ 1,  2,  4, 99], dtype=int64)

In [34]:
# add one-hot encoded categorical features
ohe = OneHotEncoder(handle_unknown='ignore',sparse=False)
cow_breed = cow_total[['id', 'BreedName']].copy()
cow_breed.drop_duplicates(subset=['id'], inplace=True)
cat = ohe.fit_transform(np.array(cow_breed['BreedName']).reshape(-1, 1))
col_names = ohe.get_feature_names_out(['BreedName'])
cat_breed = pd.DataFrame(cat, columns=col_names)
cat_breed.index = range(1,len(cow_breed)+1)
cat_breed

Unnamed: 0,BreedName_1,BreedName_2,BreedName_4,BreedName_99
1,1.0,0.0,0.0,0.0
2,0.0,1.0,0.0,0.0
3,1.0,0.0,0.0,0.0
4,0.0,0.0,1.0,0.0
5,0.0,1.0,0.0,0.0
...,...,...,...,...
114,0.0,1.0,0.0,0.0
115,1.0,0.0,0.0,0.0
116,0.0,1.0,0.0,0.0
117,1.0,0.0,0.0,0.0


In [35]:
ts_dataset = pd.concat([ts_extracted_processed, cat_breed], axis=1)
ts_dataset = pd.concat([ts_dataset, y], axis=1)
ts_dataset.to_csv(dataDir.parent/"learner_118_new_meanTimeCost.csv", index=False)
ts_dataset

Unnamed: 0,id,"mean_Total_timeDelta_Seconds__agg_autocorrelation__f_agg_""mean""__maxlag_40","mean_Total_timeDelta_Seconds__fft_coefficient__attr_""real""__coeff_4",mean_Total_timeDelta_Seconds__fourier_entropy__bins_3,"mean_Total_timeDelta_Seconds__fft_coefficient__attr_""imag""__coeff_62","mean_Total_timeDelta_Seconds__fft_coefficient__attr_""real""__coeff_42","mean_Total_timeDelta_Seconds__augmented_dickey_fuller__attr_""usedlag""__autolag_""AIC""","mean_Total_timeDelta_Seconds__fft_coefficient__attr_""real""__coeff_47","mean_Total_timeDelta_Seconds__cwt_coefficients__coeff_11__w_2__widths_(2, 5, 10, 20)",mean_Total_timeDelta_Seconds__count_below_mean,...,"DaysInMilk__linear_trend__attr_""slope""","milking_times__fft_coefficient__attr_""imag""__coeff_3",milking_times__number_peaks__n_1,"DaysInMilk__fft_aggregated__aggtype_""variance""",milking_times__range_count__max_1000000000000.0__min_0,BreedName_1,BreedName_2,BreedName_4,BreedName_99,label
1,1,-0.651289,-0.263711,0.810985,0.373440,-0.265878,-1.009695,-0.365139,0.313452,-0.109891,...,-0.224828,-1.208569,-0.032362,-0.188563,-0.045738,1.0,0.0,0.0,0.0,1
2,2,0.446625,3.487119,-1.284190,-3.570144,1.120343,1.117877,1.847772,-1.901007,0.998566,...,-0.224552,0.208745,1.025141,0.235045,0.414106,0.0,1.0,0.0,0.0,0
3,3,-0.538228,-0.339498,1.656774,0.332803,-0.283647,-1.009695,-0.241670,0.275806,-0.303432,...,-0.192719,-1.336781,0.026388,-0.254393,-0.118345,1.0,0.0,0.0,0.0,1
4,4,0.579957,1.525949,-0.148035,-1.621939,-0.243364,0.344214,0.512852,-0.845455,-0.321026,...,0.021291,0.417031,-0.619864,-0.528625,-0.408773,0.0,0.0,1.0,0.0,0
5,5,-0.838488,0.317210,-0.886752,-0.877849,-0.493306,0.344214,-0.641766,-0.363666,-0.813674,...,0.518167,0.041955,-1.148616,-0.901064,-0.856517,0.0,1.0,0.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
114,114,-0.683721,-0.483440,1.392475,1.178886,0.247334,-1.009695,-0.077045,0.223328,-0.866457,...,0.371932,0.474052,-1.383617,-0.904585,-0.832314,0.0,1.0,0.0,0.0,1
115,115,-0.891655,-0.532608,0.684050,0.432721,-0.332807,-0.816279,-0.611874,0.424157,-0.796079,...,-0.224828,-0.423797,-0.443614,-0.769826,-0.759707,1.0,0.0,0.0,0.0,1
116,116,-0.143176,-0.624530,-1.239313,0.258804,0.764916,0.150799,-0.700099,0.858559,0.629080,...,-0.224828,-0.369839,0.790140,0.661102,0.740838,0.0,1.0,0.0,0.0,0
117,117,-0.340932,0.658812,0.785719,-0.014216,-1.152967,-0.236033,-0.574898,0.052720,0.681864,...,-0.217892,0.624378,-0.208613,0.695370,0.765040,1.0,0.0,0.0,0.0,1


In [29]:
from tsfresh.feature_selection.relevance import calculate_relevance_table
rt = calculate_relevance_table(extracted_features, y)
rt

Unnamed: 0_level_0,feature,type,p_value,relevant
feature,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
timeDelta_Seconds__ratio_value_number_to_time_series_length,timeDelta_Seconds__ratio_value_number_to_time_...,real,0.001554,False
timeDelta_Seconds__percentage_of_reoccurring_datapoints_to_all_datapoints,timeDelta_Seconds__percentage_of_reoccurring_d...,real,0.001554,False
"timeDelta_Seconds__change_quantiles__f_agg_""var""__isabs_False__qh_0.6__ql_0.4","timeDelta_Seconds__change_quantiles__f_agg_""va...",real,0.001554,False
"timeDelta_Seconds__change_quantiles__f_agg_""mean""__isabs_True__qh_0.6__ql_0.4","timeDelta_Seconds__change_quantiles__f_agg_""me...",real,0.001554,False
timeDelta_Seconds__quantile__q_0.4,timeDelta_Seconds__quantile__q_0.4,real,0.001554,False
...,...,...,...,...
timeDelta_Seconds__number_crossing_m__m_-1,timeDelta_Seconds__number_crossing_m__m_-1,constant,,False
timeDelta_Seconds__number_crossing_m__m_1,timeDelta_Seconds__number_crossing_m__m_1,constant,,False
timeDelta_Seconds__count_above__t_0,timeDelta_Seconds__count_above__t_0,constant,,False
timeDelta_Seconds__count_below__t_0,timeDelta_Seconds__count_below__t_0,constant,,False
