In [1]:
from pathlib import Path
from matplotlib.style import use
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from tsfresh import extract_features
from tsfresh import select_features
from tsfresh.utilities.dataframe_functions import impute
from tsfresh.feature_extraction import ComprehensiveFCParameters, MinimalFCParameters
from tsfresh.feature_selection.relevance import calculate_relevance_table

dataDir = Path.cwd().parent.parent/'Data/processed/learner_targetCows/'
usecols = ['id', 'FarmName_Pseudo', 'Gigacow_Cow_Id', 'Total_MilkProduction', 'Total_timeDelta_Seconds', 'LactationNumber', 'DaysInMilk', 'BreedName', 'Age', 'milking_times', 'MilkingDate']
threshold_time = 1200

# labeling cow with fast/slow learner(1/0)
def labeling_data(threshold_time, cow_total): 
    '''
    func: labeling learner dataset
    args: 
        threshold_time: threshold for milking time cost
        cow_total: A dataframe contains all data points for a single cow
    return: learner dataset with label
    '''
    global learner
    total_timeCost = cow_total.Total_timeDelta_Seconds.sum()
    totalEvents = cow_total.milking_times.sum()
    mean_timeCost = total_timeCost/totalEvents
    if mean_timeCost < threshold_time:
        learner = 1 # fast learner
    else:
        learner = 0 # slow learner
    cow_total['label'] = learner
    return cow_total

In [2]:
# integrate all the cows data into one dataset
filelist = list(Path(dataDir).glob('*.csv'))
for i, _ in enumerate(filelist):
    fileName = 'cow_' + str(i) + '.csv'
    single_cow = pd.read_csv(dataDir/fileName, encoding='utf-8', usecols=usecols)
    single_cow = labeling_data(threshold_time, single_cow)
    single_cow.sort_values(by=['MilkingDate'], inplace=True)
    if i == 0:
        cow_total = single_cow
    else:
        cow_total = pd.concat([cow_total, single_cow], axis=0, ignore_index=True)
cow_total.to_csv(dataDir.parent/"Cow_Learner_dataset_L1.csv", index=False)
cow_total

Unnamed: 0,FarmName_Pseudo,Gigacow_Cow_Id,MilkingDate,LactationNumber,DaysInMilk,BreedName,Age,Total_MilkProduction,Total_timeDelta_Seconds,milking_times,id,label
0,a624fb9a,2560,2022-02-14,1.0,2.0,1,2.15,17.84,3528.0,2,1,1
1,a624fb9a,2560,2022-02-15,1.0,3.0,1,2.15,14.02,10271.0,2,1,1
2,a624fb9a,2560,2022-02-16,1.0,4.0,1,2.15,24.07,11297.0,4,1,1
3,a624fb9a,2560,2022-02-17,1.0,5.0,1,2.15,21.00,3608.0,4,1,1
4,a624fb9a,2560,2022-02-18,1.0,6.0,1,2.16,21.91,2313.0,3,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...
21128,a624fb9a,2047,2022-10-02,1.0,281.0,1,2.91,18.20,2901.0,2,81,1
21129,a624fb9a,2047,2022-10-03,1.0,282.0,1,2.92,21.58,36.0,3,81,1
21130,a624fb9a,2047,2022-10-04,1.0,283.0,1,2.92,14.69,213.0,2,81,1
21131,a624fb9a,2047,2022-10-05,1.0,284.0,1,2.92,18.00,34.0,2,81,1


In [3]:
cow_total['mean_Total_timeDelta_Seconds'] = cow_total.Total_timeDelta_Seconds/cow_total.milking_times
cow_total

Unnamed: 0,FarmName_Pseudo,Gigacow_Cow_Id,MilkingDate,LactationNumber,DaysInMilk,BreedName,Age,Total_MilkProduction,Total_timeDelta_Seconds,milking_times,id,label,mean_Total_timeDelta_Seconds
0,a624fb9a,2560,2022-02-14,1.0,2.0,1,2.15,17.84,3528.0,2,1,1,1764.00
1,a624fb9a,2560,2022-02-15,1.0,3.0,1,2.15,14.02,10271.0,2,1,1,5135.50
2,a624fb9a,2560,2022-02-16,1.0,4.0,1,2.15,24.07,11297.0,4,1,1,2824.25
3,a624fb9a,2560,2022-02-17,1.0,5.0,1,2.15,21.00,3608.0,4,1,1,902.00
4,a624fb9a,2560,2022-02-18,1.0,6.0,1,2.16,21.91,2313.0,3,1,1,771.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...
21128,a624fb9a,2047,2022-10-02,1.0,281.0,1,2.91,18.20,2901.0,2,81,1,1450.50
21129,a624fb9a,2047,2022-10-03,1.0,282.0,1,2.92,21.58,36.0,3,81,1,12.00
21130,a624fb9a,2047,2022-10-04,1.0,283.0,1,2.92,14.69,213.0,2,81,1,106.50
21131,a624fb9a,2047,2022-10-05,1.0,284.0,1,2.92,18.00,34.0,2,81,1,17.00


In [4]:
#df_static_features = cow_total[['FarmName_Pseudo']]
static_cols = ['FarmName_Pseudo', 'TrafficDeviceName', 'LactationNumber', 'BreedName']
#timeSeries_cols = ['Age', 'Total_MilkProduction', 'Total_timeDelta_Seconds', 'DaysInMilk', 'milking_times']
timeSeries_cols = ['Age', 'Total_MilkProduction', 'mean_Total_timeDelta_Seconds', 'DaysInMilk', 'milking_times']
output_col = ['label']

In [5]:
cow_label = cow_total[['id', 'label']].copy()
cow_timeseries = cow_total[['id', 'MilkingDate']].copy()
cow_timeseries.index = range(len(cow_timeseries))
# fetch y for feature extraction
y = cow_label.drop_duplicates(subset=['id'])
y = y["label"]
y.index = range(1,len(y)+1) 
y

1     1
2     0
3     1
4     0
5     1
     ..
77    1
78    0
79    0
80    0
81    1
Name: label, Length: 81, dtype: int64

In [6]:
y.value_counts()

0    44
1    37
Name: label, dtype: int64

In [86]:
#dataDir1 = Path.cwd().parent.parent.parent/'Data/processed/memory_targetCows/'
#cow_total = pd.read_csv(dataDir1/"cow_total/cow_total_1.csv", encoding='utf-8', usecols=usecols)

In [7]:
ts_extracted_dataset = cow_total[['id']].copy()
ts_extracted_dataset.drop_duplicates(subset=['id'], inplace=True)
ts_extracted_dataset.index = range(1, len(ts_extracted_dataset)+1)

In [8]:
ts_processed = pd.DataFrame(cow_total[timeSeries_cols].copy())
ts_processed.index = range(0,len(ts_processed)) 
ts_processed = pd.concat([cow_timeseries, ts_processed], axis=1)
ts_processed

Unnamed: 0,id,MilkingDate,Age,Total_MilkProduction,mean_Total_timeDelta_Seconds,DaysInMilk,milking_times
0,1,2022-02-14,2.15,17.84,1764.00,2.0,2
1,1,2022-02-15,2.15,14.02,5135.50,3.0,2
2,1,2022-02-16,2.15,24.07,2824.25,4.0,4
3,1,2022-02-17,2.15,21.00,902.00,5.0,4
4,1,2022-02-18,2.16,21.91,771.00,6.0,3
...,...,...,...,...,...,...,...
21128,81,2022-10-02,2.91,18.20,1450.50,281.0,2
21129,81,2022-10-03,2.92,21.58,12.00,282.0,3
21130,81,2022-10-04,2.92,14.69,106.50,283.0,2
21131,81,2022-10-05,2.92,18.00,17.00,284.0,2


In [37]:
'''
from tsfresh.feature_extraction import extract_features, EfficientFCParameters
extracted_dataset = extract_features(ts_processed, column_id="id", column_sort="MilkingDate", default_fc_parameters=EfficientFCParameters())
impute(extracted_dataset)
features_filtered = select_features(extracted_dataset, y)
ts_extracted_dataset = pd.concat([ts_extracted_dataset, features_filtered], axis=1)
'''

Feature Extraction: 100%|██████████| 39/39 [00:10<00:00,  3.65it/s]
 'Total_MilkProduction__query_similarity_count__query_None__threshold_0.0'
 'Total_timeDelta_Seconds__query_similarity_count__query_None__threshold_0.0'
 'DaysInMilk__query_similarity_count__query_None__threshold_0.0'
 'milking_times__friedrich_coefficients__coeff_0__m_3__r_30'
 'milking_times__friedrich_coefficients__coeff_1__m_3__r_30'
 'milking_times__friedrich_coefficients__coeff_2__m_3__r_30'
 'milking_times__friedrich_coefficients__coeff_3__m_3__r_30'
 'milking_times__max_langevin_fixed_point__m_3__r_30'
 'milking_times__query_similarity_count__query_None__threshold_0.0'] did not have any finite values. Filling with zeros.


In [9]:
settings_1 = MinimalFCParameters() 
#settings_2 = ComprehensiveFCParameters

for i, col in enumerate(timeSeries_cols):
    ts_processed = pd.DataFrame(cow_total[col].copy())
    ts_processed.index = range(0,len(ts_processed)) 
    ts_processed = pd.concat([cow_timeseries, ts_processed], axis=1)
    print(ts_processed)
    # extract time series features
    extracted_features = extract_features(ts_processed, column_id="id", column_sort="MilkingDate", default_fc_parameters=settings_1)
    #extracted_features.dropna(axis=1, inplace=True)
    impute(extracted_features)
    
    # select most relevant features based on relevance table
    if i == 0:
        temp = calculate_relevance_table(extracted_features, y)
        relevance_table = temp
    else:
        temp = calculate_relevance_table(extracted_features, y)
        relevance_table = pd.concat([relevance_table, temp], axis=0)
    features_filtered = select_features(extracted_features, y)
    ts_extracted_dataset = pd.concat([ts_extracted_dataset, features_filtered], axis=1)



       id MilkingDate   Age
0       1  2022-02-14  2.15
1       1  2022-02-15  2.15
2       1  2022-02-16  2.15
3       1  2022-02-17  2.15
4       1  2022-02-18  2.16
...    ..         ...   ...
21128  81  2022-10-02  2.91
21129  81  2022-10-03  2.92
21130  81  2022-10-04  2.92
21131  81  2022-10-05  2.92
21132  81  2022-10-06  2.92

[21133 rows x 3 columns]


Feature Extraction: 100%|██████████| 27/27 [00:05<00:00,  4.86it/s]


       id MilkingDate  Total_MilkProduction
0       1  2022-02-14                 17.84
1       1  2022-02-15                 14.02
2       1  2022-02-16                 24.07
3       1  2022-02-17                 21.00
4       1  2022-02-18                 21.91
...    ..         ...                   ...
21128  81  2022-10-02                 18.20
21129  81  2022-10-03                 21.58
21130  81  2022-10-04                 14.69
21131  81  2022-10-05                 18.00
21132  81  2022-10-06                 22.50

[21133 rows x 3 columns]


Feature Extraction: 100%|██████████| 27/27 [00:05<00:00,  4.99it/s]


       id MilkingDate  mean_Total_timeDelta_Seconds
0       1  2022-02-14                       1764.00
1       1  2022-02-15                       5135.50
2       1  2022-02-16                       2824.25
3       1  2022-02-17                        902.00
4       1  2022-02-18                        771.00
...    ..         ...                           ...
21128  81  2022-10-02                       1450.50
21129  81  2022-10-03                         12.00
21130  81  2022-10-04                        106.50
21131  81  2022-10-05                         17.00
21132  81  2022-10-06                        302.00

[21133 rows x 3 columns]


Feature Extraction: 100%|██████████| 27/27 [00:04<00:00,  5.48it/s]


       id MilkingDate  DaysInMilk
0       1  2022-02-14         2.0
1       1  2022-02-15         3.0
2       1  2022-02-16         4.0
3       1  2022-02-17         5.0
4       1  2022-02-18         6.0
...    ..         ...         ...
21128  81  2022-10-02       281.0
21129  81  2022-10-03       282.0
21130  81  2022-10-04       283.0
21131  81  2022-10-05       284.0
21132  81  2022-10-06       285.0

[21133 rows x 3 columns]


Feature Extraction: 100%|██████████| 27/27 [00:04<00:00,  5.44it/s]


       id MilkingDate  milking_times
0       1  2022-02-14              2
1       1  2022-02-15              2
2       1  2022-02-16              4
3       1  2022-02-17              4
4       1  2022-02-18              3
...    ..         ...            ...
21128  81  2022-10-02              2
21129  81  2022-10-03              3
21130  81  2022-10-04              2
21131  81  2022-10-05              2
21132  81  2022-10-06              3

[21133 rows x 3 columns]


Feature Extraction: 100%|██████████| 27/27 [00:05<00:00,  5.34it/s]


In [11]:
# Select 30 most relevant features from the relevance table
relevance_table = relevance_table[relevance_table.relevant]
relevance_table.sort_values("p_value", ascending=False, inplace=True)
relevant_features_list = list(relevance_table.feature[:])
relevance_table

Unnamed: 0_level_0,feature,type,p_value,relevant
feature,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
milking_times__variance,milking_times__variance,real,0.0003983528,True
milking_times__standard_deviation,milking_times__standard_deviation,real,0.0003983528,True
milking_times__maximum,milking_times__maximum,real,6.761171e-06,True
milking_times__absolute_maximum,milking_times__absolute_maximum,real,6.761171e-06,True
mean_Total_timeDelta_Seconds__minimum,mean_Total_timeDelta_Seconds__minimum,real,4.543933e-06,True
milking_times__median,milking_times__median,real,4.402639e-06,True
milking_times__sum_values,milking_times__sum_values,real,2.280688e-08,True
milking_times__root_mean_square,milking_times__root_mean_square,real,1.17819e-08,True
milking_times__mean,milking_times__mean,real,1.114164e-08,True
mean_Total_timeDelta_Seconds__sum_values,mean_Total_timeDelta_Seconds__sum_values,real,6.268754e-11,True


In [12]:
ts_extracted_dataset_filtered = ts_extracted_dataset[relevant_features_list].copy()
ts_extracted_dataset_filtered = ts_extracted_dataset_filtered.loc[:,~ts_extracted_dataset_filtered.columns.duplicated()]
ts_extracted_dataset = ts_extracted_dataset_filtered
ts_extracted_dataset

Unnamed: 0,milking_times__variance,milking_times__standard_deviation,milking_times__maximum,milking_times__absolute_maximum,mean_Total_timeDelta_Seconds__minimum,milking_times__median,milking_times__sum_values,milking_times__root_mean_square,milking_times__mean,mean_Total_timeDelta_Seconds__sum_values,mean_Total_timeDelta_Seconds__maximum,mean_Total_timeDelta_Seconds__absolute_maximum,mean_Total_timeDelta_Seconds__variance,mean_Total_timeDelta_Seconds__standard_deviation,mean_Total_timeDelta_Seconds__median,mean_Total_timeDelta_Seconds__root_mean_square,mean_Total_timeDelta_Seconds__mean
1,0.813291,0.901827,6.0,6.0,5.5,3.0,726.0,3.218299,3.089362,1.910528e+05,5135.500000,5135.500000,5.693446e+05,754.549301,600.250000,1109.188339,812.990851
2,0.285875,0.534673,4.0,4.0,33.0,2.0,612.0,2.304638,2.241758,7.510155e+05,54718.000000,54718.000000,3.696336e+07,6079.749951,1247.500000,6673.170859,2750.972527
3,0.704868,0.839564,6.0,6.0,14.0,3.0,781.0,3.512299,3.410480,7.453965e+04,1301.500000,1301.500000,5.797373e+04,240.777349,261.666667,404.875793,325.500655
4,0.302058,0.549598,3.0,3.0,14.0,1.0,291.0,1.522194,1.419512,1.242360e+06,34143.000000,34143.000000,4.660101e+07,6826.493287,3324.000000,9128.426362,6060.293496
5,0.487961,0.698542,5.0,5.0,7.0,3.0,573.0,2.894383,2.808824,1.632220e+05,5373.333333,5373.333333,6.147114e+05,784.035332,555.083333,1120.215917,800.107680
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
77,0.989012,0.994491,6.0,6.0,11.0,3.0,761.0,2.979016,2.808118,1.299012e+05,3154.500000,3154.500000,2.296925e+05,479.262416,343.750000,677.834471,479.340283
78,0.432099,0.657342,5.0,5.0,26.0,2.0,616.0,2.237620,2.138889,1.762428e+06,43576.000000,43576.000000,4.738766e+07,6883.869517,3920.000000,9210.671351,6119.543056
79,0.478889,0.692018,5.0,5.0,7.0,3.0,790.0,2.722744,2.633333,5.034337e+05,7698.000000,7698.000000,2.569374e+06,1602.926718,1208.000000,2320.653896,1678.112167
80,0.560337,0.748557,4.0,4.0,25.0,2.0,705.0,2.451517,2.334437,4.197974e+05,16792.000000,16792.000000,2.635619e+06,1623.459055,928.166667,2137.259842,1390.057671


In [13]:
ts_extracted_id = cow_total[['id']].copy()
ts_extracted_id.drop_duplicates(subset=['id'], inplace=True)
ts_extracted_id.index = range(1, len(ts_extracted_id)+1)
ts_extracted_dataset = pd.concat([ts_extracted_id, ts_extracted_dataset], axis=1)
ts_extracted_dataset

Unnamed: 0,id,milking_times__variance,milking_times__standard_deviation,milking_times__maximum,milking_times__absolute_maximum,mean_Total_timeDelta_Seconds__minimum,milking_times__median,milking_times__sum_values,milking_times__root_mean_square,milking_times__mean,mean_Total_timeDelta_Seconds__sum_values,mean_Total_timeDelta_Seconds__maximum,mean_Total_timeDelta_Seconds__absolute_maximum,mean_Total_timeDelta_Seconds__variance,mean_Total_timeDelta_Seconds__standard_deviation,mean_Total_timeDelta_Seconds__median,mean_Total_timeDelta_Seconds__root_mean_square,mean_Total_timeDelta_Seconds__mean
1,1,0.813291,0.901827,6.0,6.0,5.5,3.0,726.0,3.218299,3.089362,1.910528e+05,5135.500000,5135.500000,5.693446e+05,754.549301,600.250000,1109.188339,812.990851
2,2,0.285875,0.534673,4.0,4.0,33.0,2.0,612.0,2.304638,2.241758,7.510155e+05,54718.000000,54718.000000,3.696336e+07,6079.749951,1247.500000,6673.170859,2750.972527
3,3,0.704868,0.839564,6.0,6.0,14.0,3.0,781.0,3.512299,3.410480,7.453965e+04,1301.500000,1301.500000,5.797373e+04,240.777349,261.666667,404.875793,325.500655
4,4,0.302058,0.549598,3.0,3.0,14.0,1.0,291.0,1.522194,1.419512,1.242360e+06,34143.000000,34143.000000,4.660101e+07,6826.493287,3324.000000,9128.426362,6060.293496
5,5,0.487961,0.698542,5.0,5.0,7.0,3.0,573.0,2.894383,2.808824,1.632220e+05,5373.333333,5373.333333,6.147114e+05,784.035332,555.083333,1120.215917,800.107680
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
77,77,0.989012,0.994491,6.0,6.0,11.0,3.0,761.0,2.979016,2.808118,1.299012e+05,3154.500000,3154.500000,2.296925e+05,479.262416,343.750000,677.834471,479.340283
78,78,0.432099,0.657342,5.0,5.0,26.0,2.0,616.0,2.237620,2.138889,1.762428e+06,43576.000000,43576.000000,4.738766e+07,6883.869517,3920.000000,9210.671351,6119.543056
79,79,0.478889,0.692018,5.0,5.0,7.0,3.0,790.0,2.722744,2.633333,5.034337e+05,7698.000000,7698.000000,2.569374e+06,1602.926718,1208.000000,2320.653896,1678.112167
80,80,0.560337,0.748557,4.0,4.0,25.0,2.0,705.0,2.451517,2.334437,4.197974e+05,16792.000000,16792.000000,2.635619e+06,1623.459055,928.166667,2137.259842,1390.057671


In [14]:
ts_extracted_features = ts_extracted_dataset.iloc[:, 1:len(ts_extracted_dataset.columns)].copy()
# normalize numerical features
ts_extracted_cols = ts_extracted_features.columns
scaler_std = StandardScaler()
ts_std = scaler_std.fit_transform(ts_extracted_features)
# transform standard data into dataframe
ts_extracted_processed = pd.DataFrame(ts_std, columns=ts_extracted_cols)
ts_extracted_processed.index = range(1,len(ts_extracted_processed)+1)
# append id col to the dataframe
ts_extracted_processed = pd.concat([ts_extracted_dataset['id'], ts_extracted_processed], axis=1)

In [15]:
ts_extracted_processed

Unnamed: 0,id,milking_times__variance,milking_times__standard_deviation,milking_times__maximum,milking_times__absolute_maximum,mean_Total_timeDelta_Seconds__minimum,milking_times__median,milking_times__sum_values,milking_times__root_mean_square,milking_times__mean,mean_Total_timeDelta_Seconds__sum_values,mean_Total_timeDelta_Seconds__maximum,mean_Total_timeDelta_Seconds__absolute_maximum,mean_Total_timeDelta_Seconds__variance,mean_Total_timeDelta_Seconds__standard_deviation,mean_Total_timeDelta_Seconds__median,mean_Total_timeDelta_Seconds__root_mean_square,mean_Total_timeDelta_Seconds__mean
1,1,1.088510,1.021468,1.034954,1.034954,-0.356615,0.738213,0.194535,0.902397,0.879292,-0.741387,-0.752622,-0.752622,-0.597270,-0.750556,-0.659154,-0.752443,-0.727357
2,2,-1.309538,-1.310689,-0.711531,-0.711531,0.048705,-0.756669,-0.333179,-0.613560,-0.547124,0.700082,2.905393,2.905393,2.456457,2.204441,-0.045488,1.624051,0.541289
3,3,0.595535,0.625979,1.034954,1.034954,-0.231334,0.738213,0.449134,1.390206,1.419696,-1.041318,-1.035480,-1.035480,-0.640178,-1.035652,-0.980169,-1.053270,-1.046479
4,4,-1.235957,-1.215884,-1.584773,-1.584773,-0.231334,-2.251551,-1.819110,-1.911801,-1.930867,1.964913,1.387445,1.387445,3.265127,2.618815,1.923269,2.672743,2.707645
5,5,-0.390695,-0.269792,0.161712,0.161712,-0.334507,0.738213,-0.513712,0.364952,0.407179,-0.813030,-0.735075,-0.735075,-0.593463,-0.734194,-0.701977,-0.747733,-0.735791
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
77,77,1.887474,1.610070,1.034954,1.034954,-0.275551,0.738213,0.356553,0.505377,0.405992,-0.898805,-0.898773,-0.898773,-0.625769,-0.903315,-0.902345,-0.936683,-0.945772
78,78,-0.644691,-0.531495,0.161712,0.161712,-0.054467,-0.756669,-0.314662,-0.724756,-0.720241,3.303685,2.083377,2.083377,3.331132,2.650653,2.488344,2.707871,2.746431
79,79,-0.431946,-0.311235,0.161712,0.161712,-0.334507,0.738213,0.490796,0.080166,0.111850,0.062751,-0.563570,-0.563570,-0.429453,-0.279785,-0.082938,-0.235001,-0.161029
80,80,-0.061617,0.047899,-0.711531,-0.711531,-0.069206,-0.756669,0.097325,-0.369857,-0.391157,-0.152548,0.107352,0.107352,-0.423894,-0.268391,-0.348252,-0.313332,-0.349596


In [34]:
from tsfresh.feature_extraction.settings import from_columns
para_dict = from_columns(ts_extracted_features.columns)
para_dict

{'DaysInMilk': {'sum_values': None, 'minimum': None, 'length': None},
 'Age': {'sum_values': None, 'length': None},
 'mean_Total_timeDelta_Seconds': {'length': None,
  'minimum': None,
  'sum_values': None,
  'maximum': None,
  'absolute_maximum': None,
  'standard_deviation': None,
  'variance': None,
  'median': None,
  'root_mean_square': None,
  'mean': None},
 'milking_times': {'length': None,
  'variance': None,
  'standard_deviation': None,
  'median': None,
  'sum_values': None,
  'absolute_maximum': None,
  'maximum': None,
  'mean': None,
  'root_mean_square': None},
 'Total_MilkProduction': {'length': None, 'sum_values': None}}

In [35]:
ts_extracted_dataset.columns

Index(['id', 'DaysInMilk__sum_values', 'DaysInMilk__minimum',
       'Age__sum_values', 'Age__length', 'DaysInMilk__length',
       'mean_Total_timeDelta_Seconds__length', 'milking_times__length',
       'Total_MilkProduction__length', 'Total_MilkProduction__sum_values',
       'milking_times__variance', 'milking_times__standard_deviation',
       'milking_times__median', 'milking_times__sum_values',
       'milking_times__absolute_maximum', 'milking_times__maximum',
       'milking_times__mean', 'milking_times__root_mean_square',
       'mean_Total_timeDelta_Seconds__minimum',
       'mean_Total_timeDelta_Seconds__sum_values',
       'mean_Total_timeDelta_Seconds__maximum',
       'mean_Total_timeDelta_Seconds__absolute_maximum',
       'mean_Total_timeDelta_Seconds__standard_deviation',
       'mean_Total_timeDelta_Seconds__variance',
       'mean_Total_timeDelta_Seconds__median',
       'mean_Total_timeDelta_Seconds__root_mean_square',
       'mean_Total_timeDelta_Seconds__mean'],
 

In [16]:
cow_total.BreedName.unique()

array([ 1,  2,  4, 99], dtype=int64)

In [17]:
# add one-hot encoded categorical features
ohe = OneHotEncoder(handle_unknown='ignore',sparse=False)
cow_breed = cow_total[['id', 'BreedName']].copy()
cow_breed.drop_duplicates(subset=['id'], inplace=True)
cat = ohe.fit_transform(np.array(cow_breed['BreedName']).reshape(-1, 1))
col_names = ohe.get_feature_names_out(['BreedName'])
cat_breed = pd.DataFrame(cat, columns=col_names)
cat_breed.index = range(1,len(cow_breed)+1)
cat_breed

Unnamed: 0,BreedName_1,BreedName_2,BreedName_4,BreedName_99
1,1.0,0.0,0.0,0.0
2,0.0,1.0,0.0,0.0
3,1.0,0.0,0.0,0.0
4,0.0,0.0,1.0,0.0
5,0.0,1.0,0.0,0.0
...,...,...,...,...
77,1.0,0.0,0.0,0.0
78,0.0,1.0,0.0,0.0
79,0.0,1.0,0.0,0.0
80,1.0,0.0,0.0,0.0


In [18]:
ts_dataset = pd.concat([ts_extracted_processed, cat_breed], axis=1)
ts_dataset = pd.concat([ts_dataset, y], axis=1)
ts_dataset.to_csv(dataDir.parent/"learner_81_new_meanTimeCost_minPara_1200.csv", index=False)
ts_dataset

Unnamed: 0,id,milking_times__variance,milking_times__standard_deviation,milking_times__maximum,milking_times__absolute_maximum,mean_Total_timeDelta_Seconds__minimum,milking_times__median,milking_times__sum_values,milking_times__root_mean_square,milking_times__mean,...,mean_Total_timeDelta_Seconds__variance,mean_Total_timeDelta_Seconds__standard_deviation,mean_Total_timeDelta_Seconds__median,mean_Total_timeDelta_Seconds__root_mean_square,mean_Total_timeDelta_Seconds__mean,BreedName_1,BreedName_2,BreedName_4,BreedName_99,label
1,1,1.088510,1.021468,1.034954,1.034954,-0.356615,0.738213,0.194535,0.902397,0.879292,...,-0.597270,-0.750556,-0.659154,-0.752443,-0.727357,1.0,0.0,0.0,0.0,1
2,2,-1.309538,-1.310689,-0.711531,-0.711531,0.048705,-0.756669,-0.333179,-0.613560,-0.547124,...,2.456457,2.204441,-0.045488,1.624051,0.541289,0.0,1.0,0.0,0.0,0
3,3,0.595535,0.625979,1.034954,1.034954,-0.231334,0.738213,0.449134,1.390206,1.419696,...,-0.640178,-1.035652,-0.980169,-1.053270,-1.046479,1.0,0.0,0.0,0.0,1
4,4,-1.235957,-1.215884,-1.584773,-1.584773,-0.231334,-2.251551,-1.819110,-1.911801,-1.930867,...,3.265127,2.618815,1.923269,2.672743,2.707645,0.0,0.0,1.0,0.0,0
5,5,-0.390695,-0.269792,0.161712,0.161712,-0.334507,0.738213,-0.513712,0.364952,0.407179,...,-0.593463,-0.734194,-0.701977,-0.747733,-0.735791,0.0,1.0,0.0,0.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
77,77,1.887474,1.610070,1.034954,1.034954,-0.275551,0.738213,0.356553,0.505377,0.405992,...,-0.625769,-0.903315,-0.902345,-0.936683,-0.945772,1.0,0.0,0.0,0.0,1
78,78,-0.644691,-0.531495,0.161712,0.161712,-0.054467,-0.756669,-0.314662,-0.724756,-0.720241,...,3.331132,2.650653,2.488344,2.707871,2.746431,0.0,1.0,0.0,0.0,0
79,79,-0.431946,-0.311235,0.161712,0.161712,-0.334507,0.738213,0.490796,0.080166,0.111850,...,-0.429453,-0.279785,-0.082938,-0.235001,-0.161029,0.0,1.0,0.0,0.0,0
80,80,-0.061617,0.047899,-0.711531,-0.711531,-0.069206,-0.756669,0.097325,-0.369857,-0.391157,...,-0.423894,-0.268391,-0.348252,-0.313332,-0.349596,1.0,0.0,0.0,0.0,0


In [29]:
from tsfresh.feature_selection.relevance import calculate_relevance_table
rt = calculate_relevance_table(extracted_features, y)
rt

Unnamed: 0_level_0,feature,type,p_value,relevant
feature,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
timeDelta_Seconds__ratio_value_number_to_time_series_length,timeDelta_Seconds__ratio_value_number_to_time_...,real,0.001554,False
timeDelta_Seconds__percentage_of_reoccurring_datapoints_to_all_datapoints,timeDelta_Seconds__percentage_of_reoccurring_d...,real,0.001554,False
"timeDelta_Seconds__change_quantiles__f_agg_""var""__isabs_False__qh_0.6__ql_0.4","timeDelta_Seconds__change_quantiles__f_agg_""va...",real,0.001554,False
"timeDelta_Seconds__change_quantiles__f_agg_""mean""__isabs_True__qh_0.6__ql_0.4","timeDelta_Seconds__change_quantiles__f_agg_""me...",real,0.001554,False
timeDelta_Seconds__quantile__q_0.4,timeDelta_Seconds__quantile__q_0.4,real,0.001554,False
...,...,...,...,...
timeDelta_Seconds__number_crossing_m__m_-1,timeDelta_Seconds__number_crossing_m__m_-1,constant,,False
timeDelta_Seconds__number_crossing_m__m_1,timeDelta_Seconds__number_crossing_m__m_1,constant,,False
timeDelta_Seconds__count_above__t_0,timeDelta_Seconds__count_above__t_0,constant,,False
timeDelta_Seconds__count_below__t_0,timeDelta_Seconds__count_below__t_0,constant,,False
