In [32]:
from pathlib import Path
from matplotlib.style import use
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from tsfresh import extract_features
from tsfresh import select_features
from tsfresh.utilities.dataframe_functions import impute
from tsfresh.feature_extraction import ComprehensiveFCParameters, MinimalFCParameters
from tsfresh.feature_selection.relevance import calculate_relevance_table
pd.options.mode.chained_assignment = None

dataDir = Path.cwd().parent.parent/'Data/processed/learner_targetCows/'
usecols = ['id', 'FarmName_Pseudo', 'Gigacow_Cow_Id', 'Total_MilkProduction', 'Total_timeDelta_Seconds', 'LactationNumber', 'DaysInMilk', 'BreedName', 'Age', 'milking_times', 'MilkingDate']
threshold_time = 1500

# labeling cow with fast/slow learner(1/0)
def labeling_data(threshold_time, cow_total): 
    '''
    func: labeling learner dataset
    args: 
        threshold_time: threshold for milking time cost
        cow_total: A dataframe contains all data points for a single cow
    return: learner dataset with label
    '''
    global learner
    total_timeCost = cow_total.Total_timeDelta_Seconds.sum()
    totalEvents = cow_total.milking_times.sum()
    mean_timeCost = total_timeCost/totalEvents
    if mean_timeCost < threshold_time:
        learner = 1 # fast learner
    else:
        learner = 0 # slow learner
    cow_total['label'] = learner
    return cow_total

In [33]:
# integrate all the cows data into one dataset
filelist = list(Path(dataDir).glob('*.csv'))
for i, _ in enumerate(filelist):
    fileName = 'cow_' + str(i) + '.csv'
    single_cow = pd.read_csv(dataDir/fileName, encoding='utf-8', usecols=usecols)
    single_cow = labeling_data(threshold_time, single_cow)
    single_cow.sort_values(by=['MilkingDate'], inplace=True)
    if i == 0:
        cow_total = single_cow
    else:
        cow_total = pd.concat([cow_total, single_cow], axis=0, ignore_index=True)
cow_total.to_csv(dataDir.parent/"Cow_Learner_dataset_L1_1500.csv", index=False)
cow_total

Unnamed: 0,FarmName_Pseudo,Gigacow_Cow_Id,MilkingDate,LactationNumber,DaysInMilk,BreedName,Age,Total_MilkProduction,Total_timeDelta_Seconds,milking_times,id,label
0,a624fb9a,2560,2022-02-14,1.0,2.0,1,2.15,17.84,3528.0,2,1,1
1,a624fb9a,2560,2022-02-15,1.0,3.0,1,2.15,14.02,10271.0,2,1,1
2,a624fb9a,2560,2022-02-16,1.0,4.0,1,2.15,24.07,11297.0,4,1,1
3,a624fb9a,2560,2022-02-17,1.0,5.0,1,2.15,21.00,3608.0,4,1,1
4,a624fb9a,2560,2022-02-18,1.0,6.0,1,2.16,21.91,2313.0,3,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...
28171,a624fb9a,2047,2022-10-02,1.0,281.0,1,2.91,18.20,2901.0,2,118,1
28172,a624fb9a,2047,2022-10-03,1.0,282.0,1,2.92,21.58,36.0,3,118,1
28173,a624fb9a,2047,2022-10-04,1.0,283.0,1,2.92,14.69,213.0,2,118,1
28174,a624fb9a,2047,2022-10-05,1.0,284.0,1,2.92,18.00,34.0,2,118,1


In [44]:
cow_total['mean_Total_timeDelta_Seconds'] = cow_total.Total_timeDelta_Seconds/cow_total.milking_times
cow_total

Unnamed: 0,FarmName_Pseudo,Gigacow_Cow_Id,MilkingDate,LactationNumber,DaysInMilk,BreedName,Age,Total_MilkProduction,Total_timeDelta_Seconds,milking_times,id,label,mean_Total_timeDelta_Seconds
0,a624fb9a,2560,2022-02-14,1.0,2.0,1,2.15,17.84,3528.0,2,1,1,1764.00
1,a624fb9a,2560,2022-02-15,1.0,3.0,1,2.15,14.02,10271.0,2,1,1,5135.50
2,a624fb9a,2560,2022-02-16,1.0,4.0,1,2.15,24.07,11297.0,4,1,1,2824.25
3,a624fb9a,2560,2022-02-17,1.0,5.0,1,2.15,21.00,3608.0,4,1,1,902.00
4,a624fb9a,2560,2022-02-18,1.0,6.0,1,2.16,21.91,2313.0,3,1,1,771.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...
28171,a624fb9a,2047,2022-10-02,1.0,281.0,1,2.91,18.20,2901.0,2,118,1,1450.50
28172,a624fb9a,2047,2022-10-03,1.0,282.0,1,2.92,21.58,36.0,3,118,1,12.00
28173,a624fb9a,2047,2022-10-04,1.0,283.0,1,2.92,14.69,213.0,2,118,1,106.50
28174,a624fb9a,2047,2022-10-05,1.0,284.0,1,2.92,18.00,34.0,2,118,1,17.00


In [45]:
#df_static_features = cow_total[['FarmName_Pseudo']]
static_cols = ['FarmName_Pseudo', 'TrafficDeviceName', 'LactationNumber', 'BreedName']
#timeSeries_cols = ['Age', 'Total_MilkProduction', 'Total_timeDelta_Seconds', 'DaysInMilk', 'milking_times']
timeSeries_cols = ['Age', 'Total_MilkProduction', 'mean_Total_timeDelta_Seconds', 'DaysInMilk', 'milking_times']
output_col = ['label']

In [48]:
cow_label = cow_total[['id', 'label']].copy()
cow_timeseries = cow_total[['id', 'MilkingDate']].copy()
cow_timeseries.index = range(len(cow_timeseries))
# fetch y for feature extraction
y = cow_label.drop_duplicates(subset=['id'])
y = y["label"]
y.index = range(1,len(y)+1) 
y

1      1
2      0
3      1
4      0
5      0
      ..
114    1
115    0
116    0
117    0
118    1
Name: label, Length: 118, dtype: int64

In [49]:
y.value_counts()

0    75
1    43
Name: label, dtype: int64

In [86]:
#dataDir1 = Path.cwd().parent.parent.parent/'Data/processed/memory_targetCows/'
#cow_total = pd.read_csv(dataDir1/"cow_total/cow_total_1.csv", encoding='utf-8', usecols=usecols)

In [50]:
ts_extracted_dataset = cow_total[['id']].copy()
ts_extracted_dataset.drop_duplicates(subset=['id'], inplace=True)
ts_extracted_dataset.index = range(1, len(ts_extracted_dataset)+1)

In [51]:
ts_processed = pd.DataFrame(cow_total[timeSeries_cols].copy())
ts_processed.index = range(0,len(ts_processed)) 
ts_processed = pd.concat([cow_timeseries, ts_processed], axis=1)
ts_processed

Unnamed: 0,id,MilkingDate,Age,Total_MilkProduction,mean_Total_timeDelta_Seconds,DaysInMilk,milking_times
0,1,2022-02-14,2.15,17.84,1764.00,2.0,2
1,1,2022-02-15,2.15,14.02,5135.50,3.0,2
2,1,2022-02-16,2.15,24.07,2824.25,4.0,4
3,1,2022-02-17,2.15,21.00,902.00,5.0,4
4,1,2022-02-18,2.16,21.91,771.00,6.0,3
...,...,...,...,...,...,...,...
28171,118,2022-10-02,2.91,18.20,1450.50,281.0,2
28172,118,2022-10-03,2.92,21.58,12.00,282.0,3
28173,118,2022-10-04,2.92,14.69,106.50,283.0,2
28174,118,2022-10-05,2.92,18.00,17.00,284.0,2


In [37]:
'''
from tsfresh.feature_extraction import extract_features, EfficientFCParameters
extracted_dataset = extract_features(ts_processed, column_id="id", column_sort="MilkingDate", default_fc_parameters=EfficientFCParameters())
impute(extracted_dataset)
features_filtered = select_features(extracted_dataset, y)
ts_extracted_dataset = pd.concat([ts_extracted_dataset, features_filtered], axis=1)
'''

Feature Extraction: 100%|██████████| 39/39 [00:10<00:00,  3.65it/s]
 'Total_MilkProduction__query_similarity_count__query_None__threshold_0.0'
 'Total_timeDelta_Seconds__query_similarity_count__query_None__threshold_0.0'
 'DaysInMilk__query_similarity_count__query_None__threshold_0.0'
 'milking_times__friedrich_coefficients__coeff_0__m_3__r_30'
 'milking_times__friedrich_coefficients__coeff_1__m_3__r_30'
 'milking_times__friedrich_coefficients__coeff_2__m_3__r_30'
 'milking_times__friedrich_coefficients__coeff_3__m_3__r_30'
 'milking_times__max_langevin_fixed_point__m_3__r_30'
 'milking_times__query_similarity_count__query_None__threshold_0.0'] did not have any finite values. Filling with zeros.


In [52]:
settings_1 = MinimalFCParameters() 
#settings_2 = ComprehensiveFCParameters

for i, col in enumerate(timeSeries_cols):
    ts_processed = pd.DataFrame(cow_total[col].copy())
    ts_processed.index = range(0,len(ts_processed)) 
    ts_processed = pd.concat([cow_timeseries, ts_processed], axis=1)
    print(ts_processed)
    # extract time series features
    extracted_features = extract_features(ts_processed, column_id="id", column_sort="MilkingDate", default_fc_parameters=settings_1)
    #extracted_features.dropna(axis=1, inplace=True)
    impute(extracted_features)
    
    # select most relevant features based on relevance table
    if i == 0:
        temp = calculate_relevance_table(extracted_features, y)
        relevance_table = temp
    else:
        temp = calculate_relevance_table(extracted_features, y)
        relevance_table = pd.concat([relevance_table, temp], axis=0)
    features_filtered = select_features(extracted_features, y)
    ts_extracted_dataset = pd.concat([ts_extracted_dataset, features_filtered], axis=1)



        id MilkingDate   Age
0        1  2022-02-14  2.15
1        1  2022-02-15  2.15
2        1  2022-02-16  2.15
3        1  2022-02-17  2.15
4        1  2022-02-18  2.16
...    ...         ...   ...
28171  118  2022-10-02  2.91
28172  118  2022-10-03  2.92
28173  118  2022-10-04  2.92
28174  118  2022-10-05  2.92
28175  118  2022-10-06  2.92

[28176 rows x 3 columns]


Feature Extraction: 100%|██████████| 40/40 [00:05<00:00,  7.25it/s]


        id MilkingDate  Total_MilkProduction
0        1  2022-02-14                 17.84
1        1  2022-02-15                 14.02
2        1  2022-02-16                 24.07
3        1  2022-02-17                 21.00
4        1  2022-02-18                 21.91
...    ...         ...                   ...
28171  118  2022-10-02                 18.20
28172  118  2022-10-03                 21.58
28173  118  2022-10-04                 14.69
28174  118  2022-10-05                 18.00
28175  118  2022-10-06                 22.50

[28176 rows x 3 columns]


Feature Extraction: 100%|██████████| 40/40 [00:05<00:00,  7.85it/s]


        id MilkingDate  mean_Total_timeDelta_Seconds
0        1  2022-02-14                       1764.00
1        1  2022-02-15                       5135.50
2        1  2022-02-16                       2824.25
3        1  2022-02-17                        902.00
4        1  2022-02-18                        771.00
...    ...         ...                           ...
28171  118  2022-10-02                       1450.50
28172  118  2022-10-03                         12.00
28173  118  2022-10-04                        106.50
28174  118  2022-10-05                         17.00
28175  118  2022-10-06                        302.00

[28176 rows x 3 columns]


Feature Extraction: 100%|██████████| 40/40 [00:05<00:00,  7.23it/s]


        id MilkingDate  DaysInMilk
0        1  2022-02-14         2.0
1        1  2022-02-15         3.0
2        1  2022-02-16         4.0
3        1  2022-02-17         5.0
4        1  2022-02-18         6.0
...    ...         ...         ...
28171  118  2022-10-02       281.0
28172  118  2022-10-03       282.0
28173  118  2022-10-04       283.0
28174  118  2022-10-05       284.0
28175  118  2022-10-06       285.0

[28176 rows x 3 columns]


Feature Extraction: 100%|██████████| 40/40 [00:05<00:00,  7.31it/s]


        id MilkingDate  milking_times
0        1  2022-02-14              2
1        1  2022-02-15              2
2        1  2022-02-16              4
3        1  2022-02-17              4
4        1  2022-02-18              3
...    ...         ...            ...
28171  118  2022-10-02              2
28172  118  2022-10-03              3
28173  118  2022-10-04              2
28174  118  2022-10-05              2
28175  118  2022-10-06              3

[28176 rows x 3 columns]


Feature Extraction: 100%|██████████| 40/40 [00:05<00:00,  6.80it/s]


In [53]:
# Select 30 most relevant features from the relevance table
relevance_table = relevance_table[relevance_table.relevant]
relevance_table.sort_values("p_value", ascending=False, inplace=True)
relevant_features_list = list(relevance_table.feature[:])
relevance_table

Unnamed: 0_level_0,feature,type,p_value,relevant
feature,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Total_MilkProduction__root_mean_square,Total_MilkProduction__root_mean_square,real,0.006465081,True
Total_MilkProduction__mean,Total_MilkProduction__mean,real,0.005357387,True
Total_MilkProduction__median,Total_MilkProduction__median,real,0.004703561,True
DaysInMilk__sum_values,DaysInMilk__sum_values,real,0.001277917,True
Age__sum_values,Age__sum_values,real,0.0002140946,True
Age__length,Age__length,real,0.0001732308,True
DaysInMilk__length,DaysInMilk__length,real,0.0001732308,True
mean_Total_timeDelta_Seconds__length,mean_Total_timeDelta_Seconds__length,real,0.0001732308,True
milking_times__length,milking_times__length,real,0.0001732308,True
Total_MilkProduction__length,Total_MilkProduction__length,real,0.0001732308,True


In [54]:
ts_extracted_dataset_filtered = ts_extracted_dataset[relevant_features_list].copy()
ts_extracted_dataset_filtered = ts_extracted_dataset_filtered.loc[:,~ts_extracted_dataset_filtered.columns.duplicated()]
ts_extracted_dataset = ts_extracted_dataset_filtered
ts_extracted_dataset

Unnamed: 0,Total_MilkProduction__root_mean_square,Total_MilkProduction__mean,Total_MilkProduction__median,DaysInMilk__sum_values,Age__sum_values,Age__length,DaysInMilk__length,mean_Total_timeDelta_Seconds__length,milking_times__length,Total_MilkProduction__length,...,milking_times__root_mean_square,milking_times__sum_values,mean_Total_timeDelta_Seconds__sum_values,mean_Total_timeDelta_Seconds__absolute_maximum,mean_Total_timeDelta_Seconds__maximum,mean_Total_timeDelta_Seconds__standard_deviation,mean_Total_timeDelta_Seconds__variance,mean_Total_timeDelta_Seconds__median,mean_Total_timeDelta_Seconds__root_mean_square,mean_Total_timeDelta_Seconds__mean
1,28.818956,27.557277,28.340,27965.0,579.46,235.0,235.0,235.0,235.0,235.0,...,3.218299,726.0,1.910528e+05,5135.5,5135.5,754.549301,5.693446e+05,600.250000,1109.188339,812.990851
2,21.302654,20.654835,19.820,41218.0,620.04,273.0,273.0,273.0,273.0,273.0,...,2.304638,612.0,7.510155e+05,54718.0,54718.0,6079.749951,3.696336e+07,1247.500000,6673.170859,2750.972527
3,33.731063,32.889083,33.190,41230.0,554.76,229.0,229.0,229.0,229.0,229.0,...,3.512299,781.0,7.453965e+04,1301.5,1301.5,240.777349,5.797373e+04,261.666667,404.875793,325.500655
4,17.033128,15.656195,12.780,31881.0,484.97,205.0,205.0,205.0,205.0,205.0,...,1.522194,291.0,1.242360e+06,34143.0,34143.0,6826.493287,4.660101e+07,3324.000000,9128.426362,6060.293496
5,14.748492,14.113988,14.290,48140.0,509.34,168.0,168.0,168.0,168.0,168.0,...,1.980560,309.0,3.073912e+05,11083.0,11083.0,1811.520666,3.281607e+06,1360.666667,2574.770541,1829.709325
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
114,14.592172,13.968412,13.795,46903.0,496.91,170.0,170.0,170.0,170.0,170.0,...,1.875539,296.0,1.723325e+05,9235.0,9235.0,1036.148412,1.073604e+06,754.666667,1449.563025,1013.720588
115,31.839673,30.917784,30.795,15928.0,391.78,176.0,176.0,176.0,176.0,176.0,...,3.165869,540.0,2.324201e+05,5541.0,5541.0,1215.353915,1.477085e+06,977.500000,1794.710969,1320.568939
116,27.604951,26.482400,26.415,45150.0,731.10,300.0,300.0,300.0,300.0,300.0,...,2.722744,790.0,5.034337e+05,7698.0,7698.0,1602.926718,2.569374e+06,1208.000000,2320.653896,1678.112167
117,27.838808,26.566192,27.700,51484.0,699.55,302.0,302.0,302.0,302.0,302.0,...,2.451517,705.0,4.197974e+05,16792.0,16792.0,1623.459055,2.635619e+06,928.166667,2137.259842,1390.057671


In [55]:
ts_extracted_id = cow_total[['id']].copy()
ts_extracted_id.drop_duplicates(subset=['id'], inplace=True)
ts_extracted_id.index = range(1, len(ts_extracted_id)+1)
ts_extracted_dataset = pd.concat([ts_extracted_id, ts_extracted_dataset], axis=1)
ts_extracted_dataset

Unnamed: 0,id,Total_MilkProduction__root_mean_square,Total_MilkProduction__mean,Total_MilkProduction__median,DaysInMilk__sum_values,Age__sum_values,Age__length,DaysInMilk__length,mean_Total_timeDelta_Seconds__length,milking_times__length,...,milking_times__root_mean_square,milking_times__sum_values,mean_Total_timeDelta_Seconds__sum_values,mean_Total_timeDelta_Seconds__absolute_maximum,mean_Total_timeDelta_Seconds__maximum,mean_Total_timeDelta_Seconds__standard_deviation,mean_Total_timeDelta_Seconds__variance,mean_Total_timeDelta_Seconds__median,mean_Total_timeDelta_Seconds__root_mean_square,mean_Total_timeDelta_Seconds__mean
1,1,28.818956,27.557277,28.340,27965.0,579.46,235.0,235.0,235.0,235.0,...,3.218299,726.0,1.910528e+05,5135.5,5135.5,754.549301,5.693446e+05,600.250000,1109.188339,812.990851
2,2,21.302654,20.654835,19.820,41218.0,620.04,273.0,273.0,273.0,273.0,...,2.304638,612.0,7.510155e+05,54718.0,54718.0,6079.749951,3.696336e+07,1247.500000,6673.170859,2750.972527
3,3,33.731063,32.889083,33.190,41230.0,554.76,229.0,229.0,229.0,229.0,...,3.512299,781.0,7.453965e+04,1301.5,1301.5,240.777349,5.797373e+04,261.666667,404.875793,325.500655
4,4,17.033128,15.656195,12.780,31881.0,484.97,205.0,205.0,205.0,205.0,...,1.522194,291.0,1.242360e+06,34143.0,34143.0,6826.493287,4.660101e+07,3324.000000,9128.426362,6060.293496
5,5,14.748492,14.113988,14.290,48140.0,509.34,168.0,168.0,168.0,168.0,...,1.980560,309.0,3.073912e+05,11083.0,11083.0,1811.520666,3.281607e+06,1360.666667,2574.770541,1829.709325
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
114,114,14.592172,13.968412,13.795,46903.0,496.91,170.0,170.0,170.0,170.0,...,1.875539,296.0,1.723325e+05,9235.0,9235.0,1036.148412,1.073604e+06,754.666667,1449.563025,1013.720588
115,115,31.839673,30.917784,30.795,15928.0,391.78,176.0,176.0,176.0,176.0,...,3.165869,540.0,2.324201e+05,5541.0,5541.0,1215.353915,1.477085e+06,977.500000,1794.710969,1320.568939
116,116,27.604951,26.482400,26.415,45150.0,731.10,300.0,300.0,300.0,300.0,...,2.722744,790.0,5.034337e+05,7698.0,7698.0,1602.926718,2.569374e+06,1208.000000,2320.653896,1678.112167
117,117,27.838808,26.566192,27.700,51484.0,699.55,302.0,302.0,302.0,302.0,...,2.451517,705.0,4.197974e+05,16792.0,16792.0,1623.459055,2.635619e+06,928.166667,2137.259842,1390.057671


In [56]:
ts_extracted_features = ts_extracted_dataset.iloc[:, 1:len(ts_extracted_dataset.columns)].copy()
# normalize numerical features
ts_extracted_cols = ts_extracted_features.columns
scaler_std = StandardScaler()
ts_std = scaler_std.fit_transform(ts_extracted_features)
# transform standard data into dataframe
ts_extracted_processed = pd.DataFrame(ts_std, columns=ts_extracted_cols)
ts_extracted_processed.index = range(1,len(ts_extracted_processed)+1)
# append id col to the dataframe
ts_extracted_processed = pd.concat([ts_extracted_dataset['id'], ts_extracted_processed], axis=1)

In [57]:
ts_extracted_processed

Unnamed: 0,id,Total_MilkProduction__root_mean_square,Total_MilkProduction__mean,Total_MilkProduction__median,DaysInMilk__sum_values,Age__sum_values,Age__length,DaysInMilk__length,mean_Total_timeDelta_Seconds__length,milking_times__length,...,milking_times__root_mean_square,milking_times__sum_values,mean_Total_timeDelta_Seconds__sum_values,mean_Total_timeDelta_Seconds__absolute_maximum,mean_Total_timeDelta_Seconds__maximum,mean_Total_timeDelta_Seconds__standard_deviation,mean_Total_timeDelta_Seconds__variance,mean_Total_timeDelta_Seconds__median,mean_Total_timeDelta_Seconds__root_mean_square,mean_Total_timeDelta_Seconds__mean
1,1,0.497555,0.462867,0.509399,-0.396065,-0.114193,-0.045738,-0.045738,-0.045738,-0.045738,...,1.090565,0.475278,-0.796468,-0.834043,-0.834043,-0.832853,-0.613775,-0.756838,-0.839589,-0.813586
2,2,-0.645265,-0.594454,-0.708623,-0.036699,0.035717,0.414106,0.414106,0.414106,0.414106,...,-0.450755,0.027778,0.688368,2.668925,2.668925,1.808096,1.757425,-0.229413,1.242509,0.265862
3,3,1.244419,1.279597,1.202757,-0.036374,-0.205439,-0.118345,-0.118345,-0.118345,-0.118345,...,1.586535,0.691178,-1.105423,-1.104912,-1.104912,-1.087650,-0.647093,-1.032739,-1.103150,-1.085117
4,4,-1.294427,-1.360149,-1.715063,-0.289880,-0.463257,-0.408773,-0.408773,-0.408773,-0.408773,...,-1.770716,-1.232290,1.991252,1.215316,1.215316,2.178431,2.385353,1.462665,2.161290,2.109142
5,5,-1.641796,-1.596386,-1.499193,0.150997,-0.373229,-0.856517,-0.856517,-0.856517,-0.856517,...,-0.997465,-1.161632,-0.487977,-0.413856,-0.413856,-0.308665,-0.437062,-0.137197,-0.291153,-0.247278
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
114,114,-1.665564,-1.618685,-1.569959,0.117454,-0.419148,-0.832314,-0.832314,-0.832314,-0.832314,...,-1.174632,-1.212663,-0.846108,-0.544416,-0.544416,-0.693198,-0.580921,-0.631008,-0.712217,-0.701781
115,115,0.956841,0.977632,0.860367,-0.722459,-0.807518,-0.759707,-0.759707,-0.759707,-0.759707,...,1.002117,-0.254855,-0.686776,-0.805394,-0.805394,-0.604324,-0.554633,-0.449428,-0.583059,-0.530867
116,116,0.312971,0.298216,0.234200,0.069920,0.445993,0.740838,0.740838,0.740838,0.740838,...,0.254578,0.726507,0.031863,-0.653004,-0.653004,-0.412113,-0.483466,-0.261600,-0.386246,-0.331717
117,117,0.348528,0.311052,0.417904,0.241672,0.329442,0.765040,0.765040,0.765040,0.765040,...,-0.202975,0.392844,-0.189913,-0.010519,-0.010519,-0.401931,-0.479150,-0.489628,-0.454874,-0.492162


In [58]:
from tsfresh.feature_extraction.settings import from_columns
para_dict = from_columns(ts_extracted_features.columns)
para_dict

{'Total_MilkProduction': {'root_mean_square': None,
  'mean': None,
  'median': None,
  'length': None,
  'sum_values': None},
 'DaysInMilk': {'sum_values': None, 'length': None},
 'Age': {'sum_values': None, 'length': None},
 'mean_Total_timeDelta_Seconds': {'length': None,
  'minimum': None,
  'sum_values': None,
  'absolute_maximum': None,
  'maximum': None,
  'standard_deviation': None,
  'variance': None,
  'median': None,
  'root_mean_square': None,
  'mean': None},
 'milking_times': {'length': None,
  'variance': None,
  'standard_deviation': None,
  'median': None,
  'absolute_maximum': None,
  'maximum': None,
  'mean': None,
  'root_mean_square': None,
  'sum_values': None}}

In [35]:
ts_extracted_dataset.columns

Index(['id', 'DaysInMilk__sum_values', 'DaysInMilk__minimum',
       'Age__sum_values', 'Age__length', 'DaysInMilk__length',
       'mean_Total_timeDelta_Seconds__length', 'milking_times__length',
       'Total_MilkProduction__length', 'Total_MilkProduction__sum_values',
       'milking_times__variance', 'milking_times__standard_deviation',
       'milking_times__median', 'milking_times__sum_values',
       'milking_times__absolute_maximum', 'milking_times__maximum',
       'milking_times__mean', 'milking_times__root_mean_square',
       'mean_Total_timeDelta_Seconds__minimum',
       'mean_Total_timeDelta_Seconds__sum_values',
       'mean_Total_timeDelta_Seconds__maximum',
       'mean_Total_timeDelta_Seconds__absolute_maximum',
       'mean_Total_timeDelta_Seconds__standard_deviation',
       'mean_Total_timeDelta_Seconds__variance',
       'mean_Total_timeDelta_Seconds__median',
       'mean_Total_timeDelta_Seconds__root_mean_square',
       'mean_Total_timeDelta_Seconds__mean'],
 

In [16]:
cow_total.BreedName.unique()

array([ 1,  2,  4, 99], dtype=int64)

In [59]:
# add one-hot encoded categorical features
ohe = OneHotEncoder(handle_unknown='ignore',sparse=False)
cow_breed = cow_total[['id', 'BreedName']].copy()
cow_breed.drop_duplicates(subset=['id'], inplace=True)
cat = ohe.fit_transform(np.array(cow_breed['BreedName']).reshape(-1, 1))
col_names = ohe.get_feature_names_out(['BreedName'])
cat_breed = pd.DataFrame(cat, columns=col_names)
cat_breed.index = range(1,len(cow_breed)+1)
cat_breed

Unnamed: 0,BreedName_1,BreedName_2,BreedName_4,BreedName_99
1,1.0,0.0,0.0,0.0
2,0.0,1.0,0.0,0.0
3,1.0,0.0,0.0,0.0
4,0.0,0.0,1.0,0.0
5,0.0,1.0,0.0,0.0
...,...,...,...,...
114,0.0,1.0,0.0,0.0
115,1.0,0.0,0.0,0.0
116,0.0,1.0,0.0,0.0
117,1.0,0.0,0.0,0.0


In [60]:
ts_dataset = pd.concat([ts_extracted_processed, cat_breed], axis=1)
ts_dataset = pd.concat([ts_dataset, y], axis=1)
ts_dataset.to_csv(dataDir.parent/"learner_118_new_meanTimeCost_minPara_1200.csv", index=False)
ts_dataset

Unnamed: 0,id,Total_MilkProduction__root_mean_square,Total_MilkProduction__mean,Total_MilkProduction__median,DaysInMilk__sum_values,Age__sum_values,Age__length,DaysInMilk__length,mean_Total_timeDelta_Seconds__length,milking_times__length,...,mean_Total_timeDelta_Seconds__standard_deviation,mean_Total_timeDelta_Seconds__variance,mean_Total_timeDelta_Seconds__median,mean_Total_timeDelta_Seconds__root_mean_square,mean_Total_timeDelta_Seconds__mean,BreedName_1,BreedName_2,BreedName_4,BreedName_99,label
1,1,0.497555,0.462867,0.509399,-0.396065,-0.114193,-0.045738,-0.045738,-0.045738,-0.045738,...,-0.832853,-0.613775,-0.756838,-0.839589,-0.813586,1.0,0.0,0.0,0.0,1
2,2,-0.645265,-0.594454,-0.708623,-0.036699,0.035717,0.414106,0.414106,0.414106,0.414106,...,1.808096,1.757425,-0.229413,1.242509,0.265862,0.0,1.0,0.0,0.0,0
3,3,1.244419,1.279597,1.202757,-0.036374,-0.205439,-0.118345,-0.118345,-0.118345,-0.118345,...,-1.087650,-0.647093,-1.032739,-1.103150,-1.085117,1.0,0.0,0.0,0.0,1
4,4,-1.294427,-1.360149,-1.715063,-0.289880,-0.463257,-0.408773,-0.408773,-0.408773,-0.408773,...,2.178431,2.385353,1.462665,2.161290,2.109142,0.0,0.0,1.0,0.0,0
5,5,-1.641796,-1.596386,-1.499193,0.150997,-0.373229,-0.856517,-0.856517,-0.856517,-0.856517,...,-0.308665,-0.437062,-0.137197,-0.291153,-0.247278,0.0,1.0,0.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
114,114,-1.665564,-1.618685,-1.569959,0.117454,-0.419148,-0.832314,-0.832314,-0.832314,-0.832314,...,-0.693198,-0.580921,-0.631008,-0.712217,-0.701781,0.0,1.0,0.0,0.0,1
115,115,0.956841,0.977632,0.860367,-0.722459,-0.807518,-0.759707,-0.759707,-0.759707,-0.759707,...,-0.604324,-0.554633,-0.449428,-0.583059,-0.530867,1.0,0.0,0.0,0.0,0
116,116,0.312971,0.298216,0.234200,0.069920,0.445993,0.740838,0.740838,0.740838,0.740838,...,-0.412113,-0.483466,-0.261600,-0.386246,-0.331717,0.0,1.0,0.0,0.0,0
117,117,0.348528,0.311052,0.417904,0.241672,0.329442,0.765040,0.765040,0.765040,0.765040,...,-0.401931,-0.479150,-0.489628,-0.454874,-0.492162,1.0,0.0,0.0,0.0,0


In [29]:
from tsfresh.feature_selection.relevance import calculate_relevance_table
rt = calculate_relevance_table(extracted_features, y)
rt

Unnamed: 0_level_0,feature,type,p_value,relevant
feature,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
timeDelta_Seconds__ratio_value_number_to_time_series_length,timeDelta_Seconds__ratio_value_number_to_time_...,real,0.001554,False
timeDelta_Seconds__percentage_of_reoccurring_datapoints_to_all_datapoints,timeDelta_Seconds__percentage_of_reoccurring_d...,real,0.001554,False
"timeDelta_Seconds__change_quantiles__f_agg_""var""__isabs_False__qh_0.6__ql_0.4","timeDelta_Seconds__change_quantiles__f_agg_""va...",real,0.001554,False
"timeDelta_Seconds__change_quantiles__f_agg_""mean""__isabs_True__qh_0.6__ql_0.4","timeDelta_Seconds__change_quantiles__f_agg_""me...",real,0.001554,False
timeDelta_Seconds__quantile__q_0.4,timeDelta_Seconds__quantile__q_0.4,real,0.001554,False
...,...,...,...,...
timeDelta_Seconds__number_crossing_m__m_-1,timeDelta_Seconds__number_crossing_m__m_-1,constant,,False
timeDelta_Seconds__number_crossing_m__m_1,timeDelta_Seconds__number_crossing_m__m_1,constant,,False
timeDelta_Seconds__count_above__t_0,timeDelta_Seconds__count_above__t_0,constant,,False
timeDelta_Seconds__count_below__t_0,timeDelta_Seconds__count_below__t_0,constant,,False
