In [12]:
from pathlib import Path
from matplotlib.style import use
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from tsfresh import extract_features
from tsfresh import select_features
from tsfresh.utilities.dataframe_functions import impute
from tsfresh.feature_extraction import ComprehensiveFCParameters

dataDir = Path.cwd().parent.parent.parent/'Data/processed/learner_targetCows/'
usecols = ['id', 'FarmName_Pseudo', 'Gigacow_Cow_Id', 'Total_MilkProduction', 'Total_timeDelta_Seconds', 'LactationNumber', 'DaysInMilk', 'BreedName', 'Age', 'milking_times', 'MilkingDate']
threshold_time = 800

# labeling cow with good/bad learner(1/0)
def labeling_data(threshold_time, cow_total): 
    global learner
    total_timeCost = cow_total.Total_timeDelta_Seconds.sum()
    totalEvents = cow_total.milking_times.sum()
    mean_timeCost = total_timeCost/totalEvents
    if mean_timeCost < threshold_time:
        learner = 1 # good learner
    else:
        learner = 0 # bad learner
    cow_total['label'] = learner
    return cow_total

In [14]:
# integrate all the cows data into one dataset
filelist = list(Path(dataDir).glob('*.csv'))
for i, _ in enumerate(filelist):
    fileName = 'cow_' + str(i) + '.csv'
    single_cow = pd.read_csv(dataDir/fileName, encoding='utf-8', usecols=usecols)
    single_cow = labeling_data(threshold_time, single_cow)
    single_cow.sort_values(by=['MilkingDate'], inplace=True)
    if i == 0:
        cow_total = single_cow
    else:
        cow_total = pd.concat([cow_total, single_cow], axis=0, ignore_index=True)
cow_total.to_csv(dataDir.parent/"Cow_Learner_dataset_L1.csv", index=False)
cow_total

Unnamed: 0,FarmName_Pseudo,Gigacow_Cow_Id,MilkingDate,LactationNumber,DaysInMilk,BreedName,Age,Total_MilkProduction,Total_timeDelta_Seconds,milking_times,id,label
0,f454e660,6351,2020-11-12,1.0,84.0,4,2.75,15.72,5660.0,2,1,0
1,f454e660,6351,2020-11-14,1.0,86.0,4,2.75,6.02,1030.0,1,1,0
2,f454e660,6351,2020-11-16,1.0,88.0,4,2.76,7.26,1446.0,1,1,0
3,f454e660,6351,2020-11-17,1.0,89.0,4,2.76,12.98,1459.0,2,1,0
4,f454e660,6351,2020-11-18,1.0,90.0,4,2.76,9.51,224.0,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...
22558,a624fb9a,2374,2022-08-03,1.0,340.0,1,3.36,25.34,853.0,3,116,1
22559,a624fb9a,2374,2022-08-04,1.0,341.0,1,3.36,16.95,276.0,2,116,1
22560,a624fb9a,2374,2022-08-05,1.0,342.0,1,3.37,23.50,1231.0,3,116,1
22561,a624fb9a,2374,2022-08-06,1.0,343.0,1,3.37,15.20,257.0,2,116,1


In [23]:
#df_static_features = cow_total[['FarmName_Pseudo']]
static_cols = ['FarmName_Pseudo', 'TrafficDeviceName', 'LactationNumber', 'BreedName']
timeSeries_cols = ['Age', 'Total_MilkProduction', 'Total_timeDelta_Seconds', 'DaysInMilk', 'milking_times']
output_col = ['label']

In [17]:
cow_label = cow_total[['id', 'label']].copy()
cow_timeseries = cow_total[['id', 'MilkingDate']].copy()
cow_timeseries.index = range(len(cow_timeseries))
# fetch y for feature extraction
y = cow_label.drop_duplicates(subset=['id'])
y = y["label"]
y.index = range(1,len(y)+1) 

In [86]:
#dataDir1 = Path.cwd().parent.parent.parent/'Data/processed/memory_targetCows/'
#cow_total = pd.read_csv(dataDir1/"cow_total/cow_total_1.csv", encoding='utf-8', usecols=usecols)

In [18]:
y

1      0
2      0
3      0
4      0
5      0
      ..
112    0
113    1
114    1
115    0
116    1
Name: label, Length: 116, dtype: int64

In [19]:
ts_extracted_dataset = cow_total[['id']].copy()
ts_extracted_dataset.drop_duplicates(subset=['id'], inplace=True)
ts_extracted_dataset.index = range(1, len(ts_extracted_dataset)+1)

In [20]:
cow_timeseries

Unnamed: 0,id,MilkingDate
0,1,2020-11-12
1,1,2020-11-14
2,1,2020-11-16
3,1,2020-11-17
4,1,2020-11-18
...,...,...
22558,116,2022-08-03
22559,116,2022-08-04
22560,116,2022-08-05
22561,116,2022-08-06


In [None]:
from tsfresh.feature_extraction import extract_features, EfficientFCParameters
extract_features(df, default_fc_parameters=EfficientFCParameters())


In [24]:
for col in timeSeries_cols:
    ts_processed = pd.DataFrame(cow_total[col].copy())
    ts_processed.index = range(0,len(ts_processed)) 
    ts_processed = pd.concat([cow_timeseries, ts_processed], axis=1)
    print(ts_processed)
    # extract time series features
    extracted_features = extract_features(ts_processed, column_id="id", column_sort="MilkingDate")
    #extracted_features.dropna(axis=1, inplace=True)
    impute(extracted_features)
    features_filtered = select_features(extracted_features, y)
    ts_extracted_dataset = pd.concat([ts_extracted_dataset, features_filtered], axis=1)

        id MilkingDate   Age
0        1  2020-11-12  2.75
1        1  2020-11-14  2.75
2        1  2020-11-16  2.76
3        1  2020-11-17  2.76
4        1  2020-11-18  2.76
...    ...         ...   ...
22558  116  2022-08-03  3.36
22559  116  2022-08-04  3.36
22560  116  2022-08-05  3.37
22561  116  2022-08-06  3.37
22562  116  2022-08-07  3.37

[22563 rows x 3 columns]


Feature Extraction: 100%|██████████| 39/39 [00:06<00:00,  5.84it/s]


        id MilkingDate  Total_MilkProduction
0        1  2020-11-12                 15.72
1        1  2020-11-14                  6.02
2        1  2020-11-16                  7.26
3        1  2020-11-17                 12.98
4        1  2020-11-18                  9.51
...    ...         ...                   ...
22558  116  2022-08-03                 25.34
22559  116  2022-08-04                 16.95
22560  116  2022-08-05                 23.50
22561  116  2022-08-06                 15.20
22562  116  2022-08-07                 27.18

[22563 rows x 3 columns]


Feature Extraction: 100%|██████████| 39/39 [00:06<00:00,  5.61it/s]


        id MilkingDate  Total_timeDelta_Seconds
0        1  2020-11-12                   5660.0
1        1  2020-11-14                   1030.0
2        1  2020-11-16                   1446.0
3        1  2020-11-17                   1459.0
4        1  2020-11-18                    224.0
...    ...         ...                      ...
22558  116  2022-08-03                    853.0
22559  116  2022-08-04                    276.0
22560  116  2022-08-05                   1231.0
22561  116  2022-08-06                    257.0
22562  116  2022-08-07                   4865.0

[22563 rows x 3 columns]


Feature Extraction: 100%|██████████| 39/39 [00:06<00:00,  5.90it/s]


        id MilkingDate  DaysInMilk
0        1  2020-11-12        84.0
1        1  2020-11-14        86.0
2        1  2020-11-16        88.0
3        1  2020-11-17        89.0
4        1  2020-11-18        90.0
...    ...         ...         ...
22558  116  2022-08-03       340.0
22559  116  2022-08-04       341.0
22560  116  2022-08-05       342.0
22561  116  2022-08-06       343.0
22562  116  2022-08-07       344.0

[22563 rows x 3 columns]


Feature Extraction: 100%|██████████| 39/39 [00:06<00:00,  5.92it/s]


        id MilkingDate  milking_times
0        1  2020-11-12              2
1        1  2020-11-14              1
2        1  2020-11-16              1
3        1  2020-11-17              2
4        1  2020-11-18              1
...    ...         ...            ...
22558  116  2022-08-03              3
22559  116  2022-08-04              2
22560  116  2022-08-05              3
22561  116  2022-08-06              2
22562  116  2022-08-07              3

[22563 rows x 3 columns]


Feature Extraction: 100%|██████████| 39/39 [00:06<00:00,  6.33it/s]
 'milking_times__friedrich_coefficients__coeff_1__m_3__r_30'
 'milking_times__friedrich_coefficients__coeff_2__m_3__r_30'
 'milking_times__friedrich_coefficients__coeff_3__m_3__r_30'
 'milking_times__max_langevin_fixed_point__m_3__r_30'
 'milking_times__query_similarity_count__query_None__threshold_0.0'] did not have any finite values. Filling with zeros.


In [82]:
ts_extracted_features = ts_extracted_dataset.iloc[:, 1:len(ts_extracted_dataset.columns)].copy()
# normalize numerical features
ts_extracted_cols = ts_extracted_features.columns
scaler_std = StandardScaler()
ts_std = scaler_std.fit_transform(ts_extracted_features)
# transform standard data into dataframe
ts_extracted_processed = pd.DataFrame(ts_std, columns=ts_extracted_cols)
ts_extracted_processed.index = range(1,len(ts_extracted_processed)+1)
# append id col to the dataframe
ts_extracted_processed = pd.concat([ts_extracted_dataset['id'], ts_extracted_processed], axis=1)

In [25]:
ts_extracted_dataset

Unnamed: 0,id,"Age__fft_coefficient__attr_""angle""__coeff_23","Age__fft_coefficient__attr_""angle""__coeff_24","Age__fft_coefficient__attr_""imag""__coeff_60","Age__fft_coefficient__attr_""angle""__coeff_53","Age__fft_coefficient__attr_""imag""__coeff_53","Age__fft_coefficient__attr_""angle""__coeff_60","Age__fft_coefficient__attr_""angle""__coeff_54",Age__number_cwt_peaks__n_5,"Age__fft_coefficient__attr_""angle""__coeff_42",...,"milking_times__agg_linear_trend__attr_""intercept""__chunk_len_10__f_agg_""max""","milking_times__agg_linear_trend__attr_""stderr""__chunk_len_5__f_agg_""var""",milking_times__index_mass_quantile__q_0.9,"milking_times__linear_trend__attr_""stderr""","milking_times__agg_linear_trend__attr_""rvalue""__chunk_len_10__f_agg_""min""",milking_times__fourier_entropy__bins_3,milking_times__autocorrelation__lag_3,"milking_times__fft_coefficient__attr_""imag""__coeff_1","milking_times__fft_aggregated__aggtype_""centroid""","milking_times__agg_linear_trend__attr_""intercept""__chunk_len_50__f_agg_""max"""
1,1,109.381973,110.446617,0.339277,129.255861,0.419433,134.156800,130.096874,3.0,122.501477,...,2.200000,0.007980,0.906250,0.007664,0.000000,0.752975,0.311527,-2.525224,4.076384,4.400000
2,2,171.670592,149.442561,0.339277,129.255861,0.419433,134.156800,130.096874,13.0,122.501477,...,2.311111,0.010312,0.903614,0.003252,0.273861,0.898515,0.009244,14.068238,11.678342,3.000000
3,3,164.930467,137.469412,0.339277,129.255861,0.419433,134.156800,130.096874,8.0,122.501477,...,5.400000,0.018819,0.879518,0.004152,-0.098374,0.728504,0.256427,-7.484272,10.048291,6.000000
4,4,109.381973,110.446617,0.339277,129.255861,0.419433,134.156800,130.096874,4.0,122.501477,...,3.300000,0.058904,0.900000,0.009700,0.000000,0.998941,0.061271,-2.234388,5.850249,4.400000
5,5,101.929389,122.691964,0.339277,129.255861,0.419433,134.156800,130.096874,10.0,122.501477,...,4.222222,0.011316,0.890244,0.003014,-0.091287,0.633088,0.064476,-0.304478,10.958675,5.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
112,112,98.308400,98.850413,0.751172,111.404442,0.881863,118.708244,112.278856,49.0,106.367581,...,4.215909,0.000957,0.872611,0.000361,-0.446042,0.045395,0.457540,-86.259177,45.950943,4.892857
113,113,102.690545,103.302045,0.651866,119.680457,0.756781,121.076637,120.331299,49.0,113.576883,...,3.113636,0.001050,0.905063,0.000303,-0.083894,0.399813,-0.095884,35.501954,53.963165,3.964286
114,114,98.168284,98.051185,0.866244,109.915451,0.973816,115.072198,111.912572,52.0,105.125063,...,4.475936,0.001987,0.869697,0.000392,-0.773574,0.045395,0.591285,-132.450889,49.850962,4.821429
115,115,99.774851,99.616716,0.802298,111.235330,0.997864,113.779929,110.220150,53.0,106.869723,...,4.603361,0.001438,0.861446,0.000351,-0.705461,0.045395,0.631696,-145.658935,48.225048,4.857143


In [42]:
cow_total.BreedName.unique()

array([ 1,  2, 99,  4], dtype=int64)

In [43]:
# add one-hot encoded categorical features
ohe = OneHotEncoder(handle_unknown='ignore',sparse=False)
cow_breed = cow_total[['id', 'BreedName']].copy()
cow_breed.drop_duplicates(subset=['id'], inplace=True)
cat = ohe.fit_transform(np.array(cow_breed['BreedName']).reshape(-1, 1))
col_names = ohe.get_feature_names_out(['BreedName'])
cat_breed = pd.DataFrame(cat, columns=col_names)
cat_breed.index = range(1,len(cow_breed)+1)
cat_breed

Unnamed: 0,BreedName_1,BreedName_2,BreedName_4,BreedName_99
1,1.0,0.0,0.0,0.0
2,1.0,0.0,0.0,0.0
3,0.0,1.0,0.0,0.0
4,0.0,1.0,0.0,0.0
5,0.0,1.0,0.0,0.0
6,1.0,0.0,0.0,0.0
7,1.0,0.0,0.0,0.0
8,0.0,1.0,0.0,0.0
9,0.0,1.0,0.0,0.0
10,1.0,0.0,0.0,0.0


In [44]:
ts_dataset = pd.concat([ts_extracted_processed, cat_breed], axis=1)
ts_dataset = pd.concat([ts_dataset, y], axis=1)
ts_dataset.to_csv(dataDir.parent/"ts_dataset_34_with_cat.csv", index=False)
ts_dataset

Unnamed: 0,id,timeDelta_Seconds__mean,timeDelta_Seconds__quantile__q_0.8,timeDelta_Seconds__quantile__q_0.9,"timeDelta_Seconds__change_quantiles__f_agg_""mean""__isabs_True__qh_0.8__ql_0.0",timeDelta_Seconds__quantile__q_0.7,"timeDelta_Seconds__change_quantiles__f_agg_""var""__isabs_False__qh_0.8__ql_0.0","timeDelta_Seconds__change_quantiles__f_agg_""var""__isabs_False__qh_0.6__ql_0.0",timeDelta_Seconds__c3__lag_2,"timeDelta_Seconds__change_quantiles__f_agg_""var""__isabs_True__qh_0.8__ql_0.2",...,"timeDelta_Seconds__change_quantiles__f_agg_""mean""__isabs_True__qh_0.6__ql_0.0","timeDelta_Seconds__change_quantiles__f_agg_""mean""__isabs_True__qh_1.0__ql_0.4",timeDelta_Seconds__c3__lag_3,timeDelta_Seconds__c3__lag_1,"timeDelta_Seconds__change_quantiles__f_agg_""var""__isabs_False__qh_0.6__ql_0.2",BreedName_1,BreedName_2,BreedName_4,BreedName_99,label
1,1,-0.411087,-0.337273,-0.460966,-0.35652,-0.326767,-0.41798,-0.286915,-0.231237,-0.416559,...,-0.180169,-0.428851,-0.282601,-0.265592,-0.237058,1.0,0.0,0.0,0.0,0
2,2,-1.42321,-1.402307,-1.485056,-1.272109,-1.226703,-0.795247,-0.699348,-0.251819,-0.779632,...,-1.099019,-1.316508,-0.320612,-0.292364,-0.798315,1.0,0.0,0.0,0.0,1
3,3,-0.467454,-0.512278,-0.515642,-0.5494,-0.558293,-0.535669,-0.367123,-0.215246,-0.542825,...,-0.306303,-0.428939,-0.244313,-0.223679,-0.598981,0.0,1.0,0.0,0.0,0
4,4,1.575712,1.785891,0.996953,2.254924,2.188218,2.189295,2.153466,-0.050316,1.666391,...,2.117754,0.331714,0.363898,0.113767,2.425048,0.0,1.0,0.0,0.0,0
5,5,0.883018,1.25135,1.172377,1.155937,1.051125,1.033167,0.406585,-0.130904,1.082852,...,0.641422,0.484863,-0.045822,-0.08016,0.858644,0.0,1.0,0.0,0.0,0
6,6,-0.545717,-0.539422,-0.633336,-0.53045,-0.415867,-0.510161,-0.399752,-0.232985,-0.509488,...,-0.323021,-0.67249,-0.282313,-0.259023,-0.609913,1.0,0.0,0.0,0.0,1
7,7,-0.762169,-0.745143,-0.779635,-0.743015,-0.71993,-0.611673,-0.538328,-0.239428,-0.611377,...,-0.648516,-0.620489,-0.286379,-0.267964,-0.629228,1.0,0.0,0.0,0.0,1
8,8,0.856021,1.30278,1.007903,1.281259,1.089547,1.099612,0.428618,-0.10059,1.204747,...,0.6785,0.951874,-0.000804,-0.035695,1.072138,0.0,1.0,0.0,0.0,0
9,9,-0.201892,-0.233936,-0.172839,-0.283191,-0.232037,-0.378526,-0.280416,-0.190074,-0.359859,...,-0.155404,-0.178647,-0.162675,-0.205124,-0.258143,0.0,1.0,0.0,0.0,0
10,10,-0.866585,-1.005866,-0.895988,-0.983939,-0.986566,-0.689121,-0.675162,-0.243078,-0.632964,...,-1.125836,-0.420586,-0.270328,-0.28153,-0.693665,1.0,0.0,0.0,0.0,1


In [29]:
from tsfresh.feature_selection.relevance import calculate_relevance_table
rt = calculate_relevance_table(extracted_features, y)
rt

Unnamed: 0_level_0,feature,type,p_value,relevant
feature,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
timeDelta_Seconds__ratio_value_number_to_time_series_length,timeDelta_Seconds__ratio_value_number_to_time_...,real,0.001554,False
timeDelta_Seconds__percentage_of_reoccurring_datapoints_to_all_datapoints,timeDelta_Seconds__percentage_of_reoccurring_d...,real,0.001554,False
"timeDelta_Seconds__change_quantiles__f_agg_""var""__isabs_False__qh_0.6__ql_0.4","timeDelta_Seconds__change_quantiles__f_agg_""va...",real,0.001554,False
"timeDelta_Seconds__change_quantiles__f_agg_""mean""__isabs_True__qh_0.6__ql_0.4","timeDelta_Seconds__change_quantiles__f_agg_""me...",real,0.001554,False
timeDelta_Seconds__quantile__q_0.4,timeDelta_Seconds__quantile__q_0.4,real,0.001554,False
...,...,...,...,...
timeDelta_Seconds__number_crossing_m__m_-1,timeDelta_Seconds__number_crossing_m__m_-1,constant,,False
timeDelta_Seconds__number_crossing_m__m_1,timeDelta_Seconds__number_crossing_m__m_1,constant,,False
timeDelta_Seconds__count_above__t_0,timeDelta_Seconds__count_above__t_0,constant,,False
timeDelta_Seconds__count_below__t_0,timeDelta_Seconds__count_below__t_0,constant,,False
