In [17]:
from pathlib import Path
from matplotlib.style import use
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from tsfresh import extract_features
from tsfresh import select_features
from tsfresh.utilities.dataframe_functions import impute
from tsfresh.feature_extraction import ComprehensiveFCParameters

dataDir = Path.cwd().parent.parent.parent/'Data/processed/learner_targetCows/'
usecols = ['id', 'MilkingEventDateTime', 'FarmName_Pseudo', 'TrafficDeviceName', 'MilkProduction', 'timeDelta_Seconds', 'LactationNumber', 'DaysInMilk', 'BreedName', 'Age']
threshold_time = 800

# labeling cow with good/bad learner(0/1)
def labeling_data(threshold_time, cow_total): 
    global learner
    timeCost = cow_total.timeDelta_Seconds.mean()
    if timeCost < threshold_time:
        learner = 1 # good learner
    else:
        learner = 0 # bad learner
    cow_total['label'] = learner
    return cow_total

In [18]:
# integrate all the cows data into one dataset
filelist = list(Path(dataDir).glob('*.csv'))
for i, _ in enumerate(filelist):
    fileName = 'cow_' + str(i) + '.csv'
    single_cow = pd.read_csv(dataDir/fileName, encoding='utf-8', usecols=usecols)
    single_cow = labeling_data(threshold_time, single_cow)
    single_cow.sort_values(by=['MilkingEventDateTime'], inplace=True)
    if i == 0:
        cow_total = single_cow
    else:
        cow_total = pd.concat([cow_total, single_cow], axis=0, ignore_index=True)
cow_total

Unnamed: 0,FarmName_Pseudo,MilkingEventDateTime,TrafficDeviceName,MilkProduction,timeDelta_Seconds,LactationNumber,DaysInMilk,BreedName,Age,id,label
0,a624fb9a,2021-07-05 11:02:14,vms2,9.07,6215.0,1.0,2.0,1,1.86,1,1
1,a624fb9a,2021-07-05 20:52:47,vms1,7.67,79.0,1.0,2.0,1,1.86,1,1
2,a624fb9a,2021-07-06 06:41:09,vms2,7.12,392.0,1.0,3.0,1,1.86,1,1
3,a624fb9a,2021-07-06 16:25:11,vms2,7.31,43.0,1.0,3.0,1,1.86,1,1
4,a624fb9a,2021-07-07 01:39:16,vms2,4.30,232.0,1.0,4.0,1,1.87,1,1
...,...,...,...,...,...,...,...,...,...,...,...
13136,a624fb9a,2022-08-06 06:38:20,vms2,8.97,143.0,1.0,343.0,1,3.37,16,1
13137,a624fb9a,2022-08-06 14:50:23,vms2,6.23,114.0,1.0,343.0,1,3.37,16,1
13138,a624fb9a,2022-08-07 01:52:24,vms1,9.98,1192.0,1.0,344.0,1,3.37,16,1
13139,a624fb9a,2022-08-07 11:38:29,vms1,7.98,1071.0,1.0,344.0,1,3.37,16,1


In [20]:
#df_static_features = cow_total[['FarmName_Pseudo']]
static_cols = ['Age', 'FarmName_Pseudo', 'TrafficDeviceName', 'LactationNumber', 'BreedName']
timeSeries_cols = ['MilkProduction', 'timeDelta_Seconds', 'DaysInMilk']
output_col = ['label']

In [21]:
cow_label = cow_total[['id', 'label']].copy()
cow_timeseries = cow_total[['id', 'MilkingEventDateTime']].copy()
cow_timeseries.index = range(len(cow_timeseries))
# fetch y for feature extraction
y = cow_label.drop_duplicates(subset=['id'])
y = y["label"]
y.index = range(1,len(y)+1) 

In [86]:
#dataDir1 = Path.cwd().parent.parent.parent/'Data/processed/memory_targetCows/'
#cow_total = pd.read_csv(dataDir1/"cow_total/cow_total_1.csv", encoding='utf-8', usecols=usecols)

In [15]:
y

1     1
2     1
3     0
4     0
5     1
6     0
7     1
8     0
9     1
10    0
11    0
12    0
13    1
14    1
15    0
16    1
Name: label, dtype: int64

In [22]:
ts_extracted_dataset = cow_total[['id']].copy()
ts_extracted_dataset.drop_duplicates(subset=['id'], inplace=True)
ts_extracted_dataset.index = range(1, len(ts_extracted_dataset)+1)

In [23]:
cow_timeseries

Unnamed: 0,id,MilkingEventDateTime
0,1,2021-07-05 11:02:14
1,1,2021-07-05 20:52:47
2,1,2021-07-06 06:41:09
3,1,2021-07-06 16:25:11
4,1,2021-07-07 01:39:16
...,...,...
13136,16,2022-08-06 06:38:20
13137,16,2022-08-06 14:50:23
13138,16,2022-08-07 01:52:24
13139,16,2022-08-07 11:38:29


In [26]:
for col in timeSeries_cols:
    ts_processed = pd.DataFrame(cow_total[col].copy())
    ts_processed.index = range(0,len(ts_processed)) 
    ts_processed = pd.concat([cow_timeseries, ts_processed], axis=1)
    print(ts_processed)
    # extract time series features
    extracted_features = extract_features(ts_processed, column_id="id", column_sort="MilkingEventDateTime")
    #extracted_features.dropna(axis=1, inplace=True)
    impute(extracted_features)
    features_filtered = select_features(extracted_features, y)
    ts_extracted_dataset = pd.concat([ts_extracted_dataset, features_filtered], axis=1)

       id MilkingEventDateTime  MilkProduction
0       1  2021-07-05 11:02:14            9.07
1       1  2021-07-05 20:52:47            7.67
2       1  2021-07-06 06:41:09            7.12
3       1  2021-07-06 16:25:11            7.31
4       1  2021-07-07 01:39:16            4.30
...    ..                  ...             ...
13136  16  2022-08-06 06:38:20            8.97
13137  16  2022-08-06 14:50:23            6.23
13138  16  2022-08-07 01:52:24            9.98
13139  16  2022-08-07 11:38:29            7.98
13140  16  2022-08-07 23:03:05            9.22

[13141 rows x 3 columns]


Feature Extraction: 100%|██████████| 16/16 [00:07<00:00,  2.22it/s]


       id MilkingEventDateTime  timeDelta_Seconds
0       1  2021-07-05 11:02:14             6215.0
1       1  2021-07-05 20:52:47               79.0
2       1  2021-07-06 06:41:09              392.0
3       1  2021-07-06 16:25:11               43.0
4       1  2021-07-07 01:39:16              232.0
...    ..                  ...                ...
13136  16  2022-08-06 06:38:20              143.0
13137  16  2022-08-06 14:50:23              114.0
13138  16  2022-08-07 01:52:24             1192.0
13139  16  2022-08-07 11:38:29             1071.0
13140  16  2022-08-07 23:03:05             2602.0

[13141 rows x 3 columns]


Feature Extraction: 100%|██████████| 16/16 [00:07<00:00,  2.23it/s]


       id MilkingEventDateTime  DaysInMilk
0       1  2021-07-05 11:02:14         2.0
1       1  2021-07-05 20:52:47         2.0
2       1  2021-07-06 06:41:09         3.0
3       1  2021-07-06 16:25:11         3.0
4       1  2021-07-07 01:39:16         4.0
...    ..                  ...         ...
13136  16  2022-08-06 06:38:20       343.0
13137  16  2022-08-06 14:50:23       343.0
13138  16  2022-08-07 01:52:24       344.0
13139  16  2022-08-07 11:38:29       344.0
13140  16  2022-08-07 23:03:05       344.0

[13141 rows x 3 columns]


Feature Extraction: 100%|██████████| 16/16 [00:07<00:00,  2.19it/s]


In [82]:
ts_extracted_features = ts_extracted_dataset.iloc[:, 1:len(ts_extracted_dataset.columns)].copy()
# normalize numerical features
ts_extracted_cols = ts_extracted_features.columns
scaler_std = StandardScaler()
ts_std = scaler_std.fit_transform(ts_extracted_features)
# transform standard data into dataframe
ts_extracted_processed = pd.DataFrame(ts_std, columns=ts_extracted_cols)
ts_extracted_processed.index = range(1,len(ts_extracted_processed)+1)
# append id col to the dataframe
ts_extracted_processed = pd.concat([ts_extracted_dataset['id'], ts_extracted_processed], axis=1)

In [27]:
ts_extracted_dataset

Unnamed: 0,id,"timeDelta_Seconds__change_quantiles__f_agg_""var""__isabs_True__qh_0.8__ql_0.0","timeDelta_Seconds__change_quantiles__f_agg_""mean""__isabs_True__qh_0.8__ql_0.0","timeDelta_Seconds__change_quantiles__f_agg_""var""__isabs_False__qh_0.8__ql_0.0","timeDelta_Seconds__change_quantiles__f_agg_""mean""__isabs_True__qh_0.8__ql_0.4","timeDelta_Seconds__change_quantiles__f_agg_""var""__isabs_False__qh_0.8__ql_0.6","timeDelta_Seconds__change_quantiles__f_agg_""var""__isabs_True__qh_0.6__ql_0.2","timeDelta_Seconds__change_quantiles__f_agg_""mean""__isabs_True__qh_0.8__ql_0.6","timeDelta_Seconds__change_quantiles__f_agg_""var""__isabs_True__qh_0.8__ql_0.6","timeDelta_Seconds__change_quantiles__f_agg_""var""__isabs_False__qh_0.8__ql_0.4","timeDelta_Seconds__change_quantiles__f_agg_""mean""__isabs_True__qh_1.0__ql_0.0","timeDelta_Seconds__change_quantiles__f_agg_""var""__isabs_False__qh_0.8__ql_0.2","timeDelta_Seconds__change_quantiles__f_agg_""mean""__isabs_True__qh_0.8__ql_0.2","timeDelta_Seconds__change_quantiles__f_agg_""var""__isabs_True__qh_0.8__ql_0.2",timeDelta_Seconds__mean,timeDelta_Seconds__quantile__q_0.7,timeDelta_Seconds__quantile__q_0.8,timeDelta_Seconds__mean_abs_change,timeDelta_Seconds__abs_energy,"timeDelta_Seconds__change_quantiles__f_agg_""var""__isabs_True__qh_0.8__ql_0.4"
1,1,47538.593037,240.809963,105520.80499,214.25,38012.422497,10083.518418,149.037037,16000.554184,73871.479888,860.147754,111336.06345,260.507331,43577.522674,601.028335,506.0,810.0,860.147754,1430337000.0,27969.569853
2,2,76059.668632,281.473118,155285.452949,279.963504,86054.657596,9126.495156,245.619048,25730.950113,125326.275028,997.439724,166558.202211,306.525316,72604.901258,700.772936,639.4,1062.8,997.439724,1693010000.0,47422.07896
3,3,148661.636005,510.421569,409164.405421,410.582418,128291.459961,26076.22085,282.46875,48795.061523,258171.861369,1415.079114,340668.858662,464.788546,125460.140309,1254.036335,1268.4,1837.2,1415.079114,2914267000.0,93277.495955
4,4,196608.351491,589.13467,542582.115483,468.833333,211491.768707,26847.98954,364.238095,79630.562358,324439.110544,1389.072727,424809.479395,523.321244,150949.482295,1248.958258,1389.0,2009.0,1389.072727,1931160000.0,106051.400794
5,5,49587.803554,292.603774,135141.210225,238.503597,45574.227732,7582.091503,178.151515,15674.855831,85776.402567,881.219376,104875.350773,257.975831,38452.174642,776.546162,701.6,1011.0,881.219376,1648481000.0,28899.070131
6,6,114786.156182,411.174442,283550.016441,380.016,116626.889648,19697.325046,272.78125,43042.045898,224522.306816,1128.795515,286542.753429,429.100719,102841.644532,933.716733,961.6,1458.4,1128.795515,2245845000.0,80229.567744
7,7,65078.827271,286.321489,147057.09829,256.34965,57735.710738,10670.619903,195.594595,22991.430241,103118.695291,834.400216,149909.61551,304.3,57502.822121,620.525862,664.9,968.2,834.400216,1219855000.0,37405.136486
8,8,343807.920529,662.343629,782413.08606,698.314961,277549.32937,38552.404699,471.965517,89638.033294,732566.304793,1592.721945,848859.630469,733.949495,310199.70452,1244.098381,1551.4,2172.2,1592.721945,3780180000.0,247187.381115
9,9,60812.361491,300.215164,150892.821213,251.540984,48122.095663,11228.507995,185.607143,13687.23852,95740.920451,821.681698,138109.910247,297.17029,49961.184769,669.645033,680.6,959.2,821.681698,1139309000.0,33172.477829
10,10,109572.202985,355.701299,236069.467606,375.205298,175967.777778,11305.920634,368.611111,41563.070988,217216.261918,1152.102083,256511.414273,392.18232,102714.568969,847.228928,748.0,1345.0,1152.102083,2478628000.0,76941.858515


In [42]:
cow_total.BreedName.unique()

array([ 1,  2, 99,  4], dtype=int64)

In [43]:
# add one-hot encoded categorical features
ohe = OneHotEncoder(handle_unknown='ignore',sparse=False)
cow_breed = cow_total[['id', 'BreedName']].copy()
cow_breed.drop_duplicates(subset=['id'], inplace=True)
cat = ohe.fit_transform(np.array(cow_breed['BreedName']).reshape(-1, 1))
col_names = ohe.get_feature_names_out(['BreedName'])
cat_breed = pd.DataFrame(cat, columns=col_names)
cat_breed.index = range(1,len(cow_breed)+1)
cat_breed

Unnamed: 0,BreedName_1,BreedName_2,BreedName_4,BreedName_99
1,1.0,0.0,0.0,0.0
2,1.0,0.0,0.0,0.0
3,0.0,1.0,0.0,0.0
4,0.0,1.0,0.0,0.0
5,0.0,1.0,0.0,0.0
6,1.0,0.0,0.0,0.0
7,1.0,0.0,0.0,0.0
8,0.0,1.0,0.0,0.0
9,0.0,1.0,0.0,0.0
10,1.0,0.0,0.0,0.0


In [44]:
ts_dataset = pd.concat([ts_extracted_processed, cat_breed], axis=1)
ts_dataset = pd.concat([ts_dataset, y], axis=1)
ts_dataset.to_csv(dataDir.parent/"ts_dataset_34_with_cat.csv", index=False)
ts_dataset

Unnamed: 0,id,timeDelta_Seconds__mean,timeDelta_Seconds__quantile__q_0.8,timeDelta_Seconds__quantile__q_0.9,"timeDelta_Seconds__change_quantiles__f_agg_""mean""__isabs_True__qh_0.8__ql_0.0",timeDelta_Seconds__quantile__q_0.7,"timeDelta_Seconds__change_quantiles__f_agg_""var""__isabs_False__qh_0.8__ql_0.0","timeDelta_Seconds__change_quantiles__f_agg_""var""__isabs_False__qh_0.6__ql_0.0",timeDelta_Seconds__c3__lag_2,"timeDelta_Seconds__change_quantiles__f_agg_""var""__isabs_True__qh_0.8__ql_0.2",...,"timeDelta_Seconds__change_quantiles__f_agg_""mean""__isabs_True__qh_0.6__ql_0.0","timeDelta_Seconds__change_quantiles__f_agg_""mean""__isabs_True__qh_1.0__ql_0.4",timeDelta_Seconds__c3__lag_3,timeDelta_Seconds__c3__lag_1,"timeDelta_Seconds__change_quantiles__f_agg_""var""__isabs_False__qh_0.6__ql_0.2",BreedName_1,BreedName_2,BreedName_4,BreedName_99,label
1,1,-0.411087,-0.337273,-0.460966,-0.35652,-0.326767,-0.41798,-0.286915,-0.231237,-0.416559,...,-0.180169,-0.428851,-0.282601,-0.265592,-0.237058,1.0,0.0,0.0,0.0,0
2,2,-1.42321,-1.402307,-1.485056,-1.272109,-1.226703,-0.795247,-0.699348,-0.251819,-0.779632,...,-1.099019,-1.316508,-0.320612,-0.292364,-0.798315,1.0,0.0,0.0,0.0,1
3,3,-0.467454,-0.512278,-0.515642,-0.5494,-0.558293,-0.535669,-0.367123,-0.215246,-0.542825,...,-0.306303,-0.428939,-0.244313,-0.223679,-0.598981,0.0,1.0,0.0,0.0,0
4,4,1.575712,1.785891,0.996953,2.254924,2.188218,2.189295,2.153466,-0.050316,1.666391,...,2.117754,0.331714,0.363898,0.113767,2.425048,0.0,1.0,0.0,0.0,0
5,5,0.883018,1.25135,1.172377,1.155937,1.051125,1.033167,0.406585,-0.130904,1.082852,...,0.641422,0.484863,-0.045822,-0.08016,0.858644,0.0,1.0,0.0,0.0,0
6,6,-0.545717,-0.539422,-0.633336,-0.53045,-0.415867,-0.510161,-0.399752,-0.232985,-0.509488,...,-0.323021,-0.67249,-0.282313,-0.259023,-0.609913,1.0,0.0,0.0,0.0,1
7,7,-0.762169,-0.745143,-0.779635,-0.743015,-0.71993,-0.611673,-0.538328,-0.239428,-0.611377,...,-0.648516,-0.620489,-0.286379,-0.267964,-0.629228,1.0,0.0,0.0,0.0,1
8,8,0.856021,1.30278,1.007903,1.281259,1.089547,1.099612,0.428618,-0.10059,1.204747,...,0.6785,0.951874,-0.000804,-0.035695,1.072138,0.0,1.0,0.0,0.0,0
9,9,-0.201892,-0.233936,-0.172839,-0.283191,-0.232037,-0.378526,-0.280416,-0.190074,-0.359859,...,-0.155404,-0.178647,-0.162675,-0.205124,-0.258143,0.0,1.0,0.0,0.0,0
10,10,-0.866585,-1.005866,-0.895988,-0.983939,-0.986566,-0.689121,-0.675162,-0.243078,-0.632964,...,-1.125836,-0.420586,-0.270328,-0.28153,-0.693665,1.0,0.0,0.0,0.0,1


In [29]:
from tsfresh.feature_selection.relevance import calculate_relevance_table
rt = calculate_relevance_table(extracted_features, y)
rt

Unnamed: 0_level_0,feature,type,p_value,relevant
feature,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
timeDelta_Seconds__ratio_value_number_to_time_series_length,timeDelta_Seconds__ratio_value_number_to_time_...,real,0.001554,False
timeDelta_Seconds__percentage_of_reoccurring_datapoints_to_all_datapoints,timeDelta_Seconds__percentage_of_reoccurring_d...,real,0.001554,False
"timeDelta_Seconds__change_quantiles__f_agg_""var""__isabs_False__qh_0.6__ql_0.4","timeDelta_Seconds__change_quantiles__f_agg_""va...",real,0.001554,False
"timeDelta_Seconds__change_quantiles__f_agg_""mean""__isabs_True__qh_0.6__ql_0.4","timeDelta_Seconds__change_quantiles__f_agg_""me...",real,0.001554,False
timeDelta_Seconds__quantile__q_0.4,timeDelta_Seconds__quantile__q_0.4,real,0.001554,False
...,...,...,...,...
timeDelta_Seconds__number_crossing_m__m_-1,timeDelta_Seconds__number_crossing_m__m_-1,constant,,False
timeDelta_Seconds__number_crossing_m__m_1,timeDelta_Seconds__number_crossing_m__m_1,constant,,False
timeDelta_Seconds__count_above__t_0,timeDelta_Seconds__count_above__t_0,constant,,False
timeDelta_Seconds__count_below__t_0,timeDelta_Seconds__count_below__t_0,constant,,False
