In [1]:
'''
Extract features from time series data of temperature.
- Statistical features in time domaion:(max, min, mean, std, var, median), (skew, kurt) for only 30, 60 minutes
- Time delta of Resampling : 10 minutes
- Window size of rolling window calculations : 10 minutes, 30 minutes, 1 hour(60 minutes)

Concetanate all above features (22 columns)
Construct labels(normal / abnormal_cold / abnormal_hot) with lag features for every hour 

'''
from datetime import datetime
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import sys

## Read csv file of temperature data
df_Temp = pd.read_csv('ts_temperature.csv')
print (df_Temp.columns)
df_Temp['date'] = pd.to_datetime(df_Temp['date'], format="%Y-%m-%d %H:%M:%S")
# Set index as date
df_Temp = df_Temp.set_index('date')

print (df_Temp)
# print ("type(df_Temp['date'])", type(df_Temp['date']).__name__)
# print ("type(df_Temp['date'][0])", type(df_Temp['date'][0]).__name__)

Index(['date', 'temperature'], dtype='object')
                     temperature
date                            
2018-01-31 00:00:00    19.619791
2018-01-31 00:01:00    18.802944
2018-01-31 00:02:00    19.857184
2018-01-31 00:03:00    20.208154
2018-01-31 00:04:00    18.432066
...                          ...
2020-03-30 23:56:00    19.419647
2020-03-30 23:57:00    19.381692
2020-03-30 23:58:00    19.827619
2020-03-30 23:59:00    20.136786
2020-03-31 00:00:00    20.130036

[1137601 rows x 1 columns]


In [2]:
# import warnings
# warnings.filterwarnings("ignore", category=DeprecationWarning) 


## Feature calculation for window size 10
features = ['max', 'min', 'mean', 'std', 'var', 'median']
temprl = []

for f in features:
    print (f)
    temprl.append( df_Temp.resample('10T', closed='left', label='right', how=f) )

temp_10_min = pd.concat(temprl, axis=1)
temp_10_min.columns = [f + '_temp_10_min' for f in features]
temp_10_min.reset_index(inplace=True)

print ("temp_10_min", temp_10_min)
# temp_max_3mins=pd.DataFrame()
# temp_max_3mins = df_Temp.resample('10T', closed='left', label='right', how='min')
# temp_max_3mins.columns = ['10mins_'+'']
# temp_max_3mins.reset_index(inplace=True)
# print (type(temp_max_3mins))
# print ("temp_max_3mins", temp_max_3mins)


## Feature calculation for window size 30 and 60
features = ['max', 'min', 'mean', 'std', 'var', 'median', 'skew','kurt']
window_size = ['30T', '60T']
temprl = []
temp_rolling = []
for ws in window_size:
    print (ws)
    print (int(ws[:2]))
    temprl.append( df_Temp.rolling(window=int(ws[:2]),center=False).max().resample('10T', closed='left', label='right',how='first'))
    temprl.append( df_Temp.rolling(window=int(ws[:2]),center=False).min().resample('10T', closed='left', label='right',how='first'))
    temprl.append( df_Temp.rolling(window=int(ws[:2]),center=False).mean().resample('10T', closed='left', label='right',how='first'))
    temprl.append( df_Temp.rolling(window=int(ws[:2]),center=False).std().resample('10T', closed='left', label='right',how='first'))
    temprl.append( df_Temp.rolling(window=int(ws[:2]),center=False).var().resample('10T', closed='left', label='right',how='first'))
    temprl.append( df_Temp.rolling(window=int(ws[:2]),center=False).median().resample('10T', closed='left', label='right',how='first'))
    temprl.append( df_Temp.rolling(window=int(ws[:2]),center=False).skew().resample('10T', closed='left', label='right',how='first'))
    temprl.append( df_Temp.rolling(window=int(ws[:2]),center=False).kurt().resample('10T', closed='left', label='right',how='first'))
    
    temp = pd.concat(temprl, axis=1)
    temp.columns = [f + '_temp_'+ str(ws[:2]) + '_min' for f in features]
    temp = temp.loc[-temp['max' + '_temp_'+ str(ws[:2]) + '_min'].isnull()]
    temp.reset_index(inplace=True)
    temp_rolling.append(temp)
    temprl = []
    
# temp_10_min = pd.concat(temprl, axis=1)
# temp_10_min.columns = [f + '_temp_10_min' for f in features]
# temp_10_min.reset_index(inplace=True)

# print ("temp_10_min", temp_10_min)
# print ("end of line")

print ("temp_rolling[0]", temp_rolling[0])
print ("temp_rolling[1]", temp_rolling[1])


temp_feat = pd.concat([temp_10_min, temp_rolling[0].ix[:, 1:9], temp_rolling[1].ix[:, 1:9]], axis=1).dropna()
print ("temp_feat", temp_feat)



max
min
mean
std
var
median


the new syntax is .resample(...).max()
  # This is added back by InteractiveShellApp.init_path()
the new syntax is .resample(...).min()
  # This is added back by InteractiveShellApp.init_path()
the new syntax is .resample(...).mean()
  # This is added back by InteractiveShellApp.init_path()
the new syntax is .resample(...).std()
  # This is added back by InteractiveShellApp.init_path()
the new syntax is .resample(...).var()
  # This is added back by InteractiveShellApp.init_path()
the new syntax is .resample(...).median()
  # This is added back by InteractiveShellApp.init_path()


temp_10_min                       date  max_temp_10_min  min_temp_10_min  \
0      2018-01-31 00:10:00        20.742374        18.432066   
1      2018-01-31 00:20:00        20.721078        18.640298   
2      2018-01-31 00:30:00        20.326355        18.887713   
3      2018-01-31 00:40:00        23.860218        19.153188   
4      2018-01-31 00:50:00        22.802077        18.699538   
...                    ...              ...              ...   
113756 2020-03-30 23:30:00        20.699504        17.201191   
113757 2020-03-30 23:40:00        20.396278        19.261914   
113758 2020-03-30 23:50:00        20.893922        19.374886   
113759 2020-03-31 00:00:00        20.734774        19.118251   
113760 2020-03-31 00:10:00        20.130036        20.130036   

        mean_temp_10_min  std_temp_10_min  var_temp_10_min  median_temp_10_min  
0              19.732618         0.705365         0.497540           19.822494  
1              19.769415         0.637207         0.40603

the new syntax is .resample(...).first()
the new syntax is .resample(...).first()
the new syntax is .resample(...).first()
the new syntax is .resample(...).first()
the new syntax is .resample(...).first()
the new syntax is .resample(...).first()
the new syntax is .resample(...).first()
the new syntax is .resample(...).first()


60T
60
temp_rolling[0]                       date  max_temp_30_min  min_temp_30_min  \
0      2018-01-31 00:30:00        20.742374        18.432066   
1      2018-01-31 00:40:00        20.742374        18.432066   
2      2018-01-31 00:50:00        23.860218        18.640298   
3      2018-01-31 01:00:00        23.860218        18.699538   
4      2018-01-31 01:10:00        23.860218        18.699538   
...                    ...              ...              ...   
113754 2020-03-30 23:30:00        21.053281        18.305959   
113755 2020-03-30 23:40:00        20.699504        17.201191   
113756 2020-03-30 23:50:00        20.699504        17.201191   
113757 2020-03-31 00:00:00        20.893922        17.201191   
113758 2020-03-31 00:10:00        20.893922        19.118251   

        mean_temp_30_min  std_temp_30_min  var_temp_30_min  \
0              19.691256         0.601428         0.361715   
1              19.675702         0.609321         0.371272   
2              20.6378

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#ix-indexer-is-deprecated
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#ix-indexer-is-deprecated
  retval = getattr(retval, self.name)._getitem_axis(key, axis=i)


In [3]:
## Label construction
# labeled_features = final_feat.merge(failures, on=['datetime', 'machineID'], how='left')

# 1hr time delta resampling with min,max 
# Create indicator array ( min<17:cold, max>23:hot)
min_10_arr = temp_10_min['min_temp_10_min']
max_10_arr = temp_10_min['max_temp_10_min']
label_arr = np.zeros(len(max_10_arr), dtype = object)
# label_arr = np.empty(len(max_60_arr))
# label_arr[:] = np.nan
print ("label_arr", label_arr)


label_arr[((min_10_arr>=17) & (max_10_arr<=23)) ] = 'normal'
label_arr[min_10_arr<17] = 'abnormal(cold)'
label_arr[max_10_arr>23] ='abnormal(hot)'

print ("label_arr", label_arr)
print (np.sum([label_arr == 'normal']))
print (np.sum([label_arr == 'abnormal(cold)']))
print (np.sum([label_arr == 'abnormal(hot)']))

temp_10_min['label'] = label_arr
print ("temp_10_min", temp_10_min)
# comp_rep.loc[comp_rep[comp] < 1, comp] = None


# Concatenate to feature frame
# labeled_temp_feat = temp_feat.merge(label, on=['date'], how='left')
labeled_temp_feat = pd.merge(temp_feat,temp_10_min[['date','label']],on='date', how='left')
# Convert 'normal'state as 'NaN' (same as missing data aligned to date)
labeled_temp_feat.loc[labeled_temp_feat['label'] == 'normal','label'] = np.nan
pd.set_option('display.max_rows', 1000)
# Fill N/A with bfill limit 5 (fill backward up to 1hr (60mins))
labeled_temp_feat['label'] = labeled_temp_feat['label'].fillna(method='bfill', limit=5)
# Otherwise(normal operation) should be 'none'
labeled_temp_feat['label'] = labeled_temp_feat['label'].fillna('none')
# print ("labeled_temp_feat", labeled_temp_feat)
# print ("labeled_temp_feat.head(500)", labeled_temp_feat.head(500))



pd.set_option('display.max_rows', 1000)
print (labeled_temp_feat[['min_temp_10_min','max_temp_10_min','date','label']].head(1000))

labeled_temp_feat.to_csv('labeled_temp_feat.csv', index=False)
print ("csv file saved")

label_arr [0 0 0 ... 0 0 0]
label_arr ['normal' 'normal' 'normal' ... 'normal' 'normal' 'normal']
111181
1322
1258
temp_10_min                       date  max_temp_10_min  min_temp_10_min  \
0      2018-01-31 00:10:00        20.742374        18.432066   
1      2018-01-31 00:20:00        20.721078        18.640298   
2      2018-01-31 00:30:00        20.326355        18.887713   
3      2018-01-31 00:40:00        23.860218        19.153188   
4      2018-01-31 00:50:00        22.802077        18.699538   
...                    ...              ...              ...   
113756 2020-03-30 23:30:00        20.699504        17.201191   
113757 2020-03-30 23:40:00        20.396278        19.261914   
113758 2020-03-30 23:50:00        20.893922        19.374886   
113759 2020-03-31 00:00:00        20.734774        19.118251   
113760 2020-03-31 00:10:00        20.130036        20.130036   

        mean_temp_10_min  std_temp_10_min  var_temp_10_min  \
0              19.732618         0.705365 

csv file saved
