forked from shelfwise/Mars-Express-Challenge
-
Notifications
You must be signed in to change notification settings - Fork 0
/
prepare_data2.py
154 lines (121 loc) · 5.05 KB
/
prepare_data2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
# -*- coding: utf-8 -*-
"""
@author: fornax
"""
import os
import pandas as pd
import json
import prepare_data1 as prep
DATA_PATH = prep.DATA_PATH
INTERVALS = ['2H', '6H', '12H', '1D']
FUNCTIONS = ['mean', 'std', 'count']
def resample(df, df_name='generic', intervals=INTERVALS, functions=FUNCTIONS, cols=None):
"""
Function computes summary statistics over past values of each feature for a series
of intervals, as well as a 'count' feature which represents the number of
changes/activities in the past across all features (i.e. describes how many
measurements were made in the data frame in the last X hours).
:df: dataset from which to create past-based features
:df_name: name of the dataset
:intervals: a list of intervals for which to compute summary statistics w.r.t.
each time-point
:functions: a list of summary statistics to compute
:cols: a list of features on which to compute past statistics (if None,
then all features will be processed)
:return: input data frame with new features joined
"""
if cols is None:
cols = df.columns
if 'm_year' in cols:
cols = list(cols)
cols.remove('m_year')
for interval in intervals:
for function in functions:
if function == 'count':
resampler = df[cols[:1]].resample(interval, label='right')
df_resampled = resampler.count()
df_resampled.columns = ['_'.join([df_name, function, 'past', interval])]
df = df.join(df_resampled, how='outer')
else:
resampler = df[cols].resample(interval, label='right')
df_resampled = eval('resampler.' + function + '()')
df_resampled.columns = ['_'.join([c, function, 'past', interval]) for c in cols]
df = df.join(df_resampled, how='outer')
return df
def parse_ltdata():
print('Preparing LTDATA...')
f = os.path.join(DATA_PATH, 'ltdata.csv')
df_raw = prep.parse_data(f)
intervals = ['3D', '7D']
functions = ['mean', 'std']
df = resample(df_raw, 'ltdata', intervals=intervals, functions=functions)
df = df.resample('1H').mean()
return df
def parse_saaf():
print('Preparing SAAF...')
f = os.path.join(DATA_PATH, 'saaf.csv')
df_raw = prep.parse_data(f)
df = resample(df_raw, 'saaf')
df = df.resample('1H').mean()
return df
def parse_ftl():
print('Preparing FTL...')
f = os.path.join(DATA_PATH, 'ftl_processed.csv')
df = pd.read_csv(f, index_col=0)
df['ut_ms'] = df['utb_ms']
df.drop(['utb_ms', 'ute_ms'], axis=1, inplace=True)
df = prep.to_datetime(df)
df = df.set_index('ut_ms')
df.drop(['type', 'is_d5pphb', 'is_d7plts', 'is_d8pltp', 'is_d9pspo',
'is_nadir_lander', 'is_radio_science', 'is_specular', 'is_spot'], axis=1, inplace=True)
df = resample(df, 'ftl')
df = df.resample('1H').mean()
return df
def parse_evtf():
print('Preparing EVTF...')
f = os.path.join(DATA_PATH, 'evtf_processed.csv')
df = prep.parse_data(f)
to_remove = map(lambda x: filter(lambda y: x in y, df.columns),
['OCC_DEIMOS', 'OCC_PHOBOS', 'PHO_UMBRA', 'PHO_PENUMBRA', 'DEI_PENUMBRA', 'DEI_UMBRA'])
to_remove = [i for sub in to_remove for i in sub]
df.drop(to_remove, axis=1, inplace=True)
to_expand=['OCC', 'MAR_PENUMBRA', 'MAR_UMBRA', 'OCC_MARS']
df = resample(df, 'evtf', cols=to_expand)
df = df.resample('1H').mean()
return df
if __name__ == '__main__':
filename = 'dataset2'
print('Preparing powers...')
powers_train = prep.parse_power(os.path.join(DATA_PATH, 'power.csv'), intervals='1H')
powers_test = prep.parse_power('test_set/power-prediction-sample-2014-04-14_2016-03-01.csv', intervals='1H', dropna=False)
powers_test['m_year'] = 3
powers = pd.concat([powers_train, powers_test])
filling_method = 'nearest'
saaf = parse_saaf()
saaf = prep.align_to_power(saaf, powers, method=filling_method)
evtf = parse_evtf()
evtf = prep.align_to_power(evtf, powers, method=filling_method)
ftl = parse_ftl()
ftl = prep.align_to_power(ftl, powers, method=filling_method)
ltdata = parse_ltdata()
ltdata = prep.align_to_power(ltdata, powers, method=filling_method)
print('Creating the dataframe...')
df = powers.copy()
df = df.join(saaf)
df = df.join(evtf)
df = df.join(ftl)
df = df.join(ltdata)
df['mission_time'] = prep.to_utms(df.index)
path_to_save = os.path.join(DATA_PATH, filename + '.csv')
print('Saving to %s' % path_to_save)
df.to_csv(path_to_save)
features = {}
features['NPWD'] = list(powers.columns)
features['saaf'] = list(saaf.columns)
features['evtf'] = list(evtf.columns)
features['ftl'] = list(ftl.columns)
features['ltdata'] = list(ltdata.columns)
features['aux_time'] = ['m_year', 'mission_time']
with open(os.path.join(DATA_PATH, filename + '.features'), 'w') as f:
json.dump(features, f)
print('Done!')