# Data Set for Monitor Time Series Prediction

We need the following fields from a single collection, raw:

In [1]:
CONCEPTS = ["HR", "InvBPDias", "InvBPSys", "SpO2",
            "Temperature monitor", "Central Venous Pressure",
                   "RRtotal"]

The data will go into files, pickled.

In [2]:
RAW_STAY = "../data/monitor-dataset-{}.pkl"
STAY_TAGS = "../data/stay_tags.pkl"

Let's check that we at least have all of these features.

In [7]:
import sys
sys.path.append("/home/david/work/intensone/analytics/")

In [8]:
from knowledge.knowledge_dao.knowledge_read_dao_mongo import knowledgeReadDAOMongo
from db.mongo.queries import connect_to_mongo
from res.DB_PARAMS import CONFIGDB
from predictor.features.data_collector.data_collector import DataCollector
import pandas
import numpy
import numpy.random
import pickle

In [29]:
CONFIGDB

'KnowledgeApi'

In [9]:
config_connection = connect_to_mongo(CONFIGDB)
knowledge_read_dao = knowledgeReadDAOMongo(config_connection)
primary_concept_names = knowledge_read_dao.get_all_primary_concept_names()
derived_concept_names = knowledge_read_dao.get_all_derived_concept_names()
assert not set(CONCEPTS).difference(set(primary_concept_names).union(set(derived_concept_names)))

If the assert above is silent, we are ok. Now, we follow `./predictor/features/sample_concepts/sample_concepts.py` to read the concept data into a data frame.

In [38]:
# COLLECTION = "IntensixDBIchilov5"
# COLLECTION = "IntensixDBMayo7"
COLLECTION = "IntensixDB2015_4"
data_connection = connect_to_mongo(COLLECTION)

In [39]:
collector = DataCollector(data_connection, concepts=CONCEPTS, 
                          config_connection=config_connection)

We also load the dictionary of stay tags, retrieved earlier.

In [32]:
with open(STAY_TAGS, "rb") as f:
    stay_tags = pickle.load(f)

We need to get the list of all stays in the collection because we are going to iterate through them.

In [40]:
all_stays = collector.get_all_db_stays()
len(all_stays)

1026

In [41]:
SUBSET = False
NSTAYS = 200
if SUBSET:
    stays = numpy.random.choice(all_stays, size=NSTAYS, replace=False)
else:
    stays = all_stays

We'll just build a dataset as a collection of pickled dataframes.

In [42]:
dataset = {}

for i, stay in enumerate(stays):
    if i % 10 == 0:
        print(".", end="")
    if (i + 1) % 100 == 0:
        print(i + 1, end=" ")
    stay_info = collector.get_stay_info(stay)
    df = collector.get_series_list(CONCEPTS, stay,
                                   stay_info["start_date"], stay_info["end_date"],
                                   usage="prospective")
    if not df.empty:
        # Augment the stay with tags, any better way to do that?
        if stay in stay_tags:
            tags = pandas.DataFrame.from_dict(stay_tags[stay])
            tags.drop('value', axis=1, inplace=True)
            tags.rename(columns={'concept': 'tag'}, inplace=True)
            tags.set_index('time', inplace=True)
            df = df.join(tags)
            df['tag'].fillna('', inplace=True)
        else:
            df['tag'] = ''

        with open(RAW_STAY.format(stay), "wb") as file:
            pickle.dump(df, file)
        dataset[stay] = df

..........100 ..........200 ..........300 ..........400 ..........500 ..........600 ..........700 ..........800 ..........900 ..........1000 ...

We can play with the data here a little bit:

In [16]:
stay_id = numpy.random.choice(stays)
# stay_id = 'Mayo_MICU_Y8MM44A_20160727'
# stay_id = 'Mayo_MICU_A086O0Y_20710815'
print(stay_id)
with open(RAW_STAY.format(stay_id), "rb") as f:
    df = pickle.load(f)
df[numpy.logical_not(numpy.isnan(df['HR']))]

ValueError: a must be non-empty