In [None]:
from nbdev import *

In [None]:
#default_exp load

In [None]:
#hide
import sys
sys.path.append("..")
%load_ext autoreload
%autoreload 2

# Load data from the longwall

> Methods for loading data

In [None]:
#hide
from nbdev.showdoc import *
from fastcore import test

In [None]:
#export
import pandas as pd
import numpy as np
from fastcore.all import *
import wandb
from datetime import datetime, timedelta
from timecluster_extension.utils import *

## Read data from HMB longwall

We will take data from one day of the shearer. the data is hosted at https://aida.ii.uam.es/2018-01-15.csv

In [None]:
# !wget -O /data/input_data.csv https://aida.ii.uam.es/2018-01-06.csv

In [None]:
data = pd.read_csv('/home/user/data/PACMEL-2019/2018-01-06.csv', sep=';', skiprows=2)

In [None]:
data.head()

Unnamed: 0,description,Status kombajnu,Łącznik sterowania,Stan ramienia lewego,Stan ramienia prawego,Stan ciągnika lewego,Stan ciągnika prawego,Stan pompy lewej,Stan pompy prawej,Stan kruszarki,...,Prąd fazy 3.15,Zabezpieczenie.15,Przeciążenie.15,Zwarcie.21,Asymetria.21,Ciągłość żyły.15,Doziemienie.15,Temperatura.15,Unnamed: 373,Unnamed: 374
0,2018-01-06 00:00:00,2091,2.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0,...,3.0,False,False,False,False,False,False,False,,
1,2018-01-06 00:00:01,2091,2.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0,...,3.0,False,False,False,False,False,False,False,,
2,2018-01-06 00:00:02,2091,2.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0,...,3.0,False,False,False,False,False,False,False,,
3,2018-01-06 00:00:03,2091,2.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0,...,3.0,False,False,False,False,False,False,False,,
4,2018-01-06 00:00:04,2091,2.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0,...,3.0,False,False,False,False,False,False,False,,


In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 86400 entries, 0 to 86399
Columns: 375 entries, description to Unnamed: 374
dtypes: float64(181), int64(10), object(184)
memory usage: 247.2+ MB


The timestamp is given in the column `description`

In [None]:
data['timestamp'] = pd.to_datetime(data['description'])
data = data.drop('description', axis=1)

In [None]:
df1 = data.select_dtypes(exclude='object')
df2 = data.select_dtypes(include='object').astype('bool')
data = pd.concat([df2.reset_index(drop = True), df1], axis = 1)

For the dimensionality reduction we might be interested only in the numeric columns

In [None]:
data_numeric = data.select_dtypes(include=['float', 'datetime'])

As detailed in the TimeCluster paper, the data will be normalized into the range $[0, 1]$. Also, NaN columsn will be removed.

In [None]:
tmp = data_numeric.select_dtypes(include='float')
#data_numeric[data_numeric.select_dtypes(include='float')] = (tmp - tmp.min())/(tmp.max()-tmp.min())
data_numeric[data_numeric.select_dtypes(include='float').columns] = (tmp - tmp.min())/(tmp.max()-tmp.min())

In [None]:
data_numeric = data_numeric.dropna(axis=1, how='all').fillna(0)

Finally, we define a function that gathers all this operations

In [None]:
#export
def fpreprocess_numeric_vars(data, cname_ts=None, normalize=True, nan_replacement=0):
    "Preprocess a dataframe `data` containing the monitoring data from a mining longwall. \
    Non-numeric variables will be removed. Each column \
    is expected to have values of a variable in form of a time series, whose index will be described in the \
    column named `cname_ts`. If `cname_ts` is None (default), the index of the dataframe is assumed to contain the \
    timestamps. .NaN values will be \
    replaced by a constant value `nan_replacement`"
    if cname_ts is not None:
        data.index = pd.to_datetime(data[cname_ts])
        data = data.drop(cname_ts, axis=1)
    df1 = data.select_dtypes(exclude='object')
    df2 = data.select_dtypes(include='object').astype('bool')
    data = pd.concat([df2, df1], axis = 1)
    data_numeric = data.select_dtypes(include=['float', 'datetime'])
    tmp = data_numeric.select_dtypes(include='float')
    if normalize: data_numeric[data_numeric.select_dtypes(include='float').columns] = (tmp - tmp.min())/(tmp.max()-tmp.min())
    data_numeric = data_numeric.dropna(axis=1, how='all').fillna(nan_replacement)
    return data_numeric

In [None]:
path = Path("/data/PACMEL-2019/JNK/jnk_before_handling_missing.pickle")

In [None]:
df = None
with open(path, 'rb') as f:
    bin_data = f.read()
    df = pickle.loads(bin_data)[0]

In [None]:
preprocessed_df = fpreprocess_numeric_vars(data=df)

In [None]:
df.SM_ShearerLocation.plot()

In [None]:
preprocessed_df.SM_ShearerLocation.plot()

## Read multiple monitoring files, given as daily CSVs

Since the mining monitoring data is given a set of CSV files, one per day, it is usefl to have a function to load multiple files in order to analyse data from multiple days

In [None]:
#export
def fread_and_concat(paths, **read_args):
    "Read, from `paths`, a list of mining dataframes and concat them. All dataframes \
    must have the same columns. "
    return pd.concat([pd.read_csv(x, **read_args) for x in paths],
                     ignore_index=True)

In [None]:
paths = ['/data/PACMEL-2019/343_HMB/2018-01-14.csv', '/data/PACMEL-2019/343_HMB/2018-01-15.csv']
df1 = pd.read_csv(paths[0], sep=';', skiprows=2, nrows=3)
df2 = pd.read_csv(paths[1], sep=';', skiprows=2, nrows=3)
df = fread_and_concat(paths, sep=';', skiprows=2, nrows=3)

In [None]:
test.equals(df1.shape[0] + df2.shape[0], df.shape[0])
test.all_equal([df1.shape[1], df2.shape[1], df.shape[1]], np.repeat(df1.shape[1], 3))

In [None]:
#export
def fread_mining_monitoring_files(paths, **kwargs):
    "Read monitoring files from the PACMEL mining use case."
    df = fread_and_concat(paths,
                          sep=';',
                          low_memory=False,
                          skiprows=2,
                          **kwargs)
    # Convert the timestamp column into a proper datetime object
    df['description'] = pd.to_datetime(df['description'])
    return df

In [None]:
paths = ['/data/PACMEL-2019/343_HMB/2018-01-14.csv', '/data/PACMEL-2019/343_HMB/2018-01-15.csv']
df = fread_mining_monitoring_files(paths, nrows=3)

In [None]:
isinstance(df, pd.core.frame.DataFrame)

## Time series artifacts (to be used with weights and biases)

This class is meant to extend `wandb.Artifact` for logging/using files with time series data.

In [None]:
#export
class TSArtifact(wandb.Artifact):
    
    default_storage_path = Path('/home/user/data/PACMEL-2019/wandb_artifacts/')
    date_format = '%Y-%m-%d %H:%M:%S' # TODO add milliseconds
    handle_missing_values_techniques = {
        'linear_interpolation': lambda df : df.interpolate(method='linear', limit_direction='both'),
        'overall_mean': lambda df : df.fillna(df.mean()),
        'overall_median': lambda df : df.fillna(df.median())
    }

    "Class that represents a wandb artifact containing time series data. sd stands for start_date \
    and ed for end_date. Both should be pd.Timestamps"
    
    @delegates(wandb.Artifact.__init__)
    def __init__(self, name, sd:pd.Timestamp, ed:pd.Timestamp, **kwargs):
        super().__init__(type='dataset', name=name, **kwargs)
        self.sd = sd
        self.ed = ed
        if self.metadata is None:
            self.metadata = dict()
        self.metadata['TS'] = dict(sd = self.sd.strftime(self.date_format),
                                   ed = self.ed.strftime(self.date_format))

        
    @classmethod
    def from_daily_csv_files(cls, root_path, fread=pd.read_csv, start_date=None, end_date=None, metadata=None, **kwargs):
        
        "Create a wandb artifact of type `dataset`, containing the CSV files from `start_date` \
        to `end_date`. Dates must be pased as `datetime.datetime` objects. If a `wandb_run` is \
        defined, the created artifact will be logged to that run, using the longwall name as \
        artifact name, and the date range as version."
        
        return None

    
    @classmethod
    @delegates(__init__)
    def from_df(cls, df:pd.DataFrame, name:str, path:str=None, sd:pd.Timestamp=None, ed:pd.Timestamp=None,
                normalize:bool=False, missing_values_technique:str=None, resampling_freq:str=None, **kwargs):
        
        """
        Create a TSArtifact of type `dataset`, using the DataFrame `df` samples from \
        `sd` (start date) to `ed` (end date). Dates must be passed as `datetime.datetime` \
        objects. The transformed DataFrame is stored as a pickle file in the path `path` \
        and its reference is added to the artifact entries. Additionally, the dataset can \
        be normalized (see `normalize` argument) or transformed using missing values \
        handling techniques (see `missing_values_technique` argument) or resampling (see \
        `resampling_freq` argument).
        
        Arguments:
            df: (DataFrame) The dataframe you want to convert into an artifact.
            name: (str) The artifact name.
            path: (str, optional) The path where the file, containing the new transformed \
                dataframe, is saved. Default None.
            sd: (sd, optional) Start date. By default, the first index of `df` is taken.
            ed: (ed, optional) End date. By default, the last index of `df` is taken.
            normalize: (bool, optional) If the dataset values should be normalized. Default\
                False.
            missing_values_technique: (str, optional) The technique used to handle missing \
                values. Options: "linear_iterpolation", "overall_mean", "overall_median" or \
                None. Default None.
            resampling_freq: (str, optional) The offset string or object representing \
                frequency conversion for time series resampling. Default None.
                
        Returns: 
            TSArtifact object.
        """
        
        sd = df.index[0] if sd is None else sd
        ed = df.index[-1] if ed is None else ed
        obj = cls(name, sd=sd, ed=ed, **kwargs)
        df = df.query('@obj.sd <= index <= @obj.ed')
        obj.metadata['TS']['created'] = 'from-df'
        obj.metadata['TS']['n_vars'] = df.columns.__len__()
        
        # Handle Missing Values
        df = obj.handle_missing_values_techniques[missing_values_technique](df) if missing_values_technique is not None else df
        obj.metadata['TS']['handle_missing_values_technique'] = missing_values_technique.__str__()
        obj.metadata['TS']['has_missing_values'] = np.any(df.isna().values).__str__()
        
        # Resample
        df = df.resample(resampling_freq).mean()
        obj.metadata['TS']['n_samples'] = len(df)
        obj.metadata['TS']['freq'] = str(df.index.freq)
        
        # Time Series Variables
        obj.metadata['TS']['vars'] = list(df.columns)
        
        # Normalization - Save the previous means and stds
        if normalize:
            obj.metadata['TS']['normalization'] = dict(means = df.describe().loc['mean'].to_dict(),
                                                       stds = df.describe().loc['std'].to_dict())
            df = normalize_columns(df)
        
        # Hash and save
        hash_code = str(hash(df.values.tobytes()))
        path = obj.default_storage_path/f'{hash_code}' if path is None else Path(path)/f'{hash_code}'
        df.to_pickle(path)
        obj.metadata['TS']['hash'] = hash_code
        obj.add_file(str(path))
        
        return obj

In [None]:
# TSArtifact class TEST

# resampling frequency
resampling_freq = '5s'
# handle missing values technique
missing_values_technique='overall_median'

# testing dataframe
df_test = pd.util.testing.makeMissingDataframe()
df_test.index = pd.date_range(start='2021-01-01', periods=len(df_test), freq='s')

artifact = TSArtifact.from_df(df_test, 
                              name='JNK', 
                              missing_values_technique=missing_values_technique,
                              resampling_freq=resampling_freq, 
                              normalize=True)
artifact.metadata

{'TS': {'sd': '2021-01-01 00:00:00',
  'ed': '2021-01-01 00:00:29',
  'created': 'from-df',
  'n_vars': 4,
  'handle_missing_values_technique': 'overall_median',
  'has_missing_values': 'False',
  'n_samples': 6,
  'freq': '<5 * Seconds>',
  'vars': ['A', 'B', 'C', 'D'],
  'normalization': {'means': {'A': 0.3837807973595773,
    'B': -0.1702837787510083,
    'C': 0.0003015766713802802,
    'D': -0.18042534224509024},
   'stds': {'A': 0.28755677338495317,
    'B': 0.29094157184208197,
    'C': 0.5287787048279057,
    'D': 0.5244784339968553}},
  'hash': '7582266588543823729'}}

At the end, we are interested in working with time series as a dataframe. So we need a function to download the files contained in a `wandb.apis.public.Artifact` object and process them into a TS dataframe. The process of passing from files to dataframe must be different depending on what type of creation method was used to generate the original `TSArtifact`.

In [None]:
#export
@patch
def to_df(self:wandb.apis.public.Artifact):
    "Download the files of a saved wandb artifact and process them as a single dataframe. The artifact must \
    come from a call to `run.use_artifact` with a proper wandb run."
    # The way we have to ensure that the argument comes from a TS arfitact is the metadata
    if self.metadata.get('TS') is None:
        print(f'ERROR:{self} does not come from a logged TSArtifact')
        return None
    dir = Path(self.download())
    if self.metadata['TS']['created'] == 'from-df':
        # Call read_pickle with the single file from dir
        return pd.read_pickle(dir.ls()[0])
    else:
        print("ERROR: Only from_df method is allowed yet")

For convenience, we can write a method to cast a downloaded wandb artifact (instance from `wandb.apis.public,Artifact`) to a TSArtifact

In [None]:
#export
@patch
def to_tsartifact(self:wandb.apis.public.Artifact):
    "Cast an artifact as a TS artifact. The artifact must have been created from one of the \
    class creation methods of the class `TSArtifact`. This is useful to go back to a TSArtifact \
    after downloading an artifact through the wand API"
    return TSArtifact(name=self.digest, #TODO change this
                      sd=pd.to_datetime(self.metadata['TS']['sd'], format=TSArtifact.date_format),
                      ed=pd.to_datetime(self.metadata['TS']['sd'], format=TSArtifact.date_format),
                      description=self.description,
                      metadata=self.metadata)

In [None]:
#export
def create_longwall_data_artifact(root_path, start_date, end_date, longwall_name='Unnamed_longwall', wandb_run=None):
    "Create a wandb artifact of type `dataset`, containing the CSV files from `start_date` \
    to `end_date`. Dates must be pased as `datetime.datetime` objects. If a `wandb_run` is \
    defined, the created artifact will be logged to that run, using the longwall name as \
    artifact name, and the date range as version."
    # Compute the number of variables for the metadata (total and numeric)
    root_path = Path(root_path)
    date_diff = end_date - start_date
    sd_str = start_date.strftime("%Y-%m-%d")
    ed_str = end_date.strftime("%Y-%m-%d")
    mock_data = fread_mining_monitoring_files([f'{root_path/start_date.strftime("%Y-%m-%d")}.csv'],
                                             nrows=1)
    artifact_name = longwall_name if longwall_name else root_path
    artifact = wandb.Artifact(type='dataset',
                              name=artifact_name,
                              description='Dataset from the PACMEL mining use case. It contains \
                              monitoring data from a longwall shearer',
                              metadata={
                              'longwall': longwall_name,
                              'start_time': datetime.strftime(start_date, format='%Y-%m-%d %H:%M:%S'),
                              'end_time': datetime.strftime(end_date, format='%Y-%m-%d %H:%M:%S'),
                              'n_variables': len(mock_data.columns)-1 # Exclude timestamp
                              })
    # ADd files as references (we do not upload files for confidential reasons)
    [artifact.add_reference(f'file://{root_path/x.strftime("%Y-%m-%d")}.csv')
     for x in (start_date + timedelta(days=n) for n in range(date_diff.days + 1))]

    if wandb_run:
        artifact_version = f'{sd_str}_{ed_str}'
        wandb_run.log_artifact(artifact,
                               aliases=['latest', artifact_version])
    return artifact

In [None]:
run = wandb.init(job_type='create_dataset', resume=True)

In [None]:
sd = datetime.strptime("2018-01-01", "%Y-%m-%d")
ed = sd + timedelta(hours=3)
ar = create_longwall_data_artifact(root_path='/data/PACMEL-2019/343_HMB', 
                                   start_date=sd, 
                                   end_date=ed,
                                   longwall_name='HMB', 
                                   wandb_run=None)
ar.metadata, ar.manifest.entries

NameError: name 'create_longwall_data_artifact' is not defined

Try to use the logged artifact

In [None]:
ar_recovered = run.use_artifact(name='HMB:2018-01-01_2018-01-04', type='dataset')

In [None]:
dir = Path(ar_recovered.download())

In [None]:
retrieved_data = fread_mining_monitoring_files(dir.ls(), nrows=3)

In [None]:
retrieved_data.shape

### Load longwall data artifact

This function is quite handy to turn the contents of a longwall artifact, created with the function `create_longwall_data_artifact`. This is specially useful in the case where the monitoring files are given in a daily basis, but you are only interested in analysing a couple of hours of data. In that case, the artifact will link the whole day file, but using the metadata, this function will only read the corresponding  

In [None]:
#export
def load_longwall_data_artifact(a:wandb.Artifact):
    "Returns a dataframe with the longwall data, subsetted by the artifact metadata"
    a_refs = [x.ref for x in a.manifest.entries.values()]
    data = fread_mining_monitoring_files(a_refs)
    sd = datetime.strptime(a.metadata['start_time'], '%Y-%m-%d %H:%M:%S')
    ed = datetime.strptime(a.metadata['end_time'], '%Y-%m-%d %H:%M:%S')
    data = data.query('description >= @sd and description <= @ed')
    return data

TODO: Tiene que haber un error si el start date pasado es menor que el inicio del primer fichero, y lo mismo con el end date final

In [None]:
paths = ['/data/PACMEL-2019/343_HMB/2018-01-01.csv']

In [None]:
df = fread_mining_monitoring_files(paths, nrows=1000)

In [None]:
start_date = df['description'][0]
end_date = start_date + timedelta(minutes=15)
start_date, end_date

In [None]:
a = create_longwall_data_artifact(root_path='/data/PACMEL-2019/343_HMB/', 
                                  start_date=start_date, 
                                  end_date=start_date + timedelta(minutes=15), 
                                  wandb_run=None)

In [None]:
df_subset = load_longwall_data_artifact(a)

In [None]:
test.equals(df.columns, df_subset.columns) 

In [None]:
test.equals(df_subset['description'][0], start_date)

In [None]:
test.equals(df_subset['description'][len(df_subset['description']) -1], end_date)

## JNK data

The data from the JNK longwall comes in two formats:
1. Queryable database
2. Preprocessed pickle files
3. Preprocessed CSV files

### JNK pickle files

In [None]:
base_path_JNK = Path('/data/PACMEL-2019/JNK/') # *
[path for path in base_path_JNK.ls()]

Let's read the pickle files and show the contents of each of them

In [None]:
#hide
import pickle

In [None]:
#hide
jnk_files = []
filepaths_pickle = base_path_JNK.ls(file_exts='.pickle')
for i, path in enumerate(filepaths_pickle):
    f = open(path, 'rb')    
    bin_data = f.read()
    print(f'Loading file {i}...')
    jnk_files.append(pickle.loads(bin_data))

The first file is a dataframe with information about the *boolean variables*

In [None]:
print(filepaths_pickle[0].name)
df_bool_dict = jnk_files[0]
df_bool_dict

The second file is a dataframe with information about the *categorical variables*

In [None]:
print(filepaths_pickle[1].name)
df_categorical_dict = jnk_files[1]
df_categorical_dict

The third file is a dataframe with the raw data of one month of the longwall (June 2019). It contains missing values.

In [None]:
print(filepaths_pickle[2].name)
df_jnk_base = jnk_files[2]
df_jnk_base

This dataframe contains all the 95 bool variables described in the first pickle file.

In [None]:
len(df_jnk_base.columns) - len(set(df_jnk_base.columns) - set(df_bool_dict.bool_variables))

However, not all the variables listed in the dataframe for the categorical variables are present in the base data. (TODO: as why!). We can check which categorical variables are in the base data.

In [None]:
df_jnk_base.filter(items=df_categorical_dict.categorical_variables).columns

The 4th file is called `jnk_before_handling_missing`, and it is a list f 4 elements:

In [None]:
print(filepaths_pickle[3].name)
jnk_before_handling_missing = jnk_files[3]
len(jnk_before_handling_missing)

The first element is a dataframe with a subset of 16 columns from the base data (the number of rows is the same).

In [None]:
jnk_before_handling_missing[0]

The second and third elements of the list mark which of the variables of this dataframe are boolean, numerical, and categorical respectively.

In [None]:
print(jnk_before_handling_missing[1])
set(jnk_before_handling_missing[1]).issubset(df_bool_dict.bool_variables)

In [None]:
print(jnk_before_handling_missing[2])
set(jnk_before_handling_missing[2]).issubset(df_bool_dict.bool_variables), \
set(jnk_before_handling_missing[2]).issubset(df_categorical_dict.categorical_variables)

In [None]:
jnk_before_handling_missing[3]

This subset of data may represent the most interesting variables of analysis from an expert perspective (TODO: confirm).

The 5th and 6th pickle files look like an output instead of an input. More specifically, both of them are lists with two items:
1. A 11 $\times$ 11 array, containing the description of 11 clusters.
2. An array of 1363601 elements with 11 different values (from 0 to 10), containing the assignation of each data point to each of the 11 clusters described in the first element of the file.

The input dataset to achieve this result must be a preprocessed input, due to the size of the assignment array is much lower than the size of the base dataframe. Also,the number of columns of this input dataset should be 11, in case the first element of the list represents a multidimensional description of a cluster.

In [None]:
filepaths_pickle[4].name

In [None]:
jnk_files[4][0][0].shape, jnk_files[4][0][0]

In [None]:
len(jnk_files[4][1][0]), set(jnk_files[4][1][0])

In [None]:
print(filepaths_pickle[6].name)

The 7th file, `jnk_filled_removelong_imputeinterpolate_nothing.pickle`, is a list with two elements.

In [None]:
print(filepaths_pickle[6].name)
jnk_filled_removelong_imputeinterpolate_nothing = jnk_files[6]
len(jnk_filled_removelong_imputeinterpolate_nothing)

The first item in the list contains a dataframe with the same number of rows that the clustering results mentioned above. Therefore, this is the preppreprocessed data used for those clustering computations.

In [None]:
print(jnk_filled_removelong_imputeinterpolate_nothing[0].__len__(), 
     jnk_files[4][1][0].shape)
jnk_filled_removelong_imputeinterpolate_nothing[0]

The columns of this datafrae are classified in the second element of the list. It¡s the same set of variables than the dataset seen in the file `jnk_before_handling_missing`. 

In [None]:
jnk_files[6][1]

In [None]:
set(jnk_files[3][0].columns) == set(jnk_files[6][0].columns)

We can check te effect of the removed periods if we compare the evolution of the timestamps between this dataset and the one without preprocessing.

In [None]:
pd.Series(jnk_files[3][0].index).plot(), pd.Series(jnk_files[6][0].index).plot()

This must be taken into account when making use of this datasets. If one wants to consider timestamps as a countinuos source of information, even if there is no data, this last dataset, and none of them with removed periods, is an option for that analysis. For example, a forecastig task should take this very carefully.

Files 8th to 13th are the results of processing the data with the library `ts_learn`, and the resulting cluasters found in the data when using that preprocessing. Each file corresponds to the use of a different feature extracted from the package (mean, first element...) but I do not know more details about it. For know, these files can be ignored.

Finally, the last file is a list the type of variables of all the 146 variables found in the base dataframe.
1. The first element lists the boolean variables
2. The second element lists the numeric variables
3. The third element lists the categorical variables

In [None]:
print(filepaths_pickle[13].name)
types_variables = jnk_files[13]
types_variables[0], types_variables[1], types_variables[2]

## Export notebook

In [None]:
#hide
from nbdev.export import *
notebook2script()