### Imports and dependencies

In [3]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
%matplotlib inline
sns.set_style("ticks")

### Get dataframes by meterids and obtain list of meterids.

In [None]:
df_all = pd.read_csv('data/dataport-export_gas_oct2015-mar2016.csv')
df_all = df_all.drop(columns='localminute')
groups = df_all.groupby('dataid')
keys = groups.groups.keys()  # keys: an iterable of dataids or meter ids
keys_list = list(keys)
print(keys_list)

In [None]:
key = 1185
df_i = groups.get_group(key).drop(columns='dataid')

In [None]:
def zoom(df, start_date, end_date):
    # returns a df whose rows are within a particular time period. 
    # pre-condition: df is indexed by datetime.
    start_date = pd.to_datetime(start_date)
    end_date = pd.to_datetime(end_date)
    mask = (df.index >= start_date) & (df.index <= end_date)

    new_df = df.iloc[mask]
    return new_df

In [None]:
# example for zoom
df_i_1mth = zoom(df_i, '2016-01-01', '2016-02-01')

### Cleaning steps.

In [5]:
# step 1 (optional part)
def remove_mal_data(df):
    # this method finds the start datetime and end datetime of the malfunctioning period and returns a new df without data
    # from that period. i.e. the entire chunk of data from the faulty period is dropped.
    # NOTE that a new column 'marginal_change' is added to the df.
    # df: pandas dataframe. contains data from one meterid. contains a column 'meter_value'.
    
    # from visualising the data, we can arbitrarily define a spike as a marginal difference of > 2000 cubic metres
    # we have defined malfunction as a marginal change > 2000
    # and a marginal change < -2000.
    threshold = 2000
    df['marginal_change'] = df['meter_value'].diff()
    
    flagged_dates = df[df.marginal_change > threshold].index
    flagged_dates = flagged_dates.append(df[df.marginal_change < -1*threshold].index)
    
    if len(flagged_dates) > 0:
        start = flagged_dates[0]
        end = flagged_dates[-1]    
        mask = (df.index >= start) & (df.index <= end)
        return df.loc[~mask]
    else:
        return df

In [None]:
# step 2. exclude meters with low sampling frequency

# list of meters visually inspected and identified to be malfunctioning 
# based on spikes or irregularities seen on the plot.
to_remove = [1185, 1556, 2335, 2449, 3134, 3544, 4447, 4514,\
             5129, 5403, 6836, 7030, 7117, 8156, 9134, 9639, 9982]

low_samplefreq = [] # TODO

to_remove.extend(low_samplefreq) 

display(to_remove, len(to_remove))

# final meters to use for analysis
to_keep = [elem for elem in keys_list if elem not in to_remove]

In [None]:
# step 3. Change cumulative to marginal change.

# we already have the 'marginal_change' column. 
# NOTE: the 'meter_value' or cumulative reading column is NO LONGER VALID after dropping any row.
# the only exception is for the very first row which is the first cumulative reading.
# if we are going to need cumulative readings again, we must construct them from this first cumulative reading and all
# the new (remaining/cleaned) marginal changes.

In [6]:
# step 4. remove marginal decrease.
def remove_negative_marginal(df):
    # remove data points where the marginal change (from prev value) is negative.
    return df.loc[df['marginal_change'] >= 0]

# same notes apply as in step 3.

In [7]:
# step 5. Get rate of increase from marginal changes (per second)

In [None]:
# step 6. Mark anomalies from rate of increase with threshold = 0.166/s
# step 7. Remove these anomalies
# step 8. Resample hourly, finding the new hourly marginal reading. Construct cumulative readings as necessary.
# step 9. visualise cleaned readings. confirm that results are as intended
# step 10. save combined data into a new csv file.