In [None]:
import os

def call(dir, id, env, path):
    """ This function is used to read parquet files from the data lake.
    Arguments:
        dir {string} -- The directory of the file to be read. 
        id {string} -- The id of the file to be read.
        env {string} -- The environment of the file to be read.
        path {string} -- The path of the file to be read.
    Returns:
        df -- The dataframe of the file to be read. 
    """
    env = os.getenv('platform_env')
    df = spark.read.parquet(
        'abfss://lake-{0}@{1}lake{2}.dfs.core.windows.net{3}'.format(dir,id,env, path))
    return df

def clean(df, sensor, start_date, end_date):
    """ This function is used to clean the data. 
    Arguments:
            df {dataframe} -- The dataframe to be cleaned. 
            sensor {string} -- The sensor to be cleaned. 
            start_date {string} -- The start date of the data to be cleaned. 
            end_date {string} -- The end date of the data to be cleaned.

    Returns: 
            cleaned_data {dataframe} -- The cleaned dataframe.
    """
    
    sensor_data = df.filter(df.Location_code == sensor)

    sensor_data = sensor_data.select('MeasurementDTTM', 'EC', 'pH', 'WaterTemp', 'EC_quality', 'pH_quality', 'WaterTemp_quality')
    sensor_data = sensor_data.withColumnRenamed("MeasurementDTTM","datetime").withColumnRenamed("EC","cond").withColumnRenamed("pH", "ph").withColumnRenamed("WaterTemp", "temp").withColumnRenamed("EC_quality", "cond_qual").withColumnRenamed("pH_quality", "ph_qual").withColumnRenamed("WaterTemp_quality", "temp_qual")

    sensor_data=sensor_data.withColumn("cond", sensor_data.cond/1000)
    sensor_data_pandas = sensor_data.toPandas()

    sensor_data_pandas = sensor_data_pandas.sort_values('datetime')
    sorted_data= sensor_data_pandas.set_index('datetime')

    cleaned_data = sorted_data.reset_index()
    cleaned_data = cleaned_data[(cleaned_data.datetime> start_date)]
    cleaned_data = cleaned_data[(cleaned_data.datetime< end_date)]

    name = f"{sensor}_{start_date}_{end_date}"
    cleaned_data['cond'] = cleaned_data['cond'].astype(float)
    cleaned_data['ph'] = cleaned_data['ph'].astype(float)
    cleaned_data['temp'] = cleaned_data['temp'].astype(float)

    return cleaned_data

def filter_dataset(df, sensor, start_date, end_date):
    """ This function is used to filter the data. It does not create a csv file, as opposed to the function clean.
    Arguments: 
        df {dataframe} -- The dataframe to be filtered.
        sensor {string} -- The sensor to be filtered.
        start_date {string} -- The start date of the data to be filtered.
        end_date {string} -- The end date of the data to be filtered.

    Returns:
        filtered_data {dataframe} -- The filtered dataframe.
    """
    
    sensor_data = df.filter(df.Location_code == sensor)
    sensor_data = sensor_data.select('MeasurementDTTM', 'EC', 'pH', 'WaterTemp', 'EC_quality', 'pH_quality', 'WaterTemp_quality')
    sensor_data = sensor_data.withColumnRenamed("MeasurementDTTM","datetime").withColumnRenamed("EC","cond").withColumnRenamed("pH", "ph").withColumnRenamed("WaterTemp", "temp").withColumnRenamed("EC_quality", "cond_qual").withColumnRenamed("pH_quality", "ph_qual").withColumnRenamed("WaterTemp_quality", "temp_qual")
    sensor_data=sensor_data.withColumn("cond", sensor_data.cond/1000)
    sensor_data_pandas = sensor_data.toPandas()
    sensor_data_pandas = sensor_data_pandas.sort_values('datetime')
    sorted_data= sensor_data_pandas.set_index('datetime')

    filtered_data = sorted_data.reset_index()
    filtered_data = filtered_data[(filtered_data.datetime> start_date)]
    filtered_data = filtered_data[(filtered_data.datetime< end_date)]
    return filtered_data

def get_data(sensors, df):
    """
    get_data imports time series data from csv files. Files may specified explicitly by file name, or a series of files
    may be imported that follow a naming pattern with site and year (e.g. "MainStreet2014.csv").
    Files should have columns corresponding to each sensor. If technician labels and corrections exist, they may be
    imported by naming columns sensor_cor and labeled_anomaly.
    Arguments:
        sensors: list of name(s) of the sensor/variable data of interest. These must be the column names in data file(s).
        filename: string of the file name containing input data
        site: string of name of the data collection site
        years: list of the year(s) of interest
        path: path to .csv files containing the data of interest
    Returns:
        sensor_array: array of pandas DataFrames, each with 3 columns for the variable/sensor of interest:
        'raw', 'cor', 'labeled_anomaly'.
    """
    # create data frames with raw, corrected, and labeled data (if the corrected and labeled data exist)
    df_full = df
    sensor_array = dict()
    for snsr in sensors:
        df = []
        df = pd.DataFrame(index=df_full.index)
        df['raw'] = df_full[snsr]

        # if corrected data is available in dataset
        if snsr + '_cor' in df_full.columns:
            df['cor'] = df_full[snsr + '_cor']
        if snsr + "_qual" in df_full.columns:
            df['labeled_anomaly'] = df_full[snsr + '_qual']
        sensor_array[snsr] = df

    return sensor_array