### Imputing Small Gaps
##### Note: make sure no column under the feature name starts with a null value
##### HourlyPressureChange and HourlyPresureTendency are like that. Change to 0

In [None]:
from scipy.interpolate import CubicSpline
from scipy.interpolate import interp1d
import numpy as np
import pandas as pd

def imputeArrValues(index, arr, width):
    '''Helper function for imputeArr, which is used inside smallGapImputation.
    '''
    smallGapFailed = False      # represents if there's a small enough gap but was just not able to fill it in due to missing data on either side
    if width < 5 and index + width + 1 < len(arr):
        # interpolate data!
        # want 6 values before and after gap for cubic spline, but more might be better
        # technically only 4 points are needed, but more might help impute better
        lenForwards = 1
        while lenForwards < 10 and not np.isnan(arr[index + width + lenForwards]):
            lenForwards += 1

        lenBackwards = 1
        while lenBackwards < 10 and not np.isnan(arr[index - lenBackwards]):
            lenBackwards += 1

        # getting values set up for imputation
        nullRange = list(range(index, index + width, 1))
        totalRange = list(range(index - lenBackwards + 1, index + width + lenForwards, 1))
        x = [x for x in totalRange if x not in nullRange]       # impution data points
        y = [arr[i] for i in x]                                 # function values of impution data points
        imputionRange = list(range(index, index + width, 1))

        if lenForwards > 5 and lenBackwards > 5:
            # cubic spline impution
            cspline = CubicSpline(x, y)
            for i in imputionRange:
                arr[i] = cspline(i)         # replacing null values in array with interpolated values
        
        elif (lenForwards > 5 and lenBackwards > 2) or (lenForwards > 2 and lenBackwards > 5):
            # cubic spline but data lies mostly on one end
            # handles cases such as [x1, x2, x3, x4, x5, x6, nan, nan, nan, x7, x8, x9]
            cspline = CubicSpline(x, y)
            for i in imputionRange:
                arr[i] = cspline(i)         # replacing null values in array with interpolated values
        
        elif width < 3:                     # not enough values preceeding and succeeding null gap for cubic spline, but null gap is small
            # linear impution
            linInterplator = interp1d(x, y)
            for i in imputionRange:
                arr[i] = linInterplator(i)         # replacing null values in array with interpolated values
        else:
            smallGapFailed = True

    return arr, smallGapFailed

def imputeArr(arr):
    '''Helper function for smallGapImputation.
    '''
    # looping through each index in the array to find nulls
    index = 0
    reimputeColumn = False
    while index < len(arr):
        # a null has been found
        if np.isnan(arr[index]):
            # finding how many consecutive nulls are present
            width = 1
            while index + width < len(arr) and np.isnan(arr[index + width]):    #not reach the end and is still null
                width += 1

            # imputation happens with this helper function
            arr, smallGapFailed = imputeArrValues(index, arr, width)
            if smallGapFailed:
                reimputeColumn = True

            # move index forward past gap, continue searching and imputing
            index += width

        # no null gap, so continue searching
        else:
            index += 1

    return arr, reimputeColumn

def smallGapImputation(df):
    '''This function takes in a dataframe that has null values present.
    For each column, this function will attempt to fill in null gaps of size 5
    or less with cubic spline impution or, if that's not available and the gap is < 3,
    linear impution.

    Input: pandas dataframe whose columns have null values, first column is timestamp
    Returns: df with imputed data
    '''
    for col in df.columns[1:]:
        # getting an array from the dataframe
        arr = np.array(df[col])

        # to run through a column multiple times if necessary
        count = 0
        while count < 5:
            arr, reimputeColumn = imputeArr(arr)
            if reimputeColumn:
                count += 1
                # print("Reimputing {col}".format(col=col))
            else:
                break
        
        # replacing arr in df with arr with interpolated values
        df[col] = arr

    return df

def imputeSmallGaps():

    # get original combined data with all null values
    df = pd.read_csv("Joined Influent and Rainfall and Weather and Groundwater and Creek Gauge.csv", parse_dates=["DateTime"])
    df["SWTP Total Influent Flow"] = np.array([np.nan if x < 3.7 else x for x in df["SWTP Total Influent Flow"]])

    # imputing all small gaps with cubic splines and linear lines, gaps of size < 5
    df = smallGapImputation(df)

    # adding year, month, day, and hour columns
    df["Year"] = df["DateTime"].dt.year
    df["Month"] = df["DateTime"].dt.month
    df["Week Day"] = df["DateTime"].dt.dayofweek
    df["Hour"] = df["DateTime"].dt.hour
    df["Week"] = df["DateTime"].dt.week

    # saving imputed data
    df.to_csv("Small Gap Imputed Data.csv", index=False)

imputeSmallGaps()

### Validating Small Gap Imputation

In [None]:
import pandas as pd
import numpy as np
from copy import deepcopy
from sklearn.metrics import r2_score, mean_squared_error
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib qt

def normalize(arr, maximum, minimum):
    '''Array with range [a, b] scaled to [0, 1]'''
    if maximum == minimum:
        return arr
    normArr = [(x - minimum) / (maximum - minimum) for x in arr]
    return normArr

def createIndicies(index, gapSize):
    '''Creates a list of indicies centered around 'index' with size 'gapSize' to later turn null'''
    indicies = []
    if gapSize % 2 != 0:
        maxVal = int(gapSize/2) + index
        minVal = int(gapSize/2) * -1 + index
        indicies = [i for i in range(minVal, maxVal + 1)]
    else:
        maxVal = int(gapSize/2) + index
        minVal = int(gapSize/2) * -1 + index + 1
        indicies = [i for i in range(minVal, maxVal + 1)]
    return indicies

def testSmallSpot(arr, index, length):
    '''Tests if the spot is valid to turn into a small gap'''
    if len(arr) - index > 25 and index > 25:    # not in last 25 or in first 25 indicies
        # tests if 5 vals before and after index are null
        # also if index is null
        for i in range(length):
            if np.isnan(arr[index + i]):
                return False          
            if np.isnan(arr[index - i]):
                return False
        return True                         # only if all are not null will this be hit
    return False

def testSmallGap(feature, count = 5, smallGapsPerTest = 30):
    '''Performs small gap validation on a given feature.
    Returns validation metrics'''
    # getting data
    df = pd.read_csv("Joined Influent and Rainfall and Weather and Groundwater and Creek Gauge.csv", 
        usecols = ["DateTime", feature])
    arr = np.array(df[feature])

    # to remove sus values in a particular feature
    if feature == "SWTP Total Influent Flow":
        arr = np.array([np.nan if x < 3.7 else x for x in df["SWTP Total Influent Flow"]])

    # starting validation
    totalR = 0
    totalMSE = 0
    totalNMSE = 0
    breakCount = 0                              # in case not able to get as many desired spots
    for i in range(count):                      # average of how many validation tests

        spots = []                              # append all initial indicies to be turned null here
        validationIndicies = []                 # append all indicies forced to null here
        nullArr = deepcopy(arr)                 # will be adding null values to here for validation
        
        # getting the spots to make null
        while len(spots) < smallGapsPerTest:
            randIndex = np.random.randint(0, len(arr))

            # testing if the randomly generated index is a valid spot
            if testSmallSpot(nullArr, randIndex, 7):
                spots.append(randIndex)
                nullGapWidth = np.random.randint(1, 5) # either 1, 2, 3, or 4

                # making a null gap where data was previously
                indiciesToTurnNull = createIndicies(randIndex, nullGapWidth)
                for i in indiciesToTurnNull:
                    nullArr[i] = np.nan
                    validationIndicies.append(i)

            # in case while loop is infinite
            if breakCount > 5000000:             # just some large number
                raise NotImplementedError("Failed to create all small null gaps")
        
            breakCount += 1
        
        # inputing new array with created null values
        df[feature] = nullArr

        # imputing and getting r2 values
        df = smallGapImputation(df)
        imputedArr = df[feature]

        prevValues = [arr[i] for i in validationIndicies]
        imputedValues = [imputedArr[i] for i in validationIndicies]
        normPrevValues = normalize(prevValues, np.max(prevValues), np.min(prevValues))
        normImputedValues = normalize(imputedValues, np.max(prevValues), np.min(prevValues))

        totalR += r2_score(prevValues, imputedValues)
        totalMSE += mean_squared_error(prevValues, imputedValues)
        totalNMSE += mean_squared_error(normPrevValues, normImputedValues)

    # print("Avg r^2 for {col} is: \n{val}".format(col = feature, val = totalR / count))

    return totalR / count, totalMSE / count, totalNMSE / count

def performSmallGapValidation(featDf):
    '''Performs small gap validation'''
    avgR2Vals = []
    avgMSEVals = []
    avgNMSEVals = []
    allFeatures = np.array(featDf["Feature"])
    for feature in allFeatures:
        # print(feature)
        avgR2, avgMSE, avgNMSE = testSmallGap(feature, 10, 100)
        avgR2Vals.append(avgR2)       # r2 is avg of 15 tests, 25 small null gaps created per test
        avgMSEVals.append(avgMSE)
        avgNMSEVals.append(avgNMSE)
    featDf["Avg R2"] = avgR2Vals
    featDf["Avg MSE"] = avgMSEVals
    featDf["Avg NMSE"] = avgNMSEVals
    featDf.to_csv("Validated Features.csv", index=False)
    return featDf

featDf = pd.read_csv("Features.csv")
featDf = performSmallGapValidation(featDf)
print(featDf)
featDf.to_csv("Small Gap Validation.csv", index=False)

# # df = pd.read_csv("Validated Features.csv")
# df = pd.read_csv("Small Gap Validation.csv")
# df.plot.bar(x="Feature", y="Avg NMSE", rot=37)
# # df.plot.bar(x="Feature", y="Avg R2", rot=37)
# # plt.bar([i for i in range(len(df["Feature"]))], df["Avg R2"])
# plt.show()

### Large Gap Imputation Hyperparameter Tuning

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from copy import deepcopy
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error
from dcor import distance_correlation
%matplotlib qt

def normalize(arr, maximum, minimum):
    '''Used in finding normalized MSE'''
    if maximum == minimum:
        return np.zeros_like(arr)   # since all the same value, make all zeros
    normArr = [(x - minimum) / (maximum - minimum) for x in arr]
    return normArr

def findNulls(arr):
    index = 0
    pairs = []                  # formatted like [(start index, num values)]
    while index < len(arr):
        if np.isnan(arr[index]):
            width = 1
            try:
                while np.isnan(arr[index + width]):
                    width += 1
            except IndexError:  # means end of array is null
                break
            pairs.append((index, width))
            index += width
        else:
            index += 1

    # for pair in pairs:
    #     print("Null values starting at index: {i}. {w} total nulls".format(i=pair[0], w=pair[1]))
    return pairs

def createIndicies(index, gapSize):
    indicies = []
    if gapSize % 2 != 0:
        maxVal = int(gapSize/2) + index
        minVal = int(gapSize/2) * -1 + index
        indicies = [i for i in range(minVal, maxVal + 1)]
    else:
        maxVal = int(gapSize/2) + index
        minVal = int(gapSize/2) * -1 + index + 1
        indicies = [i for i in range(minVal, maxVal + 1)]
    return indicies

def testLargeSpot(arr, index, length):
    if len(arr) - index - 25 > length and index + 25 > length: 
        for i in range(length):
            if np.isnan(arr[index + i]):
                return False          
            if np.isnan(arr[index - i]):
                return False
        return True                         # only if all are not null will this be hit
    return False
 
def createLargeGapIndicies(arr, arrWithNulls, test_size):
    existingLargeNullGapLengths = [x[1] for x in findNulls(arrWithNulls) if x[1] > 5]
    minGapSize = 5
    maxGapSize = max(existingLargeNullGapLengths, default=150)
    # print("Max gap size:", maxGapSize)
    spots = []                              # append all initial indicies to be turned null here
    validationIndicies = []                 # append all indicies forced to null here
    nullArr = deepcopy(arr)                     # will be adding null values to here for validation

    # getting the spots to make null
    breakout = 0
    totalCreatedNulls = 0
    hasLargestGap = False
    while totalCreatedNulls / len(arr) < test_size and breakout < 500000:
        # randomly getting index and how large of gap to create
        randIndex = np.random.randint(0, len(arr))
        randGapSize = np.random.randint(minGapSize, maxGapSize)
        if not hasLargestGap:
            randGapSize = maxGapSize
            hasLargestGap = True

        # testing if spot is valid
        if testLargeSpot(nullArr, randIndex, randGapSize):
            spots.append(randIndex)
            totalCreatedNulls += randGapSize
            
            # making a null gap where data was previously
            indiciesToTurnNull = createIndicies(randIndex, randGapSize)
            for i in indiciesToTurnNull:
                nullArr[i] = np.nan
                validationIndicies.append(i)

        breakout += 1
    return validationIndicies

def train_test_split_largeGap(data, target, targetWithNulls, test_size = 0.1):
    trainX, trainY = deepcopy(data), deepcopy(target)
    testX, testY = [], []
    # targetWithNulls = []  # sometimes, the default of 150 will be better, rather than however large the max large gap in targetWithNulls is
    validationIndicies = np.sort(np.array(createLargeGapIndicies(target, targetWithNulls, test_size)))
    for index in validationIndicies:
        testX.append(data[index])
        testY.append(target[index])
    testX, testY = np.array(testX), np.array(testY)
    trainX = np.delete(trainX, validationIndicies, 0)
    trainY = np.delete(trainY, validationIndicies)
    return trainX, testX, trainY, testY, validationIndicies



def saveForestPredictions(originFilename, predFilename, feature):
    # building dataframes
    originDf = pd.read_csv(originFilename, parse_dates=["DateTime"])
    predDf = pd.read_csv(predFilename, parse_dates=["DateTime"])

    # getting arrays from dataframes
    predArr = np.array(predDf[predDf.columns[-1]])
    predDates = np.array(predDf["DateTime"])
    originArr = np.array(originDf[feature])
    originDates = np.array(originDf["DateTime"])

    # copying over predicted values
    for i in range(len(predDates)):
        locations = np.nonzero(originDates == predDates[i])
        originArr[locations[0]] = predArr[i]

    # replacing array in df and saving to csv
    newDf = pd.read_csv(originFilename, usecols=["DateTime"])
    newDf[targetFeature] = originArr
    newDf.to_csv(feature + " imputed data.csv", index=False)

def findSharedNullValueFeatures(df, targetFeature, tol = 0.5):
    # getting location of target nulls
    target = np.array(df[targetFeature])
    targetNulls = np.nonzero(np.isnan(target))[0]

    # getting columns
    columnList = [x for x in df.columns if x not in ["DateTime", targetFeature]]
    badColumns = []
    for col in columnList:
        # getting how many null values are shared between target and each feature
        arr = np.array(df[col])
        arrNulls = np.nonzero(np.isnan(arr))[0]
        sharedNullIncidiesCount = len(np.intersect1d(targetNulls, arrNulls))
        
        # if share tol% or more null features, then leave that feature out
        if sharedNullIncidiesCount > tol * len(targetNulls):
            badColumns.append(col)

    return badColumns

def separateDataIntoSets(target, data, dates):
    # getting null locations from target feature array
    targetNulls = np.nonzero(np.isnan(target))[0]

    # getting data corresponding to where the target feature is null
    testData = []
    testDates = []
    for i in targetNulls:
        testData.append(data[i])
        testDates.append(dates[i])
    testData = np.array(testData)
    testDates = np.array(testDates)

    # deleting all indicies that are null from target and data
    trainTarget = np.delete(target, targetNulls)
    trainData = np.delete(data, targetNulls, 0)     # the 0 means delete a row

    if len(testData) == 0:
        return trainData, trainTarget, np.array([]), np.array([])

    # finding where null values are present in the data that has a not null target corresponding to it
    badIndicies = []
    for col in range(len(trainData[0])):
        for row in range(len(trainData)):
            if np.isnan(trainData[row][col]):
                badIndicies.append(row)

    # removing those indicies so that rand forest can train
    if len(badIndicies) > 0:
        badIndicies = np.unique(np.array(badIndicies))
        trainData = np.delete(trainData, badIndicies, 0)
        trainTarget = np.delete(trainTarget, badIndicies)

    # finding where null values are present in the data that has a null target corresponding to it
    badIndicies = []
    for col in range(len(testData[0])):
        for row in range(len(testData)):
            if np.isnan(testData[row][col]):
                badIndicies.append(row)
    
    # removing those indicies so that rand forest can predict
    if len(badIndicies) > 0:
        badIndicies = np.unique(np.array(badIndicies))
        testData = np.delete(testData, badIndicies, 0)
        testDates = np.delete(testDates, badIndicies)
    
    return trainData, trainTarget, testData, testDates

def scalePredictedValues(dates, fullTarget, predictedValues, predictedIndicies, scaleFactor):
    # copying over predicted values
    fullPredTarget = deepcopy(fullTarget)
    validNullTarget = deepcopy(fullTarget)                 # used in linear scaling step to find where to scale
    predDates = [dates[i] for i in predictedIndicies]
    for i in range(len(predDates)):
        locations = np.nonzero(dates == predDates[i])
        try:
            fullPredTarget[locations[0]] = predictedValues[i]
        except:
            print(len(fullTarget))
            print(len(fullPredTarget))
            print(len(dates))
            print(len(predictedValues))
            print(locations)
            raise NotImplementedError("Blah")
        validNullTarget[locations[0]] = np.nan
    
    scalingSpots = findNulls(validNullTarget)
    for tup in scalingSpots:
        # getting points to make trendline, using 10 points
        points = []
        for i in range(1, 11):                   # at tup[0], target is null, so start with i = 1
            xBefore = tup[0] - i
            yBefore = fullTarget[xBefore]
            xAfter = tup[0] + tup[1] - 1 + i
            yAfter = fullTarget[xAfter]

            # condition if cannot get all 10 points desired due to other null gaps close to current gap
            if np.isnan(yBefore) or np.isnan(yAfter):
                break
            
            # appending points
            points.append((xBefore, yBefore))
            points.append((xAfter, yAfter))

        # creating trendline from points
        trendlineCoeffs = np.polyfit(np.array([p[0] for p in points]), np.array([p[1] for p in points]), 1)
        trendline = np.poly1d(trendlineCoeffs)

        # scaling predicted values
        for i in range(tup[1]):
            fullPredTarget[tup[0] + i] = trendline(tup[0] + i) + scaleFactor * (fullPredTarget[tup[0] + i] - trendline(tup[0] + i))

    # slicing out scaled prediced values
    scaledPredictedValues = [fullPredTarget[i] for i in predictedIndicies]

    return scaledPredictedValues

def tuneForest(df, targetFeature):
    # getting data to use
    target = np.array(df[targetFeature])
    dates = np.array(df["DateTime"])
    badFeaturesToUse = findSharedNullValueFeatures(df, targetFeature, 0)
    badFeaturesToUse += [targetFeature, "DateTime"]
    df = df.drop(columns=badFeaturesToUse)
    data = df.to_numpy()

    # splitting data up into respective datasets
    validData, validTarget, nullData, nullDates = separateDataIntoSets(target, data, dates)

    # setting up possible hyperparameter values
    maxNumFeatures = list(range(5, int(len(df.columns)/1.5) + 1, 2))
    maxDepths = [5, 7, 10]
    numTrees = [75, 100]
    scaleFactors = [.1, .2, .5, 1]
    # numTrees = [75]
    # maxNumFeatures = [11]
    # maxDepths = [10]
    # scaleFactors = [1]
    # maxNumFeatures = list(range(3, 11, 2))
    
    
    numValidations = 3
    avgDict = {}
    for n in range(numValidations):
        # splitting up known data into training and validation sets
        XTrain, XTest, YTrain, YTest, testIndicies = train_test_split_largeGap(validData, validTarget, target, test_size=0.20)

        # grid searching for best combination
        combos = []
        for numFeats in maxNumFeatures:
            for maxDepth in maxDepths:
                for trees in numTrees:
                    # only previous for loops impact the random forest's performance
                    imputer = RandomForestRegressor(n_estimators=trees, max_depth=maxDepth, max_features=numFeats)
                    imputer.fit(XTrain, YTrain)
                    predictedValues = imputer.predict(XTest)
                    # print(mean_squared_error(YTest, predictedValues))   # original mse without scaling

                    for scale in scaleFactors:
                        # linear trendline scaling
                        scaledPredictedValues = scalePredictedValues(dates, validTarget, predictedValues, testIndicies, scale)
                        mse = mean_squared_error(YTest, scaledPredictedValues)
                        combos.append((mse, (numFeats, maxDepth, trees, scale)))
        
        # adding combos info to the average dictionary
        for tup in combos:
            if tup[1] in avgDict.keys():
                avgDict[tup[1]] += tup[0]
            else:
                avgDict[tup[1]] = tup[0]
    
    # getting info out of avgDict and into a list to sort
    combos = [(tup[1]/numValidations, tup[0]) for tup in avgDict.items()]

    combos.sort(key=lambda a: a[0])
    for tup in combos:
        print(tup[0], "with {f} features, max depth of {d}, {t} trees, and a scale factor of {s}".format(
            f = tup[1][0], d = tup[1][1], t = tup[1][2], s = tup[1][3]))

    # creating best tree predictions
    print("Best hyperparas are: {f} features, max depth of {d}, {t} trees, and a scale factor of {s}".format(
        f = combos[0][1][0], d = combos[0][1][1], t = combos[0][1][2], s = combos[0][1][3]))
    print("With an MSE of: {m}".format(m = combos[0][0]))
    imputer = RandomForestRegressor(max_features=combos[0][1][0], max_depth=combos[0][1][1], n_estimators=combos[0][1][2])
    # imputer = RandomForestRegressor(max_features=11, max_depth=10, n_estimators=75)
    imputer.fit(validData, validTarget)
    imputedValues = imputer.predict(nullData)
    # imputedValues = scalePredictedValues(dates, target, imputedValues, 
    #     [i for i in range(len(target)) if np.isnan(target[i])], combos[0][1][3])

    # saving to a dataframe
    predData = np.array((np.array(nullDates), imputedValues)).T
    newDf = pd.DataFrame(predData, columns=["DateTime", "Predicted Ozark Groundwater Depth (ft)"])
    newDf.to_csv("predicted forest.csv", index=False)


def getCorrelationPerFeature(df, targetFeature):
    targetArr = np.array(df[targetFeature])
    targetNullLocations = np.nonzero(np.isnan(targetArr))[0]

    correlationList = []
    cols = [col for col in df.columns if col not in ["DateTime", targetFeature]]
    for col in cols:
        # gettiing column as array and null locations
        arr = np.array(df[col])
        arrNullLocations = np.nonzero(np.isnan(arr))[0]
        allNullLocations = np.unique(np.append(targetNullLocations, arrNullLocations))

        # removing null indicies
        currentTargetArr = np.delete(targetArr, allNullLocations)
        arr = np.delete(arr, allNullLocations)

        # computing 
        correlationValue = distance_correlation(currentTargetArr, arr)
        correlationList.append((col, correlationValue))

    correlationList.sort(key=lambda a: a[1])
    correlationList = correlationList[::-1]
    # corrDf = pd.DataFrame(np.array(correlationList), columns = ["Feature", "Correlation with Target"])
    # print(corrDf)
    
    return correlationList


In [None]:
import numpy as np
import pandas as pd

# have a list of all features needing large gap imputation here

# features = ['SWTP Plant 1 Influent Flow', 'Wilsons Gauge Height (ft)', 'SW_Peak_Flow', 
#     'SWTP Plant 1 Gravity Flow', 'James Gauge Height (ft)', 'HourlyWetBulbTemperature', 'HourlyStationPressure', 'HourlyWindSpeed']
# features = ['HourlyStationPressure', 'HourlyWindSpeed']
features = ['HourlyPressureChange']
# features = ["HourlyPressureTendency"]

for targetFeature in features:
    df = df = pd.read_csv("Small Gap Imputed Data.csv")
    df.drop(columns = ["DateTime"])
    correlationList = getCorrelationPerFeature(df, targetFeature)
    topCorrelatedFeatures = [x[0] for x in correlationList[:20]] + ["DateTime", targetFeature]
    df = pd.read_csv("Small Gap Imputed Data.csv", usecols=topCorrelatedFeatures)
    # df = pd.read_csv("Imputed Data.csv", usecols=topCorrelatedFeatures)
    print("Imputing:", targetFeature)
    tuneForest(df, targetFeature)
    saveForestPredictions("Small Gap Imputed Data.csv", "predicted forest.csv", targetFeature)
    print("\n\n")


# results
# ozark: best was 9 features, 10 max depth, 75 trees, and a scale factor of 0.1 in full grid search with mse of 0.9284514998810384
# springfield: best was 9 features, 10 max depth, 75 trees, and a scale factor of 0.1 in full grid search with mse of 0.02750132169140654
# swtp total: best was 9 features, max depth of 10, 75 trees, and a scale factor of 1 with special features with mse of 28.72129856053067
# swtp plant 2: best was 9 features, max depth of 10, 75 trees, and a scale factor of 0.5 in full grid search with mse of 24.51435301094517
# swtp plant 1: best was 7 features, max depth of 10, 100 trees, and a scale factor of 0.5 in full grid search with mse of 8.780532754986643 
# wilson gauge: best was 7 features, max depth of 10, 75 trees, and a scale factor of 0.5 in full grid search with mse of 0.1307122486409281 
# peak flow: best was 7 features, max depth of 10, 75 trees, and a scale factor of 0.5 in full grid search with mse of 3.966058622312969
# swtp gravity: best was 5 features, max depth of 10, 75 trees, and a scale factor of 0.5 in full grid search with mse of 4.519902798066215
# james gauge: best was 7 features, max depth of 10, 75 trees, and a scale factor of 1 in full grid search with mse of 0.4629773919971907
# wet bulb: best was 11 features, max depth of 10, 100 trees, and a scale factor of 1 in full grid search with mse of 0.08290534402734484
# station pressure: best was 11 features, max depth of 10, 75 trees, and a scale factor of 1 in full grid search with mse of 6.434824542627539e-06
# wind speed: best was 9 features, max depth of 10, 100 trees, and a scale factor of 1 in full grid search with mse of 19.531978664613003
# pressure tend: best was 11 features, max depth of 10, 75 trees, and a scale factor of 1 in full grid search with mse of 4.365107909935982
# pressure change: best was 9 features, max depth of 10, 100 trees, and a scale factor of 1 in full grid search with mse of 0.0008534304525081505

### Large Gap Validation and Imputation

In [None]:
import os
import json

def saveJsonFile(filename, obj):
    with open(filename, 'w') as f:
        json.dump(obj, f)

def readJsonFile(filename):
    with open(filename, "r") as f:
        obj = json.load(f)
    return obj

hyperparameters = [('HourlyWetBulbTemperature', (11, 10, 100, 1)), ('HourlyStationPressure', (11, 10, 75, 1)), 
    ('HourlyWindSpeed', (9, 10, 100, 1)), ('HourlyPressureChange', (9, 10, 100, 1)), ("HourlyPressureTendency", (11, 10, 75, 1)),
    ('SW_Peak_Flow', (7, 10, 75, 0.5)), ('SWTP Plant 1 Gravity Flow', (5, 10, 75, 0.5)),
    ('Wilsons Gauge Height (ft)', (7, 10, 75, 0.5)), ('James Gauge Height (ft)', (7, 10, 75, 1)),
    ("Springfield Plateau Aquifer Depth to Water Level (ft)", (9, 10, 75, 0.1)), ("Ozark Aquifer Depth to Water Level (ft)", (9, 10, 75, 0.1)),
    ('SWTP Plant 1 Influent Flow', (7, 10, 100, 0.5)), ('SWTP Plant 2 Influent Flow', (9, 10, 75, 0.5)), 
    ('SWTP Total Influent Flow', (9, 10, 75, 1))]

validationResults = []
for tup in hyperparameters:
    # getting data
    targetFeature = tup[0]
    df = pd.read_csv("Small Gap Imputed Data.csv")
    correlationPath = "Correlation Lists/" + targetFeature + " correlation list.json"
    if os.path.exists(correlationPath):
        correlationList = readJsonFile(correlationPath)
    else:
        correlationList = getCorrelationPerFeature(df, targetFeature)
        saveJsonFile(correlationPath, correlationList)
    if targetFeature == "SWTP Total Influent Flow":
        correlationList = [('SWTP Plant 2 Influent Flow', 0.9559068108777935), ('Wilsons Gauge Height (ft)', 0.8137102534536554), 
                            ('James Gauge Height (ft)', 0.7535434901158702), ('SWTP Plant 1 Influent Flow', 0.6894131379860265), 
                            ('Fire 168 Hour Rainfall Aggregate', 0.6761004366376544), ('AT&T 168 Hour Rainfall Aggregate', 0.672011459843413), 
                            ('Field 168 Hour Rainfall Aggregate', 0.6708505528284172), ('Springfield Plateau Aquifer Depth to Water Level (ft)', 0.6505778879978816),
                            ('Ozark Aquifer Depth to Water Level (ft)', 0.5220396032101949), ('Month', 0.34707577687495317),
                            ('Hour', 0.16784404944993966), ('Month', 0.34707577687495317), ('Week', 0.34260778783378193), ('Year', 0.3153056413507134)]
    
    addedFeats = 0
    topCorrelatedFeatures = [x[0] for x in correlationList[:20]] + ["DateTime", targetFeature]
    temporalFeatures = ["Month", "Hour", "Week", "Year"]
    if "SW" in targetFeature:
        for feat in temporalFeatures:
            if feat not in topCorrelatedFeatures:
                topCorrelatedFeatures.append(feat)
        addedFeats = 1

    # data preprocessing
    targetArrDf = pd.read_csv("Small Gap Imputed Data.csv", usecols=[targetFeature])
    df = pd.read_csv("Small Gap Imputed Data.csv", usecols=topCorrelatedFeatures)
    target = np.array(targetArrDf[targetFeature])
    dates = np.array(df["DateTime"])

    badFeaturesToUse = findSharedNullValueFeatures(df, targetFeature, 0)
    badFeaturesToUse += [targetFeature, "DateTime"]
    df = df.drop(columns=badFeaturesToUse)
    data = df.to_numpy()                   # matrix of top correlated features that can be used to impute the target feature

    # splitting data up into respective datasets
    validData, validTarget, nullData, nullDates = separateDataIntoSets(target, data, dates)

    # starting validation
    avgMSE = 0
    avgR2 = 0
    avgNMSE = 0
    numValidations = 10                     # an average of this many validations
    currentValidations = 0
    validationFailedAttempts = 0
    while currentValidations < numValidations:
        try:                                                        # there's an index error that pops up (rarely) that I don't want to fix
                                                                    # it's from the linear trendline scaling null index selection
            if validationFailedAttempts > numValidations * 5:
                raise RuntimeError("Too many failed validation attempts")
            
            # getting datasets
            XTrain, XTest, YTrain, YTest, testIndicies = train_test_split_largeGap(validData, validTarget, target, test_size=0.20)

            # random forest model and predicting
            imputer = RandomForestRegressor(n_estimators=tup[1][2], max_depth=tup[1][1], max_features=tup[1][0]+addedFeats)
            imputer.fit(XTrain, YTrain)
            predictedValues = imputer.predict(XTest)

            # scaling random forest predictions and evaluation of predictions
            scaledPredictedValues = scalePredictedValues(dates, validTarget, predictedValues, testIndicies, tup[1][3])
            normScaledPredictedValues = normalize(scaledPredictedValues, np.max(YTest), np.min(YTest))
            normYTest = normalize(YTest, np.max(YTest), np.min(YTest))
            avgMSE += mean_squared_error(YTest, scaledPredictedValues)
            avgNMSE += mean_squared_error(normYTest, normScaledPredictedValues)
            avgR2 += r2_score(YTest, scaledPredictedValues)
            currentValidations += 1

        except NotImplementedError:
            print("Failed:", validationFailedAttempts)
            validationFailedAttempts += 1
            continue


    # getting dates of all training data in order
    targetNulls = np.nonzero(np.isnan(target))[0]
    trainTestDates = np.delete(dates, targetNulls)          # list of all dates that have a non-null value being regressed
    testDates = np.array([trainTestDates[i] for i in testIndicies])

    # saving test data to a dataframe for future comparison
    predData = np.array((np.array(testDates), scaledPredictedValues)).T
    newDf = pd.DataFrame(predData, columns=["DateTime", targetFeature])
    newDf.to_csv("validated forest.csv", index=False)
    


    # if needed, imputing missing data again and saving
    imputer = RandomForestRegressor(n_estimators=tup[1][2], max_depth=tup[1][1], max_features=tup[1][0])
    imputer.fit(validData, validTarget)
    imputedValues = imputer.predict(nullData)
    imputedValues = scalePredictedValues(dates, target, imputedValues, [i for i in range(len(target)) if np.isnan(target[i])], tup[1][3])

    # saving missing value imputation results
    imputedData = np.array((np.array(nullDates), imputedValues)).T
    newDf = pd.DataFrame(imputedData, columns=["DateTime", targetFeature])
    newDf.to_csv("predicted forest.csv", index=False)
    saveForestPredictions("Small Gap Imputed Data.csv", "predicted forest.csv", targetFeature)
    
    # copying imputation results over into New Imputed Data.csv
    doesExist = os.path.exists("New Imputed Data.csv")
    if not doesExist:
        print("Creating file")
        dataframe = pd.read_csv("Small Gap Imputed Data.csv")
        dataframe.to_csv("New Imputed Data.csv", index=False)  # creating file since doesn't exist yet

    newDf = pd.read_csv(targetFeature + " imputed data.csv")
    imputedDf = pd.read_csv("New Imputed Data.csv")
    imputedDf[targetFeature] = newDf[targetFeature]
    imputedDf.to_csv("New Imputed Data.csv", index=False)
    os.remove(targetFeature + " imputed data.csv")


    # validation results
    validationList = [targetFeature, avgMSE / numValidations, avgNMSE / numValidations, avgR2 / numValidations, tup[1]]
    validationResults.append(validationList)
    print("For feature:", targetFeature)
    print("MSE:", validationList[1])
    print("NMSE:", validationList[2])
    print("R2:", validationList[3])
    print()
    print("Imputed:", targetFeature)
    print()

saveJsonFile("Validation Results.json", validationResults)

if os.path.exists("predicted forest.csv"):
    os.remove("predicted forest.csv")

In [None]:
validationResults = readJsonFile("Validation Results.json")
df = pd.DataFrame(validationResults, columns = ["Feature", "MSE", "NMSE", "R^2", "Hyperparameters"])
df = df.sort_values(by = "NMSE", ascending=True)
df.to_csv("Large Gap Validation.csv", index=False)
print(df)

### Feature Null Gap and Imputation Visualization
##### Uncomment the feature wanted to be visualized

In [None]:
import numpy as np
import matplotlib.patches as mpatches
import matplotlib.dates as mdates

def makeNullRects(dates, y):
    '''This function returns a list of matplotlib.patches.Rectangles where
    np.nan values are present in the y array. If values are consecutive,
    the rectangles will widen as needed.
    Note that this function is made for a figure with an x-axis of dates
    Input:
        dates: x axis date time values
        y: y axis range values as np.array, contains np.nan values

    Returns:
        list of matplotlib.patches.Rectangles located where
        y has np.nan values.

    Rectangle Parameters in function:
        opacityCoeff: how solid rectangles appear
        longRectColor: the color of the rectangles with >=7 width
        shortRectColor: the color of the rectanges with <7 width
    '''
    # setting up rectangle parameters
    opacityCoeff = 0.5
    longRectColor = "red"
    shortRectColor = "magenta"

    # prep work for creating rectangles for nan values
    index = 0
    yMax = np.nanmax(y)
    yMin = np.nanmin(y)
    rectHeight = yMax - yMin
    yRectCoor = yMin
    allRects = []   # this is what will be returned

    # creating rectangle patches
    while index < len(y):

        # if nan exists, then need to create a rectangle patch
        if np.isnan(y[index]):
            xRectCoorIndex = index - 1

            # condition for if first y value is nan
            if index == 0:
                xRectCoorIndex += 1
            
            # condition for if last y value is nan, assumes y is not len 2
            elif index + 1 == len(y):
                xRectCoor = mdates.date2num(dates[xRectCoorIndex])
                coords = (xRectCoor, yRectCoor)
                width = mdates.date2num(dates[xRectCoorIndex + 1]) - mdates.date2num(dates[xRectCoorIndex])
                allRects.append(mpatches.Rectangle(coords, width, rectHeight, color=shortRectColor, alpha=opacityCoeff))
                break
                
            # all other cases
            xRectCoor = mdates.date2num(dates[xRectCoorIndex])

            # checking finding how long the rectangle needs to be--how many consecutive null values
            index += 1
            while np.isnan(y[index]):
                index += 1
            rightEdgeIndex = mdates.date2num(dates[index])

            # making rectangle
            coords = (xRectCoor, yRectCoor)
            width = rightEdgeIndex - xRectCoor
            color = shortRectColor
            if index - xRectCoorIndex > 5:
                color = longRectColor
            allRects.append(mpatches.Rectangle(coords, width, rectHeight, color=color, alpha=opacityCoeff))

        else:
            index += 1

    return allRects

def visualizeMissingValues(dates, arr, fig, ax, wantToMakeNullRects = True):
    '''This function plots an array of values with datetime x axis values onto
    a given axis, showing patches of null values if present.

    Input:
        dates: a numpy array of datetime objs that are the x-axis for the array with missing data to plot
        arr: a numpy array that has missing data
        fig: a matplotlib figure that contains the axis with the plot
        ax: a matplotlib axis that will be plotted upon

    Returns:
        fig: edited matplotlib figure
        ax: edited matplotlib axis
    '''
    ax.plot(dates, arr)

    if wantToMakeNullRects:
        rects = makeNullRects(dates, arr)
        for rect in rects:
            ax.add_patch(rect)

    formatter = mdates.ConciseDateFormatter(ax.xaxis.get_major_locator(), formats=["%Y", "%Y-%b", "%b-%d", "%d %H:%M", "%d %H:%M", "%H:%M"])
    locator = mdates.AutoDateLocator()
    ax.xaxis.set_major_formatter(formatter)
    ax.xaxis.set_major_locator(locator)

    fig.autofmt_xdate()
    return fig, ax

def plotImputedData(dates, nullArr, imputedArr, ax):
    '''This graph plots imputed data as a green dashed line on a given
    matplotlib axis.

    Input:
        dates: a numpy array of datetime objs that are the x-axis for the array with missing data to plot
        nullArr: a numpy array that has missing data
        imputedArr: a numpy array that has some of the missing values imputed
        ax: a matplotlib axis that will be plotted upon
    
    Returns:
        ax: edited matplotlib axis
    '''
    index = 0
    while index < len(nullArr):                                 # looping through arr since it has the null values
        if np.isnan(nullArr[index]):
            # getting the width of the null area
            lenForward = 0
            while np.isnan(nullArr[index + lenForward]):
                lenForward += 1

            # domain to plot is [index-1, index+lenforward]
            domain = list(range(index-1, index+lenForward+1))
            datesToPlot = [dates[i] for i in domain]
            pointsToPlot = [imputedArr[i] for i in domain]
            ax.plot(datesToPlot, pointsToPlot, "g--")       # green dashed line

            # moving index forward past null gap
            index += lenForward
        else:
            index += 1
    return ax


In [24]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from copy import deepcopy
%matplotlib qt

feature = "SWTP Total Influent Flow"
# feature = "SWTP Plant 2 Influent Flow"
# feature = "SW_Peak_Flow"
# feature = "SWTP Plant 1 Gravity Flow"
# feature = "Total 168 Hour Rainfall Aggregate"
# feature = "Ozark Aquifer Depth to Water Level (ft)"
# feature = "Springfield Plateau Aquifer Depth to Water Level (ft)"
# feature = "James Gauge Height (ft)"
# feature = "Wilsons Gauge Height (ft)"
# feature = "Fire 168 Hour Rainfall Aggregate"
# feature = "HourlyPressureChange"

df = pd.read_csv("Joined Influent and Rainfall and Weather and Groundwater and Creek Gauge.csv", parse_dates=["DateTime"])
# df = pd.read_csv("Small Gap Imputed Data.csv", parse_dates=["DateTime"])
# df = pd.read_csv("Small Gap Imputed Data Editted.csv", parse_dates=["DateTime"])
df["SWTP Total Influent Flow"] = np.array([np.nan if x < 3.7 else x for x in df["SWTP Total Influent Flow"]])


# imputedDf = pd.read_csv("Small Gap Imputed Data.csv")
# imputedDf = pd.read_csv("Small Gap Imputed Data Editted.csv")
# imputedDf = pd.read_csv("test.csv")
# imputedDf = pd.read_csv("Imputed Data.csv")
imputedDf = pd.read_csv("New Imputed Data.csv")


dates = np.array(df["DateTime"])
imputedArr = np.array(imputedDf[feature])
nullArr = deepcopy(np.array(df[feature]))

fig, ax = plt.subplots()
fig, ax = visualizeMissingValues(dates, nullArr, fig, ax)
ax = plotImputedData(dates, nullArr, imputedArr, ax)
ax.set_ylabel(feature, fontsize=20)
ax.set_title("Missing Data in the " + str(feature), fontsize=25)
plt.show()