In [None]:
import numpy as np
import matplotlib.patches as mpatches
import matplotlib.dates as mdates

def makeNullRects(dates, y):
    '''This function returns a list of matplotlib.patches.Rectangles where
    np.nan values are present in the y array. If values are consecutive,
    the rectangles will widen as needed.
    Note that this function is made for a figure with an x-axis of dates
    Input:
        dates: x axis date time values
        y: y axis range values as np.array, contains np.nan values

    Returns:
        list of matplotlib.patches.Rectangles located where
        y has np.nan values.

    Rectangle Parameters in function:
        opacityCoeff: how solid rectangles appear
        longRectColor: the color of the rectangles with >=7 width
        shortRectColor: the color of the rectanges with <7 width
    '''
    # setting up rectangle parameters
    opacityCoeff = 0.5
    longRectColor = "red"
    shortRectColor = "magenta"

    # prep work for creating rectangles for nan values
    index = 0
    yMax = np.nanmax(y)
    yMin = np.nanmin(y)
    rectHeight = yMax - yMin
    yRectCoor = yMin
    allRects = []   # this is what will be returned

    # creating rectangle patches
    while index < len(y):

        # if nan exists, then need to create a rectangle patch
        if np.isnan(y[index]):
            xRectCoorIndex = index - 1

            # condition for if first y value is nan
            if index == 0:
                xRectCoorIndex += 1
            
            # condition for if last y value is nan, assumes y is not len 2
            elif index + 1 == len(y):
                xRectCoor = mdates.date2num(dates[xRectCoorIndex])
                coords = (xRectCoor, yRectCoor)
                width = mdates.date2num(dates[xRectCoorIndex + 1]) - mdates.date2num(dates[xRectCoorIndex])
                allRects.append(mpatches.Rectangle(coords, width, rectHeight, color=shortRectColor, alpha=opacityCoeff))
                break
                
            # all other cases
            xRectCoor = mdates.date2num(dates[xRectCoorIndex])

            # checking finding how long the rectangle needs to be--how many consecutive null values
            index += 1
            while np.isnan(y[index]):
                index += 1
            rightEdgeIndex = mdates.date2num(dates[index])

            # making rectangle
            coords = (xRectCoor, yRectCoor)
            width = rightEdgeIndex - xRectCoor
            color = shortRectColor
            if index - xRectCoorIndex > 5:
                color = longRectColor
            allRects.append(mpatches.Rectangle(coords, width, rectHeight, color=color, alpha=opacityCoeff))

        else:
            index += 1

    return allRects

def visualizeMissingValues(dates, arr, fig, ax):
    '''This function plots an array of values with datetime x axis values onto
    a given axis, showing patches of null values if present.

    Input:
        dates: a numpy array of datetime objs that are the x-axis for the array with missing data to plot
        arr: a numpy array that has missing data
        fig: a matplotlib figure that contains the axis with the plot
        ax: a matplotlib axis that will be plotted upon

    Returns:
        fig: edited matplotlib figure
        ax: edited matplotlib axis
    '''
    ax.plot(dates, arr)

    rects = makeNullRects(dates, arr)
    for rect in rects:
        ax.add_patch(rect)

    formatter = mdates.ConciseDateFormatter(ax.xaxis.get_major_locator(), formats=["%Y", "%Y-%b", "%b-%d", "%d %H:%M", "%d %H:%M", "%H:%M"])
    locator = mdates.AutoDateLocator()
    ax.xaxis.set_major_formatter(formatter)
    ax.xaxis.set_major_locator(locator)

    fig.autofmt_xdate()
    return fig, ax

def plotImputedData(dates, nullArr, imputedArr, ax):
    '''This graph plots imputed data as a green dashed line on a given
    matplotlib axis.

    Input:
        dates: a numpy array of datetime objs that are the x-axis for the array with missing data to plot
        nullArr: a numpy array that has missing data
        imputedArr: a numpy array that has some of the missing values imputed
        ax: a matplotlib axis that will be plotted upon
    
    Returns:
        ax: edited matplotlib axis
    '''
    index = 0
    while index < len(nullArr):                                 # looping through arr since it has the null values
        if np.isnan(nullArr[index]):
            # getting the width of the null area
            lenForward = 0
            while np.isnan(nullArr[index + lenForward]):
                lenForward += 1

            # domain to plot is [index-1, index+lenforward]
            domain = list(range(index-1, index+lenForward+1))
            datesToPlot = [dates[i] for i in domain]
            pointsToPlot = [imputedArr[i] for i in domain]
            ax.plot(datesToPlot, pointsToPlot, "g--")       # green dashed line

            # moving index forward past null gap
            index += lenForward
        else:
            index += 1
    return ax


In [None]:
import numpy as np
import pandas as pd

def getImputationData(nullFilename, targetFeature, correlatedFeatures):
    '''For a given feature and set of correlated features,
    this function returns a set of training data and target values
    and testing data.
    The training data is all the data that has no null values that correspond
    to the non-null target data, which is the training target.
    The testing data is all the data with non-null values that correspond
    to a null target value.
    Note that if a null target value has a set of features that includes a null,
    that value will be left out.
    '''
    df = pd.read_csv(nullFilename, usecols=correlatedFeatures)   # selected areas that seem believeable and kept them from large gap impution
    target = np.array(df[targetFeature])
    dates = np.array(df["DateTime"])
    df = df.drop(columns=[targetFeature, "DateTime"])
    data = df.to_numpy()
    targetNulls = np.nonzero(np.isnan(target))[0]

    testData = []
    testDates = []
    for i in targetNulls:
        testData.append(data[i])
        testDates.append(dates[i])
    testData = np.array(testData)

    trainTarget = np.delete(target, targetNulls)
    trainData = np.delete(data, targetNulls, 0)     # the 0 means delete a row

    badIndicies = []
    for col in range(len(trainData[0])):
        for row in range(len(trainData)):
            if np.isnan(trainData[row][col]):
                badIndicies.append(row)
    if len(badIndicies) > 0:
        badIndicies = np.unique(np.array(badIndicies))
        trainData = np.delete(trainData, badIndicies, 0)
        trainTarget = np.delete(trainTarget, badIndicies)

    badIndicies = []
    for col in range(len(testData[0])):
        for row in range(len(testData)):
            if np.isnan(testData[row][col]):
                badIndicies.append(row)
    if len(badIndicies) > 0:
        badIndicies = np.unique(np.array(badIndicies))
        testData = np.delete(testData, badIndicies, 0)
    
    return trainData, trainTarget, testData, testDates

def joinImputedData(imputedFilename, nullFilename, targetFeature, filename="test.csv"):
    '''This function takes a csv of imputated data for null values for a single feature
    and joins it to the original dataset with null values under the filename test.csv.
    '''
    testDf = pd.read_csv(imputedFilename, parse_dates=["DateTime"])
    df = pd.read_csv(nullFilename, parse_dates=["DateTime"])
    testArr = np.array(testDf[testDf.columns[-1]])
    testDates = np.array(testDf["DateTime"])
    actualArr = np.array(df[targetFeature])
    actualDates = np.array(df["DateTime"])

    for i in range(len(testDates)):
        locations = np.nonzero(actualDates == testDates[i])
        actualArr[locations[0]] = testArr[i]

    df[targetFeature] = actualArr
    df.to_csv(filename, index=False)

def normalize1D(arr):
    return (arr - np.min(arr)) / (np.max(arr) - np.min(arr))

def normalize2D(data):
    for col in range(np.shape(data)[1]):
        ithCol = data[:, col]
        ithCol = normalize1D(ithCol)
        data[:, col] = ithCol
    return data

def scaleImputedData(trainData, trainTarget, testData):
    return normalize2D(trainData), normalize1D(trainTarget), normalize2D(testData)


In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression, Lasso, Ridge

correlatedFeatures = ['Ozark Aquifer Depth to Water Level (ft)', "Hour", "Month", "Year", "Week"] + ["DateTime"]
correlatedFeatures += ["Sequiota 168 Hour Rainfall Aggregate"] + ["SWTP Total Influent Flow"]
correlatedFeatures += ["James Gauge Height (ft)"] + ["Wilsons Gauge Height (ft)"]

imputedFilename = "predicted groundwater test.csv"
# nullFilename = "Small Gap Imputed Data Editted.csv"
nullFilename = "Small Gap Imputed Data.csv"
targetFeature = "Ozark Aquifer Depth to Water Level (ft)"

trainData, trainTarget, testData, testDates = getImputationData(nullFilename, targetFeature, correlatedFeatures)
imputer = RandomForestRegressor(min_samples_split=5)
# imputer = DecisionTreeRegressor(min_samples_split=5)
imputer.fit(trainData, trainTarget)
predictedValues = imputer.predict(testData)


# normTrainData, normTrainTarget, normTestData = scaleImputedData(trainData, trainTarget, testData)
# # imputer = LinearRegression()
# imputer = Lasso()
# # imputer = Ridge()
# imputer.fit(trainData, trainTarget)
# predictedValues = imputer.predict(testData)


l = np.array((np.array(testDates), predictedValues)).T
newDf = pd.DataFrame(l, columns=["DateTime", "Predicted Ozark Groundwater Depth (ft)"])
newDf.to_csv("predicted groundwater test.csv", index=False)

joinImputedData(imputedFilename, nullFilename, targetFeature)

In [165]:
import numpy as np
import pandas as pd
from scipy.interpolate import CubicSpline

# initial starting points
# p1 = (25806, 199.75)    # first large gap
# p2 = (26487, 189.66)
# p1 = (27465, 181.0573)  # second large gap
# p2 = (27584, 182.5138)
# p1 = (27746, 181.1897)  # third large gap
# p2 = (27781, 179.1884)
# p1 = (27895, 179.9647)  # fourth large gap
# p2 = (27984, 177.9796)
p1 = (28040, 178.8463)  # fifth large gap
p2 = (28118, 173.26)

# create a linear line from start of null gap to end
x = np.linspace(p1[0], p2[0] - 1, num=(p2[0]-p1[0]), dtype=int)
m = (p2[1] - p1[1]) / (p2[0] - p1[0])
b = p2[1] - m * p2[0]
xLinear = m * x + b

# scaling and adjustment factors, depends on which gap as differences might be larger or smaller
# the mean and st dev for each were hand chosen for these by trial and error
# scale = np.random.normal(loc=.1, scale=0.04, size=np.shape(xLinear))     # for first small gap  
# adjust = np.random.normal(loc=0, scale=0.25, size=np.shape(xLinear))
# scale = np.random.normal(loc=.02, scale=0.04, size=np.shape(xLinear))    # for second small gap  
# adjust = np.random.normal(loc=0, scale=0.04, size=np.shape(xLinear))
# scale = np.random.normal(loc=.05, scale=0.04, size=np.shape(xLinear))    # for third small gap  
# adjust = np.random.normal(loc=0, scale=0.2, size=np.shape(xLinear))
# scale = np.random.normal(loc=.04, scale=0.04, size=np.shape(xLinear))    # for fourth small gap  
# adjust = np.random.normal(loc=0, scale=0.15, size=np.shape(xLinear))
scale = np.random.normal(loc=.038, scale=0.03, size=np.shape(xLinear))    # for fifth and last small gap  
adjust = np.random.normal(loc=0, scale=0.125, size=np.shape(xLinear))

# get original data with large null gaps and then the random forest predictions for those gaps
# originDf = pd.read_csv("Small Gap Imputed Data.csv", usecols=["Ozark Aquifer Depth to Water Level (ft)", "DateTime"])
originDf = pd.read_csv("Small Gap Imputed Data Editted.csv", usecols=["Ozark Aquifer Depth to Water Level (ft)", "DateTime"])
predDf = pd.read_csv("test.csv", usecols=["Ozark Aquifer Depth to Water Level (ft)"])
depths = np.array(originDf["Ozark Aquifer Depth to Water Level (ft)"])
predDepths = np.array(predDf["Ozark Aquifer Depth to Water Level (ft)"])

# linear line shifted toward the prediction value with gaussian noise added
newDepths = np.ones_like(xLinear)
for i in x:
    if i % 6 == 0:
        newDepths[i - p1[0]] = xLinear[i - p1[0]] + scale[i - p1[0]] * (predDepths[i] - xLinear[i - p1[0]]) + adjust[i - p1[0]]
        # depths[i] = xLinear[i - p1[0]] + scale[i - p1[0]] * (predDepths[i] - xLinear[i - p1[0]]) + adjust[i - p1[0]]
    else:
        newDepths[i - p1[0]] = xLinear[i - p1[0]] + scale[i - p1[0]] * (predDepths[i] - xLinear[i - p1[0]])
        # depths[i] = xLinear[i - p1[0]] + scale[i - p1[0]] * (predDepths[i] - xLinear[i - p1[0]])

# since the new depth values oscillate a lot, we are going to smooth it out
# take every few points and then cubic spline interpolate using those few points
interpX = [p1[0] - 1]       # starting values, right before null gap
interpY = [depths[p1[0] - 1]]
# n = 10          # for first large gap
# n = 6           # for second large gap
# n = 2           # for third large gap  
# n = 3           # for fourth large gap 
n = 2           # for fifth and last large gap
for i in x:
    if i % n == 0 and i != (p2[0] - 1):     # taking essentially every n values from the linear shift thingy
        interpX.append(i)
        interpY.append(newDepths[i - p1[0]])
interpX.append(p2[0] - 1)   # ending values, right after null gap
interpY.append(depths[p2[0] - 1])

cs = CubicSpline(interpX, interpY)
indicies = list(range(p1[0] - 1, p2[0]))
newDepths = np.array([cs(i) for i in indicies])
for i in indicies:
    depths[i] = newDepths[i - p1[0] + 1]    # replacing values with cubic spline values

# save new data into a file called "test2.csv"
originDf["Ozark Aquifer Depth to Water Level (ft)"] = depths
originDf.to_csv("linear line test.csv")
joinImputedData("linear line test.csv", "test.csv", "Ozark Aquifer Depth to Water Level (ft)", "test2.csv")

# after this, pick out the wanted data by hand and put it in a master file

In [166]:
from copy import deepcopy
import matplotlib.pyplot as plt
%matplotlib qt

feature = "Ozark Aquifer Depth to Water Level (ft)"

# df = pd.read_csv("Small Gap Imputed Data Editted.csv", parse_dates=["DateTime"])
df = pd.read_csv("Small Gap Imputed Data.csv", parse_dates=["DateTime"])
nullArr = deepcopy(np.array(df[feature]))
dates = np.array(df["DateTime"])
imputedDf = pd.read_csv("Small Gap Imputed Data Editted.csv")
# imputedDf = pd.read_csv("test.csv")
# imputedDf = pd.read_csv("test2.csv")
imputedArr = np.array(imputedDf[feature])

fig, ax = plt.subplots()
fig, ax = visualizeMissingValues(dates, nullArr, fig, ax)
ax = plotImputedData(dates, nullArr, imputedArr, ax)
# ax.scatter(testDf["DateTime"], testDf[testDf.columns[-1]], s=8, color="red", marker="x")

plt.show()