In [4]:
import numpy as np
import matplotlib.patches as mpatches
import matplotlib.dates as mdates

def makeNullRects(dates, y):
    '''This function returns a list of matplotlib.patches.Rectangles where
    np.nan values are present in the y array. If values are consecutive,
    the rectangles will widen as needed.
    Note that this function is made for a figure with an x-axis of dates
    Input:
        dates: x axis date time values
        y: y axis range values as np.array, contains np.nan values

    Returns:
        list of matplotlib.patches.Rectangles located where
        y has np.nan values.

    Rectangle Parameters in function:
        opacityCoeff: how solid rectangles appear
        longRectColor: the color of the rectangles with >=7 width
        shortRectColor: the color of the rectanges with <7 width
    '''
    # setting up rectangle parameters
    opacityCoeff = 0.5
    longRectColor = "red"
    shortRectColor = "magenta"

    # prep work for creating rectangles for nan values
    index = 0
    yMax = np.nanmax(y)
    yMin = np.nanmin(y)
    rectHeight = yMax - yMin
    yRectCoor = yMin
    allRects = []   # this is what will be returned

    # creating rectangle patches
    while index < len(y):

        # if nan exists, then need to create a rectangle patch
        if np.isnan(y[index]):
            xRectCoorIndex = index - 1

            # condition for if first y value is nan
            if index == 0:
                xRectCoorIndex += 1
            
            # condition for if last y value is nan, assumes y is not len 2
            elif index + 1 == len(y):
                xRectCoor = mdates.date2num(dates[xRectCoorIndex])
                coords = (xRectCoor, yRectCoor)
                width = mdates.date2num(dates[xRectCoorIndex + 1]) - mdates.date2num(dates[xRectCoorIndex])
                allRects.append(mpatches.Rectangle(coords, width, rectHeight, color=shortRectColor, alpha=opacityCoeff))
                break
                
            # all other cases
            xRectCoor = mdates.date2num(dates[xRectCoorIndex])

            # checking finding how long the rectangle needs to be--how many consecutive null values
            index += 1
            while np.isnan(y[index]):
                index += 1
            rightEdgeIndex = mdates.date2num(dates[index])

            # making rectangle
            coords = (xRectCoor, yRectCoor)
            width = rightEdgeIndex - xRectCoor
            color = shortRectColor
            if index - xRectCoorIndex > 5:
                color = longRectColor
            allRects.append(mpatches.Rectangle(coords, width, rectHeight, color=color, alpha=opacityCoeff))

        else:
            index += 1

    return allRects

def visualizeMissingValues(dates, arr, fig, ax, wantToMakeNullRects = True):
    '''This function plots an array of values with datetime x axis values onto
    a given axis, showing patches of null values if present.

    Input:
        dates: a numpy array of datetime objs that are the x-axis for the array with missing data to plot
        arr: a numpy array that has missing data
        fig: a matplotlib figure that contains the axis with the plot
        ax: a matplotlib axis that will be plotted upon

    Returns:
        fig: edited matplotlib figure
        ax: edited matplotlib axis
    '''
    ax.plot(dates, arr)

    if wantToMakeNullRects:
        rects = makeNullRects(dates, arr)
        for rect in rects:
            ax.add_patch(rect)

    formatter = mdates.ConciseDateFormatter(ax.xaxis.get_major_locator(), formats=["%Y", "%Y-%b", "%b-%d", "%d %H:%M", "%d %H:%M", "%H:%M"])
    locator = mdates.AutoDateLocator()
    ax.xaxis.set_major_formatter(formatter)
    ax.xaxis.set_major_locator(locator)

    fig.autofmt_xdate()
    return fig, ax

def plotImputedData(dates, nullArr, imputedArr, ax):
    '''This graph plots imputed data as a green dashed line on a given
    matplotlib axis.

    Input:
        dates: a numpy array of datetime objs that are the x-axis for the array with missing data to plot
        nullArr: a numpy array that has missing data
        imputedArr: a numpy array that has some of the missing values imputed
        ax: a matplotlib axis that will be plotted upon
    
    Returns:
        ax: edited matplotlib axis
    '''
    index = 0
    while index < len(nullArr):                                 # looping through arr since it has the null values
        if np.isnan(nullArr[index]):
            # getting the width of the null area
            lenForward = 0
            while np.isnan(nullArr[index + lenForward]):
                lenForward += 1

            # domain to plot is [index-1, index+lenforward]
            domain = list(range(index-1, index+lenForward+1))
            datesToPlot = [dates[i] for i in domain]
            pointsToPlot = [imputedArr[i] for i in domain]
            ax.plot(datesToPlot, pointsToPlot, "g--")       # green dashed line

            # moving index forward past null gap
            index += lenForward
        else:
            index += 1
    return ax


In [5]:
import numpy as np

def findNulls(arr):
    index = 0
    pairs = []                  # formatted like [(start index, num values)]
    while index < len(arr):
        if np.isnan(arr[index]):
            width = 1
            try:
                while np.isnan(arr[index + width]):
                    width += 1
            except IndexError:  # means end of array is null
                break
            pairs.append((index, width))
            index += width
        else:
            index += 1

    # for pair in pairs:
    #     print("Null values starting at index: {i}. {w} total nulls".format(i=pair[0], w=pair[1]))
    return pairs


In [4]:
from scipy.interpolate import CubicSpline
from scipy.interpolate import interp1d

def imputeArrValues(index, arr, width):
    '''Helper function for imputeArr, which is used inside smallGapImputation.
    '''
    smallGapFailed = False      # represents if there's a small enough gap but was just not able to fill it in due to missing data on either side
    if width < 5 and index + width + 1 < len(arr):
        # interpolate data!
        # want 6 values before and after gap for cubic spline, but more might be better
        # technically only 4 points are needed, but more might help impute better
        lenForwards = 1
        while lenForwards < 10 and not np.isnan(arr[index + width + lenForwards]):
            lenForwards += 1

        lenBackwards = 1
        while lenBackwards < 10 and not np.isnan(arr[index - lenBackwards]):
            lenBackwards += 1

        # getting values set up for imputation
        nullRange = list(range(index, index + width, 1))
        totalRange = list(range(index - lenBackwards + 1, index + width + lenForwards, 1))
        x = [x for x in totalRange if x not in nullRange]       # impution data points
        y = [arr[i] for i in x]                                 # function values of impution data points
        imputionRange = list(range(index, index + width, 1))

        if lenForwards > 5 and lenBackwards > 5:
            # cubic spline impution
            cspline = CubicSpline(x, y)
            for i in imputionRange:
                arr[i] = cspline(i)         # replacing null values in array with interpolated values
        
        elif (lenForwards > 5 and lenBackwards > 2) or (lenForwards > 2 and lenBackwards > 5):
            # cubic spline but data lies mostly on one end
            # handles cases such as [x1, x2, x3, x4, x5, x6, nan, nan, nan, x7, x8, x9]
            cspline = CubicSpline(x, y)
            for i in imputionRange:
                arr[i] = cspline(i)         # replacing null values in array with interpolated values
        
        elif width < 3:                     # not enough values preceeding and succeeding null gap for cubic spline, but null gap is small
            # linear impution
            linInterplator = interp1d(x, y)
            for i in imputionRange:
                arr[i] = linInterplator(i)         # replacing null values in array with interpolated values
        else:
            smallGapFailed = True

    return arr, smallGapFailed

def imputeArr(arr):
    '''Helper function for smallGapImputation.
    '''
    # looping through each index in the array to find nulls
    index = 0
    reimputeColumn = False
    while index < len(arr):
        # a null has been found
        if np.isnan(arr[index]):
            # finding how many consecutive nulls are present
            width = 1
            while index + width < len(arr) and np.isnan(arr[index + width]):    #not reach the end and is still null
                width += 1

            # imputation happens with this helper function
            arr, smallGapFailed = imputeArrValues(index, arr, width)
            if smallGapFailed:
                reimputeColumn = True

            # move index forward past gap, continue searching and imputing
            index += width

        # no null gap, so continue searching
        else:
            index += 1

    return arr, reimputeColumn

def smallGapImputation(df):
    '''This function takes in a dataframe that has null values present.
    For each column, this function will attempt to fill in null gaps of size 5
    or less with cubic spline impution or, if that's not available and the gap is < 3,
    linear impution.

    Input: pandas dataframe whose columns have null values, first column is timestamp
    Returns: df with imputed data
    '''
    for col in df.columns[1:]:
        # getting an array from the dataframe
        arr = np.array(df[col])

        # to run through a column multiple times if necessary
        count = 0
        while count < 5:
            arr, reimputeColumn = imputeArr(arr)
            if reimputeColumn:
                count += 1
                # print("Reimputing {col}".format(col=col))
            else:
                break
        
        # replacing arr in df with arr with interpolated values
        df[col] = arr

    return df


In [None]:
import numpy as np
import pandas as pd

def imputeSmallGaps():

    # get original combined data with all null values
    df = pd.read_csv("Joined Influent and Rainfall and Weather and Groundwater and Creek Gauge.csv", parse_dates=["DateTime"])
    df["SWTP Total Influent Flow"] = np.array([np.nan if x < 3.7 else x for x in df["SWTP Total Influent Flow"]])

    # imputing all small gaps with cubic splines and linear lines, gaps of size < 5
    df = smallGapImputation(df)

    # adding year, month, day, and hour columns
    df["Year"] = df["DateTime"].dt.year
    df["Month"] = df["DateTime"].dt.month
    df["Week Day"] = df["DateTime"].dt.dayofweek
    df["Hour"] = df["DateTime"].dt.hour
    df["Week"] = df["DateTime"].dt.week

    # saving imputed data
    df.to_csv("Small Gap Imputed Data.csv", index=False)

imputeSmallGaps()

In [None]:
import pandas as pd
import numpy as np
from copy import deepcopy
from sklearn.metrics import r2_score
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib qt

def createIndicies(index, gapSize):
    indicies = []
    if gapSize % 2 != 0:
        maxVal = int(gapSize/2) + index
        minVal = int(gapSize/2) * -1 + index
        indicies = [i for i in range(minVal, maxVal + 1)]
    else:
        maxVal = int(gapSize/2) + index
        minVal = int(gapSize/2) * -1 + index + 1
        indicies = [i for i in range(minVal, maxVal + 1)]
    return indicies

def testSmallSpot(arr, index, length):
    if len(arr) - index > 25 and index > 25:    # not in last 25 or in first 25 indicies
        # tests if 5 vals before and after index are null
        # also if index is null
        for i in range(length):
            if np.isnan(arr[index + i]):
                return False          
            if np.isnan(arr[index - i]):
                return False
        return True                         # only if all are not null will this be hit
    return False

def testSmallGap(feature, count = 5, smallGapsPerTest = 30):
    # getting data
    df = pd.read_csv("Joined Influent and Rainfall and Weather and Groundwater and Creek Gauge.csv", 
        usecols = ["DateTime", feature])
    arr = np.array(df[feature])

    # to remove sus values in a particular feature
    if feature == "SWTP Total Influent Flow":
        arr = np.array([np.nan if x < 3.7 else x for x in df["SWTP Total Influent Flow"]])
    nullArr = deepcopy(arr)                     # will be adding null values to here for validation

    # starting validation
    totalR = 0
    breakCount = 0                              # in case not able to get as many desired spots
    for i in range(count):                      # average of how many validation tests

        spots = []                              # append all initial indicies to be turned null here
        validationIndicies = []                 # append all indicies forced to null here
        
        # getting the spots to make null
        while len(spots) < smallGapsPerTest:
            randIndex = np.random.randint(0, len(arr))

            # testing if the randomly generated index is a valid spot
            if testSmallSpot(nullArr, randIndex, 7):
                spots.append(randIndex)
                nullGapWidth = np.random.randint(1, 5) # either 1, 2, 3, or 4

                # making a null gap where data was previously
                indiciesToTurnNull = createIndicies(randIndex, nullGapWidth)
                for i in indiciesToTurnNull:
                    nullArr[i] = np.nan
                    validationIndicies.append(i)

            # in case while loop is infinite
            if breakCount > 5000000:             # just some large number
                raise NotImplementedError("Failed to create all small null gaps")
        
            breakCount += 1
        
        # inputing new array with created null values
        df[feature] = nullArr

        # imputing and getting r2 values
        df = smallGapImputation(df)
        imputedArr = df[feature]

        prevValues = [arr[i] for i in validationIndicies]
        imputedValues = [imputedArr[i] for i in validationIndicies]
        totalR += r2_score(prevValues, imputedValues)
    # print("Avg r^2 for {col} is: \n{val}".format(col = feature, val = totalR / count))

    return totalR / count

def performSmallGapValidation(featDf):
    avgR2Vals = []
    allFeatures = np.array(featDf["Feature"])
    for feature in allFeatures:
    # for feature in ["Blackman 96 Hour Rainfall Aggregate", "Blackman 120 Hour Rainfall Aggregate"]:
        print(feature)
        avgR2Vals.append(testSmallGap(feature, 10, 100))       # r2 is avg of 15 tests, 25 small null gaps created per test
    print(avgR2Vals)
    # featDf["Avg R2"] = avgR2Vals
    # featDf.to_csv("Validated Features.csv", index=False)

    return featDf

# featDf = pd.read_csv("Features.csv")
# featDf = performSmallGapValidation(featDf)
# print(featDf)

# df = pd.read_csv("Validated Features.csv")
df = pd.read_csv("Filtered Validated Features.csv")
df.plot.bar(x="Feature", y="Avg R2", rot=37)
# plt.bar([i for i in range(len(df["Feature"]))], df["Avg R2"])
plt.show()

In [6]:
import pandas as pd
import numpy as np
from dcor import distance_correlation

def getCorrelationPerFeature(df, targetFeature):
    targetArr = np.array(df[targetFeature])
    targetNullLocations = np.nonzero(np.isnan(targetArr))[0]

    correlationList = []
    cols = [col for col in df.columns if col not in ["DateTime", targetFeature]]
    for col in cols:
        # gettiing column as array and null locations
        arr = np.array(df[col])
        arrNullLocations = np.nonzero(np.isnan(arr))[0]
        allNullLocations = np.unique(np.append(targetNullLocations, arrNullLocations))

        # removing null indicies
        currentTargetArr = np.delete(targetArr, allNullLocations)
        arr = np.delete(arr, allNullLocations)

        # computing 
        correlationValue = distance_correlation(currentTargetArr, arr)
        correlationList.append((col, correlationValue))

    correlationList.sort(key=lambda a: a[1])
    correlationList = correlationList[::-1]
    # corrDf = pd.DataFrame(np.array(correlationList), columns = ["Feature", "Correlation with Target"])
    # print(corrDf)
    
    return correlationList

# targetFeature = "Ozark Aquifer Depth to Water Level (ft)"
targetFeature = "Springfield Plateau Aquifer Depth to Water Level (ft)"
# df = pd.read_csv("Small Gap Imputed Data.csv")
df = pd.read_csv("Imputed Data.csv")
correlationList = getCorrelationPerFeature(df, targetFeature)

In [11]:
'''
ozarkCorrelationList = [('Springfield Plateau Aquifer Depth to Water Level (ft)', 0.6307472041607479), ('Week', 0.5765419244542733), ('Month', 0.5726279226642098), ('James Gauge Height (ft)', 0.5636879927117522), ('SWTP Total Influent Flow', 0.533093275904417), ('SWTP Plant 2 Influent Flow', 0.5205061617966512), ('Wilsons Gauge Height (ft)', 0.43951761698951675), ('SWTP Plant 1 Influent Flow', 0.3815965878440867), ('Sequiota 168 Hour Rainfall Aggregate', 0.3790800777831508), ('Sequiota 144 Hour Rainfall Aggregate', 0.35859381995089634), ('Sequiota 120 Hour Rainfall Aggregate', 0.3361124951332317), ('Year', 0.33475684589322774), ('Sequiota 96 Hour Rainfall Aggregate', 0.30806467669381055), ('Republic 168 Hour Rainfall Aggregate', 0.2985548939998324), ('AT&T 168 Hour Rainfall Aggregate', 0.29750850751625374), ('Hiland 168 Hour Rainfall Aggregate', 0.29675139217766533), ('Field 168 Hour Rainfall Aggregate', 0.2924034951313501), ('Willard 168 Hour Rainfall Aggregate', 0.29100906059925974), ('NW 168 Hour Rainfall Aggregate', 0.2855386912527847), ('Pittman 168 Hour Rainfall Aggregate', 0.2831864693005713), ('Hiland 144 Hour Rainfall Aggregate', 0.2823723855497944), ('Westport 168 Hour Rainfall Aggregate', 0.2823710115408092), ('Republic 144 Hour Rainfall Aggregate', 0.28043329631822683), ('AT&T 144 Hour Rainfall Aggregate', 0.2795171504344931), ('Waste 168 Hour Rainfall Aggregate', 0.27939688488900466), ('Pleasant 168 Hour Rainfall Aggregate', 0.2785139833307677), ('Weller 168 Hour Rainfall Aggregate', 0.27460603981370146), ('Field 144 Hour Rainfall Aggregate', 0.27458970133859273), ('Willard 144 Hour Rainfall Aggregate', 0.27445281575912894), ('Millwood 168 Hour Rainfall Aggregate', 0.2744424049919476), ('Sequiota 72 Hour Rainfall Aggregate', 0.2721167731721901), ('Rutledge 168 Hour Rainfall Aggregate', 0.2707097493453225), ('Fire 168 Hour Rainfall Aggregate', 0.2702133951630731), ('Sherwood 168 Hour Rainfall Aggregate', 0.26970477387268893), ('NW 144 Hour Rainfall Aggregate', 0.26932598698918603), ('Strafford 168 Hour Rainfall Aggregate', 0.26896867391737117), ('Disney 168 Hour Rainfall Aggregate', 0.2686604900200778), ('James 168 Hour Rainfall Aggregate', 0.26831648686722814), ('Bingham 168 Hour Rainfall Aggregate', 0.26803556971653875), ('Hiland 120 Hour Rainfall Aggregate', 0.2662906398387919), ('Pittman 144 Hour Rainfall Aggregate', 0.26579546912939356), ('Westport 144 Hour Rainfall Aggregate', 0.2653438267516973), ('Total 168 Hour Rainfall Aggregate', 0.2651211732477124), ('Willard_Intermediate 168 Hour Rainfall Aggregate', 0.2646803359843156), ('Sunshine 168 Hour Rainfall Aggregate', 0.26411604203806427), ('Valley 168 Hour Rainfall Aggregate', 0.2640368782184995), ('Pleasant 144 Hour Rainfall Aggregate', 0.2628851990562218), ('Le 168 Hour Rainfall Aggregate', 0.2626605734648382), ('Waste 144 Hour Rainfall Aggregate', 0.2626302529722998), ('Jefferies 168 Hour Rainfall Aggregate', 0.26186429498121505), ('Republic 120 Hour Rainfall Aggregate', 0.2601934265389541), ('AT&T 120 Hour Rainfall Aggregate', 0.26014802067271886), ('Roundtree 168 Hour Rainfall Aggregate', 0.2599020729344486), ('Millwood 144 Hour Rainfall Aggregate', 0.2580556435308486), ('Weller 144 Hour Rainfall Aggregate', 0.25783907998665784), ('Willard 120 Hour Rainfall Aggregate', 0.25663979900923933), ('Rutledge 144 Hour Rainfall Aggregate', 0.25599928879469974), ('Field 120 Hour Rainfall Aggregate', 0.2553522837563216), ('Strafford 144 Hour Rainfall Aggregate', 0.25500714120086543), ('James 144 Hour Rainfall Aggregate', 0.2541510307111001), ('Fire 144 Hour Rainfall Aggregate', 0.25404192724171476), ('Sherwood 144 Hour Rainfall Aggregate', 0.2539436667284397), 
('Disney 144 Hour Rainfall Aggregate', 0.2530527935941957), ('Bingham 144 Hour Rainfall Aggregate', 0.25210047154875603), ('NW 120 Hour Rainfall Aggregate', 0.25176045049815754), ('Total 144 Hour Rainfall Aggregate', 0.2517075864653168), ('Valley 144 Hour Rainfall Aggregate', 0.24958346865522912), ('Willard_Intermediate 144 Hour Rainfall Aggregate', 0.2490556283200971), ('Sunshine 144 Hour Rainfall Aggregate', 0.24852355557803882), ('Le 144 Hour Rainfall Aggregate', 0.24810669887320116), ('Pittman 120 Hour Rainfall Aggregate', 0.24749707982179972), ('Westport 120 Hour Rainfall Aggregate', 0.24700697344444747), ('Jefferies 144 Hour Rainfall Aggregate', 0.24665109623495626), ('Hiland 96 Hour Rainfall Aggregate', 0.24637857176725605), ('Pleasant 120 Hour Rainfall Aggregate', 0.24621414284580512), ('Waste 120 Hour Rainfall Aggregate', 0.2448116890440284), ('Roundtree 144 Hour Rainfall Aggregate', 0.24385020938943144), ('Cherokee 168 Hour Rainfall Aggregate', 0.24153369976082234), ('Shady 168 Hour Rainfall Aggregate', 0.24067834451030065), ('Weller 120 Hour Rainfall Aggregate', 0.2403943416513641), ('Rutledge 120 Hour Rainfall Aggregate', 0.24035668519467035), 
('Millwood 120 Hour Rainfall Aggregate', 0.239524347840239), ('Strafford 120 Hour Rainfall Aggregate', 0.2390755449468274), ('James 120 Hour Rainfall Aggregate', 0.2375659021974559), ('Sherwood 120 Hour Rainfall Aggregate', 0.23720134611318294), ('Fire 120 Hour Rainfall Aggregate', 0.23660288180650696), ('Airport 168 Hour Rainfall Aggregate', 0.23638541268319654), ('AT&T 96 Hour Rainfall Aggregate', 0.23630281835000333), ('Total 120 Hour Rainfall Aggregate', 0.2362648049132383), ('Republic 96 Hour Rainfall Aggregate', 0.2361654773917245), ('Disney 120 Hour Rainfall Aggregate', 0.23612157117533117), ('Bingham 120 Hour Rainfall Aggregate', 0.23532812910093917), ('Willard 96 Hour Rainfall Aggregate', 0.23489417683946853), ('Valley 120 Hour Rainfall Aggregate', 0.2343995447840297), ('Le 120 Hour Rainfall Aggregate', 0.23261859952637387), ('Field 96 Hour Rainfall Aggregate', 0.23243894817469085), ('Willard_Intermediate 120 Hour Rainfall Aggregate', 0.23234485570461447), ('Sunshine 120 Hour Rainfall Aggregate', 0.23231825501152442), ('NW 96 Hour Rainfall Aggregate', 0.23099448165634265), ('English 168 Hour Rainfall Aggregate', 0.23014656557358623), ('Jefferies 120 Hour Rainfall Aggregate', 0.22987353316879236), ('Cherokee 144 Hour Rainfall Aggregate', 0.2270747669766974), ('Pleasant 96 Hour Rainfall Aggregate', 0.22659688376348727), ('Shady 144 Hour Rainfall Aggregate', 0.22651379330817617), ('Roundtree 120 Hour Rainfall Aggregate', 0.22635119554380428), ('Airport 144 Hour Rainfall Aggregate', 0.2250796367732096), ('Westport 96 Hour Rainfall Aggregate', 0.22483879809254997), ('Pittman 96 Hour Rainfall Aggregate', 0.22479551237034942), ('Waste 96 Hour Rainfall Aggregate', 0.22393427553415146), ('SWTP Plant 1 Gravity Flow', 0.22281688798394458), ('English 144 Hour Rainfall Aggregate', 0.22246424915152896), ('Rutledge 96 Hour Rainfall Aggregate', 0.22098227058024936), ('Hiland 72 Hour Rainfall Aggregate', 0.22007452895313404), ('Weller 96 Hour Rainfall Aggregate', 0.21919715915412416), ('Strafford 96 Hour Rainfall Aggregate', 0.21886583596037126), ('Total 96 Hour Rainfall Aggregate', 0.21742536634327664), ('Millwood 96 Hour Rainfall Aggregate', 0.2167716490321167), ('Sherwood 96 Hour Rainfall Aggregate', 0.216436510730393), ('James 96 Hour Rainfall Aggregate', 0.21592691132062838), ('Disney 96 Hour Rainfall Aggregate', 0.21571571232830455), ('Valley 96 Hour Rainfall Aggregate', 0.21539586767638869), ('Fire 96 Hour Rainfall Aggregate', 0.21509328189990626), ('Bingham 96 Hour Rainfall Aggregate', 0.2146857482148008), ('Le 96 Hour Rainfall Aggregate', 0.21357119459298754), ('Willard_Intermediate 96 Hour Rainfall Aggregate', 0.21300159517117412), ('Airport 120 Hour Rainfall Aggregate', 0.21244731066372874), ('Sunshine 96 Hour Rainfall Aggregate', 0.21240589104352825), ('Shady 120 Hour Rainfall Aggregate', 0.21158403461127104), ('English 120 Hour Rainfall Aggregate', 0.21125453984434878), ('Cherokee 120 Hour Rainfall Aggregate', 0.21068024178715256), ('Jefferies 96 Hour Rainfall Aggregate', 0.2093168536252737), ('Willard 72 Hour Rainfall Aggregate', 0.20760409983942765), ('AT&T 72 Hour Rainfall Aggregate', 0.2068192589103986), ('Republic 72 Hour Rainfall Aggregate', 0.20681367066403575), ('Roundtree 96 Hour Rainfall Aggregate', 0.2058601464175766), ('Field 72 Hour Rainfall Aggregate', 0.20437893840974616), ('NW 72 Hour Rainfall Aggregate', 0.20426445546022257), ('Airport_West 168 Hour Rainfall Aggregate', 0.2021557768059795), ('Pleasant 72 Hour Rainfall Aggregate', 0.20106420218329538), ('Waste 72 Hour Rainfall Aggregate', 0.197007686208034), ('Pittman 72 Hour Rainfall Aggregate', 0.1969301250353878), ('Westport 72 Hour Rainfall Aggregate', 0.19627575740380288), ('Airport 96 Hour Rainfall Aggregate', 0.19614977110207302), ('English 96 Hour Rainfall Aggregate', 0.1956702861951137), ('Rutledge 72 Hour Rainfall Aggregate', 0.19488770677348685), ('Strafford 72 Hour Rainfall Aggregate', 0.19386122089609864), ('Shady 96 Hour Rainfall Aggregate', 0.19328283360312926), ('Total 72 Hour Rainfall Aggregate', 0.1931401943341729), ('Weller 72 Hour Rainfall Aggregate', 0.19245475043649352), ('Cherokee 96 Hour Rainfall Aggregate', 0.19137376678409568), ('Valley 72 Hour Rainfall Aggregate', 0.19054834109648455), ('Airport_West 144 Hour Rainfall Aggregate', 0.19054201029338988), ('Disney 72 Hour Rainfall Aggregate', 0.19012695788202627), ('Sherwood 72 Hour Rainfall Aggregate', 0.1898180708397562), ('Le 72 Hour Rainfall Aggregate', 0.1891027941299336), ('Millwood 72 Hour Rainfall Aggregate', 0.18907934020622533), ('Willard_Intermediate 72 Hour Rainfall Aggregate', 0.1884987405617409), ('James 72 Hour Rainfall Aggregate', 0.1883429155017829), ('Bingham 72 Hour Rainfall Aggregate', 0.18832798282147675), ('Fire 72 Hour Rainfall Aggregate', 0.18801180220100128), ('Sunshine 72 Hour Rainfall Aggregate', 0.1867367317882274), ('Jefferies 72 Hour Rainfall Aggregate', 0.1835802593835982), ('Roundtree 72 Hour Rainfall Aggregate', 0.18108560664929352), ('Airport_West 120 Hour Rainfall Aggregate', 0.17803936392471822), ('Airport 72 Hour Rainfall Aggregate', 0.1753393818083737), ('English 72 Hour Rainfall Aggregate', 0.17399142700541345), ('Shady 72 Hour Rainfall Aggregate', 0.17001773181148486), ('Cherokee 72 Hour Rainfall Aggregate', 0.16759641015713841), 
('Airport_West 96 Hour Rainfall Aggregate', 0.16317252836079457), ('SW_Peak_Flow', 0.1616236444894937), ('HourlyStationPressure', 0.15271579902818724), ('HourlyAltimeterSetting', 0.1519876727661242), ('River 168 Hour Rainfall Aggregate', 0.1498452011767934), ('HourlySeaLevelPressure', 0.14718987868136907), ('Airport_West 72 Hour Rainfall Aggregate', 0.14531536260511274), ('River 144 Hour Rainfall Aggregate', 0.13970922022769972), ('Airport_Springfield 168 Hour Rainfall Aggregate', 0.13729891459175309), ('Mark 168 Hour Rainfall Aggregate', 0.13176908834283665), ('River 120 Hour Rainfall Aggregate', 0.12935352844065576), ('Airport_Springfield 144 Hour Rainfall Aggregate', 0.12910769031762823), ('Mark 144 Hour Rainfall Aggregate', 0.12396599583614991), ('Airport_Springfield 120 Hour Rainfall Aggregate', 0.11991303578479541), ('River 96 Hour Rainfall Aggregate', 0.11694787461798342), ('Mark 120 Hour Rainfall Aggregate', 0.11531315043345582), ('Airport_Springfield 96 Hour Rainfall Aggregate', 0.10988786858045745), ('Mark 96 Hour Rainfall Aggregate', 0.105340728223974), ('River 72 Hour Rainfall Aggregate', 0.10232202884542511), ('Airport_Springfield 72 Hour Rainfall Aggregate', 0.0967608660270247), ('Mark 72 Hour Rainfall Aggregate', 0.09337510242949196), ('HourlyWetBulbTemperature', 0.09059418198330822), ('HourlyDryBulbTemperature', 0.08819100260972183), ('HourlyDewPointTemperature', 0.08777041151979946), ('Blackman 168 Hour Rainfall Aggregate', 0.08765474719145999), ('Blackman 144 Hour Rainfall Aggregate', 0.083155716834374), ('Blackman 120 Hour Rainfall Aggregate', 0.07841367288398636), ('Blackman 96 Hour Rainfall Aggregate', 0.07257553890422525), ('Sequiota Rainfall (in)', 0.06750777069010394), ('Blackman 72 Hour Rainfall Aggregate', 0.0651078109755356), ('NW Rainfall (in)', 0.05543693777927901), ('Rutledge Rainfall (in)', 0.05381348224029626), ('Hiland Rainfall (in)', 0.053696110380969886), ('James Rainfall (in)', 0.05305243382951508), ('Willard Rainfall (in)', 0.05220670432854757), ('Total Rainfall (in)', 0.05184139433221512), ('Pleasant Rainfall (in)', 0.05145325676525972), ('Republic Rainfall (in)', 0.05105062435299764), ('Valley Rainfall (in)', 0.05014734318512089), ('AT&T Rainfall (in)', 0.049703125829297604), ('Westport Rainfall (in)', 0.04892283315594852), ('Willard_Intermediate Rainfall (in)', 0.04850189246638779), ('Strafford Rainfall (in)', 0.04846012476350796), ('Le Rainfall (in)', 0.0482686296380321), ('Field Rainfall (in)', 0.04806007936708993), ('Sherwood Rainfall (in)', 0.04750710701739062), ('Millwood Rainfall (in)', 0.047282632930919485), ('Fire Rainfall (in)', 0.04700055987379547), ('Waste Rainfall (in)', 0.046909445387607156), ('Bingham Rainfall (in)', 0.04670387814383212), ('Sunshine Rainfall (in)', 0.046362475221268816), ('Weller Rainfall (in)', 0.04605963344976909), ('Pittman Rainfall (in)', 0.04502803967879383), ('Jefferies Rainfall (in)', 0.044918085631140525), ('Disney Rainfall (in)', 0.04435527020946567), ('Roundtree Rainfall (in)', 0.04332548479514482), ('Airport Rainfall (in)', 0.04248003034330316), ('Shady Rainfall (in)', 0.04166010289902156), ('Williams 72 Hour Rainfall Aggregate', 0.041383981013486656), ('Williams 96 Hour Rainfall Aggregate', 0.04098745824144807), ('Williams 168 Hour Rainfall Aggregate', 0.04091676146336944), ('Williams 144 Hour Rainfall Aggregate', 0.04091676146336944), ('Williams 120 Hour Rainfall Aggregate', 0.04091676146336944), ('English Rainfall (in)', 0.03989512268847416), ('Cherokee Rainfall (in)', 0.038858059014430485), ('HourlyRelativeHumidity', 0.037892409365553645), ('Airport_West Rainfall (in)', 0.0358809369705908), ('Airport_Springfield Rainfall (in)', 0.03318206211893972), ('River Rainfall (in)', 0.03160756583078008), ('HourlyWindSpeed', 0.02884656752918485), ('HourlyVisibility', 0.025626272725796147), ('Mark Rainfall (in)', 0.020894563511283545), ('Blackman Rainfall (in)', 0.015501730294682148), ('Williams Rainfall (in)', 0.01328631789601823), ('HourlyPressureChange', 0.012152893951929169), ('HourlyPressureTendency', 0.009456778773684667), ('Week Day', 0.006847159600733715), ('Hour', 0.0018620492588146615)]

springfieldCorrelationList = [('James Gauge Height (ft)', 0.6682595437090044), ('SWTP Total Influent Flow', 0.6555463483051369), ('SWTP Plant 2 Influent Flow', 0.6361044465403347), ('Ozark Aquifer Depth to Water Level (ft)', 0.6307472041655565), ('Wilsons Gauge Height (ft)', 0.5348081187015494), ('SWTP Plant 1 Influent Flow', 0.5131380272745351), ('Sequiota 168 Hour Rainfall Aggregate', 0.49261155233603243), ('AT&T 168 Hour Rainfall Aggregate', 0.47924683180344546), ('Weller 168 Hour Rainfall Aggregate', 0.46914069961390753), ('Pleasant 168 Hour Rainfall Aggregate', 0.46836093693481723), ('Field 168 Hour Rainfall Aggregate', 0.4677687215030506), ('Fire 168 Hour Rainfall Aggregate', 0.4648271387497209), ('Le 168 Hour Rainfall Aggregate', 0.46482625262585736), ('Pittman 168 Hour Rainfall Aggregate', 0.4645865239595623), ('NW 168 Hour Rainfall Aggregate', 0.46418388798483584), ('Waste 168 Hour Rainfall Aggregate', 0.46401463740107984), ('Bingham 168 Hour Rainfall Aggregate', 0.46282093560916804), ('Jefferies 168 Hour Rainfall Aggregate', 0.4597704046209257), ('Westport 168 Hour Rainfall Aggregate', 0.4578908602354908), ('Republic 168 Hour Rainfall Aggregate', 0.45783171507823345), ('Strafford 168 Hour Rainfall Aggregate', 0.4561680567674403), ('Shady 168 Hour Rainfall Aggregate', 0.4555261201192056), ('Rutledge 168 Hour Rainfall Aggregate', 0.4549424533408891), ('Sequiota 144 Hour Rainfall Aggregate', 0.4548337679672771), ('Valley 168 Hour Rainfall Aggregate', 0.4537452680544154), ('Sunshine 168 Hour Rainfall Aggregate', 0.45250312519898506), ('Roundtree 168 Hour Rainfall Aggregate', 0.4518784630629673), ('Millwood 168 Hour Rainfall Aggregate', 0.4518611459088023), ('Disney 168 Hour Rainfall Aggregate', 0.45006996869229055), ('Willard_Intermediate 168 Hour Rainfall Aggregate', 0.44934023601414647), ('Sherwood 168 Hour Rainfall Aggregate', 0.4477758930119068), ('Willard 168 Hour Rainfall Aggregate', 0.4392164621833291), ('AT&T 144 Hour Rainfall Aggregate', 0.4379238407575416), ('Hiland 168 Hour Rainfall Aggregate', 0.43374351624349583), ('James 168 Hour Rainfall Aggregate', 0.43335490923530623), ('Weller 144 Hour Rainfall Aggregate', 0.4276919084939065), ('Field 144 Hour Rainfall Aggregate', 0.4266477822652221), ('Cherokee 168 Hour Rainfall Aggregate', 0.42646576889995086), ('Pleasant 144 Hour Rainfall Aggregate', 0.42643839452297166), ('Total 168 Hour Rainfall Aggregate', 0.4263620198177264), ('Waste 144 Hour Rainfall Aggregate', 0.42457929882349604), ('Fire 144 Hour Rainfall Aggregate', 0.4237802319777868), ('Pittman 144 Hour Rainfall Aggregate', 0.4232479959638109), ('NW 144 Hour Rainfall Aggregate', 0.42287411280768006), ('Le 144 Hour Rainfall Aggregate', 0.42284569953342266), ('Bingham 144 Hour Rainfall Aggregate', 0.4216418216072208), ('Westport 144 Hour Rainfall Aggregate', 0.4190661929329971), ('Jefferies 144 Hour Rainfall Aggregate', 0.4187868444660492), ('Rutledge 144 Hour Rainfall Aggregate', 0.4171185721060713), ('Strafford 144 Hour Rainfall Aggregate', 0.41704922393872534), ('Republic 144 Hour Rainfall Aggregate', 0.41662770356318357), ('Shady 144 Hour Rainfall Aggregate', 0.41470354016303496), ('Millwood 144 Hour Rainfall Aggregate', 0.4129726433747951), ('Valley 144 Hour Rainfall Aggregate', 0.41288065817080816), ('Sunshine 144 Hour Rainfall Aggregate', 0.4127837324431027), ('Roundtree 144 Hour Rainfall Aggregate', 0.41111209421135764), ('Disney 144 Hour Rainfall Aggregate', 0.4107772592023044), ('Willard_Intermediate 144 Hour Rainfall Aggregate', 0.4096480939999576), ('Airport 168 Hour Rainfall Aggregate', 0.40959218711762146), ('Sequiota 120 Hour Rainfall Aggregate', 0.4091605305242023), ('Sherwood 144 Hour Rainfall Aggregate', 0.4090154502890437), ('Week', 0.40632921092381735), ('Hiland 144 Hour Rainfall Aggregate', 0.4036334264595414), ('English 168 Hour Rainfall Aggregate', 0.4033708901005385), ('Willard 144 Hour Rainfall Aggregate', 0.4015487469613252), ('James 144 Hour Rainfall Aggregate', 0.3974771441413898), ('Month', 0.3968561060258641), ('Airport_West 168 Hour Rainfall Aggregate', 0.39317597518162056), ('Total 144 Hour Rainfall Aggregate', 0.3920454455796846), ('AT&T 120 Hour Rainfall Aggregate', 0.38821002985117103), ('Cherokee 144 Hour Rainfall Aggregate', 0.38697235190475276), ('Weller 120 Hour Rainfall Aggregate', 0.3777695811651127), ('Airport 144 Hour Rainfall Aggregate', 0.37678312757065285), ('Field 120 Hour Rainfall Aggregate', 0.3767742917612683), ('Waste 120 Hour Rainfall Aggregate', 0.37673313870994374), ('Pleasant 120 Hour Rainfall Aggregate', 0.3761025342332299), ('Fire 120 Hour Rainfall Aggregate', 0.37383521873783515), ('Pittman 120 Hour Rainfall Aggregate', 0.3736571987018945), ('NW 120 Hour Rainfall Aggregate', 0.3735336340619427), ('English 144 Hour Rainfall Aggregate', 0.3725638651262099), ('Le 120 Hour Rainfall Aggregate', 0.37244177223267433), ('Bingham 120 Hour Rainfall Aggregate', 0.37161253900093255), ('Westport 120 Hour Rainfall Aggregate', 0.3715929446810732), ('Rutledge 120 Hour Rainfall Aggregate', 0.37081821637666945), ('Strafford 120 Hour Rainfall Aggregate', 0.36918748312745786), ('Jefferies 120 Hour Rainfall Aggregate', 0.3690285571842881), ('Year', 0.367797611469703), ('Republic 120 Hour Rainfall Aggregate', 0.36664732728299854), ('Hiland 120 Hour Rainfall Aggregate', 0.3664518714314395), 
('Shady 120 Hour Rainfall Aggregate', 0.36541652094317023), ('Sunshine 120 Hour Rainfall Aggregate', 0.36477771773898915), ('Millwood 120 Hour Rainfall Aggregate', 0.3645824348493793), ('Valley 120 Hour Rainfall Aggregate', 0.36428518207968835), ('Disney 120 Hour Rainfall Aggregate', 0.36310969859833664), ('River 168 Hour Rainfall Aggregate', 0.36261469289321047), ('Sherwood 120 Hour Rainfall Aggregate', 0.3619826807420285), ('Willard_Intermediate 120 Hour Rainfall Aggregate', 0.36185996153310473), ('Roundtree 120 Hour Rainfall Aggregate', 0.36164645710460974), ('Airport_West 144 Hour Rainfall Aggregate', 0.35857359113807563), ('Willard 120 Hour Rainfall Aggregate', 0.3562853976311659), ('James 120 Hour Rainfall Aggregate', 0.3529357416943799), ('Sequiota 96 Hour Rainfall Aggregate', 0.35244389410195837), ('Total 120 Hour Rainfall Aggregate', 0.34914340948991257), ('Cherokee 120 Hour Rainfall Aggregate', 0.3389923448328174), ('Airport 120 Hour Rainfall Aggregate', 0.3367172103431759), ('English 120 Hour Rainfall Aggregate', 0.3338424646324984), ('River 144 Hour Rainfall Aggregate', 0.3296094403152723), ('AT&T 96 Hour Rainfall Aggregate', 0.32718954406544776), ('Hiland 96 Hour Rainfall Aggregate', 0.3189602479486714), ('Waste 96 Hour Rainfall Aggregate', 0.3177554191110962), ('Airport_West 120 Hour Rainfall Aggregate', 0.31662248873630155), ('Pleasant 96 Hour Rainfall Aggregate', 0.316184329528708), 
('Field 96 Hour Rainfall Aggregate', 0.31617771860153365), ('Weller 96 Hour Rainfall Aggregate', 0.31610568446439263), ('NW 96 Hour Rainfall Aggregate', 0.3143160136072081), ('Rutledge 96 Hour Rainfall Aggregate', 0.31396748332736335), ('Pittman 96 Hour Rainfall Aggregate', 0.3128531052204416), ('Fire 96 Hour Rainfall Aggregate', 0.3128350903882983), ('Westport 96 Hour Rainfall Aggregate', 0.3128339837491793), ('Le 96 Hour Rainfall Aggregate', 0.31085653387919177), ('Bingham 96 Hour Rainfall Aggregate', 0.3102929517669665), ('Strafford 96 Hour Rainfall Aggregate', 0.31024557112131496), ('Jefferies 96 Hour Rainfall Aggregate', 0.3088131999884786), ('Republic 96 Hour Rainfall Aggregate', 0.3075400591727575), ('Sunshine 96 Hour Rainfall Aggregate', 0.3061041048566286), ('Millwood 96 Hour Rainfall Aggregate', 0.30536170511495436), ('Disney 96 Hour Rainfall Aggregate', 0.3052465618401107), ('Shady 96 Hour Rainfall Aggregate', 0.3050697886388504), ('Valley 96 Hour Rainfall Aggregate', 0.3049518261674969), ('Willard_Intermediate 96 Hour Rainfall Aggregate', 0.3039421654322875), ('Sherwood 96 Hour Rainfall Aggregate', 0.3035103598458502), ('Roundtree 96 Hour Rainfall Aggregate', 0.30229947084786585), ('Willard 96 Hour Rainfall Aggregate', 0.30060981236386014), ('James 96 Hour Rainfall Aggregate', 0.297666205474958), ('Total 96 Hour Rainfall Aggregate', 0.2958642205203572), ('River 120 Hour Rainfall Aggregate', 0.28892368872524143), ('Airport 96 Hour Rainfall Aggregate', 0.2864535666295285), ('English 96 Hour Rainfall Aggregate', 0.2856508395647613), ('Sequiota 72 Hour Rainfall Aggregate', 0.2847959315212102), ('Cherokee 96 Hour Rainfall Aggregate', 0.28169401458996707), ('Airport_West 96 Hour Rainfall Aggregate', 0.2655000050988324), ('Hiland 72 Hour Rainfall Aggregate', 0.2601314787011252), ('AT&T 72 Hour Rainfall Aggregate', 0.25643917945082473), ('SWTP Plant 1 Gravity Flow', 0.2550182268014052), ('Waste 72 Hour Rainfall Aggregate', 0.24763822637706978), ('Rutledge 72 Hour Rainfall Aggregate', 0.246621903513602), ('Pleasant 72 Hour Rainfall Aggregate', 0.2464847279134578), ('Field 72 Hour Rainfall Aggregate', 0.2461824866806548), ('NW 72 Hour Rainfall Aggregate', 0.24504870960543781), ('Weller 72 Hour Rainfall Aggregate', 0.24358256565129532), ('Westport 72 Hour Rainfall Aggregate', 0.2434961474781645), ('Pittman 72 Hour Rainfall Aggregate', 0.2427009810764945), ('Strafford 72 Hour Rainfall Aggregate', 0.24190201436589462), ('Fire 72 Hour Rainfall Aggregate', 0.24156832975314274), ('Republic 72 Hour Rainfall Aggregate', 0.2399226523912302), ('River 96 Hour Rainfall Aggregate', 0.2393625502246285), ('Le 72 Hour Rainfall Aggregate', 0.23926236121463998), ('Bingham 72 Hour Rainfall Aggregate', 0.23918997873656403), ('Jefferies 72 Hour Rainfall Aggregate', 0.23897830203937312), ('Disney 72 Hour Rainfall Aggregate', 0.23778244125423317), ('Sunshine 72 Hour Rainfall Aggregate', 0.23760310565212936), ('Millwood 72 Hour Rainfall Aggregate', 0.23719964687027006), ('Willard_Intermediate 72 Hour Rainfall Aggregate', 0.23628665756005987), ('Valley 72 Hour Rainfall Aggregate', 0.23598947098514328), ('Willard 72 Hour Rainfall Aggregate', 0.23525614908120793), 
('Sherwood 72 Hour Rainfall Aggregate', 0.23490158254293908), ('Shady 72 Hour Rainfall Aggregate', 0.2348709530125168), ('Roundtree 72 Hour Rainfall Aggregate', 0.23426203979355367), ('Total 72 Hour Rainfall Aggregate', 0.23245146267819022), ('James 72 Hour Rainfall Aggregate', 0.23201768549528615), ('English 72 Hour Rainfall Aggregate', 0.22827096580421713), ('Airport 72 Hour Rainfall Aggregate', 0.22731273424824494), ('Cherokee 72 Hour Rainfall Aggregate', 0.21605243990177425), ('Airport_West 72 Hour Rainfall Aggregate', 0.20637837942429413), ('River 72 Hour Rainfall Aggregate', 0.18283244391708428), ('Airport_Springfield 168 Hour Rainfall Aggregate', 0.14774064817739763), ('Mark 168 Hour Rainfall Aggregate', 0.14681624941590535), ('Blackman 168 Hour Rainfall Aggregate', 0.14099505050272113), ('HourlyWetBulbTemperature', 0.1408114302337495), ('Airport_Springfield 144 Hour Rainfall Aggregate', 0.14016438167916295), ('HourlyDewPointTemperature', 0.1371754294554511), ('Mark 144 Hour Rainfall Aggregate', 0.13692074686100758), ('SW_Peak_Flow', 0.1364631920330091), ('HourlyDryBulbTemperature', 0.13612507401310667), ('Airport_Springfield 120 Hour Rainfall Aggregate', 0.13142944335464757), ('Blackman 144 Hour Rainfall Aggregate', 0.13134106371579227), ('HourlyStationPressure', 0.12745412314880214), ('HourlyAltimeterSetting', 0.12678893423774354), ('Mark 120 Hour Rainfall Aggregate', 0.12465767254218987), ('Airport_Springfield 96 Hour Rainfall Aggregate', 0.12206016689766325), ('Blackman 120 Hour Rainfall Aggregate', 0.11923932055957005), ('HourlySeaLevelPressure', 0.11282506509930637), ('Airport_Springfield 72 Hour Rainfall Aggregate', 0.11005955643417992), ('Mark 96 Hour Rainfall Aggregate', 0.10938924792118732), ('Blackman 96 Hour Rainfall Aggregate', 0.10282427629932536), ('Mark 72 Hour Rainfall Aggregate', 0.09108120414500065), ('Blackman 72 Hour Rainfall Aggregate', 0.08290786102263553), ('Sequiota Rainfall (in)', 0.05584810633704488), ('HourlyRelativeHumidity', 0.05163433537954279), ('HourlyWindSpeed', 0.05159415237561198), ('Hiland Rainfall (in)', 0.05019264265840611), ('NW Rainfall (in)', 0.043358478664041356), ('Rutledge Rainfall (in)', 0.04322496741056401), ('James Rainfall (in)', 0.04283251855051575), ('AT&T Rainfall (in)', 0.041385658532493375), ('Pleasant Rainfall (in)', 0.04110969926330502), ('English Rainfall (in)', 0.04045828530704011), ('Willard Rainfall (in)', 0.039664596091594874), ('Strafford Rainfall (in)', 0.03917619876931006), ('Total Rainfall (in)', 0.039070488663647984), ('Westport Rainfall (in)', 0.03829713464870178), ('Millwood Rainfall (in)', 0.038230716841559854), ('HourlyVisibility', 0.03807132978887514), ('Valley Rainfall (in)', 0.037924212737930625), ('Fire Rainfall (in)', 0.03755030616984465), ('Waste Rainfall (in)', 0.037487770354472504), ('Le Rainfall (in)', 0.03712608578678943), ('Sunshine Rainfall (in)', 0.03712408894501554), ('Weller Rainfall (in)', 0.037092468419905496), ('Willard_Intermediate Rainfall (in)', 0.03701233297143658), ('Bingham Rainfall (in)', 0.036902033062771344), ('Disney Rainfall (in)', 0.036885939902612845), ('Airport Rainfall (in)', 0.03671320456233719), ('Republic Rainfall (in)', 0.0366949714612916), ('Field Rainfall (in)', 0.036305832106271446), ('Jefferies Rainfall (in)', 0.036240114680229744), ('Williams 72 Hour Rainfall Aggregate', 0.0361833662449386), ('Sherwood Rainfall (in)', 0.03605416311979079), ('Shady Rainfall (in)', 0.03575687217934234), ('Pittman Rainfall (in)', 0.03569274340782442), ('Williams 96 Hour Rainfall Aggregate', 0.03564909361765943), ('Williams 168 Hour Rainfall Aggregate', 0.03557915569634159), ('Williams 144 Hour Rainfall Aggregate', 0.03557915569634159), ('Williams 120 Hour Rainfall Aggregate', 0.03557915569634159), ('Roundtree Rainfall (in)', 0.03391163050103573), ('Cherokee Rainfall (in)', 0.032726906835445435), ('River Rainfall (in)', 0.032076763035391524), ('Airport_West Rainfall (in)', 0.031153990493329013), ('Airport_Springfield Rainfall (in)', 0.030863161852477856), ('HourlyPressureChange', 0.027835771224437478), ('Week Day', 0.01794606611703306), ('Mark Rainfall (in)', 0.016258881660621793), ('HourlyPressureTendency', 0.013124291897146926), ('Blackman Rainfall (in)', 0.012761530675847892), ('Williams Rainfall (in)', 0.012036665431222568), ('Hour', 0.0026714172489968177)]
'''
correlationList = [('James Gauge Height (ft)', 0.6682595437090044), ('SWTP Total Influent Flow', 0.6555463483051369), ('SWTP Plant 2 Influent Flow', 0.6361044465403347), ('Ozark Aquifer Depth to Water Level (ft)', 0.6307472041655565), ('Wilsons Gauge Height (ft)', 0.5348081187015494), ('SWTP Plant 1 Influent Flow', 0.5131380272745351), ('Sequiota 168 Hour Rainfall Aggregate', 0.49261155233603243), ('AT&T 168 Hour Rainfall Aggregate', 0.47924683180344546), ('Weller 168 Hour Rainfall Aggregate', 0.46914069961390753), ('Pleasant 168 Hour Rainfall Aggregate', 0.46836093693481723), ('Field 168 Hour Rainfall Aggregate', 0.4677687215030506), ('Fire 168 Hour Rainfall Aggregate', 0.4648271387497209), ('Le 168 Hour Rainfall Aggregate', 0.46482625262585736), ('Pittman 168 Hour Rainfall Aggregate', 0.4645865239595623), ('NW 168 Hour Rainfall Aggregate', 0.46418388798483584), ('Waste 168 Hour Rainfall Aggregate', 0.46401463740107984), ('Bingham 168 Hour Rainfall Aggregate', 0.46282093560916804), ('Jefferies 168 Hour Rainfall Aggregate', 0.4597704046209257), ('Westport 168 Hour Rainfall Aggregate', 0.4578908602354908), ('Republic 168 Hour Rainfall Aggregate', 0.45783171507823345), ('Strafford 168 Hour Rainfall Aggregate', 0.4561680567674403), ('Shady 168 Hour Rainfall Aggregate', 0.4555261201192056), ('Rutledge 168 Hour Rainfall Aggregate', 0.4549424533408891), ('Sequiota 144 Hour Rainfall Aggregate', 0.4548337679672771), ('Valley 168 Hour Rainfall Aggregate', 0.4537452680544154), ('Sunshine 168 Hour Rainfall Aggregate', 0.45250312519898506), ('Roundtree 168 Hour Rainfall Aggregate', 0.4518784630629673), ('Millwood 168 Hour Rainfall Aggregate', 0.4518611459088023), ('Disney 168 Hour Rainfall Aggregate', 0.45006996869229055), ('Willard_Intermediate 168 Hour Rainfall Aggregate', 0.44934023601414647), ('Sherwood 168 Hour Rainfall Aggregate', 0.4477758930119068), ('Willard 168 Hour Rainfall Aggregate', 0.4392164621833291), ('AT&T 144 Hour Rainfall Aggregate', 0.4379238407575416), ('Hiland 168 Hour Rainfall Aggregate', 0.43374351624349583), ('James 168 Hour Rainfall Aggregate', 0.43335490923530623), ('Weller 144 Hour Rainfall Aggregate', 0.4276919084939065), ('Field 144 Hour Rainfall Aggregate', 0.4266477822652221), ('Cherokee 168 Hour Rainfall Aggregate', 0.42646576889995086), ('Pleasant 144 Hour Rainfall Aggregate', 0.42643839452297166), ('Total 168 Hour Rainfall Aggregate', 0.4263620198177264), ('Waste 144 Hour Rainfall Aggregate', 0.42457929882349604), ('Fire 144 Hour Rainfall Aggregate', 0.4237802319777868), ('Pittman 144 Hour Rainfall Aggregate', 0.4232479959638109), ('NW 144 Hour Rainfall Aggregate', 0.42287411280768006), ('Le 144 Hour Rainfall Aggregate', 0.42284569953342266), ('Bingham 144 Hour Rainfall Aggregate', 0.4216418216072208), ('Westport 144 Hour Rainfall Aggregate', 0.4190661929329971), ('Jefferies 144 Hour Rainfall Aggregate', 0.4187868444660492), ('Rutledge 144 Hour Rainfall Aggregate', 0.4171185721060713), ('Strafford 144 Hour Rainfall Aggregate', 0.41704922393872534), ('Republic 144 Hour Rainfall Aggregate', 0.41662770356318357), ('Shady 144 Hour Rainfall Aggregate', 0.41470354016303496), ('Millwood 144 Hour Rainfall Aggregate', 0.4129726433747951), ('Valley 144 Hour Rainfall Aggregate', 0.41288065817080816), ('Sunshine 144 Hour Rainfall Aggregate', 0.4127837324431027), ('Roundtree 144 Hour Rainfall Aggregate', 0.41111209421135764), ('Disney 144 Hour Rainfall Aggregate', 0.4107772592023044), ('Willard_Intermediate 144 Hour Rainfall Aggregate', 0.4096480939999576), ('Airport 168 Hour Rainfall Aggregate', 0.40959218711762146), ('Sequiota 120 Hour Rainfall Aggregate', 0.4091605305242023), ('Sherwood 144 Hour Rainfall Aggregate', 0.4090154502890437), ('Week', 0.40632921092381735), ('Hiland 144 Hour Rainfall Aggregate', 0.4036334264595414), ('English 168 Hour Rainfall Aggregate', 0.4033708901005385), ('Willard 144 Hour Rainfall Aggregate', 0.4015487469613252), ('James 144 Hour Rainfall Aggregate', 0.3974771441413898), ('Month', 0.3968561060258641), ('Airport_West 168 Hour Rainfall Aggregate', 0.39317597518162056), ('Total 144 Hour Rainfall Aggregate', 0.3920454455796846), ('AT&T 120 Hour Rainfall Aggregate', 0.38821002985117103), ('Cherokee 144 Hour Rainfall Aggregate', 0.38697235190475276), ('Weller 120 Hour Rainfall Aggregate', 0.3777695811651127), ('Airport 144 Hour Rainfall Aggregate', 0.37678312757065285), ('Field 120 Hour Rainfall Aggregate', 0.3767742917612683), ('Waste 120 Hour Rainfall Aggregate', 0.37673313870994374), ('Pleasant 120 Hour Rainfall Aggregate', 0.3761025342332299), ('Fire 120 Hour Rainfall Aggregate', 0.37383521873783515), ('Pittman 120 Hour Rainfall Aggregate', 0.3736571987018945), ('NW 120 Hour Rainfall Aggregate', 0.3735336340619427), ('English 144 Hour Rainfall Aggregate', 0.3725638651262099), ('Le 120 Hour Rainfall Aggregate', 0.37244177223267433), ('Bingham 120 Hour Rainfall Aggregate', 0.37161253900093255), ('Westport 120 Hour Rainfall Aggregate', 0.3715929446810732), ('Rutledge 120 Hour Rainfall Aggregate', 0.37081821637666945), ('Strafford 120 Hour Rainfall Aggregate', 0.36918748312745786), ('Jefferies 120 Hour Rainfall Aggregate', 0.3690285571842881), ('Year', 0.367797611469703), ('Republic 120 Hour Rainfall Aggregate', 0.36664732728299854), ('Hiland 120 Hour Rainfall Aggregate', 0.3664518714314395), 
('Shady 120 Hour Rainfall Aggregate', 0.36541652094317023), ('Sunshine 120 Hour Rainfall Aggregate', 0.36477771773898915), ('Millwood 120 Hour Rainfall Aggregate', 0.3645824348493793), ('Valley 120 Hour Rainfall Aggregate', 0.36428518207968835), ('Disney 120 Hour Rainfall Aggregate', 0.36310969859833664), ('River 168 Hour Rainfall Aggregate', 0.36261469289321047), ('Sherwood 120 Hour Rainfall Aggregate', 0.3619826807420285), ('Willard_Intermediate 120 Hour Rainfall Aggregate', 0.36185996153310473), ('Roundtree 120 Hour Rainfall Aggregate', 0.36164645710460974), ('Airport_West 144 Hour Rainfall Aggregate', 0.35857359113807563), ('Willard 120 Hour Rainfall Aggregate', 0.3562853976311659), ('James 120 Hour Rainfall Aggregate', 0.3529357416943799), ('Sequiota 96 Hour Rainfall Aggregate', 0.35244389410195837), ('Total 120 Hour Rainfall Aggregate', 0.34914340948991257), ('Cherokee 120 Hour Rainfall Aggregate', 0.3389923448328174), ('Airport 120 Hour Rainfall Aggregate', 0.3367172103431759), ('English 120 Hour Rainfall Aggregate', 0.3338424646324984), ('River 144 Hour Rainfall Aggregate', 0.3296094403152723), ('AT&T 96 Hour Rainfall Aggregate', 0.32718954406544776), ('Hiland 96 Hour Rainfall Aggregate', 0.3189602479486714), ('Waste 96 Hour Rainfall Aggregate', 0.3177554191110962), ('Airport_West 120 Hour Rainfall Aggregate', 0.31662248873630155), ('Pleasant 96 Hour Rainfall Aggregate', 0.316184329528708), 
('Field 96 Hour Rainfall Aggregate', 0.31617771860153365), ('Weller 96 Hour Rainfall Aggregate', 0.31610568446439263), ('NW 96 Hour Rainfall Aggregate', 0.3143160136072081), ('Rutledge 96 Hour Rainfall Aggregate', 0.31396748332736335), ('Pittman 96 Hour Rainfall Aggregate', 0.3128531052204416), ('Fire 96 Hour Rainfall Aggregate', 0.3128350903882983), ('Westport 96 Hour Rainfall Aggregate', 0.3128339837491793), ('Le 96 Hour Rainfall Aggregate', 0.31085653387919177), ('Bingham 96 Hour Rainfall Aggregate', 0.3102929517669665), ('Strafford 96 Hour Rainfall Aggregate', 0.31024557112131496), ('Jefferies 96 Hour Rainfall Aggregate', 0.3088131999884786), ('Republic 96 Hour Rainfall Aggregate', 0.3075400591727575), ('Sunshine 96 Hour Rainfall Aggregate', 0.3061041048566286), ('Millwood 96 Hour Rainfall Aggregate', 0.30536170511495436), ('Disney 96 Hour Rainfall Aggregate', 0.3052465618401107), ('Shady 96 Hour Rainfall Aggregate', 0.3050697886388504), ('Valley 96 Hour Rainfall Aggregate', 0.3049518261674969), ('Willard_Intermediate 96 Hour Rainfall Aggregate', 0.3039421654322875), ('Sherwood 96 Hour Rainfall Aggregate', 0.3035103598458502), ('Roundtree 96 Hour Rainfall Aggregate', 0.30229947084786585), ('Willard 96 Hour Rainfall Aggregate', 0.30060981236386014), ('James 96 Hour Rainfall Aggregate', 0.297666205474958), ('Total 96 Hour Rainfall Aggregate', 0.2958642205203572), ('River 120 Hour Rainfall Aggregate', 0.28892368872524143), ('Airport 96 Hour Rainfall Aggregate', 0.2864535666295285), ('English 96 Hour Rainfall Aggregate', 0.2856508395647613), ('Sequiota 72 Hour Rainfall Aggregate', 0.2847959315212102), ('Cherokee 96 Hour Rainfall Aggregate', 0.28169401458996707), ('Airport_West 96 Hour Rainfall Aggregate', 0.2655000050988324), ('Hiland 72 Hour Rainfall Aggregate', 0.2601314787011252), ('AT&T 72 Hour Rainfall Aggregate', 0.25643917945082473), ('SWTP Plant 1 Gravity Flow', 0.2550182268014052), ('Waste 72 Hour Rainfall Aggregate', 0.24763822637706978), ('Rutledge 72 Hour Rainfall Aggregate', 0.246621903513602), ('Pleasant 72 Hour Rainfall Aggregate', 0.2464847279134578), ('Field 72 Hour Rainfall Aggregate', 0.2461824866806548), ('NW 72 Hour Rainfall Aggregate', 0.24504870960543781), ('Weller 72 Hour Rainfall Aggregate', 0.24358256565129532), ('Westport 72 Hour Rainfall Aggregate', 0.2434961474781645), ('Pittman 72 Hour Rainfall Aggregate', 0.2427009810764945), ('Strafford 72 Hour Rainfall Aggregate', 0.24190201436589462), ('Fire 72 Hour Rainfall Aggregate', 0.24156832975314274), ('Republic 72 Hour Rainfall Aggregate', 0.2399226523912302), ('River 96 Hour Rainfall Aggregate', 0.2393625502246285), ('Le 72 Hour Rainfall Aggregate', 0.23926236121463998), ('Bingham 72 Hour Rainfall Aggregate', 0.23918997873656403), ('Jefferies 72 Hour Rainfall Aggregate', 0.23897830203937312), ('Disney 72 Hour Rainfall Aggregate', 0.23778244125423317), ('Sunshine 72 Hour Rainfall Aggregate', 0.23760310565212936), ('Millwood 72 Hour Rainfall Aggregate', 0.23719964687027006), ('Willard_Intermediate 72 Hour Rainfall Aggregate', 0.23628665756005987), ('Valley 72 Hour Rainfall Aggregate', 0.23598947098514328), ('Willard 72 Hour Rainfall Aggregate', 0.23525614908120793), 
('Sherwood 72 Hour Rainfall Aggregate', 0.23490158254293908), ('Shady 72 Hour Rainfall Aggregate', 0.2348709530125168), ('Roundtree 72 Hour Rainfall Aggregate', 0.23426203979355367), ('Total 72 Hour Rainfall Aggregate', 0.23245146267819022), ('James 72 Hour Rainfall Aggregate', 0.23201768549528615), ('English 72 Hour Rainfall Aggregate', 0.22827096580421713), ('Airport 72 Hour Rainfall Aggregate', 0.22731273424824494), ('Cherokee 72 Hour Rainfall Aggregate', 0.21605243990177425), ('Airport_West 72 Hour Rainfall Aggregate', 0.20637837942429413), ('River 72 Hour Rainfall Aggregate', 0.18283244391708428), ('Airport_Springfield 168 Hour Rainfall Aggregate', 0.14774064817739763), ('Mark 168 Hour Rainfall Aggregate', 0.14681624941590535), ('Blackman 168 Hour Rainfall Aggregate', 0.14099505050272113), ('HourlyWetBulbTemperature', 0.1408114302337495), ('Airport_Springfield 144 Hour Rainfall Aggregate', 0.14016438167916295), ('HourlyDewPointTemperature', 0.1371754294554511), ('Mark 144 Hour Rainfall Aggregate', 0.13692074686100758), ('SW_Peak_Flow', 0.1364631920330091), ('HourlyDryBulbTemperature', 0.13612507401310667), ('Airport_Springfield 120 Hour Rainfall Aggregate', 0.13142944335464757), ('Blackman 144 Hour Rainfall Aggregate', 0.13134106371579227), ('HourlyStationPressure', 0.12745412314880214), ('HourlyAltimeterSetting', 0.12678893423774354), ('Mark 120 Hour Rainfall Aggregate', 0.12465767254218987), ('Airport_Springfield 96 Hour Rainfall Aggregate', 0.12206016689766325), ('Blackman 120 Hour Rainfall Aggregate', 0.11923932055957005), ('HourlySeaLevelPressure', 0.11282506509930637), ('Airport_Springfield 72 Hour Rainfall Aggregate', 0.11005955643417992), ('Mark 96 Hour Rainfall Aggregate', 0.10938924792118732), ('Blackman 96 Hour Rainfall Aggregate', 0.10282427629932536), ('Mark 72 Hour Rainfall Aggregate', 0.09108120414500065), ('Blackman 72 Hour Rainfall Aggregate', 0.08290786102263553), ('Sequiota Rainfall (in)', 0.05584810633704488), ('HourlyRelativeHumidity', 0.05163433537954279), ('HourlyWindSpeed', 0.05159415237561198), ('Hiland Rainfall (in)', 0.05019264265840611), ('NW Rainfall (in)', 0.043358478664041356), ('Rutledge Rainfall (in)', 0.04322496741056401), ('James Rainfall (in)', 0.04283251855051575), ('AT&T Rainfall (in)', 0.041385658532493375), ('Pleasant Rainfall (in)', 0.04110969926330502), ('English Rainfall (in)', 0.04045828530704011), ('Willard Rainfall (in)', 0.039664596091594874), ('Strafford Rainfall (in)', 0.03917619876931006), ('Total Rainfall (in)', 0.039070488663647984), ('Westport Rainfall (in)', 0.03829713464870178), ('Millwood Rainfall (in)', 0.038230716841559854), ('HourlyVisibility', 0.03807132978887514), ('Valley Rainfall (in)', 0.037924212737930625), ('Fire Rainfall (in)', 0.03755030616984465), ('Waste Rainfall (in)', 0.037487770354472504), ('Le Rainfall (in)', 0.03712608578678943), ('Sunshine Rainfall (in)', 0.03712408894501554), ('Weller Rainfall (in)', 0.037092468419905496), ('Willard_Intermediate Rainfall (in)', 0.03701233297143658), ('Bingham Rainfall (in)', 0.036902033062771344), ('Disney Rainfall (in)', 0.036885939902612845), ('Airport Rainfall (in)', 0.03671320456233719), ('Republic Rainfall (in)', 0.0366949714612916), ('Field Rainfall (in)', 0.036305832106271446), ('Jefferies Rainfall (in)', 0.036240114680229744), ('Williams 72 Hour Rainfall Aggregate', 0.0361833662449386), ('Sherwood Rainfall (in)', 0.03605416311979079), ('Shady Rainfall (in)', 0.03575687217934234), ('Pittman Rainfall (in)', 0.03569274340782442), ('Williams 96 Hour Rainfall Aggregate', 0.03564909361765943), ('Williams 168 Hour Rainfall Aggregate', 0.03557915569634159), ('Williams 144 Hour Rainfall Aggregate', 0.03557915569634159), ('Williams 120 Hour Rainfall Aggregate', 0.03557915569634159), ('Roundtree Rainfall (in)', 0.03391163050103573), ('Cherokee Rainfall (in)', 0.032726906835445435), ('River Rainfall (in)', 0.032076763035391524), ('Airport_West Rainfall (in)', 0.031153990493329013), ('Airport_Springfield Rainfall (in)', 0.030863161852477856), ('HourlyPressureChange', 0.027835771224437478), ('Week Day', 0.01794606611703306), ('Mark Rainfall (in)', 0.016258881660621793), ('HourlyPressureTendency', 0.013124291897146926), ('Blackman Rainfall (in)', 0.012761530675847892), ('Williams Rainfall (in)', 0.012036665431222568), ('Hour', 0.0026714172489968177)]


In [12]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from copy import deepcopy
%matplotlib qt

def findNulls(arr):
    index = 0
    pairs = []                  # formatted like [(start index, num values)]
    while index < len(arr):
        if np.isnan(arr[index]):
            width = 1
            try:
                while np.isnan(arr[index + width]):
                    width += 1
            except IndexError:  # means end of array is null
                break
            pairs.append((index, width))
            index += width
        else:
            index += 1

    # for pair in pairs:
    #     print("Null values starting at index: {i}. {w} total nulls".format(i=pair[0], w=pair[1]))
    return pairs

def createIndicies(index, gapSize):
    indicies = []
    if gapSize % 2 != 0:
        maxVal = int(gapSize/2) + index
        minVal = int(gapSize/2) * -1 + index
        indicies = [i for i in range(minVal, maxVal + 1)]
    else:
        maxVal = int(gapSize/2) + index
        minVal = int(gapSize/2) * -1 + index + 1
        indicies = [i for i in range(minVal, maxVal + 1)]
    return indicies

def testLargeSpot(arr, index, length):
    if len(arr) - index > length and index > length: 
        # tests if 5 vals before and after index are null
        # also if index is null
        for i in range(length):
            if np.isnan(arr[index + i]):
                return False          
            if np.isnan(arr[index - i]):
                return False
        return True                         # only if all are not null will this be hit
    return False
 
def createLargeGapIndicies(arr, test_size):
    existingLargeNullGapLengths = [x[1] for x in findNulls(arr) if x[1] > 5]
    minGapSize = 5
    maxGapSize = max(existingLargeNullGapLengths, default=150)
    spots = []                              # append all initial indicies to be turned null here
    validationIndicies = []                 # append all indicies forced to null here
    nullArr = deepcopy(arr)                     # will be adding null values to here for validation

    # getting the spots to make null
    breakout = 0
    totalCreatedNulls = 0
    hasLargestGap = False
    while totalCreatedNulls / len(arr) < test_size and breakout < 500000:
        # randomly getting index and how large of gap to create
        randIndex = np.random.randint(0, len(arr))
        randGapSize = np.random.randint(minGapSize, maxGapSize)
        if not hasLargestGap:
            randGapSize = maxGapSize
            hasLargestGap = True

        # testing if spot is valid
        if testLargeSpot(nullArr, randIndex, randGapSize):
            spots.append(randIndex)
            totalCreatedNulls += randGapSize
            
            # making a null gap where data was previously
            indiciesToTurnNull = createIndicies(randIndex, randGapSize)
            for i in indiciesToTurnNull:
                nullArr[i] = np.nan
                validationIndicies.append(i)

        breakout += 1
    return validationIndicies

def train_test_split_largeGap(data, target, test_size = 0.1):
    trainX, trainY = deepcopy(data), deepcopy(target)
    testX, testY = [], []
    validationIndicies = np.sort(np.array(createLargeGapIndicies(target, test_size)))
    for index in validationIndicies:
        testX.append(data[index])
        testY.append(target[index])
    testX, testY = np.array(testX), np.array(testY)
    trainX = np.delete(trainX, validationIndicies, 0)
    trainY = np.delete(trainY, validationIndicies)
    return trainX, testX, trainY, testY, validationIndicies


In [13]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error

def saveForestPredictions(originFilename, predFilename, feature):
    # building dataframes
    originDf = pd.read_csv(originFilename, parse_dates=["DateTime"])
    predDf = pd.read_csv(predFilename, parse_dates=["DateTime"])

    # getting arrays from dataframes
    predArr = np.array(predDf[predDf.columns[-1]])
    predDates = np.array(predDf["DateTime"])
    originArr = np.array(originDf[feature])
    originDates = np.array(originDf["DateTime"])

    # copying over predicted values
    for i in range(len(predDates)):
        locations = np.nonzero(originDates == predDates[i])
        originArr[locations[0]] = predArr[i]

    # replacing array in df and saving to csv
    df[feature] = originArr
    df.to_csv("test.csv", index=False)

def findSharedNullValueFeatures(df, targetFeature, tol = 0.5):
    # getting location of target nulls
    target = np.array(df[targetFeature])
    targetNulls = np.nonzero(np.isnan(target))[0]

    # getting columns
    columnList = [x for x in df.columns if x not in ["DateTime", targetFeature]]
    badColumns = []
    for col in columnList:
        # getting how many null values are shared between target and each feature
        arr = np.array(df[col])
        arrNulls = np.nonzero(np.isnan(arr))[0]
        sharedNullIncidiesCount = len(np.intersect1d(targetNulls, arrNulls))
        
        # if share tol% or more null features, then leave that feature out
        if sharedNullIncidiesCount > tol * len(targetNulls):
            badColumns.append(col)

    return badColumns

def separateDataIntoSets(target, data, dates):
    # getting null locations from target feature array
    targetNulls = np.nonzero(np.isnan(target))[0]

    # getting data corresponding to where the target feature is null
    testData = []
    testDates = []
    for i in targetNulls:
        testData.append(data[i])
        testDates.append(dates[i])
    testData = np.array(testData)
    testDates = np.array(testDates)

    # deleting all indicies that are null from target and data
    trainTarget = np.delete(target, targetNulls)
    trainData = np.delete(data, targetNulls, 0)     # the 0 means delete a row

    # finding where null values are present in the data that has a not null target corresponding to it
    badIndicies = []
    for col in range(len(trainData[0])):
        for row in range(len(trainData)):
            if np.isnan(trainData[row][col]):
                badIndicies.append(row)

    # removing those indicies so that rand forest can train
    if len(badIndicies) > 0:
        badIndicies = np.unique(np.array(badIndicies))
        trainData = np.delete(trainData, badIndicies, 0)
        trainTarget = np.delete(trainTarget, badIndicies)

    # finding where null values are present in the data that has a null target corresponding to it
    badIndicies = []
    for col in range(len(testData[0])):
        for row in range(len(testData)):
            if np.isnan(testData[row][col]):
                badIndicies.append(row)
    
    # removing those indicies so that rand forest can predict
    if len(badIndicies) > 0:
        badIndicies = np.unique(np.array(badIndicies))
        testData = np.delete(testData, badIndicies, 0)
        testDates = np.delete(testDates, badIndicies)
    
    return trainData, trainTarget, testData, testDates

def scalePredictedValues(dates, fullTarget, predictedValues, predictedIndicies, scaleFactor):
    # copying over predicted values
    fullPredTarget = deepcopy(fullTarget)
    validNullTarget = deepcopy(fullTarget)                 # used in linear scaling step to find where to scale
    predDates = [dates[i] for i in predictedIndicies]
    for i in range(len(predDates)):
        locations = np.nonzero(dates == predDates[i])
        fullPredTarget[locations[0]] = predictedValues[i]
        validNullTarget[locations[0]] = np.nan
    
    scalingSpots = findNulls(validNullTarget)
    for tup in scalingSpots:
        # getting points to make trendline, using 10 points
        points = []
        for i in range(1, 11):                   # at tup[0], target is null, so start with i = 1
            xBefore = tup[0] - i
            yBefore = fullTarget[xBefore]
            xAfter = tup[0] + tup[1] - 1 + i
            yAfter = fullTarget[xAfter]

            # condition if cannot get all 10 points desired due to other null gaps close to current gap
            if np.isnan(yBefore) or np.isnan(yAfter):
                break
            
            # appending points
            points.append((xBefore, yBefore))
            points.append((xAfter, yAfter))

        # creating trendline from points
        trendlineCoeffs = np.polyfit(np.array([p[0] for p in points]), np.array([p[1] for p in points]), 1)
        trendline = np.poly1d(trendlineCoeffs)

        # scaling predicted values
        for i in range(tup[1]):
            fullPredTarget[tup[0] + i] = trendline(tup[0] + i) + scaleFactor * (fullPredTarget[tup[0] + i] - trendline(tup[0] + i))

    # slicing out scaled prediced values
    scaledPredictedValues = [fullPredTarget[i] for i in predictedIndicies]

    return scaledPredictedValues

def tuneForest(df, targetFeature):
    # getting data to use
    target = np.array(df[targetFeature])
    dates = np.array(df["DateTime"])
    badFeaturesToUse = findSharedNullValueFeatures(df, targetFeature)
    badFeaturesToUse += [targetFeature, "DateTime"]
    df = df.drop(columns=badFeaturesToUse)
    data = df.to_numpy()

    # splitting data up into respective datasets
    validData, validTarget, nullData, nullDates = separateDataIntoSets(target, data, dates)

    # setting up possible hyperparameter values
    maxNumFeatures = list(range(5, int(len(df.columns)/1.75) + 1, 2))
    maxDepths = [3, 5, 7, 10]
    numTrees = [50, 75, 100]
    scaleFactors = [.1, .2, .5, 1]
    # maxNumFeatures = [9]
    # maxDepths = [10]
    # numTrees = [75]
    # scaleFactors = [.1, 1]
    
    
    numValidations = 3
    avgDict = {}
    for n in range(numValidations):
        # splitting up known data into training and validation sets
        XTrain, XTest, YTrain, YTest, testIndicies = train_test_split_largeGap(validData, validTarget, test_size=0.20)

        # grid searching for best combination
        combos = []
        for numFeats in maxNumFeatures:
            for maxDepth in maxDepths:
                for trees in numTrees:
                    # only previous for loops impact the random forest's performance
                    imputer = RandomForestRegressor(n_estimators=trees, max_depth=maxDepth, max_features=numFeats)
                    imputer.fit(XTrain, YTrain)
                    predictedValues = imputer.predict(XTest)
                    # print(mean_squared_error(YTest, predictedValues))   # original mse without scaling

                    for scale in scaleFactors:
                        # linear trendline scaling
                        scaledPredictedValues = scalePredictedValues(dates, validTarget, predictedValues, testIndicies, scale)
                        mse = mean_squared_error(YTest, scaledPredictedValues)
                        combos.append((mse, (numFeats, maxDepth, trees, scale)))
        
        # adding combos info to the average dictionary
        for tup in combos:
            if tup[1] in avgDict.keys():
                avgDict[tup[1]] += tup[0]
            else:
                avgDict[tup[1]] = tup[0]
    
    # getting info out of avgDict and into a list to sort
    combos = [(tup[1]/numValidations, tup[0]) for tup in avgDict.items()]

    combos.sort(key=lambda a: a[0])
    for tup in combos:
        print(tup[0], "with {f} features, {d} max depth, {t} trees, and a scale factor of {s}".format(
            f = tup[1][0], d = tup[1][1], t = tup[1][2], s = tup[1][3]))

    # creating best tree predictions
    print("Best hyperparas are: {f} features, {d} depth {t} trees, and a scale factor of {s}".format(
        f = combos[0][1][0], d = combos[0][1][1], t = combos[0][1][2], s = combos[0][1][3]))
    print("With an MSE of: {m}".format(m = combos[0][0]))
    imputer = RandomForestRegressor(max_features=combos[0][1][0], max_depth=combos[0][1][1], n_estimators=combos[0][1][2])
    # imputer = RandomForestRegressor(max_features=9, max_depth=10, n_estimators=75)
    imputer.fit(validData, validTarget)
    imputedValues = imputer.predict(nullData)
    imputedValues = scalePredictedValues(dates, target, imputedValues, 
        [i for i in range(len(target)) if np.isnan(target[i])], combos[0][1][3])

    # saving to a dataframe
    predData = np.array((np.array(nullDates), imputedValues)).T
    newDf = pd.DataFrame(predData, columns=["DateTime", "Predicted Ozark Groundwater Depth (ft)"])
    newDf.to_csv("predicted forest.csv", index=False)


# targetFeature = "Ozark Aquifer Depth to Water Level (ft)"
targetFeature = "Springfield Plateau Aquifer Depth to Water Level (ft)"
# correlationList = getCorrelationPerFeature(df, targetFeature)
topCorrelatedFeatures = [x[0] for x in correlationList[:20]] + ["DateTime", targetFeature]
df = pd.read_csv("Small Gap Imputed Data.csv", usecols=topCorrelatedFeatures)
tuneForest(df, targetFeature)
# ozark: best was 9 features, 10 max depth, 75 trees, and a scale factor of 0.1 in full grid search with mse of 0.9284514998810384
# springfield: best was 9 features, 10 max depth, 75 trees, and a scale factor of 0.1 in full grid search with mse of 0.02750132169140654
saveForestPredictions("Small Gap Imputed Data.csv", "predicted forest.csv", targetFeature)

0.02750132169140654 with 9 features, 10 max depth, 75 trees, and a scale factor of 0.1
0.027520025706964074 with 7 features, 10 max depth, 100 trees, and a scale factor of 0.1
0.027528424581983707 with 7 features, 10 max depth, 50 trees, and a scale factor of 0.1
0.027546466555528504 with 7 features, 10 max depth, 75 trees, and a scale factor of 0.1
0.027577814762204233 with 9 features, 10 max depth, 100 trees, and a scale factor of 0.1
0.027629094048223682 with 5 features, 10 max depth, 75 trees, and a scale factor of 0.1
0.02765887322868521 with 5 features, 10 max depth, 50 trees, and a scale factor of 0.1
0.027668715628490683 with 9 features, 10 max depth, 50 trees, and a scale factor of 0.1
0.02772261642370331 with 5 features, 10 max depth, 100 trees, and a scale factor of 0.1
0.028209361747770265 with 7 features, 7 max depth, 50 trees, and a scale factor of 0.1
0.028211002777877858 with 9 features, 7 max depth, 100 trees, and a scale factor of 0.1
0.02821141643414227 with 5 featur

In [10]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from copy import deepcopy
%matplotlib qt

# feature = "SWTP Total Influent Flow"
# feature = "Ozark Aquifer Depth to Water Level (ft)"
feature = "Springfield Plateau Aquifer Depth to Water Level (ft)"
# feature = "James Gauge Height (ft)"
# feature = "Wilsons Gauge Height (ft)"
# feature = "Fire 168 Hour Rainfall Aggregate"
# feature = "HourlyPressureChange"


# df = pd.read_csv("Joined Influent and Rainfall and Weather and Groundwater and Creek Gauge.csv", parse_dates=["DateTime"])
df = pd.read_csv("Small Gap Imputed Data.csv", parse_dates=["DateTime"])
# df = pd.read_csv("Small Gap Imputed Data Editted.csv", parse_dates=["DateTime"])
# df["SWTP Total Influent Flow"] = np.array([np.nan if x < 3.7 else x for x in df["SWTP Total Influent Flow"]])


# imputedDf = pd.read_csv("Small Gap Imputed Data.csv")
# imputedDf = pd.read_csv("Small Gap Imputed Data Editted.csv")
imputedDf = pd.read_csv("test.csv")


dates = np.array(df["DateTime"])
imputedArr = np.array(imputedDf[feature])
nullArr = deepcopy(np.array(df[feature]))

fig, ax = plt.subplots()
fig, ax = visualizeMissingValues(dates, nullArr, fig, ax)
ax = plotImputedData(dates, nullArr, imputedArr, ax)
# ax.scatter(testDf["DateTime"], testDf[testDf.columns[-1]], s=8, color="red", marker="x")
ax.set_ylabel(feature)
plt.show()