In [3]:
# get all csv and store as pandas dataframe
# What affects backflow the most: temperature or pressure
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import random
# get data from backflowdata.csv
chunk = pd.read_csv('../data/backflow_data.csv',chunksize=1001, sep=",")
pd_df = pd.concat(chunk)
df = pd.DataFrame(pd_df)
data = df.values.tolist()

# function to sort data by id
# set ascending to true for resulting data to be in ascending order
def id_sort(arr, ascending=True):
    n = len(arr)  
    for i in range(n-1):  
        for j in range(0, n-i-1):
           # compare ID and swap if greater
            if ascending:
                if arr[j][0] > arr[j + 1][0] :
                    arr[j], arr[j + 1] = arr[j + 1], arr[j]
            if not ascending: 
                if arr[j][0] < arr[j + 1][0] :
                    arr[j], arr[j + 1] = arr[j + 1], arr[j]

def partitionData(data, start, end):
    arr = []
    for i in range(start, end-1):
        arr.append(data[i])
    return arr

id_sort(data, ascending=True)

In [4]:
# return avgs of data for each interval as a dictionary
def getAvgData(data, interval):
    avgs = {"avg_temperature":[], "avg_pressure":[], "avg_backflow":[]}
    counter=0
    totals = [0,0,0]
    for temp in data:
        # total temp
        totals[0] = totals[0] + float(temp[1].split("%")[0])
        # total pressure
        totals[1] = totals[1] + float(temp[2].split("%")[0])
        # total backflow
        totals[2] = totals[2] + float(temp[3].split("%")[0])
        counter = counter + 1
        # every n rows calculate the averages
        if counter % interval == 0:
            avgs['avg_temperature'].append(totals[0]/counter)
            avgs['avg_pressure'].append(totals[1]/counter)
            avgs['avg_backflow'].append(totals[2]/counter)
            # reset totals and counter
            totals[0]=0
            totals[1]=0
            totals[2]=0
            counter =0
    return avgs


In [5]:
# returns highest value (specify row and number of days)
# start and end specifies where to find the highest value
# i.e) start=0, end=10, col="pressure" would mean find highest temperature from row 0 to 10
def highestValue(data, start, end,  col="temp"):
    columnIndex = 1
    maxVal = 0
    if col == "temp": 
        maxVal = float(data[0][1].split("%")[0])
        columnIndex = 1
    elif col == "pressure": 
        maxVal = float(data[0][2].split("%")[0])
        columnIndex = 2
    elif col == "backflow": 
        maxVal = float(data[0][3].split("%")[0])
        columnIndex = 3
    # search for max val
    for i in range(start, end):
        num = float(data[i][columnIndex].split("%")[0])
        if maxVal < num:
            maxVal = num
    return maxVal
# returns lowest value (specify row and number of days)
# start and end specifies where to find the lowest value
# i.e) start=0, end=10, col="pressure" would mean find lowest temperature from row 0 to 10
def lowestValue(data, start, end, col="Temp"):
    columnIndex = 1
    lowVal = 0
    if col == "temp": 
        lowVal = float(data[0][1].split("%")[0])
        columnIndex = 1
    elif col == "pressure": 
        lowVal = float(data[0][2].split("%")[0])
        columnIndex = 2
    elif col == "backflow": 
        lowVal = float(data[0][3].split("%")[0])
        columnIndex = 3
    # search for max val
    for i in range(start, end):
        num = float(data[i][columnIndex].split("%")[0])
        if lowVal > num:
            lowVal = num
    return lowVal

In [6]:
# 3 day average (and... average of x number of days)
threedayavg = getAvgData(data, 3)

In [7]:
# today - yesterday  (and... today / x number of days ago)
def todayMinusYesterday(data):
    diff = {"temp":[] , "pressure":[] , "backflow":[]}
    for i in range(0, len(data)-1):
        # today - yesterday
        temperature_diff =float(data[i][1].split("%")[0]) -float(data[i+1][1].split("%")[0] )
        pressure_diff =float(data[i][2].split("%")[0]) -float(data[i+1][2].split("%")[0] )
        backflow_diff = float(data[i][3].split("%")[0]) -float(data[i+1][3].split("%")[0] )
        diff['temp'].append(temperature_diff)
        diff['pressure'].append(pressure_diff)
        diff['backflow'].append(backflow_diff)
    return diff
result =  todayMinusYesterday(data)

In [8]:
# today value / highest value of the last x number of days 
def todayHighest_xdays(data, days, col="backflow"):
    columnIndex = 0
    result = []
    if col == "temp": 
        columnIndex = 1
    elif col == "pressure": 
        columnIndex = 2
    elif col == "backflow": 
        columnIndex = 3
    start =0
    end = len(data)-1
    for i in range(0, len(data)-1):
        if start < end: 
            highestval_xdays = highestValue(data, start, start+days,col=col)
            res = float(data[i][columnIndex].split("%")[0]) / highestval_xdays
            result.append(res)
        start = start + days
    return result
        
# today value / lowest value of the last x number of days 
def todayLowest_xdays(data, days, col="backflow"):
    result = []
    columnIndex = 0
    if col == "temp": 
        columnIndex = 1
    elif col == "pressure": 
        columnIndex = 2
    elif col == "backflow": 
        columnIndex = 3
    start =0
    end = len(data)-1
    for i in range(0, len(data)-1):
        if start < end: 
            lowestval_xdays = lowestValue(data, start, start+days,col=col)
            res = float(data[i][columnIndex].split("%")[0]) / lowestval_xdays
            result.append(res)
        start = start + days
    return result

todayhighest_xdays = todayHighest_xdays(data, 50, col='backflow')
todaylowest_xdays = todayLowest_xdays(data, 50, col='backflow')
    

In [9]:
# out of the past 10 days, number of days data was higher than the day before.
def daysWhereDataIsHigher(data):
    results = []
    counter =0
    for i in range(0, len(data)-1):
        # reset counter every 10 days
        if i % 10 ==0 and i != 0:
            results.append(counter)
            counter =0
        currDay = float(data[i][1].split("%")[0])
        nextDay =  float(data[i+1][1].split("%")[0])
        if  currDay > nextDay:
            counter = counter +1
    return results
# out of the past 10 days, number of days data was lower than the day before.
def daysWhereDataIsLower(data):
    results = []
    counter =0
    for i in range(0, len(data)-1):
        # reset counter every 10 days
        if i % 10 ==0 and i != 0:
            results.append(counter)
            counter =0
        currDay = float(data[i][1].split("%")[0])
        nextDay =  float(data[i+1][1].split("%")[0])
        if  currDay < nextDay:
            counter = counter +1
    return results

d = daysWhereDataIsHigher(data) 
d = daysWhereDataIsLower(data) 


In [10]:
# number of consecutive days that data was rising
def numberOfDaysRising(data, col="backflow"):
    counter=1
    columnIndex = 0
    numRising = 0
    if col == "temp": 
        columnIndex = 1
    elif col == "pressure": 
        columnIndex = 2
    elif col == "backflow": 
        columnIndex = 3
    for i in range(0, len(data)-1):
        nextData = float(data[i+1][columnIndex].split("%")[0])
        currData = float(data[i][columnIndex].split("%")[0])
        if currData > nextData:  
            #print("Num Rising: " + str(numRising))
            numRising = numRising +1
            counter =0
        counter = counter +1
    return numRising

# number of consecutive days that data was falling
def numberOfDaysFalling(data, col="backflow"):
    counter=1
    columnIndex = 0
    numFalling = 0
    if col == "temp": 
        columnIndex = 1
    elif col == "pressure": 
        columnIndex = 2
    elif col == "backflow": 
        columnIndex = 3
    for i in range(0, len(data)-1):
        nextData = float(data[i+1][columnIndex].split("%")[0])
        currData = float(data[i][columnIndex].split("%")[0])
        #print(currData)
        if currData < nextData:  
            #print("Num Falling: " + str(numFalling))
            numFalling = numFalling + 1
            counter =0
        counter = counter +1
    return numFalling

res = numberOfDaysRising(data, col="backflow")
res = numberOfDaysFalling(data, col="backflow")




In [11]:
# 1,000 rows of days, sorted them into deciles, and then sorted each decile group into quartiles.  
# You then calculated the 1st quartile less the 3rd quartile and the 2nd quartile less the 4th quartile.
# That gives you 10 results.  
# From the 10 results you divided the most extreme quartile by the average of all of the quartiles. 

    
def quartileDifferences(data, col="temp"):
    quartiles = {"1st-3rd":[], "2nd-4th":[] , "avgs":[]}
    columnIndex =1
    if col == "temp": 
        columnIndex = 1
    elif col == "pressure": 
        columnIndex = 2
    elif col == "backflow": 
        columnIndex = 3
    for i in range(0, len(data)):
        # for each decile calculate quartile differences
        if i % 10 == 0 and i != 0:
            quartile1 = data[int(i/4)]
            quartile2 = data[int(i/2)]  
            quartile3 = data[int(i*3/4)]
            quartile4 = data[int(i)]
            total= float(quartile1[columnIndex].split("%")[0]) + float(quartile2[columnIndex].split("%")[0])+float(quartile4[columnIndex].split("%")[0])+float(quartile3[columnIndex].split("%")[0])
            # avg of all quartiles
            quartileAvg = total/4
            #1st quartile - 3rd quartile
            quartiles['1st-3rd'].append(float(quartile1[columnIndex].split("%")[0])- float(quartile3[columnIndex].split("%")[0]))
            # 2nd quartile - 4th quartile
            quartiles['2nd-4th'].append(float(quartile3[columnIndex].split("%")[0])- float(quartile4[columnIndex].split("%")[0]))
            quartiles['avgs'].append(quartileAvg)
    return quartiles

quartileData = quartileDifferences(data, col="temp")


[0.73, 1.01, -1.22, 2.52, -1.13, 0.37, -0.5900000000000001, -0.33999999999999997, -0.25, -0.24000000000000005, 1.9, -1.95, -2.1399999999999997, 1.46, -2.34, -0.16000000000000003, 0.8400000000000001, 0.22999999999999998, 1.1300000000000001, 5.96, 3.11, -0.97, 0.01999999999999999, 0.94, -0.030000000000000027, 0.44000000000000006, -0.040000000000000036, 0.08, -2.43, 1.49, 0.97, 1.2599999999999998, -2.32, 1.9600000000000002, -1.6400000000000001, 0.9299999999999999, -1.14, -0.14, 1.7799999999999998, 0.47000000000000003, -0.08999999999999997, 0.010000000000000009, 0.07000000000000006, 0.7700000000000001, 2.28, 0.22999999999999998, 0.36, 0.63, -1.27, 0.53, 0.06, -1.04, -0.19999999999999998, -1.12, 3.06, -0.75, -1.2000000000000002, -0.7200000000000001, 0.8999999999999999, -6.12, -1.9000000000000001, 6.03, -0.8199999999999998, -0.27, 4.1, 2.1, 2.24, -0.56, 0.38, 0.48, -0.26, 0.30000000000000004, 0.47000000000000003, 0.97, 0.41000000000000003, 0.45999999999999996, -0.2, 1.27, 0.23000000000000004

In [12]:
# From the 10 results you divided the most extreme quartile by the average of all of the quartiles. 
# Take the 10 results of the quartiles and subtract the 6 from the 1st, and the 7th from the 2nd etc.

# i'm unsure how to find the most extreme quartile