In [18]:
import sys
import psycopg2
sys.path.append('../')
from helper import db
import pandas as pd
import numpy as np
import bokeh

In [41]:
# Based on https://www.usna.edu/Users/oceano/pguth/md_help/html/approx_equivalents.htm rounding latitude and longitude 
# to .01 degree approximates by 11.1km which is ok at our scale
query = """
SELECT
extract(year from data_date) as year,
depth,
ROUND(latitude) as latitude, 
ROUND(longitude) as longitude,
ROUND(AVG(temperature), 3) as temperature,
ROUND(AVG(salinity), 3) as salinity
FROM OCEAN_DATA
WHERE data_date BETWEEN '2009-01-01' AND '2011-12-31'
AND to_char(data_date,'Mon') in ('Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep')
GROUP BY year, depth, latitude, longitude
ORDER BY year, temperature;
"""
df = db.run_query(query)

In [42]:
df.head()

Unnamed: 0,year,depth,latitude,longitude,temperature,salinity
0,2009.0,"(0, 100]",55,-55,-1.609,32.72
1,2009.0,"(0, 100]",55,-55,-1.537,33.389
2,2009.0,"(0, 100]",55,-55,-1.534,33.227
3,2009.0,"(0, 100]",55,-54,-1.487,32.855
4,2009.0,"(0, 100]",55,-54,-1.485,32.839


In [43]:
len(df)

68172

In [44]:
temp_var = df.copy()

In [51]:
temp_var["temperature"] = temp_var["temperature"].astype(float)
temp_var["salinity"] = temp_var["salinity"].astype(float)
temp_var.sort_values(by=["depth", "latitude", "longitude", "year"]).head(15)
temp_var.groupby(["depth", "latitude", "longitude", "year"])[["temperature", "salinity"]].mean().reset_index()

Unnamed: 0,depth,latitude,longitude,year,temperature,salinity
0,"(0, 100]",38,-70,2011.0,20.405000,35.6510
1,"(0, 100]",38,-69,2010.0,12.590000,35.2700
2,"(0, 100]",38,-68,2010.0,16.997500,35.4315
3,"(0, 100]",38,-68,2011.0,16.741000,35.1930
4,"(0, 100]",38,-67,2010.0,23.496000,36.4320
...,...,...,...,...,...,...
20344,"(900, 1000]",59,-37,2011.0,3.787333,34.8970
20345,"(900, 1000]",59,-36,2009.0,3.822000,34.8915
20346,"(900, 1000]",59,-36,2010.0,3.696000,34.8530
20347,"(900, 1000]",59,-36,2011.0,4.014000,34.9380


In [116]:
query = """
SELECT
extract(year from data_date) as year,
depth,
ROUND(AVG(temperature), 3) as temperature,
ROUND(AVG(salinity), 3) as salinity
FROM OCEAN_DATA
WHERE data_date BETWEEN '2009-01-01' AND '2017-12-31'
AND salinity BETWEEN 30 and 41
AND temperature BETWEEN -2.5 and 40
AND to_char(data_date,'Mon') in ('Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep')
GROUP BY year, depth
ORDER BY year, temperature;
"""
df1 = db.run_query(query)

In [119]:
df1.sort_values(by=["depth", "year"]).head(5)

Unnamed: 0,year,depth,temperature,salinity
32,2009.0,"(0, 100]",10.632,35.102
54,2010.0,"(0, 100]",9.811,34.864
75,2011.0,"(0, 100]",8.28,34.787
101,2012.0,"(0, 100]",9.902,34.872
124,2013.0,"(0, 100]",9.057,34.946


In [179]:
df2 = df1.copy()
df2["temperature"] = df2["temperature"].astype(float)
df2["salinity"] = df2["salinity"].astype(float)
#df2.groupby(["depth", "year"])[["temperature", "salinity"]].mean().diff()


def variation(df, param):    
#     df[f"{param}_variation"] = 0
    for depth_range in df.depth.unique():
        # Get the baseline value of the parameter in 2009
        depth = (df.depth == depth_range)
        y2009 = (df.year == 2009)
        baseline = df.loc[depth & y2009, param].values
        print(baseline)
        # Iterate of each depth mask of the dataframe to return the difference with the baseline value
        for i, row in df[depth].iterrows():
            row[f"{param}_variation"] = row[param] - baseline
            #df[f"{param}_variation"].iloc[i] = df[param].iloc[i] - baseline
            
        
df2["temperature_variation"] = variation(df2, "temperature")
df2
# df2["temp_variation"] = df2.sort_values(by=["depth", "year"]).groupby(["depth", "year"])['temperature'].diff(-1)
# df2.sort_values(by=["depth", "year"])

[3.467]
[3.474]
[3.557]
[3.604]
[3.685]
[3.702]
[3.703]
[3.703]
[3.703]
[3.704]
[3.704]
[3.707]
[3.711]
[3.765]
[3.766]
[3.767]
[3.874]
[3.953]
[4.059]
[4.191]
[4.307]
[4.336]
[4.572]
[4.914]
[5.038]
[5.361]
[5.929]
[6.567]
[7.234]
[8.037]
[8.509]
[9.138]
[10.632]
[14.073]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]


Unnamed: 0,year,depth,temperature,salinity,temperature_variation
0,2009.0,"(2000, 2100]",3.467,34.926,
1,2009.0,"(1900, 2000]",3.474,34.918,
2,2009.0,"(1800, 1900]",3.557,34.930,
3,2009.0,"(1700, 1800]",3.604,34.925,
4,2009.0,"(1600, 1700]",3.685,34.925,
...,...,...,...,...,...
262,2017.0,"(400, 500]",7.070,35.121,
263,2017.0,"(300, 400]",7.954,35.195,
264,2017.0,"(200, 300]",8.844,35.274,
265,2017.0,"(100, 200]",9.351,35.251,


In [188]:
from pandas.api.types import CategoricalDtype
#turn string back into categories... yes you have to do it manually apprarently
cat = ['(0, 100]', '(100, 200]', '(200, 300]', '(300, 400]', '(400, 500]', '(500, 600]', '(600, 700]', '(700, 800]', '(800, 900]', '(900, 1000]',
        '(1000, 1100]', '(1100, 1200]', '(1200, 1300]', '(1300, 1400]', '(1400, 1500]', '(1500, 1600]', '(1600, 1700]', '(1700, 1800]', '(1800, 1900]', '(1900, 2000]', 
        '(2000, 2100]', '(2100, 2200]', '(2200, 2300]', '(2300, 2400]', '(2400, 2500]', '(2500, 2600]', '(2600, 2700]', '(2700, 2800]', '(2800, 2900]', '(2900, 3000]',
        '(3000, 3100]', '(3100, 3200]', '(3200, 3300]', '(3300, 3400]', '(3400, 3500]', '(3500, 3600]', '(3600, 3700]', '(3700, 3800]', '(3800, 3900]', '(3900, 4000]', 
        '(4000, 4100]', '(4100, 4200]', '(4200, 4300]', '(4300, 4400]', '(4400, 4500]', '(4500, 4600]', '(4600, 4700]', '(4700, 4800]', '(4800, 4900]', '(4900, 5000]',
        '(5000, 5100]', '(5100, 5200]', '(5200, 5300]', '(5300, 5400]', '(5400, 5500]', '(5500, 5600]', '(5600, 5700]', '(5700, 5800]', '(5800, 5900]', '(5900, 6000]',
       ]

cat_type = CategoricalDtype(categories=cat, ordered=True)
# df_cat = df.astype(cat_type)

df2["depth"] = df2["depth"].astype(cat_type)
df2
# df2[df2.year == 2009].sort_values(by="depth")["depth"]

Unnamed: 0,year,depth,temperature,salinity,temperature_variation
0,2009.0,"(2000, 2100]",3.467,34.926,
1,2009.0,"(1900, 2000]",3.474,34.918,
2,2009.0,"(1800, 1900]",3.557,34.930,
3,2009.0,"(1700, 1800]",3.604,34.925,
4,2009.0,"(1600, 1700]",3.685,34.925,
...,...,...,...,...,...
262,2017.0,"(400, 500]",7.070,35.121,
263,2017.0,"(300, 400]",7.954,35.195,
264,2017.0,"(200, 300]",8.844,35.274,
265,2017.0,"(100, 200]",9.351,35.251,


In [190]:
df2[df2.year == 2009].sort_values(by="depth")

Unnamed: 0,year,depth,temperature,salinity,temperature_variation
32,2009.0,"(0, 100]",10.632,35.102,
31,2009.0,"(100, 200]",9.138,35.296,
30,2009.0,"(200, 300]",8.509,35.256,
29,2009.0,"(300, 400]",8.037,35.231,
28,2009.0,"(400, 500]",7.234,35.15,
27,2009.0,"(500, 600]",6.567,35.086,
26,2009.0,"(600, 700]",5.929,35.026,
25,2009.0,"(700, 800]",5.361,34.982,
23,2009.0,"(800, 900]",4.914,34.958,
22,2009.0,"(900, 1000]",4.572,34.945,


In [118]:
min_sal = df1[df1['salinity']== df1.salinity.min()].index.values
max_sal = df1[df1['salinity']== df1.salinity.max()].index.values
df1.salinity.min(), df1.salinity.max(), df1.iloc[min_sal], df1.iloc[max_sal]

(Decimal('33.444'),
 Decimal('37.386'),
       year         depth temperature salinity
 20  2009.0  (3100, 3200]       4.307   33.444,
       year         depth temperature salinity
 10  2009.0  (3400, 3500]       3.704   37.386)

In [77]:
len(df1), len(df1[df1["salinity"] > 30])

(271, 264)

In [78]:
df1[df1["salinity"] < 30]

Unnamed: 0,year,depth,temperature,salinity
34,2009.0,"(2100, 2200]",27.52,14.33
35,2009.0,"(2600, 2700]",28.426,14.323
36,2009.0,"(2200, 2300]",54.844,12.681
58,2010.0,"(2100, 2200]",34.969,5.999
98,2012.0,"(3000, 3100]",5.167,18.551
127,2013.0,"(3000, 3100]",8.373,15.975
141,2014.0,"(3100, 3200]",4.415,27.679


In [105]:
query = """
SELECT 
data_date,
depth,
floatid,
round(latitude, 1) as latitude,
round(longitude, 1) as longitude,
round(salinity, 2) as salinity
FROM OCEAN_DATA
WHERE data_date BETWEEN '2009-01-01' AND '2017-12-31'
AND salinity BETWEEN 2 and 30
AND to_char(data_date,'Mon') in ('Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep');
"""
test = db.run_query(query)

In [115]:
query = """
SELECT 
data_date,
depth,
floatid,
round(latitude, 2) as latitude,
round(longitude, 2) as longitude,
round(salinity, 2) as salinity
FROM OCEAN_DATA
WHERE data_date = '2009-08-15'
AND floatid = '4900545';
"""
db.run_query(query)

Unnamed: 0,data_date,depth,floatid,latitude,longitude,salinity
0,2009-08-15,"(0, 100]",4900545,39.86,-46.65,26.81
1,2009-08-15,"(100, 200]",4900545,39.86,-46.65,26.38
2,2009-08-15,"(200, 300]",4900545,39.86,-46.65,30.31
3,2009-08-15,"(300, 400]",4900545,39.86,-46.65,29.06
4,2009-08-15,"(400, 500]",4900545,39.86,-46.65,29.29
5,2009-08-15,"(500, 600]",4900545,39.86,-46.65,28.66
6,2009-08-15,"(600, 700]",4900545,39.86,-46.65,28.19
7,2009-08-15,"(700, 800]",4900545,39.86,-46.65,27.79
8,2009-08-15,"(800, 900]",4900545,39.86,-46.65,27.36
9,2009-08-15,"(900, 1000]",4900545,39.86,-46.65,26.57


In [106]:
test["salinity"] = test["salinity"].astype(int)
test.groupby(["floatid", "data_date", "depth"])[["salinity"]].mean().head(50)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,salinity
floatid,data_date,depth,Unnamed: 3_level_1
4900500,2011-05-04,"(1900, 2000]",17
4900500,2011-05-15,"(1600, 1700]",27
4900500,2011-05-24,"(1900, 2000]",17
4900500,2011-06-04,"(1200, 1300]",29
4900500,2011-06-04,"(1400, 1500]",17
4900500,2011-06-13,"(1100, 1200]",25
4900500,2011-06-13,"(1400, 1500]",17
4900500,2011-06-24,"(900, 1000]",24
4900503,2011-08-06,"(300, 400]",21
4900503,2011-08-16,"(1500, 1600]",5
