## BIKE
```
select count(*) from bike;
```
```
   count   
-----------
 111,518,204
```


### Optimal ordering for most operations
```
create index bike_workoutid_idx on bike (workoutid);
```
```
cluster verbose bike using bike_workoutid_idx;
```

### REMOVE DUPES.

#### Count the number of dupes

```
select count(*) from (select count(*) from bike group by altitude, heart_rate, latitude, longitude, speed, workoutid, time having count(*) > 1) as foo;
```
```
 count 
-------
 23,180
```

#### Remove the dupes

```
ALTER TABLE bike ADD COLUMN id SERIAL PRIMARY KEY;
```
```
DELETE FROM bike
WHERE id IN (SELECT id
              FROM (SELECT id,
                             ROW_NUMBER() OVER (partition BY altitude, heart_rate, latitude, longitude, speed, workoutid, time ORDER BY id) AS rnum
                     FROM bike) t
              WHERE t.rnum > 1);
```
```
DELETE 23293
```
```
ALTER TABLE bike drop column id;
```


```
select count(*) from (select count(*) from bike group by altitude, heart_rate, latitude, longitude, speed, workoutid, time having count(*) > 1) as foo;
```

```
 count 
-------
    14
```

### Vacuum
```
vacuum full bike;
```

## Speed

### Value given in kph. Range should be from 0-125 (~80 mph).

```
select * from histogram('speed', 'bike');
```
```
 bucket |                 range                 |   freq   |       bar
--------+---------------------------------------+----------+-----------------
      1 | [-11689.6730000000,-11689.6730000000] |        1 |
      8 | [-3774.3667000000,-3774.3667000000]   |        1 |
     10 | [-1607.8727000000,-717.3540000000]    |        4 |
     11 | [-210.5873100000,531.2680000000]      | 53677006 | ***************
     12 | [531.9980000000,809.9760000000]       |      377 |
     20 | [10353.1000000000,10353.1000000000]   |        1 |
     21 | [10531.2000000000,10531.2000000000]   |        1 |
```

#### Simply removing data has implications, there is a relatively small amount of outliers so we will just smooth them.

#### Lets find the average speed and smooth them to that.

```
SELECT avg(speed) AS average FROM bike where speed < 50 and speed > 0;
```
```
       average       
---------------------
 24.2918175507782843
```

### Still too many > 125kph (80mph, no one that fast).

```
update bike set speed = 24 where speed > 125;
```
```
UPDATE 3864
```

```
select * from histogram('speed', 'bike');
```
```
 bucket |                 range                 |   freq   |       bar       
--------+---------------------------------------+----------+-----------------
      1 | [-11689.6730000000,-11689.6730000000] |        1 | 
     14 | [-3774.3667000000,-3774.3667000000]   |        1 | 
     18 | [-1607.8727000000,-1439.1677000000]   |        3 | 
     19 | [-717.3540000000,-717.3540000000]     |        1 | 
     20 | [-210.5873100000,124.9920000000]      | 53677383 | ***************
     21 | [124.9990000000,124.9990000000]       |        2 | 
```

### Still too many < 0kph.

```
update bike set speed = 24 where speed < 0;
```
```
UPDATE 502
```

```
select * from histogram('speed', 'bike');
```
```
 bucket |              range              |   freq   |       bar       
--------+---------------------------------+----------+-----------------
      1 | [0.0000000000,6.2499456000]     |  2150304 | **
      2 | [6.2499500000,12.4998910000]    |  3765002 | ****
      3 | [12.4999000000,18.7498470000]   |  8306070 | *********
      4 | [18.7498570000,24.9997980000]   | 13685788 | **************
      5 | [24.9998000000,31.2497270000]   | 14549492 | ***************
      6 | [31.2497650000,37.4996000000]   |  7612258 | ********
      7 | [37.4997000000,43.7496200000]   |  2469832 | ***
      8 | [43.7497000000,49.9995770000]   |   749316 | *
      9 | [49.9996000000,56.2494000000]   |   233941 | 
     10 | [56.2497140000,62.4984740000]   |    74839 | 
     11 | [62.4996000000,68.7492000000]   |    26836 | 
     12 | [68.7503800000,74.9988000000]   |    12471 | 
     13 | [75.0000000000,81.2490500000]   |     8304 | 
     14 | [81.2496000000,87.4992400000]   |     6712 | 
     15 | [87.5009500000,93.7477100000]   |     4781 | 
     16 | [93.7494300000,99.9976600000]   |     3839 | 
     17 | [100.0000000000,106.2480000000] |    13762 | 
     18 | [106.2500000000,112.4960000000] |     1793 | 
     19 | [112.5140000000,118.7390000000] |     1256 | 
     20 | [118.7500000000,124.9920000000] |      793 | 
     21 | [124.9990000000,124.9990000000] |        2 | 
```


#### Much better lets generate first derivative of speed.
##### Split is done due to space restrictions

```
alter table bike add column speed_first numeric(10,5);
```

```
vacuum full verbose bike; 
with dev_list as (
    select round((speed_difference / time_difference),5) as deriv,
           time,
           workoutid
           from (
        select speed_difference,
               case when time_difference = 0 then 1 else time_difference end as time_difference,
               time,
               workoutid
               from (
            select speed - lag(speed) over (partition by workoutid order by time) as speed_difference,
                   time - lag(time) over (partition by workoutid order by time) as time_difference,
                   speed,
                   time,
                   workoutid
                from bike order by time)
        as foo)
    as bar
    order by workoutid,
             time )
update bike r1
  set speed_first = d1.deriv
  from dev_list as d1
  where d1.workoutid = r1.workoutid and
        d1.time = r1.time and r1.workoutid > 326636695;
```
```
vacuum full verbose bike; 
with dev_list as (
    select round((speed_difference / time_difference),5) as deriv,
           time,
           workoutid
           from (
        select speed_difference,
               case when time_difference = 0 then 1 else time_difference end as time_difference,
               time,
               workoutid
               from (
            select speed - lag(speed) over (partition by workoutid order by time) as speed_difference,
                   time - lag(time) over (partition by workoutid order by time) as time_difference,
                   speed,
                   time,
                   workoutid
                from bike order by time)
        as foo)
    as bar
    order by workoutid,
             time )
update bike r1
  set speed_first = d1.deriv
  from dev_list as d1
  where d1.workoutid = r1.workoutid and
        d1.time = r1.time and r1.workoutid <= 326636695;
```


### Sanity check values
```
select * from histogram('speed_first', 'bike');
```
```
      1 | [-119.79000,-117.36700] |        2 |
      2 | [-100.00000,-97.83840]  |       49 |
      3 | [-97.72753,-86.74603]   |      144 |
      4 | [-86.70998,-75.71077]   |      245 |
      5 | [-75.69618,-64.70432]   |      318 |
      6 | [-64.65368,-53.66907]   |      295 |
      7 | [-53.65040,-42.65314]   |      572 |
      8 | [-42.62009,-31.61831]   |     1474 |
      9 | [-31.60865,-20.59078]   |     5013 |
     10 | [-20.58959,-9.56806]    |    30060 |
     11 | [-9.56787,1.45419]      | 52025512 | ***************
     12 | [1.45420,12.47635]      |  1454144 |
     13 | [12.47654,23.49701]     |    17822 |
     14 | [23.50044,34.52067]     |     5870 |
     15 | [34.52840,45.53847]     |     2701 |
     16 | [45.54520,56.52723]     |     1425 |
     17 | [56.57387,67.57965]     |      763 |
     18 | [67.61866,78.58357]     |      569 |
     19 | [78.62376,89.62135]     |      384 |
     20 | [89.68573,100.22500]    |     1161 |
     21 | [100.65400,100.65400]   |        1 |
```

#### (Re)Generate moving averages

##### Over 50 points
```
alter table bike add column speed_ma_50 numeric(8,5);
```
```
vacuum full verbose bike;

with dev_list as (
	select time, 
	       workoutid, 
	       avg(speed) over (partition by workoutid order by time rows between 50 preceding and current row) as mavg 
	from bike 
	order by time
)
update bike r1 set speed_ma_50 = d1.mavg 
   from dev_list as d1 
   where d1.workoutid = r1.workoutid and 
     d1.time = r1.time and
     r1.workoutid <= 326636695;    
```

```
vacuum full verbose bike;

with dev_list as (
	select time, 
	       workoutid, 
	       avg(speed) over (partition by workoutid order by time rows between 50 preceding and current row) as mavg 
	from bike 
	order by time
)
update bike r1 set speed_ma_50 = d1.mavg 
  from dev_list as d1 
  where d1.workoutid = r1.workoutid and 
    d1.time = r1.time and
    r1.workoutid > 326636695;
```


```
select * from histogram('speed_ma_50', 'bike');
```
```
 bucket |         range         |   freq   |       bar       
--------+-----------------------+----------+-----------------
      1 | [0.00000,6.16174]     |   891714 | *
      2 | [6.16175,12.32349]    |  2380679 | **
      3 | [12.32350,18.48524]   |  7841678 | *******
      4 | [18.48525,24.64699]   | 16055904 | *************
      5 | [24.64700,30.80874]   | 18029670 | ***************
      6 | [30.80875,36.97049]   |  7012569 | ******
      7 | [36.97051,43.13216]   |  1237132 | *
      8 | [43.13231,49.29395]   |   155860 | 
      9 | [49.29401,55.45562]   |    28764 | 
     10 | [55.45625,61.61696]   |    11370 | 
     11 | [61.61783,67.77826]   |     6493 | 
     12 | [67.78031,73.94033]   |     5553 | 
     13 | [73.94159,80.10225]   |     4587 | 
     14 | [80.10307,86.26364]   |     3854 | 
     15 | [86.27066,92.42568]   |     2846 | 
     16 | [92.42991,98.58608]   |     3292 | 
     17 | [98.59093,104.74927]  |     3297 | 
     18 | [104.77490,110.90801] |     1411 | 
     19 | [110.91210,117.06625] |      687 | 
     20 | [117.07649,122.79600] |       30 | 
     21 | [123.23500,123.23500] |        1 | 
```




##### Over 100 points
```
alter table bike add column speed_ma_100 numeric(8,5);
```
```
vacuum full verbose bike;
with dev_list as (
	select time, 
	       workoutid, 
	       avg(speed) over (partition by workoutid order by time rows between 100 preceding and current row) as mavg 
	from bike 
	order by time
)
update bike r1 set speed_ma_100 = d1.mavg from dev_list as d1 where d1.workoutid = r1.workoutid and d1.time = r1.time and r1.workoutid > 326636695;
```

```
vacuum full verbose bike;
with dev_list as (
	select time, 
	       workoutid, 
	       avg(speed) over (partition by workoutid order by time rows between 100 preceding and current row) as mavg 
	from bike 
	order by time
)
update bike r1 set speed_ma_100 = d1.mavg from dev_list as d1 where d1.workoutid = r1.workoutid and d1.time = r1.time and r1.workoutid <= 326636695;
```

```
select * from histogram('speed_ma_100', 'bike');
```
```
 bucket |         range         |   freq   |       bar
--------+-----------------------+----------+-----------------
      1 | [0.00000,6.16174]     |   803081 | *
      2 | [6.16175,12.32349]    |  2158773 | **
      3 | [12.32350,18.48524]   |  7718307 | ******
      4 | [18.48525,24.64699]   | 16681162 | *************
      5 | [24.64700,30.80874]   | 18882233 | ***************
      6 | [30.80875,36.97048]   |  6439743 | *****
      7 | [36.97050,43.13221]   |   858428 | *
      8 | [43.13228,49.29390]   |    78473 |
      9 | [49.29434,55.45561]   |    18907 |
     10 | [55.45586,61.61684]   |     8514 |
     11 | [61.61783,67.77869]   |     5810 |
     12 | [67.78231,73.93745]   |     5710 |
     13 | [73.94270,80.10178]   |     5027 |
     14 | [80.10436,86.26385]   |     3202 |
     15 | [86.26563,92.42404]   |     2503 |
     16 | [92.42991,98.58608]   |     3307 |
     17 | [98.58930,104.72656]  |     2494 |
     18 | [104.77498,110.91032] |     1227 |
     19 | [110.91506,116.50542] |      483 |
     20 | [117.11500,122.79600] |        6 |
     21 | [123.23500,123.23500] |        1 |
```

## Altitude

### Big range of altitudes, some too low, some too high


```
select * from histogram('altitude', 'bike');
```
```
 bucket |                 range                 |   freq   |       bar
--------+---------------------------------------+----------+-----------------
      1 | [-78885.5000000000,-74390.9000000000] |       14 |
      2 | [-73694.4000000000,-69288.1000000000] |       28 |
      3 | [-69136.7000000000,-65031.8000000000] |       30 |
     15 | [-9999.0000000000,-9999.0000000000]   |     1519 |
     17 | [-1340.0000000000,2767.6000000000]    | 95884438 | ***************
     18 | [2767.7000000000,7569.4000000000]     |   154732 |
     19 | [7573.8000000000,12373.6000000000]    |    31709 |
     20 | [12374.0000000000,17001.0000000000]   |     2367 |
     21 | [17177.0000000000,17177.0000000000]   |        1 |
```

#### Lets see how many rows are more thant two standard deviations  from the mean

```
select count(*) from (
  with dev_list as ( 
    select avg(altitude), stddev(altitude), workoutid 
    from bike 
    group by workoutid )
select altitude, r1.workoutid 
  from bike r1 
  join dev_list d1 on (d1.workoutid = r1.workoutid) 
  where r1.altitude < d1.avg - d1.stddev * 2 or r1.altitude > d1.avg + d1.stddev * 2) 
as foo;
```

#####  3,290,430 / 111,518,204 => ~2.95%
##### We should just smooth them to be two standard deviations from the mean

##### Fix values too low
```
with dev_list as ( 
	select avg(altitude), stddev(altitude), workoutid from bike group by workoutid )
update bike as r1 
  set altitude = d1.avg - (d1.stddev * 2) 
  from dev_list as d1 
  where d1.workoutid = r1.workoutid and r1.altitude < d1.avg - (d1.stddev * 2);
```
```
UPDATE 825254
```

##### Fix values too high
```
with dev_list as ( 
	select avg(altitude), stddev(altitude), workoutid from bike group by workoutid )
update bike as r1 
  set altitude = d1.avg + (d1.stddev * 2) 
  from dev_list as d1 
  where d1.workoutid = r1.workoutid and r1.altitude > d1.avg + (d1.stddev * 2);
```
```
UPDATE 2510788
```

#### Sanity check

```
select * from histogram('altitude', 'bike');
```
```
 bucket |                 range                 |   freq   |       bar       
--------+---------------------------------------+----------+-----------------
      1 | [-78108.3003654220,-73483.5000000000] |       16 | 
      2 | [-73318.7000000000,-68742.4000000000] |       30 | 
      3 | [-68545.7000000000,-65031.8000000000] |       26 | 
     15 | [-9999.0000000000,-9999.0000000000]   |     1500 | 
     16 | [-6033.9358860479,-6033.9358860479]   |        1 | 
     17 | [-1702.1882519329,2884.2000000000]    | 95896170 | ***************
     18 | [2884.2500000000,7646.2000000000]     |   143095 | 
     19 | [7649.8000000000,12412.6000000000]    |    32357 | 
     20 | [12412.8000000000,17001.0000000000]   |     1642 | 
     21 | [17177.0000000000,17177.0000000000]   |        1 | 
```


#### The -9999 looks like a programtic error, just erase the values
```
update bike set altitude = NULL where altitude = -9999;
```
```
UPDATE 1500
```

##### Everest is at 8,850 meters, and the most extreme high altitude ultra marathon only gets up to 5,300 meters. The lowest below sea level land is -413 meters. Probably a lot of errant sensor data

##### Lets look at workouts that include altitudes greater than 5300
```
select altitude, time, latitude, longitude, workoutid from bike where workoutid in (select distinct(workoutid) from bike where altitude > 5300) order by workoutid, time;
```

###### Small sample, look up lat / lng and check altitude. Spot check lat / lngs looks like the data is bad. 
```
     altitude     |    time    |   latitude    |    longitude    | workoutid 
------------------+------------+---------------+-----------------+-----------
 17177.0000000000 | 1313905119 | 59.1564850000 |   17.1899560000 |  23020821
  8427.2444384898 | 1340120087 | 56.4902350000 |    9.7499210000 |  65055568
  8427.2444384898 | 1340120091 | 56.4907710000 |    9.7499420000 |  65055568
  8427.2444384898 | 1340120094 | 56.4913020000 |    9.7500770000 |  65055568
  8427.2444384898 | 1340120099 | 56.4922300000 |    9.7502160000 |  65055568
  8427.2444384898 | 1340120103 | 56.4895590000 |    9.7532470000 |  65055568
  8427.2444384898 | 1340120104 | 56.4894250000 |    9.7534780000 |  65055568
 16671.0000000000 | 1340120252 | 56.5048690000 |    9.7914950000 |  65055568
 16691.0000000000 | 1340120256 | 56.5039620000 |    9.7916780000 |  65055568
 16704.0000000000 | 1340120260 | 56.5030400000 |    9.7917690000 |  65055568
 16707.0000000000 | 1340120264 | 56.5020200000 |    9.7918600000 |  65055568
 16707.0000000000 | 1340120266 | 56.5015380000 |    9.7918710000 |  65055568
 16840.0000000000 | 1340120276 | 56.5056150000 |    9.7922030000 |  65055568
 16857.0000000000 | 1340120280 | 56.5070410000 |    9.7928150000 |  65055568
 16979.0000000000 | 1340120284 | 56.4984370000 |    9.7930400000 |  65055568
 16981.0000000000 | 1340120288 | 56.4973320000 |    9.7932230000 |  65055568
 16981.0000000000 | 1340120290 | 56.4968540000 |    9.7932820000 |  65055568
```

###### How many workouts is it?
```
select count(*) from  (select distinct(workoutid) from bike where altitude > 5300) as foo;
```

###### 295 / 252,397. ~0.1%
```
select count(*) from bike where altitude > 5300;
```
##### 130,310 probably should just drop the altitude for those workouts, as that is ~441 per workout.
```
update bike set altitude = NULL where workoutid in (select distinct(workoutid) from bike where altitude > 5300);
```
```
UPDATE 136245
```


#### Fix values too low - TBD
```
select altitude, heart_rate, latitude, longitude, speed, workoutid, time from bike where altitude < -450;
```
```
     altitude      | heart_rate |   latitude    |   longitude    |     speed      | workoutid |    time    
-------------------+------------+---------------+----------------+----------------+-----------+------------
   -923.5999557658 |  137.00000 | 55.6546670000 |   9.6786350000 |                | 612433343 | 1443847923
  -1611.4208294678 |            | 50.9478910000 |   3.1253680000 |   0.0000000000 | 567756888 | 1437621848
  -1611.4208294678 |            | 50.9478910000 |   3.1253680000 |   0.0000000000 | 567756888 | 1437621847
  -1702.1882519329 |            | 50.8999370000 |   3.1993520000 |   0.0000000000 | 525830418 | 1431521273
  -1702.1882519329 |            | 50.8999370000 |   3.1993520000 |   0.0000000000 | 525830418 | 1431521271
   -478.3569164498 |            | 37.5446620000 | 127.2225720000 |   2.1606700000 |  66083852 | 1340495854
   -478.3569164498 |            | 37.5446410000 | 127.2222030000 |   3.7657300000 |  66083852 | 1340495834
...
   -478.3569164498 |            | 37.5452990000 | 127.2223190000 |   3.6422700000 |  66083852 | 1340495141
   -478.3569164498 |            | 37.5451620000 | 127.2221180000 |   4.6917300000 |  66083852 | 1340495117
  -1639.7359207840 |            | 50.8999290000 |   3.1994100000 |   0.0000000000 | 566747289 | 1437567820
  -1639.7359207840 |            | 50.8999290000 |   3.1994100000 |   0.0000000000 | 566747289 | 1437567819
   -923.5999557658 |  137.00000 | 55.6546670000 |   9.6786350000 |                | 612437587 | 1443847923
  -1693.2380948209 |            | 50.8999520000 |   3.1993720000 |   0.0000000000 | 510935815 | 1429876768
  -1693.2380948209 |            | 50.8999520000 |   3.1993720000 |   0.0000000000 | 510935815 | 1429876767
  -1624.6988167984 |            | 50.9476780000 |   3.1254040000 |   0.0000000000 | 546303221 | 1434598011
  -1624.6988167984 |            | 50.9476780000 |   3.1254040000 |   0.0000000000 | 546303221 | 1434598010
  -6033.9358860479 |            | 50.9474790000 |   3.1287390000 |  19.0566300000 | 546303164 | 1434713355
  -1628.2727461786 |            | 50.9476130000 |   3.1254340000 |   0.0000000000 | 514002116 | 1430364612
  -1628.2727461786 |            | 50.9476130000 |   3.1254340000 |   0.0000000000 | 514002116 | 1430364611
  -1119.3825860397 |            | 50.8997760000 |   3.1998070000 |   0.0000000000 | 514002342 | 1430310748
  -1538.1392940937 |            | 50.8999750000 |   3.1993670000 |   0.0000000000 | 609318812 | 1442580068
  -1538.1392940937 |            | 50.8999750000 |   3.1993670000 |   0.0000000000 | 609318812 | 1442580067
   -912.7109130831 |  127.00000 | 51.2015500000 |   3.0055640000 |                | 507252615 | 1429017367
 -78108.3003654220 |            | 49.6204530000 |  -1.5991340000 |   0.0000000000 |  60039321 | 1338437610
   -490.2245031167 |   92.00000 | 52.3099169321 |  20.9573715739 |                | 610191113 | 1443522549
   -490.2245031167 |  102.00000 | 52.3094630521 |  20.9564794879 |                | 610191113 | 1443522537
...
```

#### Mixture of values



##### Taking a while...
```
select min(series_length), count(*), r1.workoutid from bike r1 join bike_by_workout rbw on (rbw.workoutid = r1.workoutid) where r1.workoutid in (select workoutid from bike where altitude < -450) and altitude = -500 group by r1.workoutid order by count(*);
```
```
 min | count | workoutid 
-----+-------+-----------
 500 |     1 |  36433108
 499 |     1 |  10441308
 500 |    75 | 372331356
 499 |   132 | 610774871
 500 |   132 | 540221483
 500 |   281 | 610191113
 500 |   325 | 551475883
 500 |   404 | 609609918
 430 |   430 | 207386913
 499 |   478 | 266898818
 486 |   480 | 645231060
 498 |   498 | 344771859
 499 |   499 | 200960009
 499 |   499 | 157081521
 499 |   499 | 201223275
 499 |   499 | 210069893
 499 |   499 | 211856479
 499 |   499 | 244306761
 500 |   500 | 207386951
```

 ```
 update bike set altitude = NULL where workoutid = 610191113;
 update bike set altitude = NULL where workoutid = 551475883;
 update bike set altitude = NULL where workoutid = 609609918;
 update bike set altitude = NULL where workoutid = 207386913;
 update bike set altitude = NULL where workoutid = 266898818;
 update bike set altitude = NULL where workoutid = 645231060;
 update bike set altitude = NULL where workoutid = 344771859;
 update bike set altitude = NULL where workoutid = 200960009;
 update bike set altitude = NULL where workoutid = 157081521;
 update bike set altitude = NULL where workoutid = 201223275;
 update bike set altitude = NULL where workoutid = 210069893;
 update bike set altitude = NULL where workoutid = 211856479;
 update bike set altitude = NULL where workoutid = 244306761;
 update bike set altitude = NULL where workoutid = 207386951;
 ```
 
#### Lets see if there are a bunch that have garbage altitude data
```
select max(altitude) - min(altitude), max(altitude), min(altitude), workoutid from bike where altitude is not null group by workoutid order by max(altitude) - min(altitude) desc;
```

```
     ?column?     |        max        |        min        | workoutid 
------------------+-------------------+-------------------+-----------
 13076.5003654220 | -65031.8000000000 | -78108.3003654220 |  60039321
  6102.9358860479 |     69.0000000000 |  -6033.9358860479 | 546303164
  4522.0000000000 |   4022.0000000000 |   -500.0000000000 | 372331356
  4306.5527194342 |   3823.3527194342 |   -483.2000000000 | 551476143
  4002.8386644268 |   4139.4386644268 |    136.6000000000 | 559628632
  3988.4984976875 |   4099.0984976875 |    110.6000000000 | 558068675
  3751.3950351665 |   4484.7679932697 |    733.3729581032 | 522026632
```


 
``` 
select altitude, heart_rate, latitude, longitude, speed, workoutid, time from bike where workoutid = 60039321;
update bike set altitude = NULL where workoutid = 60039321;

select altitude, heart_rate, latitude, longitude, speed, workoutid, time from bike where workoutid = 546303164;
update bike set altitude = 60 where workoutid = 546303164 and time = 1434713355;

select altitude, heart_rate, latitude, longitude, speed, workoutid, time from bike where workoutid = 372331356;
update bike set altitude = NULL where workoutid = 372331356;

select altitude, heart_rate, latitude, longitude, speed, workoutid, time from bike where workoutid = 551476143;
update bike set altitude = NULL where workoutid = 551476143;
```

#### Taking too long, lets see if we can do this in bulk. group by lat, lng and use median probably.

```
select max(altitude), min(altitude), median(altitude), count(*), latitude, longitude from bike group latitude, longitude order by count(*) desc;
```

select altitude, heart_rate, latitude, longitude, speed, workoutid, time from bike where workoutid = 559628632;

select altitude, heart_rate, latitude, longitude, speed, workoutid, time from bike where workoutid = ;

select altitude, heart_rate, latitude, longitude, speed, workoutid, time from bike where workoutid = ;


``` 
 ```
 update bike set altitude = 144 where workoutid = 415904865 and altitude = -500;
 ```

```
select count(*), workoutid from bike where workoutid in (select workoutid from bike where altitude < -450) group by workoutid;
 -------+-----------
   486 |  16834745
   500 | 464120412
   463 |  14961570
    46 | 631126216
   361 | 469000118
   500 | 468416193
   499 | 435760340
   333 | 163748603
   322 | 100775522
   183 | 216962176
   490 |  94523030
   318 |  10914476
   399 | 150153106
```

```
update bike set altitude = -386.8000000000 where workoutid = 468416193 and altitude = 1008.8365313163;
update bike set altitude = altitude + 386 + 131 where workoutid = 468416193;

update bike set altitude = NULL where workoutid = 16834745;
update bike set altitude = altitude * -0.1 where workoutid = 464120412;
update bike set altitude = altitude * 0.25 where workoutid = 631126216;
update bike set altitude = altitude + 595 where workoutid = 469000118;
update bike set altitude = 20 where workoutid = 435760340 and altitude = -493.9756638241;
update bike set altitude = altitude * -1 where workoutid = 14961570;
update bike set altitude = altitude * -0.5 where workoutid = 163748603;
update bike set altitude = NULL where workoutid = 100775522;
update bike set altitude = altitude * -0.5 where workoutid = 216962176 and altitude < 0;
update bike set altitude = NULL where workoutid = 94523030;
update bike set altitude = altitude * -1 where workoutid = 10914476;
update bike set altitude = NULL where workoutid = 150153106;
```



##### (Re)Generate first derivative
```
vacuum full verbose bike;
with dev_list as (
    select round((alt_difference / time_difference),5) as deriv, time, workoutid from ( 
        select alt_difference, case when time_difference = 0 then 1 else time_difference end as time_difference, time, workoutid from (
            select altitude - lag(altitude) over (partition by workoutid order by time) as alt_difference, time - lag(time) over (partition by workoutid order by time) as time_difference, time, workoutid from bike order by time) 
        as foo) 
    as bar order by workoutid, time )
update bike r1 
  set altitude_first = d1.deriv 
  from dev_list as d1 
  where d1.workoutid = r1.workoutid and d1.time = r1.time  and r1.workoutid <= 326636695;
```
```
vacuum full verbose bike;
with dev_list as (
    select round((alt_difference / time_difference),5) as deriv, time, workoutid from ( 
        select alt_difference, case when time_difference = 0 then 1 else time_difference end as time_difference, time, workoutid from (
            select altitude - lag(altitude) over (partition by workoutid order by time) as alt_difference, time - lag(time) over (partition by workoutid order by time) as time_difference, time, workoutid from bike order by time) 
        as foo) 
    as bar order by workoutid, time )
update bike r1 
  set altitude_first = d1.deriv 
  from dev_list as d1 
  where d1.workoutid = r1.workoutid and d1.time = r1.time  and r1.workoutid > 326636695;
```

##### (Re)Generate second derivative
```
vacuum full verbose bike;
with dev_list as (
    select round((alt_difference / time_difference),5) as deriv, time, workoutid from ( 
        select alt_difference, case when time_difference = 0 then 1 else time_difference end as time_difference, time, workoutid from (
            select altitude_first - lag(altitude_first) over (partition by workoutid order by time) as alt_difference, time - lag(time) over (partition by workoutid order by time) as time_difference, time, workoutid from bike order by time) 
        as foo) 
    as bar order by workoutid, time )
update bike r1 
  set altitude_second = d1.deriv 
  from dev_list as d1 
  where d1.workoutid = r1.workoutid and d1.time = r1.time and r1.workoutid <= 326636695;
```

```
vacuum full verbose bike;
with dev_list as (
    select round((alt_difference / time_difference),5) as deriv, time, workoutid from ( 
        select alt_difference, case when time_difference = 0 then 1 else time_difference end as time_difference, time, workoutid from (
            select altitude_first - lag(altitude_first) over (partition by workoutid order by time) as alt_difference, time - lag(time) over (partition by workoutid order by time) as time_difference, time, workoutid from bike order by time) 
        as foo) 
    as bar order by workoutid, time )
update bike r1 
  set altitude_second = d1.deriv 
  from dev_list as d1 
  where d1.workoutid = r1.workoutid and d1.time = r1.time and r1.workoutid > 326636695;
```


## Series length in bike_by_workout for use in elapsed distance calculation
```
update bike_by_workout as rbw set series_length = (select count(*) from bike r where r.workoutid = rbw.workoutid); 
```

```
select count(*) from bike_by_workout where series_length < 2;
 count 
-------
  3182
```

#### 3,182 / 347,556. ~0.9%.  Just ignore them when updating

## Elapsed distance


```
vacuum full verbose bike;
with elap_dist as (select time,
       r1.workoutid,
       (row_number() over (partition by r1.workoutid order by time) - 1) * (distance / (series_length - 1)) as elapsed_distance
       from bike r1 join bike_by_workout r2 on (r1.workoutid = r2.workoutid) where series_length > 1 order by time)   
update bike as r1 
  set elapsed_distance = round(d1.elapsed_distance,10)
  from elap_dist as d1 
  where d1.workoutid = r1.workoutid and d1.time = r1.time and r1.workoutid <= 326636695;
```
```
vacuum full verbose bike;
with elap_dist as (select time,
       r1.workoutid,
       (row_number() over (partition by r1.workoutid order by time) - 1) * (distance / (series_length - 1)) as elapsed_distance
       from bike r1 join bike_by_workout r2 on (r1.workoutid = r2.workoutid) where series_length > 1 order by time) 
update bike as r1 
  set elapsed_distance = round(d1.elapsed_distance,10)
  from elap_dist as d1 
  where d1.workoutid = r1.workoutid and d1.time = r1.time and r1.workoutid > 326636695;
```

## Elapsed time

```
vacuum full verbose bike;
with elap_time as (select time, workoutid, time - first_value(time) over (partition by workoutid order by time) as time_elap from bike order by time)
update bike as r1 
  set elapsed_time = d1.time_elap 
  from elap_time as d1 
  where r1.workoutid = d1.workoutid and r1.time = d1.time and r1.workoutid <= 326636695;
```
```
vacuum full verbose bike;
with elap_time as (select time, workoutid, time - first_value(time) over (partition by workoutid order by time) as time_elap from bike order by time)
update bike as r1 
  set elapsed_time = d1.time_elap 
  from elap_time as d1 
  where r1.workoutid = d1.workoutid and r1.time = d1.time and r1.workoutid > 326636695;
```


## Heart Rate

```
select * from histogram('heart_rate', 'bike');
```
```
 bucket |           range           |   freq   |       bar
--------+---------------------------+----------+-----------------
      1 | [-2604.00000,-2390.00000] |       26 |
      2 | [-2381.00000,-2168.00000] |       27 |
      3 | [-2160.00000,-1947.00000] |       27 |
      4 | [-1939.00000,-1729.00000] |       27 |
      5 | [-1721.00000,-1508.00000] |       27 |
      6 | [-1499.00000,-1289.00000] |       27 |
      7 | [-1281.00000,-1068.00000] |       27 |
      8 | [-1060.00000,-848.00000]  |       27 |
      9 | [-840.00000,-626.00000]   |       30 |
     10 | [-619.00000,-404.00000]   |       49 |
     11 | [-399.00000,-184.00000]   |      170 |
     12 | [-183.00000,36.00000]     |   147780 |
     13 | [37.00000,255.00000]      | 67328561 | ***************
     14 | [257.00000,473.00000]     |      187 |
     15 | [478.00000,693.00000]     |       84 |
     16 | [700.00000,913.00000]     |       73 |
     17 | [918.00000,1135.00000]    |       66 |
     18 | [1142.00000,1351.00000]   |       35 |
     19 | [1362.00000,1572.00000]   |       31 |
     20 | [1584.00000,1787.00000]   |       18 |
     21 | [1797.00000,1797.00000]   |        1 |
```

##### Lots of garbage data. Heart rate should never be above 240 or below 40. Even those are uncommon. Use the same two standard deviation logic as altitude. 

##### Fix values too high
```
with dev_list as ( 
	select avg(heart_rate), stddev(heart_rate), workoutid from bike group by workoutid )
update bike as r1 
  set heart_rate = d1.avg + (d1.stddev * 2) 
  from dev_list as d1 
  where d1.workoutid = r1.workoutid and r1.heart_rate > d1.avg + (d1.stddev * 2);
```
```
UPDATE 495953
```


##### Fix values too low
```
with dev_list as ( 
	select avg(heart_rate), stddev(heart_rate), workoutid from bike group by workoutid )
update bike as r1 
  set heart_rate = d1.avg + (d1.stddev * 2) 
  from dev_list as d1 
  where d1.workoutid = r1.workoutid and r1.heart_rate < d1.avg - (d1.stddev * 2);
```
```
UPDATE 2305944
```

#### Better, but still too many below 40 and above 250
```
select * from histogram('heart_rate', 'bike');
```
```
      1 | [-2578.00000,-2390.00000] |       24 | 
      2 | [-2381.00000,-2185.00000] |       25 | 
      3 | [-2176.00000,-1996.00000] |       23 | 
      4 | [-1988.00000,-1792.00000] |       25 | 
      5 | [-1785.00000,-1598.00000] |       24 | 
      6 | [-1590.00000,-1402.00000] |       24 | 
      7 | [-1394.00000,-1207.00000] |       24 | 
      8 | [-1200.00000,-1011.00000] |       24 | 
      9 | [-1003.00000,-816.00000]  |       24 | 
     10 | [-807.00000,-619.00000]   |       24 | 
     11 | [-611.00000,-421.00000]   |       24 | 
     12 | [-413.00000,-227.00000]   |       24 | 
     13 | [-218.00000,-23.00000]    |      536 | 
     14 | [-19.00000,173.78621]     | 61414170 | ***************
     15 | [173.78703,368.00000]     |  6061960 | *
     16 | [371.00000,564.00000]     |      108 | 
     17 | [573.00000,762.00000]     |       84 | 
     18 | [764.00000,954.00000]     |       69 | 
     19 | [964.00000,1152.00000]    |       24 | 
     20 | [1162.00000,1350.00000]   |       17 | 
     21 | [1353.12415,1353.12415]   |       43 | 
```

##### Lets look at workouts that include heart_rates greater than 500 to find outliers
```
select heart_rate, time, workoutid from bike where workoutid in (select distinct(workoutid) from bike where heart_rate > 370) order by workoutid, time;
```


###### Looks like some workouts have garbage data or other units.

###### How many workouts is it?
```
select count(*) from  (select distinct(workoutid) from bike where heart_rate > 370) as foo;
```
```
 count 
-------
   6
```

##### 6 impacted workouts  just drop the heart_rate for those workouts.
```
update bike set heart_rate = NULL where workoutid in (select distinct(workoutid) from bike where heart_rate > 370);
```
```
UPDATE 2588
```


##### Lets look at workouts that include heart_rates less than 40 to find outliers
```
select heart_rate, time, workoutid from bike where workoutid in (select distinct(workoutid) from bike where heart_rate < 40 and heart_rate != 0) order by workoutid, time;
```
```
```


###### Found 21948, Looks like some workouts have garbage data or other units.

###### How many workouts is it?
```
select count(*) from  (select distinct(workoutid) from bike where heart_rate < 40) as foo;
```
```
 count 
-------
   386
```

##### 386 impacted workouts  just drop the heart_rate for those workouts.
```
update bike set heart_rate = NULL where workoutid in (select distinct(workoutid) from bike where heart_rate < 40);
```
```
UPDATE 165896
```


```
select * from histogram('heart_rate', 'bike');
```
```
 bucket |         range         |   freq   |       bar       
--------+-----------------------+----------+-----------------
      1 | [40.00000,53.01118]   |     7917 | 
      2 | [53.70816,66.00000]   |    30898 | 
      3 | [66.26454,79.31991]   |   132706 | 
      4 | [79.51359,92.34629]   |   394801 | 
      5 | [92.45286,105.53394]  |   891418 | *
      6 | [105.60919,118.63141] |  2348922 | **
      7 | [118.64713,131.74957] |  6796043 | *****
      8 | [131.75328,144.85615] | 15256396 | ************
      9 | [144.85843,157.96446] | 19151395 | ***************
     10 | [157.96517,171.07162] | 14849415 | ************
     11 | [171.07201,184.17886] |  5632441 | ****
     12 | [184.17908,197.28610] |  1243028 | *
     13 | [197.28629,210.38916] |   316159 | 
     14 | [210.39501,223.46777] |   133062 | 
     15 | [223.50918,236.56273] |    80148 | 
     16 | [236.62393,249.70201] |    35657 | 
     17 | [249.84802,262.80312] |     5539 | 
     18 | [262.82886,275.81747] |     2236 | 
     19 | [275.94231,288.05198] |      577 | 
     20 | [289.16319,297.90789] |       41 | 
     21 | [302.14367,302.14367] |       17 | 
```

#### Count those greater than 250.
```
select count(*) from bike where heart_rate > 250;
```
```
 count 
-------
  7730
```

```
select count(*) from (
    with dev_list as ( 
        select avg(heart_rate), stddev(heart_rate), workoutid from bike group by workoutid )
select heart_rate, r1.workoutid 
  from bike r1 
  join dev_list d1 on (d1.workoutid = r1.workoutid) 
  where r1.heart_rate > 250 and (r1.heart_rate > d1.avg + d1.stddev * 2)) 
as foo;
```
```
 count 
-------
  3368
```

#### Lets smooth those above 250 and 2 std devs to 2 std devs. Set those that remain down to 250.,
```
with dev_list as ( 
	select avg(heart_rate), stddev(heart_rate), workoutid from bike group by workoutid )
update bike as r1 
  set heart_rate = d1.avg + (d1.stddev * 2) 
  from dev_list as d1 
  where r1.heart_rate > 250 and d1.workoutid = r1.workoutid and r1.heart_rate > d1.avg + (d1.stddev * 2);
```

```
UPDATE 3368
```

```
update bike 
  set heart_rate = 250 where heart_rate > 250;
```
```

```


```
vacuum full verbose bike;
```

#### (Re)Generate moving heart rate average over last 25 points

```
vacuum full verbose bike;
with dev_list as (
	select time,
	       workoutid, 
	       avg(heart_rate) over (partition by workoutid order by time rows between 25 preceding and current row) as mavg 
	from bike 
	order by time
)
update bike r1 set heart_rate_ma_25 = d1.mavg from dev_list as d1 where d1.workoutid = r1.workoutid and d1.time = r1.time  and r1.workoutid > 326636695;
```

```
```

```
vacuum full verbose bike;
with dev_list as (
	select time,
	       workoutid, 
	       avg(heart_rate) over (partition by workoutid order by time rows between 25 preceding and current row) as mavg 
	from bike 
	order by time
)
update bike r1 set heart_rate_ma_25 = d1.mavg from dev_list as d1 where d1.workoutid = r1.workoutid and d1.time = r1.time  and r1.workoutid <= 326636695;
```


```
```

```
select * from histogram('heart_rate_ma_25', 'bike');
```

```

```

### geo_distance
```
alter table bike add column geo_distance numeric(20,10);
```

```
vacuum full verbose bike;
with gd as ( 
  select time, workoutid, sqrt(power(lat_difference, 2) + power(long_difference, 2)) as geo_distance 
  from (select time, workoutid, latitude - lag(latitude) over (partition by workoutid order by time) as lat_difference, longitude - lag(longitude) over (partition by workoutid order by time) as long_difference from bike order by time) 
  as foo 
)
update bike r1 
      set geo_distance = round(d1.geo_distance, 10)
  from gd as d1 
  where d1.workoutid = r1.workoutid and d1.time = r1.time and r1.workoutid > 326636695;
```

```
vacuum full verbose bike;
with gd as ( 
  select time, workoutid, sqrt(power(lat_difference, 2) + power(long_difference, 2)) as geo_distance 
  from (select time, workoutid, latitude - lag(latitude) over (partition by workoutid order by time) as lat_difference, longitude - lag(longitude) over (partition by workoutid order by time) as long_difference from bike order by time) 
  as foo
)
update bike r1 
      set geo_distance = round(d1.geo_distance, 10)
  from gd as d1 
  where d1.workoutid = r1.workoutid and d1.time = r1.time and r1.workoutid <= 326636695;
```


#### speed_by_geo
```
alter table bike add column speed_by_geo numeric(20,10);
```

```
vacuum full verbose bike;
with sgd as ( 
  select workoutid, 
         time, 
         geo_distance, 
         case when time_difference = 0 then 1 else time_difference end as time_difference 
  from (
    select workoutid, 
           time, 
           geo_distance, 
           time - lag(time) over (partition by workoutid order by time) as time_difference
    from bike order by time )
  as foo )
update bike r1
  set speed_by_geo = s1.geo_distance / time_difference
  from sgd as s1
  where r1.workoutid = s1.workoutid and r1.time = s1.time and r1.workoutid <= 326636695;
```

```
vacuum full verbose bike;
with sgd as ( 
  select workoutid, 
         time, 
         geo_distance, 
         case when time_difference = 0 then 1 else time_difference end as time_difference 
  from (
    select workoutid, 
           time, 
           geo_distance, 
           time - lag(time) over (partition by workoutid order by time) as time_difference
    from bike order by time )
  as foo )
update bike r1
  set speed_by_geo = s1.geo_distance / time_difference
  from sgd as s1
  where r1.workoutid = s1.workoutid and r1.time = s1.time and r1.workoutid > 326636695;
```



## BIKE_BY_WORKOUT derived fields

#### altitude_max. Not derived, worth checking if our changes impact this.

```
with alt as (select max(altitude) as mx, min(altitude) as mn, workoutid from bike group by workoutid)
select rbw.workoutid, altitude_max - mx from bike_by_workout rbw join alt on (rbw.workoutid = alt.workoutid) where round(mx) != round(rbw.altitude_max) order by mx - rbw.altitude_max; 
```
```
 330516190 | 11122.7073674802
 329706837 | 10809.7602832570
 587186790 | 10015.0780456665
 555614260 |  9929.2362597201
...
 ```

#### altitude_min. Not derived, worth checking if our changes impact this.

```
with alt as (select max(altitude) as mx, min(altitude) as mn, workoutid from bike group by workoutid)
select rbw.workoutid, altitude_min - mn from bike_by_workout rbw join alt on (rbw.workoutid = alt.workoutid) where round(mn) != round(rbw.altitude_min) order by mn - rbw.altitude_min asc; 
```
```
...
 361394650 | -2655.7451394907
 303853730 | -2664.3155222158
 416246180 | -2734.7073505368
 497010281 | -2745.7315130195
 589267802 | -2749.5074751865
 250964111 | -3519.4471997389
 303011701 | -4610.1849504942
  10914476 | -5466.2500000000
  14961570 | -8336.7500000000
```

## Should not use those, derive from workout series and use those instead.

#### Add altitude_max2 & altitude min2.
```
alter table bike_by_workout add column altitude_max2 numeric(20,10);
alter table bike_by_workout add column altitude_min2 numeric(20,10);
```

```
with alt as (select max(altitude) as mx, min(altitude) as mn, workoutid from bike group by workoutid)
update bike_by_workout as r1
  set altitude_max2 = d1.mx, altitude_min2 = d1.mn
  from alt as d1 where d1.workoutid = r1.workoutid;
```


### heart_rate_avg, heart_rate_max
```
alter table bike_by_workout add column heart_rate_avg numeric(20,10);
alter table bike_by_workout add column heart_rate_max numeric(20,10);
```

```
with alt as (select max(heart_rate) as mx, avg(heart_rate) as av, workoutid from bike group by workoutid)
update bike_by_workout as r1
  set heart_rate_avg = d1.av, heart_rate_max = d1.mx
  from alt as d1 where d1.workoutid = r1.workoutid;
```



### speed_avg, speed_max
```
alter table bike_by_workout add column speed_avg2 numeric(20,10);
alter table bike_by_workout add column speed_max2 numeric(20,10);
```

```
with alt as (select max(speed) as mx, avg(speed) as av, workoutid from bike group by workoutid)
update bike_by_workout as r1
  set speed_avg2 = d1.av, speed_max2 = d1.mx
  from alt as d1 where d1.workoutid = r1.workoutid;
```


### geo_distance
```
alter table bike_by_workout add column geo_distance numeric(15,10);
```

```
with alt as (select sum(geo_distance) as su, workoutid from bike group by workoutid)
update bike_by_workout as r1
  set geo_distance = d1.su
  from alt as d1 where d1.workoutid = r1.workoutid;
```


### series_length
```
alter table bike_by_woerkout add column series_length integer;
```

```
with alt as (select count(*) as su, workoutid from bike group by workoutid)
update bike_by_workout as r1
  set series_length = d1.su
  from alt as d1 where d1.workoutid = r1.workoutid;
```


### Elapsed time
```
alter table bike_by_woerkout add column elapsed_time integer;
```
```
with alt as (select max(time) - min(time) as m, workoutid from bike group by workoutid)
update bike_by_workout as r1
  set elapsed_time = d1.m
  from alt as d1 where d1.workoutid = r1.workoutid;
```

#### Lot's of elapsed time > 4 hours. Seems unlikely
```
select workoutid, elapsed_time / 60 / 60 from bike_by_workout where elapsed_time is not null order by elapsed_time desc limit 10;
```

```
 597722810 |    59808
 112339593 |    49897
 379758644 |    41328
 372999554 |    40193
 418433931 |    39641
 360494498 |    39617
 358411686 |    39521
 469082111 |    36623
 313885817 |    35126
 291721039 |    34352
 ```
 
#### Each one is a bit different, first one looks like good data up to a point and then became garbage. Drop the garbage

```
delete from bike where workoutid = 597722810 and time > 1441685063;
```

#### Second starts with garbage data
```
delete from bike where workoutid = 112339593 and time < 1355052658;
```

#### Try a few more manually to find a trend
```
delete from bike where workoutid = 291721039 and time > 1391089574;
```

```
delete from bike where workoutid = 379758644  and time > 1406301674 ;
```

```
delete from bike where workoutid = 372999554 and time < 1405347145 ;
```

```
delete from bike where workoutid = 418433931 and time > 1411875066 ;
```

```
delete from bike where workoutid = 360494498;
```

```
delete from bike where workoutid = 358411686;
```

```
delete from bike where workoutid = 469082111;
```

```
delete from bike where workoutid = 313885817 and time > 1388301792;
```

#### Update elapsed_time

```
with alt as (select max(time) - min(time) as m, workoutid from bike group by workoutid)
update bike_by_workout as r1
  set elapsed_time = d1.m
  from alt as d1 where d1.workoutid = r1.workoutid;
```

#### Update series_length, a few of them had a small amount of data points, can just drop those all together

```
with alt as (select count(*) as su, workoutid from bike group by workoutid)
update bike_by_workout as r1
  set series_length = d1.su
  from alt as d1 where d1.workoutid = r1.workoutid;
```

```
select elapsed_time, series_length from bike_by_workout where elapsed_time > 60*60*8 and series_length < 30;
```

##### Found 29

```
delete from bike where workoutid in (select workoutid from bike_by_workout where elapsed_time > 60*60*8 and series_length < 30);
```

```
DELETE 278
```

##### Cleanup
```
with alt as (select max(time) - min(time) as m, workoutid from bike group by workoutid)
update bike_by_workout as r1
  set elapsed_time = d1.m
  from alt as d1 where d1.workoutid = r1.workoutid;
```
```
with alt as (select count(*) as su, workoutid from bike group by workoutid)
update bike_by_workout as r1
  set series_length = d1.su
  from alt as d1 where d1.workoutid = r1.workoutid;
```

```
delete from bike_by_workout where series_length = 0;
```

##### If an event is really long it should have a lot of events, lets see how many actually have a large number
```
select elapsed_time, series_length from bike_by_workout where elapsed_time > 60*60*8 and series_length > 300 order by series_length;
```


##### Found 1575 / 1780

```
select altitude, heart_rate, latitude, longitude, speed, workoutid from bike where workoutid in (select workoutid from bike_by_workout where elapsed_time > 60*60*8) and latitude is not null and longitude is not null group by altitude, heart_rate, latitude, longitude, speed, workoutid having count(*) > 100;
```

#### Remove workouts that have a lot of repeat data and a big time jump...

```
delete from bike where workoutid in (select workoutid from bike where workoutid in (select workoutid from bike_by_workout where elapsed_time > 60*60*8) and  latitude is not null and longitude is not null group by altitude, heart_rate, latitude, longitude, speed, workoutid having count(*) > 100);
```

##### Cleanup
```
with alt as (select max(time) - min(time) as m, workoutid from bike group by workoutid)
update bike_by_workout as r1
  set elapsed_time = d1.m
  from alt as d1 where d1.workoutid = r1.workoutid;
with alt as (select count(*) as su, workoutid from bike group by workoutid)
update bike_by_workout as r1
  set series_length = d1.su
  from alt as d1 where d1.workoutid = r1.workoutid;
```

```
delete from bike_by_workout where series_length = 0;
delete from bike_by_workout where workoutid not in (select distinct(workoutid) from bike);
```

#### Lets see how many remain.

```
select count(*) from bike_by_workout where elapsed_time > 60*60*8 and series_length < 300;
```

#### Found 205

```
delete from bike where workoutid in (select workoutid from bike_by_workout where elapsed_time > 60*60*8 and series_length < 300);
```

```
with alt as (select max(time) - min(time) as m, workoutid from bike group by workoutid)
update bike_by_workout as r1
  set elapsed_time = d1.m
  from alt as d1 where d1.workoutid = r1.workoutid;
with alt as (select count(*) as su, workoutid from bike group by workoutid)
update bike_by_workout as r1
  set series_length = d1.su
  from alt as d1 where d1.workoutid = r1.workoutid;
```

```
delete from bike_by_workout where series_length = 0;
delete from bike_by_workout where workoutid not in (select distinct(workoutid) from bike);
```

#### Taking too long, lets find the median value and then remove the those more than 8 hours from that. Postgres has no built in median aggregator.

```
CREATE OR REPLACE FUNCTION _final_median(NUMERIC[])
   RETURNS NUMERIC AS
$$
   SELECT AVG(val)
   FROM (
     SELECT val
     FROM unnest($1) val
     ORDER BY 1
     LIMIT  2 - MOD(array_upper($1, 1), 2)
     OFFSET CEIL(array_upper($1, 1) / 2.0) - 1
   ) sub;
$$
LANGUAGE 'sql' IMMUTABLE;
 
CREATE AGGREGATE median(NUMERIC) (
  SFUNC=array_append,
  STYPE=NUMERIC[],
  FINALFUNC=_final_median,
  INITCOND='{}'
);

```

```
delete from bike r1 
  using bike_by_workout rbw
  where r1.workoutid = rbw.workoutid and
    rbw.elapsed_time > 60*60*8 and
    r1.time > 60*60*8 + (select median(time) from bike r2 where r2.workoutid = r1.workoutid group by workoutid limit 1);
```
  

```
delete from bike r1 
  using bike_by_workout rbw
  where r1.workoutid = rbw.workoutid and
    rbw.elapsed_time > 60*60*8 and
    r1.time < -60*60*8 + (select median(time) from bike r2 where r2.workoutid = r1.workoutid group by workoutid limit 1);
```

```
--commit;
--rollback;
```

## Time since last workout

```
alter table bike_by_workout add column time_since_last_workout integer;
```
```
with alt as (select workoutid, start_time - lag(start_time) over (partition by userid order by start_time) as time_since from bike_by_workout order by start_time)
update bike_by_workout as r1
  set time_since_last_workout = d1.time_since
  from alt as d1 where d1.workoutid = r1.workoutid;
```


### series_time_delta

```
alter table bike_by_workout add column series_time_delta integer[];
```

```
with alt as (select workoutid, array_agg( tdelta ) as tdelta_array from (select workoutid, time - lag(time) over (partition by workoutid order by time) as tdelta from bike order by time) as foo where tdelta is not null group by workoutid)
update bike_by_workout as r1
  set series_time_delta = d1.tdelta_array
  from alt as d1 where d1.workoutid = r1.workoutid;
```



### series_time_delta_average
```
alter table bike_by_workout add column series_time_delta_average numeric(20,10);
```

```
CREATE OR REPLACE FUNCTION array_avg(double precision[])
RETURNS double precision AS $$
SELECT avg(v) FROM unnest($1) g(v)
$$ LANGUAGE sql;
```

```
update bike_by_workout
  set series_time_delta_average = array_avg(series_time_delta);
```

# '

# '