## RUN
```
select count(*) from run;
```
```
   count   
-----------
 140,057,634
```


### Optimal ordering for most operations
```
cluster verbose run using run_workoutid_idx;
```

### REMOVE DUPES.

#### Count the number of dupes

```
select count(*) from (select count(*) from run group by altitude, heart_rate, latitude, longitude, speed, workoutid, time having count(*) > 1) as foo;
```
```
 count  
--------
 136326
```

#### Remove the dupes

```
ALTER TABLE run ADD COLUMN id SERIAL PRIMARY KEY;
DELETE FROM run
WHERE id IN (SELECT id
              FROM (SELECT id,
                             ROW_NUMBER() OVER (partition BY altitude, heart_rate, latitude, longitude, speed, workoutid, time ORDER BY id) AS rnum
                     FROM run) t
              WHERE t.rnum > 1);
ALTER TABLE run drop column id;
```
```
DELETE 136868
```

```select count(*) from (select count(*) from run group by altitude, heart_rate, latitude, longitude, speed, workoutid, time having count(*) > 1) as foo;```

```
 count 
-------
     0
```

### Vacuum
```vacuum full run;```

## Speed

### Value given in kph. Range should be from 0-50.

```
select * from histogram('speed', 'run');
```
```
 bucket |                range                |   freq   |       bar
--------+-------------------------------------+----------+-----------------
      1 | [-1056.5865000000,1951.4200000000]  | 62909689 | ***************
      2 | [2209.0300000000,4585.4600000000]   |        5 |
      3 | [5865.0600000000,5865.0600000000]   |        1 |
      4 | [8118.3600000000,8118.3600000000]   |        1 |
      5 | [12446.2000000000,13525.5000000000] |        5 |
      6 | [14311.3000000000,14311.3000000000] |        1 |
     10 | [27818.9000000000,27818.9000000000] |        1 |
     17 | [47554.0000000000,47554.0000000000] |        1 |
     21 | [59500.2000000000,59500.2000000000] |        1 |
     ```

#### Simply removing data has implications, there is a relatively small amount of outliers so we will just smooth them.

#### Lets find the average speed and smooth them to that.

```
SELECT avg(speed) AS average FROM run where speed < 50 and speed > 0;
```
```
       average       
---------------------
 11.1156684465806305
```

### Still too many > 50kph (30mph, no one that fast).

```
update run set speed = 11 where speed > 50;
```
```
UPDATE 28903
```

```
select * from histogram('speed', 'run');
```
```
 bucket |                range                |   freq   |       bar       
--------+-------------------------------------+----------+-----------------
      1 | [-1056.5865000000,-1012.0039000000] |        2 | 
      3 | [-909.6679700000,-909.6679700000]   |        1 | 
     13 | [-365.4199200000,-365.4199200000]   |        1 | 
     16 | [-218.5532200000,-188.3548100000]   |        3 | 
     17 | [-142.0299200000,-142.0299200000]   |        1 | 
     18 | [-107.4442140000,-72.7328700000]    |        3 | 
     19 | [-60.1526680000,-5.3368900000]      |      544 | 
     20 | [-5.3074465000,49.9968000000]       | 62909148 | ***************
     21 | [50.0000000000,50.0000000000]       |        2 | 
```

### Still too many < 0kph.

```
update run set speed = 11 where speed < 0;
```
```
UPDATE 28903
```

```
select * from histogram('speed', 'run');
```
```
 bucket |                range                |   freq   |       bar       
--------+-------------------------------------+----------+-----------------
      1 | [-1056.5865000000,-1012.0039000000] |        2 | 
      3 | [-909.6679700000,-909.6679700000]   |        1 | 
     13 | [-365.4199200000,-365.4199200000]   |        1 | 
     16 | [-218.5532200000,-188.3548100000]   |        3 | 
     17 | [-142.0299200000,-142.0299200000]   |        1 | 
     18 | [-107.4442140000,-72.7328700000]    |        3 | 
     19 | [-60.1526680000,-5.3368900000]      |      544 | 
     20 | [-5.3074465000,49.9968000000]       | 62909148 | ***************
     21 | [50.0000000000,50.0000000000]       |        2 | 
```


#### Much better lets generate first derivative of speed.
##### Need to split to finish, 

```
vacuum full verbose run; with dev_list as (
    select round((speed_difference / time_difference),5) as deriv,
           time,
           workoutid
           from (
        select speed_difference,
               case when time_difference = 0 then 1 else time_difference end as time_difference,
               time,
               workoutid
               from (
            select speed - lag(speed) over (partition by workoutid order by time) as speed_difference,
                   time - lag(time) over (partition by workoutid order by time) as time_difference,
                   speed,
                   time,
                   workoutid
                from run order by time)
        as foo)
    as bar
    order by workoutid,
             time )
update run r1
  set speed_first = d1.deriv
  from dev_list as d1
  where d1.workoutid = r1.workoutid and
        d1.time = r1.time and r1.workoutid > 674628540 / 2;
```
```
vacuum full verbose run; with dev_list as (
    select round((speed_difference / time_difference),5) as deriv,
           time,
           workoutid
           from (
        select speed_difference,
               case when time_difference = 0 then 1 else time_difference end as time_difference,
               time,
               workoutid
               from (
            select speed - lag(speed) over (partition by workoutid order by time) as speed_difference,
                   time - lag(time) over (partition by workoutid order by time) as time_difference,
                   speed,
                   time,
                   workoutid
                from run order by time)
        as foo)
    as bar
    order by workoutid,
             time )
update run r1
  set speed_first = d1.deriv
  from dev_list as d1
  where d1.workoutid = r1.workoutid and
        d1.time = r1.time and r1.workoutid <= 674628540 / 2;
```


### Sanity check values
```
select * from histogram('speed_first', 'run');
```
```
 bucket |         range         |   freq   |       bar       
--------+-----------------------+----------+-----------------
      1 | [-47.73240,-42.98813] |      234 | 
      2 | [-42.96152,-38.20114] |      868 | 
      3 | [-38.19191,-33.42419] |      937 | 
      4 | [-33.41972,-28.65955] |      609 | 
      5 | [-28.64880,-23.89011] |      601 | 
      6 | [-23.87152,-19.11500] |     1339 | 
      7 | [-19.11278,-14.34612] |     4012 | 
      8 | [-14.34480,-9.57550]  |    13702 | 
      9 | [-9.57527,-4.80565]   |    66119 | 
     10 | [-4.80561,-0.03601]   | 21209799 | ********
     11 | [-0.03600,4.73362]    | 41337319 | ***************
     12 | [4.73367,9.50317]     |    74429 | 
     13 | [9.50350,14.27250]    |    15557 | 
     14 | [14.27307,19.03956]   |     6717 | 
     15 | [19.04283,23.80343]   |     3242 | 
     16 | [23.81516,28.58019]   |     1401 | 
     17 | [28.58248,33.34351]   |     1072 | 
     18 | [33.35423,38.11226]   |      963 | 
     19 | [38.12325,42.88530]   |      912 | 
     20 | [42.89200,46.93680]   |     3453 | 
     21 | [47.66040,47.66040]   |        1 | 
```

#### (Re)Generate moving averages

##### Over 50 points
```
vacuum full verbose run;

with dev_list as (
	select time, 
	       workoutid, 
	       avg(speed) over (partition by workoutid order by time rows between 50 preceding and current row) as mavg 
	from run 
	order by time
)
update run r1 set speed_ma_50 = d1.mavg from dev_list as d1 where d1.workoutid = r1.workoutid and 
   d1.time = r1.time and
    r1.workoutid <= 674628540 / 2;    
```

```
vacuum full verbose run;

with dev_list as (
	select time, 
	       workoutid, 
	       avg(speed) over (partition by workoutid order by time rows between 50 preceding and current row) as mavg 
	from run 
	order by time
)
update run r1 set speed_ma_50 = d1.mavg from dev_list as d1 where d1.workoutid = r1.workoutid and 
   d1.time = r1.time and
    r1.workoutid > 674628540 / 2;
    
```


```
select * from histogram('speed_ma_50', 'run');
```
```
 bucket |        range        |   freq   |       bar
--------+---------------------+----------+-----------------
      1 | [0.00000,2.49974]   |   723959 |
      2 | [2.49975,4.99948]   |  1065301 | *
      3 | [4.99949,7.49923]   |  3474418 | **
      4 | [7.49924,9.99897]   | 15263434 | ********
      5 | [9.99898,12.49872]  | 28202884 | ***************
      6 | [12.49873,14.99846] | 11414483 | ******
      7 | [14.99847,17.49821] |  1854616 | *
      8 | [17.49823,19.99793] |   318389 |
      9 | [19.99796,22.49767] |   154493 |
     10 | [22.49772,24.99744] |   116133 |
     11 | [24.99746,27.49719] |    91617 |
     12 | [27.49720,29.99693] |    79134 |
     13 | [29.99694,32.49668] |    65711 |
     14 | [32.49669,34.99622] |    38678 |
     15 | [34.99645,37.49591] |    19543 |
     16 | [37.49623,39.99531] |     9560 |
     17 | [39.99690,42.49418] |     5338 |
     18 | [42.49669,44.99466] |     5570 |
     19 | [44.99922,47.47320] |     6339 |
     20 | [47.49917,49.85640] |      103 |
     21 | [49.99490,49.99490] |        2 |
```




##### Over 100 points
```
vacuum full verbose run;
with dev_list as (
	select time, 
	       workoutid, 
	       avg(speed) over (partition by workoutid order by time rows between 100 preceding and current row) as mavg 
	from run 
	order by time
)
update run r1 set speed_ma_100 = d1.mavg from dev_list as d1 where d1.workoutid = r1.workoutid and d1.time = r1.time and r1.workoutid > 674628540 / 2;
```

```
vacuum full verbose run;
with dev_list as (
	select time, 
	       workoutid, 
	       avg(speed) over (partition by workoutid order by time rows between 100 preceding and current row) as mavg 
	from run 
	order by time
)
update run r1 set speed_ma_100 = d1.mavg from dev_list as d1 where d1.workoutid = r1.workoutid and d1.time = r1.time and r1.workoutid <= 674628540 / 2;
```

```
select * from histogram('speed_ma_100', 'run');
```
```
 bucket |        range        |   freq   |       bar
--------+---------------------+----------+-----------------
      1 | [0.00000,2.49974]   |   685770 |
      2 | [2.49975,4.99948]   |   989860 | *
      3 | [4.99949,7.49923]   |  3381266 | **
      4 | [7.49924,9.99897]   | 15627434 | ********
      5 | [9.99898,12.49872]  | 28507149 | ***************
      6 | [12.49873,14.99846] | 11105122 | ******
      7 | [14.99847,17.49821] |  1722350 | *
      8 | [17.49823,19.99795] |   308469 |
      9 | [19.99796,22.49770] |   161878 |
     10 | [22.49771,24.99742] |   112155 |
     11 | [24.99749,27.49717] |    88815 |
     12 | [27.49720,29.99688] |    81664 |
     13 | [29.99695,32.49654] |    67296 |
     14 | [32.49679,34.99641] |    34831 |
     15 | [34.99649,37.49596] |    14112 |
     16 | [37.49623,39.99556] |     6428 |
     17 | [39.99609,42.49466] |     4578 |
     18 | [42.49733,44.99406] |     5261 |
     19 | [44.99608,47.47320] |     5162 |
     20 | [47.49917,49.85640] |      103 |
     21 | [49.99490,49.99490] |        2 |
```

## Altitude

### Big range of altitudes, some too low, some too high


```
select * from histogram('altitude', 'run');
```
```

```

#### Lets see how many rows are more thant two standard deviations  from the mean

```
select count(*) from (
	with dev_list as ( 
		select avg(altitude), stddev(altitude), workoutid from run group by workoutid )
select altitude, r1.workoutid 
  from run r1 
  join dev_list d1 on (d1.workoutid = r1.workoutid) 
  where r1.altitude < d1.avg - d1.stddev * 2 or r1.altitude > d1.avg + d1.stddev * 2) 
as foo;
```

##### 323,310 / 12,493,175 => ~2.5%
##### We should just smooth them to be two standard deviations from the mean

##### Fix values too low
```
with dev_list as ( 
	select avg(altitude), stddev(altitude), workoutid from run group by workoutid )
update run as r1 
  set altitude = d1.avg - (d1.stddev * 2) 
  from dev_list as d1 
  where d1.workoutid = r1.workoutid and r1.altitude < d1.avg - (d1.stddev * 2);
  ```
```
UPDATE 113289
```

##### Fix values too high
```
with dev_list as ( 
	select avg(altitude), stddev(altitude), workoutid from run group by workoutid )
update run as r1 
  set altitude = d1.avg + (d1.stddev * 2) 
  from dev_list as d1 
  where d1.workoutid = r1.workoutid and r1.altitude > d1.avg + (d1.stddev * 2);
```
```
UPDATE 218677
```

#### Sanity check

```
select * from histogram('altitude', 'run');
```
```

```

##### Everest is at 8,850 meters, and the most extreme high altitude ultra marathon only gets up to 5,300 meters. The lowest below sea level land is -413 meters. Probably a lot of errant sensor data

##### Lets look at workouts that include altitudes greater than 5300
```
select altitude, time, latitude, longitude, workoutid from run where workoutid in (select distinct(workoutid) from run where altitude > 5300) order by workoutid, time;
```

###### Small sample, look up lat / lng and check altitude. Spot check lat / lngs looks like the data is bad. 
```
     altitude     |    time    |   latitude    |   longitude    | workoutid 
------------------+------------+---------------+----------------+-----------
  4717.7463628016 | 1359891816 | 54.5475614000 |  -1.9052789000 | 156382546
...
  6868.2000000000 | 1359891836 | 54.5479799000 |  -1.9053959000 | 156382546
```

###### How many workouts is it?
```
select count(*) from  (select distinct(workoutid) from run where altitude > 5300) as foo;
```

###### 87 / 25,000.
```
select count(*) from run where altitude > 5300;
```
##### 42,172  probably should just drop the altitude for those workouts, as that is ~484 per workout.
```
update run set altitude = NULL where workoutid in (select distinct(workoutid) from run where altitude > 5300);
```


##### (Re)Generate first derivative
```
with dev_list as (
    select round((alt_difference / time_difference),5) as deriv, time, workoutid from ( 
        select alt_difference, case when time_difference = 0 then 1 else time_difference end as time_difference, altitude_first, time, workoutid, altitude from (
            select altitude - lag(altitude) over (partition by workoutid order by time) as alt_difference, time - lag(time) over (partition by workoutid order by time) as time_difference, time, workoutid from run order by time) 
        as foo) 
    as bar order by workoutid, time )
update run r1 
  set altitude_first = d1.deriv 
  from dev_list as d1 
  where d1.workoutid = r1.workoutid and d1.time = r1.time;
```
```
UPDATE 12493175
```


##### (Re)Generate second derivative
```
with dev_list as (
    select round((alt_difference / time_difference),5) as deriv, time, workoutid from ( 
        select alt_difference, case when time_difference = 0 then 1 else time_difference end as time_difference, time, workoutid from (
            select altitude_first - lag(altitude_first) over (partition by workoutid order by time) as alt_difference, time - lag(time) over (partition by workoutid order by time) as time_difference, time, workoutid from run order by time) 
        as foo) 
    as bar order by workoutid, time )
update run r1 
  set altitude_first = d1.deriv 
  from dev_list as d1 
  where d1.workoutid = r1.workoutid and d1.time = r1.time;
```
```
UPDATE 12493175

```


#### Vacuum
```
vacumm full run;
```

## Elapsed distance


```
with elap_dist as (select time,
       r1.workoutid,
       (row_number() over (partition by r1.workoutid order by time) - 1) * (distance / (series_length - 1)) as elapsed_distance
       from run r1 join run_by_workout r2 on (r1.workoutid = r2.workoutid) order by time)   
update run as r1 
  set elapsed_distance = round(d1.elapsed_distance,10)
  from elap_dist as d1 
  where d1.workoutid = r1.workoutid and d1.time = r1.time;
```


## Elapsed time

```
with elap_time as (select time, workoutid, time - first_value(time) over (partition by workoutid order by time) as time_elap from run order by time)
update run as r1 
  set elapsed_time = d1.time_elap 
  from elap_time as d1 
  where r1.workoutid = d1.workoutid and r1.time = d1.time;
```


## Heart Rate

```
select * from histogram('heart_rate', 'run');
```
```

```

##### Lots of garbage data. Heart rate should never be above 240 or below 40. Even those are uncommon. Use the same two standard deviation logic as altitude. 

##### Fix values too high
```
with dev_list as ( 
	select avg(heart_rate), stddev(heart_rate), workoutid from run group by workoutid )
update run as r1 
  set heart_rate = d1.avg + (d1.stddev * 2) 
  from dev_list as d1 
  where d1.workoutid = r1.workoutid and r1.heart_rate > d1.avg + (d1.stddev * 2);
```
```
UPDATE 77518
```


##### Fix values too low
```
with dev_list as ( 
	select avg(heart_rate), stddev(heart_rate), workoutid from run group by workoutid )
update run as r1 
  set heart_rate = d1.avg + (d1.stddev * 2) 
  from dev_list as d1 
  where d1.workoutid = r1.workoutid and r1.heart_rate < d1.avg - (d1.stddev * 2);
```
```
UPDATE 302130
```

#### Better, but still too many below 40 and above 250
```
select * from histogram('heart_rate', 'run');
```
```
 bucket |         range         |  freq   |       bar       
--------+-----------------------+---------+-----------------
      1 | [0.00000,0.00000]     |   20130 | 
      3 | [30.00000,41.00000]   |     659 | 
      4 | [42.00000,55.00000]   |    1664 | 
      5 | [55.69926,69.00000]   |    4291 | 
      6 | [70.00000,83.05090]   |   24722 | 
      7 | [83.74898,97.21050]   |   72291 | 
      8 | [97.69388,111.13301]  |  191009 | *
      9 | [111.20287,125.00000] |  592089 | ***
     10 | [125.10133,138.93941] | 1517767 | ********
     11 | [138.96311,152.84465] | 2953818 | ***************
     12 | [152.85173,166.73934] | 2536751 | *************
     13 | [166.74265,180.63194] | 1201726 | ******
     14 | [180.63728,194.52560] |  266629 | *
     15 | [194.53101,208.35552] |   55031 | 
     16 | [208.44704,222.28627] |   22529 | 
     17 | [222.36136,236.20774] |   14470 | 
     18 | [236.28783,250.07760] |    6902 | 
     19 | [250.28778,263.28305] |     813 | 
     20 | [266.16942,277.41503] |     368 | 
     21 | [277.90138,277.90138] |      19 | 
     ```

#### (Re)Generate moving heart rate average over last 25 points

```
with dev_list as (
	select time,
	       workoutid, 
	       avg(heart_rate) over (partition by workoutid order by time rows between 25 preceding and current row) as mavg 
	from run 
	order by time
)
update run r1 set heart_rate_ma_25 = d1.mavg from dev_list as d1 where d1.workoutid = r1.workoutid and d1.time = r1.time;
```

```
select * from histogram('heart_rate_ma_25', 'run');
```

```

```