From a554a4f1066b3d338af1a003b72bef743478e218 Mon Sep 17 00:00:00 2001
From: Ryan Harter <harterrt@gmail.com>
Date: Tue, 2 May 2017 12:36:36 -0400
Subject: [PATCH 01/15] Split choosing_a_dataset into separate files

---
 SUMMARY.md                                    |  16 +-
 concepts/choosing_a_dataset.md                | 212 +-----------------
 datasets/batch_view/{ => addons}/Addons.md    |   0
 datasets/batch_view/addons/intro.md           |   0
 datasets/batch_view/addons/reference.md       |  11 +
 datasets/batch_view/client_count/intro.md     |  35 +++
 datasets/batch_view/client_count/reference.md |  11 +
 .../CrashAggregateView.md                     |   0
 datasets/batch_view/crash_aggregates/intro.md |  26 +++
 .../batch_view/crash_aggregates/reference.md  |  11 +
 .../{ => crash_summary}/CrashSummary.md       |   0
 datasets/batch_view/crash_summary/intro.md    |   0
 .../batch_view/crash_summary/reference.md     |  11 +
 .../{ => cross_sectional}/cross_sectional.md  |   0
 datasets/batch_view/cross_sectional/intro.md  |  52 +++++
 .../batch_view/cross_sectional/reference.md   |  11 +
 datasets/batch_view/{ => events}/Events.md    |   0
 datasets/batch_view/events/intro.md           |   0
 datasets/batch_view/events/reference.md       |  11 +
 .../{ => longitudinal}/Longitudinal.md        |   0
 datasets/batch_view/longitudinal/intro.md     |  38 ++++
 datasets/batch_view/longitudinal/reference.md |  11 +
 datasets/batch_view/ls                        |  11 +
 .../{ => main_summary}/MainSummary.md         |   0
 datasets/batch_view/main_summary/intro.md     |  43 ++++
 datasets/batch_view/main_summary/reference.md |  11 +
 datasets/batch_view/reference.md              |  11 +
 .../{ => sync_summary}/SyncSummary.md         |   0
 datasets/batch_view/sync_summary/intro.md     |   0
 datasets/batch_view/sync_summary/reference.md |  11 +
 30 files changed, 328 insertions(+), 215 deletions(-)
 rename datasets/batch_view/{ => addons}/Addons.md (100%)
 create mode 100644 datasets/batch_view/addons/intro.md
 create mode 100644 datasets/batch_view/addons/reference.md
 create mode 100644 datasets/batch_view/client_count/intro.md
 create mode 100644 datasets/batch_view/client_count/reference.md
 rename datasets/batch_view/{ => crash_aggregates}/CrashAggregateView.md (100%)
 create mode 100644 datasets/batch_view/crash_aggregates/intro.md
 create mode 100644 datasets/batch_view/crash_aggregates/reference.md
 rename datasets/batch_view/{ => crash_summary}/CrashSummary.md (100%)
 create mode 100644 datasets/batch_view/crash_summary/intro.md
 create mode 100644 datasets/batch_view/crash_summary/reference.md
 rename datasets/batch_view/{ => cross_sectional}/cross_sectional.md (100%)
 create mode 100644 datasets/batch_view/cross_sectional/intro.md
 create mode 100644 datasets/batch_view/cross_sectional/reference.md
 rename datasets/batch_view/{ => events}/Events.md (100%)
 create mode 100644 datasets/batch_view/events/intro.md
 create mode 100644 datasets/batch_view/events/reference.md
 rename datasets/batch_view/{ => longitudinal}/Longitudinal.md (100%)
 create mode 100644 datasets/batch_view/longitudinal/intro.md
 create mode 100644 datasets/batch_view/longitudinal/reference.md
 create mode 100644 datasets/batch_view/ls
 rename datasets/batch_view/{ => main_summary}/MainSummary.md (100%)
 create mode 100644 datasets/batch_view/main_summary/intro.md
 create mode 100644 datasets/batch_view/main_summary/reference.md
 create mode 100644 datasets/batch_view/reference.md
 rename datasets/batch_view/{ => sync_summary}/SyncSummary.md (100%)
 create mode 100644 datasets/batch_view/sync_summary/intro.md
 create mode 100644 datasets/batch_view/sync_summary/reference.md

diff --git a/SUMMARY.md b/SUMMARY.md
index bd612dc20..af4225da2 100644
--- a/SUMMARY.md
+++ b/SUMMARY.md
@@ -15,14 +15,14 @@
   * [Dataset Reference](datasets/reference.md)
     * [Pings](datasets/pings.md)
     * [Derived Datasets](datasets/derived.md)
-      * [Longitudinal](datasets/batch_view/Longitudinal.md)
-      * [Cross Sectional](datasets/batch_view/cross_sectional.md)
-      * [Main Summary](datasets/batch_view/MainSummary.md)
-      * [Crash Summary](datasets/batch_view/CrashSummary.md)
-      * [Crash Aggregate](datasets/batch_view/CrashAggregateView.md)
-      * [Events](datasets/batch_view/Events.md)
-      * [Sync Summary](datasets/batch_view/SyncSummary.md)
-      * [Addons](datasets/batch_view/Addons.md)
+      * [Longitudinal](datasets/batch_view/longitudinal/reference.md)
+      * [Cross Sectional](datasets/batch_view/cross_sectional/reference.md)
+      * [Main Summary](datasets/batch_view/main_summary/reference.md)
+      * [Crash Summary](datasets/batch_view/crash_summary/reference.md)
+      * [Crash Aggregate](datasets/batch_view/crash_aggregates/reference.md)
+      * [Events](datasets/batch_view/events/reference.md)
+      * [Sync Summary](datasets/batch_view/sync_summary/reference.md)
+      * [Addons](datasets/batch_view/addons/reference.md)
     * [Experimental Datasets](tools/experiments.md)
       * [Accessing Shield Study data](datasets/shield.md)
   * [Collecting New Data](datasets/new_data.md)
diff --git a/concepts/choosing_a_dataset.md b/concepts/choosing_a_dataset.md
index a148901c7..7e65f7a05 100644
--- a/concepts/choosing_a_dataset.md
+++ b/concepts/choosing_a_dataset.md
@@ -61,195 +61,19 @@ This section describes the derived datasets we provide to make analyzing this da
 
 ## longitudinal
 
-The `longitudinal` dataset is a 1% sample of main ping data
-organized so that each row corresponds to a client_id.
-If you're not sure which dataset to use for your analysis,
-this is probably what you want.
-
-#### Contents
-Each row in the `longitudinal` dataset represents one `client_id`,
-which is approximately a user.
-Each column represents a field from the main ping.
-Most fields contain **arrays of values**, with one value for each ping associated with a client_id.
-Using arrays give you access to the raw data from each ping,
-but can be difficult to work with from SQL.
-Here's a [query showing some sample data](https://sql.telemetry.mozilla.org/queries/4188#table)
-to help illustrate.
-Take a look at the [longitudinal examples](/cookbooks/longitudinal.md) if you get stuck.
-
-#### Background and Caveats
-Think of the longitudinal table as wide and short.
-The dataset contains more columns than `main_summary`
-and down-samples to 1% of all clients to reduce query computation time and save resources.
-
-In summary, the longitudinal table differs from `main_summary` in two important ways:
-
-* The longitudinal dataset groups all data so that one row represents a client_id
-* The longitudinal dataset samples to 1% of all client_ids
-
-#### Accessing the Data
-
-The `longitudinal` is available in re:dash,
-though it can be difficult to work with the array values in SQL.
-Take a look at this [example query](https://sql.telemetry.mozilla.org/queries/4189/source).
-
-The data is stored as a parquet table in S3 at the following address.
-See [this cookbook](/cookbooks/parquet.md) to get started working with the data
-in [Spark](http://spark.apache.org/docs/latest/quick-start.html).
-```
-s3://telemetry-parquet/longitudinal/
-```
-
-#### Further Reading
-
-The technical documentation for the `longitudinal` dataset is located in the
-[telemetry-batch-view documentation](https://github.com/mozilla/telemetry-batch-view/blob/master/docs/Longitudinal.md).
-
-We also have a set of examples in the [longitudinal cookbook](/cookbooks/longitudinal.md)
-
-The code that generates this dataset is [here](https://github.com/mozilla/telemetry-batch-view/blob/master/src/main/scala/com/mozilla/telemetry/views/Longitudinal.scala)
+{% include "/datasets/batch_view/longitudinal/intro.md" %}
 
 ## main_summary
 
-The `main_summary` table is the most direct representation of a main ping
-but can be difficult to work with due to its size. 
-Prefer the `longitudinal` dataset unless using the sampled data is prohibitive.
-
-#### Contents
-
-The `main_summary` table contains one row for each ping.
-Each column represents one field from the main ping payload,
-though only a subset of all main ping fields are included.
-This dataset **does not include histograms**.
-
-#### Background and Caveats
-This table is massive, and due to it's size, it can be difficult to work with.
-You should **avoid querying `main_summary`** from [re:dash](https://sql.telemetry.mozilla.org).
-Your queries will be **slow to complete** and can **impact performance for other users**,
-since re:dash on a shared cluster.
-
-Instead, we recommend using the `longitudinal` or `cross_sectional` dataset where possible.
-If these datasets do not suffice, consider using Spark on an
-[ATMO](https://analysis.telemetry.mozilla.org) cluster.
-In the odd case where these queries are necessary,
-make use of the `sample_id` field and limit to a short submission date range.
-
-#### Accessing the Data
-
-The data is stored as a parquet table in S3 at the following address.
-See [this cookbook](/cookbooks/parquet.md) to get started working with the data in Spark.
-```
-s3://telemetry-parquet/main_summary/v3/
-```
-
-Though **not recommended** `main_summary` is accessible through re:dash. 
-Here's an [example query](https://sql.telemetry.mozilla.org/queries/4201/source).
-Your queries will be slow to complete and can **impact performance for other users**,
-since re:dash is on a shared cluster.
-
-#### Further Reading
-
-The technical documentation for `main_summary` is located in the
-[telemetry-batch-view documentation](https://github.com/mozilla/telemetry-batch-view/blob/master/docs/MainSummary.md).
-
-The code responsible for generating this dataset is 
-[here](https://github.com/mozilla/telemetry-batch-view/blob/master/src/main/scala/com/mozilla/telemetry/views/MainSummaryView.scala)
-
+{% include "/datasets/batch_view/main_summary/intro.md" %}
 
 ## cross_sectional
 
-The `cross_sectional` dataset provides descriptive statistics
-for each client_id in a 1% sample of main ping data.
-This dataset simplifies the longitudinal table by replacing 
-the longitudinal arrays with summary statistics. 
-This is the most useful dataset for describing our user base.
-
-#### Content
-Each row in the `cross_sectional` dataset represents one `client_id`,
-which is approximately a user.
-Each column is a summary statistic describing the client_id.
-
-For example, the longitudinal table has a row called `geo_country` 
-which contains an array of country codes.
-For the same `client_id` the `cross_sectional` table 
-has columns called `geo_country_mode` and `geo_country_configs` 
-containing single summary statistics for 
-the modal country and the number of distinct countries in the array.
-
-| `client_id` | `geo_country`          | `geo_country_mode` | `geo_country_configs`|
-| ----------- |:----------------------:|:------------------:|:--------------------:|
-| 1           | array<"US">            | "US"               | 1                    |
-| 2           | array<"DE", "DE" "US"> | "DE"               | 2                    |
-
-#### Background and Caveats
-
-This table is much easier to work with than the longitudinal dataset because
-you don't need to work with arrays.
-This table has a limited number of pre-computed summary statistics
-so you're metric may not be included.
-
-Note that this dataset is a summary of the longitudinal dataset,
-so it is also a 1% sample of all client_ids.
-
-All summary statistics are computed over the last 180 days,
-so this dataset can be insensitive to changes over time.
-
-#### Accessing the Data
-
-The cross_sectional dataset is available in re:dash.
-Here's an [example query](https://sql.telemetry.mozilla.org/queries/4202/source).
-
-The data is stored as a parquet table in S3 at the following address.
-See [this cookbook](/cookbooks/parquet.md) to get started working with the data in Spark.
-```
-s3://telemetry-parquet/cross_sectional/v1/
-```
-
-#### Further Reading
-
-The `cross_sectional` dataset is generated by 
-[this code](https://github.com/mozilla/telemetry-batch-view/blob/master/src/main/scala/com/mozilla/telemetry/views/CrossSectionalView.scala).
-Take a look at [this query](https://sql.telemetry.mozilla.org/queries/4203/source) for a schema.
-
+{% include "/datasets/batch_view/cross_sectional/intro.md" %}
 
 ## client_count
 
-The `client_count` dataset is useful for estimating user counts over a few 
-[pre-defined dimensions](https://github.com/mozilla/telemetry-batch-view/blob/master/src/main/scala/com/mozilla/telemetry/views/ClientCountView.scala#L22).
-
-#### Content
-
-This dataset includes columns for a dozen factors and an HLL variable.
-The `hll` column contains a
-[HyperLogLog](https://en.wikipedia.org/wiki/HyperLogLog)
-variable, which is an approximation to the exact count.
-The factor columns include activity date and the dimensions listed
-[here](https://github.com/mozilla/telemetry-batch-view/blob/master/src/main/scala/com/mozilla/telemetry/views/ClientCountView.scala#L22).
-Each row represents one combinations of the factor columns.
-
-#### Background and Caveats
-
-It's important to understand that the `hll` column is **not a standard count**.
-The `hll` variable avoids double-counting users when aggregating over multiple days.
-The HyperLogLog variable is a far more efficient way to count distinct elements of a set,
-but comes with some complexity.
-To find the cardinality of an HLL use `cardinality(cast(hll AS HLL))`.
-To find the union of two HLL's over different dates, use `merge(cast(hll AS HLL))`.
-The [Firefox ER Reporting Query](https://sql.telemetry.mozilla.org/queries/81/source#129)
-is a good example to review.
-Finally, Roberto has a relevant writeup
-[here](https://robertovitillo.com/2016/04/12/measuring-product-engagment-at-scale/).
-
-#### Accessing the Data
-
-The data is available in re:dash.
-Take a look at this 
-[example query](https://sql.telemetry.mozilla.org/queries/81/source#129).
-
-I don't recommend accessing this data from ATMO.
-
-#### Further Reading
-
+{% include "/datasets/batch_view/client_count/intro.md" %}
 
 # Crash Ping Derived Datasets
 
@@ -262,33 +86,7 @@ This section describes the derived datasets we provide to make analyzing this da
 
 ## crash_aggregates
 
-The `crash_aggregates` dataset compiles crash statistics over various dimensions for each day.
-
-#### Rows and Columns
-
-There's one column for each of the stratifying dimensions and the crash statistics.
-Each row is a distinct set of dimensions, along with their associated crash stats.
-Example stratifying dimensions include channel and country,
-example statistics include usage hours and plugin crashes.
-See the [complete documentation](https://github.com/mozilla/telemetry-batch-view/blob/master/docs/CrashAggregateView.md)
-for all available dimensions
-and statistics.
-
-#### Accessing the Data
-
-This dataset is accessible via re:dash.
-
-The data is stored as a parquet table in S3 at the following address.
-See [this cookbook](/cookbooks/parquet.md) to get started working with the data in Spark.
-```
-s3://telemetry-parquet/crash_aggregates/v1/
-```
-
-#### Further Reading
-
-The technical documentation for this dataset can be found in the
-[telemetry-batch-view documentation](https://github.com/mozilla/telemetry-batch-view/blob/master/docs/CrashAggregateView.md)
-
+{% include "/datasets/batch_view/crash_aggregates/intro.md" %}
 
 # Appendix
 
diff --git a/datasets/batch_view/Addons.md b/datasets/batch_view/addons/Addons.md
similarity index 100%
rename from datasets/batch_view/Addons.md
rename to datasets/batch_view/addons/Addons.md
diff --git a/datasets/batch_view/addons/intro.md b/datasets/batch_view/addons/intro.md
new file mode 100644
index 000000000..e69de29bb
diff --git a/datasets/batch_view/addons/reference.md b/datasets/batch_view/addons/reference.md
new file mode 100644
index 000000000..b6dbfde9e
--- /dev/null
+++ b/datasets/batch_view/addons/reference.md
@@ -0,0 +1,11 @@
+# Introduction
+
+{% include "./intro.md" %}
+
+# Data Reference
+
+% include "./data_reference.md" %
+
+# Data Reference
+
+% include "./code_reference.md" %
diff --git a/datasets/batch_view/client_count/intro.md b/datasets/batch_view/client_count/intro.md
new file mode 100644
index 000000000..5fcb79768
--- /dev/null
+++ b/datasets/batch_view/client_count/intro.md
@@ -0,0 +1,35 @@
+The `client_count` dataset is useful for estimating user counts over a few 
+[pre-defined dimensions](https://github.com/mozilla/telemetry-batch-view/blob/master/src/main/scala/com/mozilla/telemetry/views/ClientCountView.scala#L22).
+
+#### Content
+
+This dataset includes columns for a dozen factors and an HLL variable.
+The `hll` column contains a
+[HyperLogLog](https://en.wikipedia.org/wiki/HyperLogLog)
+variable, which is an approximation to the exact count.
+The factor columns include activity date and the dimensions listed
+[here](https://github.com/mozilla/telemetry-batch-view/blob/master/src/main/scala/com/mozilla/telemetry/views/ClientCountView.scala#L22).
+Each row represents one combinations of the factor columns.
+
+#### Background and Caveats
+
+It's important to understand that the `hll` column is **not a standard count**.
+The `hll` variable avoids double-counting users when aggregating over multiple days.
+The HyperLogLog variable is a far more efficient way to count distinct elements of a set,
+but comes with some complexity.
+To find the cardinality of an HLL use `cardinality(cast(hll AS HLL))`.
+To find the union of two HLL's over different dates, use `merge(cast(hll AS HLL))`.
+The [Firefox ER Reporting Query](https://sql.telemetry.mozilla.org/queries/81/source#129)
+is a good example to review.
+Finally, Roberto has a relevant writeup
+[here](https://robertovitillo.com/2016/04/12/measuring-product-engagment-at-scale/).
+
+#### Accessing the Data
+
+The data is available in re:dash.
+Take a look at this 
+[example query](https://sql.telemetry.mozilla.org/queries/81/source#129).
+
+I don't recommend accessing this data from ATMO.
+
+#### Further Reading
diff --git a/datasets/batch_view/client_count/reference.md b/datasets/batch_view/client_count/reference.md
new file mode 100644
index 000000000..b6dbfde9e
--- /dev/null
+++ b/datasets/batch_view/client_count/reference.md
@@ -0,0 +1,11 @@
+# Introduction
+
+{% include "./intro.md" %}
+
+# Data Reference
+
+% include "./data_reference.md" %
+
+# Data Reference
+
+% include "./code_reference.md" %
diff --git a/datasets/batch_view/CrashAggregateView.md b/datasets/batch_view/crash_aggregates/CrashAggregateView.md
similarity index 100%
rename from datasets/batch_view/CrashAggregateView.md
rename to datasets/batch_view/crash_aggregates/CrashAggregateView.md
diff --git a/datasets/batch_view/crash_aggregates/intro.md b/datasets/batch_view/crash_aggregates/intro.md
new file mode 100644
index 000000000..53965cf99
--- /dev/null
+++ b/datasets/batch_view/crash_aggregates/intro.md
@@ -0,0 +1,26 @@
+The `crash_aggregates` dataset compiles crash statistics over various dimensions for each day.
+
+#### Rows and Columns
+
+There's one column for each of the stratifying dimensions and the crash statistics.
+Each row is a distinct set of dimensions, along with their associated crash stats.
+Example stratifying dimensions include channel and country,
+example statistics include usage hours and plugin crashes.
+See the [complete documentation](https://github.com/mozilla/telemetry-batch-view/blob/master/docs/CrashAggregateView.md)
+for all available dimensions
+and statistics.
+
+#### Accessing the Data
+
+This dataset is accessible via re:dash.
+
+The data is stored as a parquet table in S3 at the following address.
+See [this cookbook](/cookbooks/parquet.md) to get started working with the data in Spark.
+```
+s3://telemetry-parquet/crash_aggregates/v1/
+```
+
+#### Further Reading
+
+The technical documentation for this dataset can be found in the
+[telemetry-batch-view documentation](https://github.com/mozilla/telemetry-batch-view/blob/master/docs/CrashAggregateView.md)
diff --git a/datasets/batch_view/crash_aggregates/reference.md b/datasets/batch_view/crash_aggregates/reference.md
new file mode 100644
index 000000000..b6dbfde9e
--- /dev/null
+++ b/datasets/batch_view/crash_aggregates/reference.md
@@ -0,0 +1,11 @@
+# Introduction
+
+{% include "./intro.md" %}
+
+# Data Reference
+
+% include "./data_reference.md" %
+
+# Data Reference
+
+% include "./code_reference.md" %
diff --git a/datasets/batch_view/CrashSummary.md b/datasets/batch_view/crash_summary/CrashSummary.md
similarity index 100%
rename from datasets/batch_view/CrashSummary.md
rename to datasets/batch_view/crash_summary/CrashSummary.md
diff --git a/datasets/batch_view/crash_summary/intro.md b/datasets/batch_view/crash_summary/intro.md
new file mode 100644
index 000000000..e69de29bb
diff --git a/datasets/batch_view/crash_summary/reference.md b/datasets/batch_view/crash_summary/reference.md
new file mode 100644
index 000000000..b6dbfde9e
--- /dev/null
+++ b/datasets/batch_view/crash_summary/reference.md
@@ -0,0 +1,11 @@
+# Introduction
+
+{% include "./intro.md" %}
+
+# Data Reference
+
+% include "./data_reference.md" %
+
+# Data Reference
+
+% include "./code_reference.md" %
diff --git a/datasets/batch_view/cross_sectional.md b/datasets/batch_view/cross_sectional/cross_sectional.md
similarity index 100%
rename from datasets/batch_view/cross_sectional.md
rename to datasets/batch_view/cross_sectional/cross_sectional.md
diff --git a/datasets/batch_view/cross_sectional/intro.md b/datasets/batch_view/cross_sectional/intro.md
new file mode 100644
index 000000000..b973e0bd8
--- /dev/null
+++ b/datasets/batch_view/cross_sectional/intro.md
@@ -0,0 +1,52 @@
+The `cross_sectional` dataset provides descriptive statistics
+for each client_id in a 1% sample of main ping data.
+This dataset simplifies the longitudinal table by replacing 
+the longitudinal arrays with summary statistics. 
+This is the most useful dataset for describing our user base.
+
+#### Content
+Each row in the `cross_sectional` dataset represents one `client_id`,
+which is approximately a user.
+Each column is a summary statistic describing the client_id.
+
+For example, the longitudinal table has a row called `geo_country` 
+which contains an array of country codes.
+For the same `client_id` the `cross_sectional` table 
+has columns called `geo_country_mode` and `geo_country_configs` 
+containing single summary statistics for 
+the modal country and the number of distinct countries in the array.
+
+| `client_id` | `geo_country`          | `geo_country_mode` | `geo_country_configs`|
+| ----------- |:----------------------:|:------------------:|:--------------------:|
+| 1           | array<"US">            | "US"               | 1                    |
+| 2           | array<"DE", "DE" "US"> | "DE"               | 2                    |
+
+#### Background and Caveats
+
+This table is much easier to work with than the longitudinal dataset because
+you don't need to work with arrays.
+This table has a limited number of pre-computed summary statistics
+so you're metric may not be included.
+
+Note that this dataset is a summary of the longitudinal dataset,
+so it is also a 1% sample of all client_ids.
+
+All summary statistics are computed over the last 180 days,
+so this dataset can be insensitive to changes over time.
+
+#### Accessing the Data
+
+The cross_sectional dataset is available in re:dash.
+Here's an [example query](https://sql.telemetry.mozilla.org/queries/4202/source).
+
+The data is stored as a parquet table in S3 at the following address.
+See [this cookbook](/cookbooks/parquet.md) to get started working with the data in Spark.
+```
+s3://telemetry-parquet/cross_sectional/v1/
+```
+
+#### Further Reading
+
+The `cross_sectional` dataset is generated by 
+[this code](https://github.com/mozilla/telemetry-batch-view/blob/master/src/main/scala/com/mozilla/telemetry/views/CrossSectionalView.scala).
+Take a look at [this query](https://sql.telemetry.mozilla.org/queries/4203/source) for a schema.
diff --git a/datasets/batch_view/cross_sectional/reference.md b/datasets/batch_view/cross_sectional/reference.md
new file mode 100644
index 000000000..b6dbfde9e
--- /dev/null
+++ b/datasets/batch_view/cross_sectional/reference.md
@@ -0,0 +1,11 @@
+# Introduction
+
+{% include "./intro.md" %}
+
+# Data Reference
+
+% include "./data_reference.md" %
+
+# Data Reference
+
+% include "./code_reference.md" %
diff --git a/datasets/batch_view/Events.md b/datasets/batch_view/events/Events.md
similarity index 100%
rename from datasets/batch_view/Events.md
rename to datasets/batch_view/events/Events.md
diff --git a/datasets/batch_view/events/intro.md b/datasets/batch_view/events/intro.md
new file mode 100644
index 000000000..e69de29bb
diff --git a/datasets/batch_view/events/reference.md b/datasets/batch_view/events/reference.md
new file mode 100644
index 000000000..b6dbfde9e
--- /dev/null
+++ b/datasets/batch_view/events/reference.md
@@ -0,0 +1,11 @@
+# Introduction
+
+{% include "./intro.md" %}
+
+# Data Reference
+
+% include "./data_reference.md" %
+
+# Data Reference
+
+% include "./code_reference.md" %
diff --git a/datasets/batch_view/Longitudinal.md b/datasets/batch_view/longitudinal/Longitudinal.md
similarity index 100%
rename from datasets/batch_view/Longitudinal.md
rename to datasets/batch_view/longitudinal/Longitudinal.md
diff --git a/datasets/batch_view/longitudinal/intro.md b/datasets/batch_view/longitudinal/intro.md
new file mode 100644
index 000000000..5d603367b
--- /dev/null
+++ b/datasets/batch_view/longitudinal/intro.md
@@ -0,0 +1,38 @@
+The `longitudinal` dataset is a 1% sample of main ping data
+organized so that each row corresponds to a client_id.
+If you're not sure which dataset to use for your analysis,
+this is probably what you want.
+
+#### Contents
+Each row in the `longitudinal` dataset represents one `client_id`,
+which is approximately a user.
+Each column represents a field from the main ping.
+Most fields contain **arrays of values**, with one value for each ping associated with a client_id.
+Using arrays give you access to the raw data from each ping,
+but can be difficult to work with from SQL.
+Here's a [query showing some sample data](https://sql.telemetry.mozilla.org/queries/4188#table)
+to help illustrate.
+Take a look at the [longitudinal examples](/cookbooks/longitudinal.md) if you get stuck.
+
+#### Background and Caveats
+Think of the longitudinal table as wide and short.
+The dataset contains more columns than `main_summary`
+and down-samples to 1% of all clients to reduce query computation time and save resources.
+
+In summary, the longitudinal table differs from `main_summary` in two important ways:
+
+* The longitudinal dataset groups all data so that one row represents a client_id
+* The longitudinal dataset samples to 1% of all client_ids
+
+#### Accessing the Data
+
+The `longitudinal` is available in re:dash,
+though it can be difficult to work with the array values in SQL.
+Take a look at this [example query](https://sql.telemetry.mozilla.org/queries/4189/source).
+
+The data is stored as a parquet table in S3 at the following address.
+See [this cookbook](/cookbooks/parquet.md) to get started working with the data
+in [Spark](http://spark.apache.org/docs/latest/quick-start.html).
+```
+s3://telemetry-parquet/longitudinal/
+```
diff --git a/datasets/batch_view/longitudinal/reference.md b/datasets/batch_view/longitudinal/reference.md
new file mode 100644
index 000000000..b6dbfde9e
--- /dev/null
+++ b/datasets/batch_view/longitudinal/reference.md
@@ -0,0 +1,11 @@
+# Introduction
+
+{% include "./intro.md" %}
+
+# Data Reference
+
+% include "./data_reference.md" %
+
+# Data Reference
+
+% include "./code_reference.md" %
diff --git a/datasets/batch_view/ls b/datasets/batch_view/ls
new file mode 100644
index 000000000..b6dbfde9e
--- /dev/null
+++ b/datasets/batch_view/ls
@@ -0,0 +1,11 @@
+# Introduction
+
+{% include "./intro.md" %}
+
+# Data Reference
+
+% include "./data_reference.md" %
+
+# Data Reference
+
+% include "./code_reference.md" %
diff --git a/datasets/batch_view/MainSummary.md b/datasets/batch_view/main_summary/MainSummary.md
similarity index 100%
rename from datasets/batch_view/MainSummary.md
rename to datasets/batch_view/main_summary/MainSummary.md
diff --git a/datasets/batch_view/main_summary/intro.md b/datasets/batch_view/main_summary/intro.md
new file mode 100644
index 000000000..62139b83b
--- /dev/null
+++ b/datasets/batch_view/main_summary/intro.md
@@ -0,0 +1,43 @@
+The `main_summary` table is the most direct representation of a main ping
+but can be difficult to work with due to its size. 
+Prefer the `longitudinal` dataset unless using the sampled data is prohibitive.
+
+#### Contents
+
+The `main_summary` table contains one row for each ping.
+Each column represents one field from the main ping payload,
+though only a subset of all main ping fields are included.
+This dataset **does not include histograms**.
+
+#### Background and Caveats
+This table is massive, and due to it's size, it can be difficult to work with.
+You should **avoid querying `main_summary`** from [re:dash](https://sql.telemetry.mozilla.org).
+Your queries will be **slow to complete** and can **impact performance for other users**,
+since re:dash on a shared cluster.
+
+Instead, we recommend using the `longitudinal` or `cross_sectional` dataset where possible.
+If these datasets do not suffice, consider using Spark on an
+[ATMO](https://analysis.telemetry.mozilla.org) cluster.
+In the odd case where these queries are necessary,
+make use of the `sample_id` field and limit to a short submission date range.
+
+#### Accessing the Data
+
+The data is stored as a parquet table in S3 at the following address.
+See [this cookbook](/cookbooks/parquet.md) to get started working with the data in Spark.
+```
+s3://telemetry-parquet/main_summary/v3/
+```
+
+Though **not recommended** `main_summary` is accessible through re:dash. 
+Here's an [example query](https://sql.telemetry.mozilla.org/queries/4201/source).
+Your queries will be slow to complete and can **impact performance for other users**,
+since re:dash is on a shared cluster.
+
+#### Further Reading
+
+The technical documentation for `main_summary` is located in the
+[telemetry-batch-view documentation](https://github.com/mozilla/telemetry-batch-view/blob/master/docs/MainSummary.md).
+
+The code responsible for generating this dataset is 
+[here](https://github.com/mozilla/telemetry-batch-view/blob/master/src/main/scala/com/mozilla/telemetry/views/MainSummaryView.scala)
diff --git a/datasets/batch_view/main_summary/reference.md b/datasets/batch_view/main_summary/reference.md
new file mode 100644
index 000000000..b6dbfde9e
--- /dev/null
+++ b/datasets/batch_view/main_summary/reference.md
@@ -0,0 +1,11 @@
+# Introduction
+
+{% include "./intro.md" %}
+
+# Data Reference
+
+% include "./data_reference.md" %
+
+# Data Reference
+
+% include "./code_reference.md" %
diff --git a/datasets/batch_view/reference.md b/datasets/batch_view/reference.md
new file mode 100644
index 000000000..b6dbfde9e
--- /dev/null
+++ b/datasets/batch_view/reference.md
@@ -0,0 +1,11 @@
+# Introduction
+
+{% include "./intro.md" %}
+
+# Data Reference
+
+% include "./data_reference.md" %
+
+# Data Reference
+
+% include "./code_reference.md" %
diff --git a/datasets/batch_view/SyncSummary.md b/datasets/batch_view/sync_summary/SyncSummary.md
similarity index 100%
rename from datasets/batch_view/SyncSummary.md
rename to datasets/batch_view/sync_summary/SyncSummary.md
diff --git a/datasets/batch_view/sync_summary/intro.md b/datasets/batch_view/sync_summary/intro.md
new file mode 100644
index 000000000..e69de29bb
diff --git a/datasets/batch_view/sync_summary/reference.md b/datasets/batch_view/sync_summary/reference.md
new file mode 100644
index 000000000..b6dbfde9e
--- /dev/null
+++ b/datasets/batch_view/sync_summary/reference.md
@@ -0,0 +1,11 @@
+# Introduction
+
+{% include "./intro.md" %}
+
+# Data Reference
+
+% include "./data_reference.md" %
+
+# Data Reference
+
+% include "./code_reference.md" %

From da7be85b0781ca8d518d08876ec8844d6f86852f Mon Sep 17 00:00:00 2001
From: Ryan Harter <harterrt@gmail.com>
Date: Tue, 2 May 2017 13:21:32 -0400
Subject: [PATCH 02/15] Add Longitudinal reference material

---
 datasets/batch_view/addons/reference.md       |  9 ++-
 datasets/batch_view/client_count/reference.md |  9 ++-
 .../batch_view/crash_aggregates/reference.md  |  9 ++-
 .../batch_view/crash_summary/reference.md     |  9 ++-
 .../batch_view/cross_sectional/reference.md   |  9 ++-
 datasets/batch_view/events/reference.md       |  9 ++-
 .../batch_view/longitudinal/Longitudinal.md   |  5 --
 datasets/batch_view/longitudinal/reference.md | 55 ++++++++++++++++++-
 datasets/batch_view/main_summary/reference.md |  9 ++-
 datasets/batch_view/sync_summary/reference.md |  9 ++-
 10 files changed, 84 insertions(+), 48 deletions(-)
 delete mode 100644 datasets/batch_view/longitudinal/Longitudinal.md

diff --git a/datasets/batch_view/addons/reference.md b/datasets/batch_view/addons/reference.md
index b6dbfde9e..83306a198 100644
--- a/datasets/batch_view/addons/reference.md
+++ b/datasets/batch_view/addons/reference.md
@@ -4,8 +4,7 @@
 
 # Data Reference
 
-% include "./data_reference.md" %
-
-# Data Reference
-
-% include "./code_reference.md" %
+## Making Queries
+## Sampling
+## Scheduling
+## Schema
diff --git a/datasets/batch_view/client_count/reference.md b/datasets/batch_view/client_count/reference.md
index b6dbfde9e..83306a198 100644
--- a/datasets/batch_view/client_count/reference.md
+++ b/datasets/batch_view/client_count/reference.md
@@ -4,8 +4,7 @@
 
 # Data Reference
 
-% include "./data_reference.md" %
-
-# Data Reference
-
-% include "./code_reference.md" %
+## Making Queries
+## Sampling
+## Scheduling
+## Schema
diff --git a/datasets/batch_view/crash_aggregates/reference.md b/datasets/batch_view/crash_aggregates/reference.md
index b6dbfde9e..83306a198 100644
--- a/datasets/batch_view/crash_aggregates/reference.md
+++ b/datasets/batch_view/crash_aggregates/reference.md
@@ -4,8 +4,7 @@
 
 # Data Reference
 
-% include "./data_reference.md" %
-
-# Data Reference
-
-% include "./code_reference.md" %
+## Making Queries
+## Sampling
+## Scheduling
+## Schema
diff --git a/datasets/batch_view/crash_summary/reference.md b/datasets/batch_view/crash_summary/reference.md
index b6dbfde9e..83306a198 100644
--- a/datasets/batch_view/crash_summary/reference.md
+++ b/datasets/batch_view/crash_summary/reference.md
@@ -4,8 +4,7 @@
 
 # Data Reference
 
-% include "./data_reference.md" %
-
-# Data Reference
-
-% include "./code_reference.md" %
+## Making Queries
+## Sampling
+## Scheduling
+## Schema
diff --git a/datasets/batch_view/cross_sectional/reference.md b/datasets/batch_view/cross_sectional/reference.md
index b6dbfde9e..83306a198 100644
--- a/datasets/batch_view/cross_sectional/reference.md
+++ b/datasets/batch_view/cross_sectional/reference.md
@@ -4,8 +4,7 @@
 
 # Data Reference
 
-% include "./data_reference.md" %
-
-# Data Reference
-
-% include "./code_reference.md" %
+## Making Queries
+## Sampling
+## Scheduling
+## Schema
diff --git a/datasets/batch_view/events/reference.md b/datasets/batch_view/events/reference.md
index b6dbfde9e..83306a198 100644
--- a/datasets/batch_view/events/reference.md
+++ b/datasets/batch_view/events/reference.md
@@ -4,8 +4,7 @@
 
 # Data Reference
 
-% include "./data_reference.md" %
-
-# Data Reference
-
-% include "./code_reference.md" %
+## Making Queries
+## Sampling
+## Scheduling
+## Schema
diff --git a/datasets/batch_view/longitudinal/Longitudinal.md b/datasets/batch_view/longitudinal/Longitudinal.md
deleted file mode 100644
index 0c66184a6..000000000
--- a/datasets/batch_view/longitudinal/Longitudinal.md
+++ /dev/null
@@ -1,5 +0,0 @@
-{% include "git+https://github.com/mozilla/telemetry-batch-view.git/docs/Longitudinal.md" %}
-
-# Appendix
-
-This documentation is taken from the [telemetry-batch-view documentation](https://github.com/mozilla/telemetry-batch-view/tree/master/docs)
diff --git a/datasets/batch_view/longitudinal/reference.md b/datasets/batch_view/longitudinal/reference.md
index b6dbfde9e..300078261 100644
--- a/datasets/batch_view/longitudinal/reference.md
+++ b/datasets/batch_view/longitudinal/reference.md
@@ -1,11 +1,60 @@
+# Longitudinal Reference
+
+<!-- toc -->
+
 # Introduction
 
 {% include "./intro.md" %}
 
 # Data Reference
 
-% include "./data_reference.md" %
+## Making Queries
 
-# Data Reference
+Take a look at the 
+[Longitudinal Examples Cookbook](/cookbooks/longitudinal_examples.md).
+
+## Sampling
+
+### Pings Within Last 6 Months
+
+The `longitudinal` filters to `main` pings from within the last 6 months.
+
+### 1% Sample
+
+The longitudinal dataset samples down to 1% of all clients in the above sample.
+The sample is generated by the following process:
+
+* hash the client_id for each ping from the last 6 months.
+* project that hash onto an integer from 1:100, inclusive
+* filter to pings with client_ids matching a 'magic number' (in this case 42)
+
+This process has a couple of nice properties:
+
+* The sample is consistent over time. 
+  The `longitudinal` dataset is regenerated weekly.
+  The clients included in each run are very similar with this process.
+  The only change will come from never-before-seen clients,
+  or clients without a ping in the last 180 days.
+* We don't need to adjust the sample as new clients enter or exit our pool.
+
+More practically,
+the sample is created by filtering to pings with `main_summary.sample_id == 42`.
+If you're working with `main_summary`,
+you can recreate this sample by doing this filter manually.
+
+## Scheduling
+
+The `longitudinal` job is run weekly, early on Sunday morning UTC.
+The job is scheduled on [Airflow](https://github.com/mozilla/telemetry-airflow).
+The DAG is [here](https://github.com/mozilla/telemetry-airflow/blob/master/dags/longitudinal.py).
+
+## Schema
+
+TODO(harter): Coming soon.
+
+# Code Reference
+
+This dataset is generated by 
+[telemetry-batch-view](https://github.com/mozilla/telemetry-batch-view/blob/master/src/main/scala/com/mozilla/telemetry/views/Longitudinal.scala).
+Refer to this repository for information on how to run or augment the dataset.
 
-% include "./code_reference.md" %
diff --git a/datasets/batch_view/main_summary/reference.md b/datasets/batch_view/main_summary/reference.md
index b6dbfde9e..83306a198 100644
--- a/datasets/batch_view/main_summary/reference.md
+++ b/datasets/batch_view/main_summary/reference.md
@@ -4,8 +4,7 @@
 
 # Data Reference
 
-% include "./data_reference.md" %
-
-# Data Reference
-
-% include "./code_reference.md" %
+## Making Queries
+## Sampling
+## Scheduling
+## Schema
diff --git a/datasets/batch_view/sync_summary/reference.md b/datasets/batch_view/sync_summary/reference.md
index b6dbfde9e..83306a198 100644
--- a/datasets/batch_view/sync_summary/reference.md
+++ b/datasets/batch_view/sync_summary/reference.md
@@ -4,8 +4,7 @@
 
 # Data Reference
 
-% include "./data_reference.md" %
-
-# Data Reference
-
-% include "./code_reference.md" %
+## Making Queries
+## Sampling
+## Scheduling
+## Schema

From 920dcaf08b113292b2eb41ae702a9bd6ea3e46f6 Mon Sep 17 00:00:00 2001
From: Ryan Harter <harterrt@gmail.com>
Date: Wed, 3 May 2017 18:07:06 -0400
Subject: [PATCH 03/15] Add crash_aggregates reference material

---
 .../batch_view/crash_aggregates/reference.md  | 93 ++++++++++++++++++-
 datasets/batch_view/longitudinal/reference.md |  2 +-
 2 files changed, 93 insertions(+), 2 deletions(-)

diff --git a/datasets/batch_view/crash_aggregates/reference.md b/datasets/batch_view/crash_aggregates/reference.md
index 83306a198..95f6b8151 100644
--- a/datasets/batch_view/crash_aggregates/reference.md
+++ b/datasets/batch_view/crash_aggregates/reference.md
@@ -4,7 +4,98 @@
 
 # Data Reference
 
-## Making Queries
+## Example Queries
+
+Here's an example query that computes crash rates
+for each channel (sorted by number of usage hours):
+
+```sql
+SELECT dimensions['channel'] AS channel,
+       sum(stats['usage_hours']) AS usage_hours,
+       1000 * sum(stats['main_crashes']) / sum(stats['usage_hours']) AS main_crash_rate,
+       1000 * sum(stats['content_crashes']) / sum(stats['usage_hours']) AS content_crash_rate,
+       1000 * sum(stats['plugin_crashes']) / sum(stats['usage_hours']) AS plugin_crash_rate,
+       1000 * sum(stats['gmplugin_crashes']) / sum(stats['usage_hours']) AS gmplugin_crash_rate,
+       1000 * sum(stats['gpu_crashes']) / sum(stats['usage_hours']) AS gpu_crash_rate
+FROM crash_aggregates
+GROUP BY dimensions['channel']
+ORDER BY -sum(stats['usage_hours'])
+```
+
+Main process crashes by build date and E10S cohort.
+
+```sql
+WITH channel_rates AS (
+  SELECT dimensions['build_id'] AS build_id,
+         SUM(stats['main_crashes']) AS main_crashes, -- total number of crashes
+         SUM(stats['usage_hours']) / 1000 AS usage_kilohours, -- thousand hours of usage
+         dimensions['e10s_cohort'] AS e10s_cohort -- e10s cohort
+   FROM crash_aggregates
+   WHERE dimensions['experiment_id'] is null -- not in an experiment
+     AND regexp_like(dimensions['build_id'], '^\d{14}$') -- validate build IDs
+     AND dimensions['build_id'] > '20160201000000' -- only in the date range that we care about
+   GROUP BY dimensions['build_id'], dimensions['e10s_cohort']
+)
+SELECT cast(parse_datetime(build_id, 'yyyyMMddHHmmss') as date) as build_id, -- program build date
+       usage_kilohours, -- thousands of usage hours
+       e10s_cohort, -- e10s cohort
+       main_crashes / usage_kilohours AS main_crash_rate -- crash rate being defined as crashes per thousand usage hours
+FROM channel_rates
+WHERE usage_kilohours > 100 -- only aggregates that have statistically significant usage hours
+ORDER BY build_id ASC
+```
+
 ## Sampling
+
+### Invalid Pings
+
+We ignore invalid pings in our processing. Invalid pings are defined as those that:
+
+* The submission dates or activity dates are invalid or missing.
+* The build ID is malformed.
+* The docType field is missing or unknown.
+* The ping is a main ping without usage hours or a crash ping with usage hours.
+
+
 ## Scheduling
+
+The `crash_aggregates` job is run daily, at midnight UTC.
+The job is scheduled on [Airflow](https://github.com/mozilla/telemetry-airflow).
+The DAG is [here](https://github.com/mozilla/telemetry-airflow/blob/master/dags/crash_aggregates.py)
+
 ## Schema
+
+The `crash_aggregates` table has 4 commonly-used columns:
+
+* `submission_date` is the date pings were submitted for a particular aggregate.
+    * For example, `select sum(stats['usage_hours']) from crash_aggregates where submission_date = '2016-03-15'` will give the total number of user hours represented by pings submitted on March 15, 2016.
+    * The dataset is partitioned by this field. Queries that limit the possible values of `submission_date` can run significantly faster.
+* `activity_date` is the day when the activity being recorded took place.
+    * For example, `select sum(stats['usage_hours']) from crash_aggregates where activity_date = '2016-03-15'` will give the total number of user hours represented by activities that took place on March 15, 2016.
+    * This can be several days before the pings are actually submitted, so it will always be before or on its corresponding `submission_date`.
+    * Therefore, queries that are sensitive to when measurements were taken on the client should prefer this field over `submission_date`.
+* `dimensions` is a map of all the other dimensions that we currently care about. These fields include:
+    * `dimensions['build_version']` is the program version, like `46.0a1`.
+    * `dimensions['build_id']` is the YYYYMMDDhhmmss timestamp the program was built, like `20160123180541`. This is also known as the "build ID" or "buildid".
+    * `dimensions['channel']` is the channel, like `release` or `beta`.
+    * `dimensions['application']` is the program name, like `Firefox` or `Fennec`.
+    * `dimensions['os_name']` is the name of the OS the program is running on, like `Darwin` or `Windows_NT`.
+    * `dimensions['os_version']` is the version of the OS the program is running on.
+    * `dimensions['architecture']` is the architecture that the program was built for (not necessarily the one it is running on).
+    * `dimensions['country']` is the country code for the user (determined using geoIP), like `US` or `UK`.
+    * `dimensions['experiment_id']` is the identifier of the experiment being participated in, such as `e10s-beta46-noapz@experiments.mozilla.org`, or null if no experiment.
+    * `dimensions['experiment_branch']` is the branch of the experiment being participated in, such as `control` or `experiment`, or null if no experiment.
+    * `dimensions['e10s_enabled']` is whether E10S is enabled.
+    * `dimensions['e10s_cohort']` is the E10S cohort the user is part of, such as `control`, `test`, or `disqualified`.
+    * `dimensions['gfx_compositor']` is the graphics backend compositor used by the program, such as `d3d11`, `opengl` and `simple`. Null values may be reported as `none` as well.
+    * All of the above fields can potentially be blank, which means "not present". That means that in the actual pings, the corresponding fields were null.
+* `stats` contains the aggregate values that we care about:
+    * `stats['usage_hours']` is the number of user-hours represented by the aggregate.
+    * `stats['main_crashes']` is the number of main process crashes represented by the aggregate (or just program crashes, in the non-E10S case).
+    * `stats['content_crashes']` is the number of content process crashes represented by the aggregate.
+    * `stats['plugin_crashes']` is the number of plugin process crashes represented by the aggregate.
+    * `stats['gmplugin_crashes']` is the number of Gecko media plugin (often abbreviated GMPlugin) process crashes represented by the aggregate.
+    * `stats['content_shutdown_crashes']` is the number of content process crashes that were caused by failure to shut down in a timely manner.
+    * `stats['gpu_crashes']` is the number of gpu process crashes represented by the aggregate.
+
+TODO(harter): https://bugzilla.mozilla.org/show_bug.cgi?id=1361862
diff --git a/datasets/batch_view/longitudinal/reference.md b/datasets/batch_view/longitudinal/reference.md
index 300078261..1b4e4cd0e 100644
--- a/datasets/batch_view/longitudinal/reference.md
+++ b/datasets/batch_view/longitudinal/reference.md
@@ -50,7 +50,7 @@ The DAG is [here](https://github.com/mozilla/telemetry-airflow/blob/master/dags/
 
 ## Schema
 
-TODO(harter): Coming soon.
+TODO(harter): https://bugzilla.mozilla.org/show_bug.cgi?id=1361862
 
 # Code Reference
 

From bef3b204d022a8810cd41e1cea58f90819fd12a5 Mon Sep 17 00:00:00 2001
From: Ryan Harter <harterrt@gmail.com>
Date: Wed, 3 May 2017 18:21:45 -0400
Subject: [PATCH 04/15] Add cross_sectional reference material

---
 .../batch_view/cross_sectional/reference.md   | 43 ++++++++++++++++++-
 1 file changed, 42 insertions(+), 1 deletion(-)

diff --git a/datasets/batch_view/cross_sectional/reference.md b/datasets/batch_view/cross_sectional/reference.md
index 83306a198..8473b0876 100644
--- a/datasets/batch_view/cross_sectional/reference.md
+++ b/datasets/batch_view/cross_sectional/reference.md
@@ -1,10 +1,51 @@
+# Cross Sectional Reference
+
+<!-- toc -->
+
 # Introduction
 
 {% include "./intro.md" %}
 
 # Data Reference
 
-## Making Queries
+## Example Queries
+
+This query calculates relative OS frequencies for different channels:
+
+```sql
+SELECT
+    os_name_mode,
+    COUNT(1)
+FROM cross_sectional
+-- Can't limit by date or app_name in the cross_sectional
+WHERE os_name_mode IN ('Windows_NT', 'Darwin', 'Linux')
+  AND normalized_channel = 'release'
+GROUP BY 1
+ORDER BY 2 DESC
+    
+```
+
 ## Sampling
+
+The `cross_sectional` dataset is derived from the `longitudinal` dataset,
+and uses the exact same sampling.
+See the [longitudinal documentation](../longitudinal/reference.md#sampling)
+for details.
+
 ## Scheduling
+
+The `cross_sectional` dataset is generated shortly after the `longitudinal`
+dataset every Sunday.
+The job is scheduled on Airflow and can be found 
+[here](https://github.com/mozilla/telemetry-airflow/blob/master/dags/longitudinal.py#L64).
+
 ## Schema
+
+TODO(harter): https://bugzilla.mozilla.org/show_bug.cgi?id=1361862
+
+# Code Reference
+
+This dataset is generated by 
+[telemetry-batch-view](https://github.com/mozilla/telemetry-batch-view/blob/master/src/main/scala/com/mozilla/telemetry/views/CrossSectionalView.scala).
+Refer to this repository for information on how to run or augment the dataset.
+

From 025f41a952a2a65c6b167d6cbef552689f973fd5 Mon Sep 17 00:00:00 2001
From: Ryan Harter <harterrt@gmail.com>
Date: Wed, 3 May 2017 18:39:39 -0400
Subject: [PATCH 05/15] Add main_summary reference material

---
 datasets/batch_view/main_summary/reference.md | 243 +++++++++++++++++-
 1 file changed, 242 insertions(+), 1 deletion(-)

diff --git a/datasets/batch_view/main_summary/reference.md b/datasets/batch_view/main_summary/reference.md
index 83306a198..477feeeef 100644
--- a/datasets/batch_view/main_summary/reference.md
+++ b/datasets/batch_view/main_summary/reference.md
@@ -1,10 +1,251 @@
+# Main Summary
+
+<!-- toc -->
+
 # Introduction
 
 {% include "./intro.md" %}
 
 # Data Reference
 
-## Making Queries
+## Example Queries
+
+We recommend working with this dataset via Spark rather than sql.t.m.o.
+Due to the large number of records, 
+queries can consume a lot of resources on the 
+**shared cluster and impact other users**.
+Queries via sql.t.m.o should limit to a short `submission_date_s3` range,
+and ideally make use of the `sample_id` field.
+
+When using Presto to query the data from sql.t.m.o,
+you can use the UNNEST feature to access items in the 
+`search_counts`, `popup_notification_stats` and `active_addons` fields.
+
+For example, to compare the search volume for different search source values,
+you could use:
+```sql
+WITH search_data AS
+  (SELECT s.source AS search_source,
+          s.count AS search_count
+   FROM main_summary CROSS JOIN UNNEST(search_counts) AS t(s)
+   WHERE submission_date_s3 = '20160510'
+     AND search_counts IS NOT NULL)
+SELECT search_source, sum(search_count) as total_searches
+FROM search_data
+GROUP BY search_source
+ORDER BY sum(search_count) DESC
+```
+
 ## Sampling
+
+The `main_summary` dataset contains one record for each `main` ping
+as long as the record contains a non-null value for 
+`documentId`, `submissionDate`, and `Timestamp`.
+We do not ever expect nulls for these fields.
+
 ## Scheduling
+
+This dataset is updated daily via the [telemetry-airflow](https://github.com/mozilla/telemetry-airflow) infrastructure.
+The job DAG runs every day shortly after midnight UTC.
+You can find the job definition
+[here](https://github.com/mozilla/telemetry-airflow/blob/master/dags/main_summary.py)
+
 ## Schema
+
+As of 2016-07-05, the current version of the `main_summary` dataset is `v3`, and has a schema as follows:
+
+```
+root
+ |-- document_id: string (nullable = true)
+ |-- client_id: string (nullable = true)
+ |-- channel: string (nullable = true)
+ |-- normalized_channel: string (nullable = true)
+ |-- country: string (nullable = true)
+ |-- city: string (nullable = true)
+ |-- os: string (nullable = true)
+ |-- os_version: string (nullable = true)
+ |-- os_service_pack_major: long (nullable = true)
+ |-- os_service_pack_minor: long (nullable = true)
+ |-- windows_build_number: long (nullable = true)
+ |-- windows_ubr: long (nullable = true)
+ |-- install_year: long (nullable = true)
+ |-- profile_creation_date: long (nullable = true)
+ |-- subsession_start_date: string (nullable = true)
+ |-- subsession_length: long (nullable = true)
+ |-- distribution_id: string (nullable = true)
+ |-- submission_date: string (nullable = true)
+ |-- sync_configured: boolean (nullable = true)
+ |-- sync_count_desktop: integer (nullable = true)
+ |-- sync_count_mobile: integer (nullable = true)
+ |-- app_build_id: string (nullable = true)
+ |-- app_display_version: string (nullable = true)
+ |-- app_name: string (nullable = true)
+ |-- app_version: string (nullable = true)
+ |-- timestamp: long (nullable = true)
+ |-- env_build_id: string (nullable = true)
+ |-- env_build_version: string (nullable = true)
+ |-- env_build_arch: string (nullable = true)
+ |-- e10s_enabled: boolean (nullable = true)
+ |-- e10s_cohort: string (nullable = true)
+ |-- locale: string (nullable = true)
+ |-- active_experiment_id: string (nullable = true)
+ |-- active_experiment_branch: string (nullable = true)
+ |-- reason: string (nullable = true)
+ |-- timezone_offset: integer (nullable = true)
+ |-- plugin_hangs: integer (nullable = true)
+ |-- aborts_plugin: integer (nullable = true)
+ |-- aborts_content: integer (nullable = true)
+ |-- aborts_gmplugin: integer (nullable = true)
+ |-- crashes_detected_plugin: integer (nullable = true)
+ |-- crashes_detected_content: integer (nullable = true)
+ |-- crashes_detected_gmplugin: integer (nullable = true)
+ |-- crash_submit_attempt_main: integer (nullable = true)
+ |-- crash_submit_attempt_content: integer (nullable = true)
+ |-- crash_submit_attempt_plugin: integer (nullable = true)
+ |-- crash_submit_success_main: integer (nullable = true)
+ |-- crash_submit_success_content: integer (nullable = true)
+ |-- crash_submit_success_plugin: integer (nullable = true)
+ |-- shutdown_kill: integer (nullable = true)
+ |-- active_addons_count: long (nullable = true)
+ |-- flash_version: string (nullable = true)
+ |-- vendor: string (nullable = true)
+ |-- is_default_browser: boolean (nullable = true)
+ |-- default_search_engine_data_name: string (nullable = true)
+ |-- default_search_engine: string (nullable = true)
+ |-- loop_activity_counter: struct (nullable = true)
+ |    |-- open_panel: integer (nullable = true)
+ |    |-- open_conversation: integer (nullable = true)
+ |    |-- room_open: integer (nullable = true)
+ |    |-- room_share: integer (nullable = true)
+ |    |-- room_delete: integer (nullable = true)
+ |-- devtools_toolbox_opened_count: integer (nullable = true)
+ |-- client_submission_date: string (nullable = true)
+ |-- places_bookmarks_count: integer (nullable = true)
+ |-- places_pages_count: integer (nullable = true)
+ |-- push_api_notify: integer (nullable = true)
+ |-- web_notification_shown: integer (nullable = true)
+ |-- popup_notification_stats: map (nullable = true)
+ |    |-- key: string
+ |    |-- value: struct (valueContainsNull = true)
+ |    |    |-- offered: integer (nullable = true)
+ |    |    |-- action_1: integer (nullable = true)
+ |    |    |-- action_2: integer (nullable = true)
+ |    |    |-- action_3: integer (nullable = true)
+ |    |    |-- action_last: integer (nullable = true)
+ |    |    |-- dismissal_click_elsewhere: integer (nullable = true)
+ |    |    |-- dismissal_leave_page: integer (nullable = true)
+ |    |    |-- dismissal_close_button: integer (nullable = true)
+ |    |    |-- dismissal_not_now: integer (nullable = true)
+ |    |    |-- open_submenu: integer (nullable = true)
+ |    |    |-- learn_more: integer (nullable = true)
+ |    |    |-- reopen_offered: integer (nullable = true)
+ |    |    |-- reopen_action_1: integer (nullable = true)
+ |    |    |-- reopen_action_2: integer (nullable = true)
+ |    |    |-- reopen_action_3: integer (nullable = true)
+ |    |    |-- reopen_action_last: integer (nullable = true)
+ |    |    |-- reopen_dismissal_click_elsewhere: integer (nullable = true)
+ |    |    |-- reopen_dismissal_leave_page: integer (nullable = true)
+ |    |    |-- reopen_dismissal_close_button: integer (nullable = true)
+ |    |    |-- reopen_dismissal_not_now: integer (nullable = true)
+ |    |    |-- reopen_open_submenu: integer (nullable = true)
+ |    |    |-- reopen_learn_more: integer (nullable = true)
+ |-- search_counts: array (nullable = true)
+ |    |-- element: struct (containsNull = true)
+ |    |    |-- engine: string (nullable = true)
+ |    |    |-- source: string (nullable = true)
+ |    |    |-- count: long (nullable = true)
+ |-- active_addons: array (nullable = true)
+ |    |-- element: struct (containsNull = true)
+ |    |    |-- addon_id: string (nullable = true)
+ |    |    |-- blocklisted: boolean (nullable = true)
+ |    |    |-- name: string (nullable = true)
+ |    |    |-- user_disabled: boolean (nullable = true)
+ |    |    |-- app_disabled: boolean (nullable = true)
+ |    |    |-- version: string (nullable = true)
+ |    |    |-- scope: integer (nullable = true)
+ |    |    |-- type: string (nullable = true)
+ |    |    |-- foreign_install: boolean (nullable = true)
+ |    |    |-- has_binary_components: boolean (nullable = true)
+ |    |    |-- install_day: integer (nullable = true)
+ |    |    |-- update_day: integer (nullable = true)
+ |    |    |-- signed_state: integer (nullable = true)
+ |    |    |-- is_system: boolean (nullable = true)
+ |-- active_theme: struct (nullable = true)
+ |    |-- addon_id: string (nullable = true)
+ |    |-- blocklisted: boolean (nullable = true)
+ |    |-- name: string (nullable = true)
+ |    |-- user_disabled: boolean (nullable = true)
+ |    |-- app_disabled: boolean (nullable = true)
+ |    |-- version: string (nullable = true)
+ |    |-- scope: integer (nullable = true)
+ |    |-- type: string (nullable = true)
+ |    |-- foreign_install: boolean (nullable = true)
+ |    |-- has_binary_components: boolean (nullable = true)
+ |    |-- install_day: integer (nullable = true)
+ |    |-- update_day: integer (nullable = true)
+ |    |-- signed_state: integer (nullable = true)
+ |    |-- is_system: boolean (nullable = true)
+ |-- blocklist_enabled: boolean (nullable = true)
+ |-- addon_compatibility_check_enabled: boolean (nullable = true)
+ |-- telemetry_enabled: boolean (nullable = true)
+ |-- user_prefs: struct (nullable = true)
+ |    |-- dom_ipc_process_count: integer (nullable = true)
+ |-- max_concurrent_tab_count: integer (nullable = true)
+ |-- tab_open_event_count: integer (nullable = true)
+ |-- max_concurrent_window_count: integer (nullable = true)
+ |-- window_open_event_count: integer (nullable = true)
+ |-- total_uri_count: integer (nullable = true)
+ |-- unfiltered_uri_count: integer (nullable = true)
+ |-- unique_domains_count: integer (nullable = true)
+ |-- submission_date_s3: string (nullable = true)
+ |-- sample_id: string (nullable = true)
+ |-- events: array (nullable = true)
+ |    |-- element: struct (containsNull = true)
+ |    |    |-- timestamp: long (nullable = false)
+ |    |    |-- category: string (nullable = false)
+ |    |    |-- method: string (nullable = false)
+ |    |    |-- object: string (nullable = false)
+ |    |    |-- string_value: string (nullable = true)
+ |    |    |-- map_values: map (nullable = true)
+ |    |    |    |-- key: string
+ |    |    |    |-- value: string
+```
+
+For more detail on where these fields come from in the
+[raw data](https://gecko.readthedocs.io/en/latest/toolkit/components/telemetry/telemetry/data/main-ping.html),
+please look 
+[in the MainSummaryView code](https://github.com/mozilla/telemetry-batch-view/blob/master/src/main/scala/com/mozilla/telemetry/views/MainSummaryView.scala).
+in the `buildSchema` function.
+
+Most of the fields are simple scalar values, with a few notable exceptions:
+
+* The `search_count` field is an array of structs, each item in the array representing
+  a 3-tuple of (`engine`, `source`, `count`). The `engine` field represents the name of
+  the search engine against which the searches were done. The `source` field represents
+  the part of the Firefox UI that was used to perform the search. It contains values
+  such as "abouthome", "urlbar", and "searchbar". The `count` field contains the number
+  of searches performed against this engine+source combination during that subsession.
+  Any of the fields in the struct may be null (for example if the search key did not
+  match the expected pattern, or if the count was non-numeric).
+* The `loop_activity_counter` field is a simple struct containing inner fields for each
+  expected value of the `LOOP_ACTIVITY_COUNTER` Enumerated Histogram. Each inner field
+  is a count for that histogram bucket.
+* The `popup_notification_stats` field is a map of `String` keys to struct values,
+  each field in the struct being a count for the expected values of the
+  `POPUP_NOTIFICATION_STATS` Keyed Enumerated Histogram.
+* The `places_bookmarks_count` and `places_pages_count` fields contain the **mean**
+  value of the corresponding Histogram, which can be interpreted as the average number
+  of bookmarks or pages in a given subsession.
+* The `active_addons` field contains an array of structs, one for each entry in
+  the `environment.addons.activeAddons` section of the payload. More detail in
+  [Bug 1290181](https://bugzilla.mozilla.org/show_bug.cgi?id=1290181).
+* The `theme` field contains a single struct in the same shape as the items in the
+  `active_addons` array. It contains information about the currently active browser
+  theme.
+* The `user_prefs` field contains a struct with values for preferences of interest.
+
+# Code Reference
+
+This dataset is generated by 
+[telemetry-batch-view](https://github.com/mozilla/telemetry-batch-view/blob/master/src/main/scala/com/mozilla/telemetry/views/MainSummaryView.scala).
+Refer to this repository for information on how to run or augment the dataset.

From b2319bca2162305d627c9b6a2cb89dadc77f2939 Mon Sep 17 00:00:00 2001
From: Ryan Harter <harterrt@gmail.com>
Date: Wed, 3 May 2017 18:54:18 -0400
Subject: [PATCH 06/15] Add reference material for Addons

---
 datasets/batch_view/addons/reference.md | 55 ++++++++++++++++++++++++-
 1 file changed, 54 insertions(+), 1 deletion(-)

diff --git a/datasets/batch_view/addons/reference.md b/datasets/batch_view/addons/reference.md
index 83306a198..c5c8d80df 100644
--- a/datasets/batch_view/addons/reference.md
+++ b/datasets/batch_view/addons/reference.md
@@ -1,10 +1,63 @@
+# Addons Datsets
+
+<!-- toc -->
+
 # Introduction
 
 {% include "./intro.md" %}
 
 # Data Reference
 
-## Making Queries
+## Example Queries
+
 ## Sampling
+
+It contains one or more records for every [Main Summary](MainSummary.md) record 
+that contains a non-null value for `client_id`.
+Each Addons record contains info for a single addon,
+or if the main ping did not contain any active addons,
+there will be a row with nulls for all the addon fields
+(to identify client_ids/records without any addons).
+
+Like the Main Summary dataset, No attempt is made to de-duplicate submissions by `documentId`, so any analysis that could be affected by duplicate records should take care to remove duplicates using the `documentId` field.
+
 ## Scheduling
+
+This dataset is updated daily via the 
+[telemetry-airflow](https://github.com/mozilla/telemetry-airflow) infrastructure.
+The job DAG runs every day after the Main Summary data has been generated.
+The DAG is [here](https://github.com/mozilla/telemetry-airflow/blob/master/dags/main_summary.py#L36).
+
 ## Schema
+
+As of 2017-03-16, the current version of the `addons` dataset is `v2`,
+ and has a schema as follows:
+```
+root
+ |-- document_id: string (nullable = true)
+ |-- client_id: string (nullable = true)
+ |-- subsession_start_date: string (nullable = true)
+ |-- normalized_channel: string (nullable = true)
+ |-- addon_id: string (nullable = true)
+ |-- blocklisted: boolean (nullable = true)
+ |-- name: string (nullable = true)
+ |-- user_disabled: boolean (nullable = true)
+ |-- app_disabled: boolean (nullable = true)
+ |-- version: string (nullable = true)
+ |-- scope: integer (nullable = true)
+ |-- type: string (nullable = true)
+ |-- foreign_install: boolean (nullable = true)
+ |-- has_binary_components: boolean (nullable = true)
+ |-- install_day: integer (nullable = true)
+ |-- update_day: integer (nullable = true)
+ |-- signed_state: integer (nullable = true)
+ |-- is_system: boolean (nullable = true)
+ |-- submission_date_s3: string (nullable = true)
+ |-- sample_id: string (nullable = true)
+```
+For more detail on where these fields come from in the
+[raw data](https://gecko.readthedocs.io/en/latest/toolkit/components/telemetry/telemetry/data/environment.html#addons),
+please look 
+[in the AddonsView code](https://github.com/mozilla/telemetry-batch-view/blob/master/src/main/scala/views/AddonsView.scala).
+
+The fields are all simple scalar values.

From 35c120e723701e7a021aba839ea8f23bdac39d75 Mon Sep 17 00:00:00 2001
From: Ryan Harter <harterrt@gmail.com>
Date: Thu, 11 May 2017 12:06:45 -0400
Subject: [PATCH 07/15] Add crash_aggregates reference material

---
 datasets/batch_view/crash_aggregates/reference.md | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/datasets/batch_view/crash_aggregates/reference.md b/datasets/batch_view/crash_aggregates/reference.md
index 95f6b8151..34c49b3e8 100644
--- a/datasets/batch_view/crash_aggregates/reference.md
+++ b/datasets/batch_view/crash_aggregates/reference.md
@@ -1,3 +1,7 @@
+# Crash Aggregates Reference
+
+<!-- toc -->
+
 # Introduction
 
 {% include "./intro.md" %}

From f5fbfddee1535e438681ef0cc1d3f2db5d4cffd8 Mon Sep 17 00:00:00 2001
From: Ryan Harter <harterrt@gmail.com>
Date: Thu, 11 May 2017 12:16:11 -0400
Subject: [PATCH 08/15] Add crash_summary reference material

---
 .../batch_view/crash_summary/reference.md     | 63 ++++++++++++++++++-
 datasets/batch_view/longitudinal/reference.md |  2 +-
 2 files changed, 63 insertions(+), 2 deletions(-)

diff --git a/datasets/batch_view/crash_summary/reference.md b/datasets/batch_view/crash_summary/reference.md
index 83306a198..2752c6f60 100644
--- a/datasets/batch_view/crash_summary/reference.md
+++ b/datasets/batch_view/crash_summary/reference.md
@@ -1,10 +1,71 @@
+# Crash Summary Reference
+
+<!-- toc -->
+
 # Introduction
 
 {% include "./intro.md" %}
 
 # Data Reference
 
-## Making Queries
+## Example Queries
+
+Here is an example query to get the total number of main crashes by gfx_compositor:
+
+```sql
+select gfx_compositor, count(*)
+from crash_summary
+where application = 'Firefox'
+and (payload.processType IS NULL OR payload.processType = 'main') 
+group by gfx_compositor
+```
+ 
 ## Sampling
+
+CrashSummary contains one record for every 
+[crash ping](https://gecko.readthedocs.io/en/latest/toolkit/components/telemetry/telemetry/data/crash-ping.html)
+submitted by Firefox.
+It was built with the long term goal of providing a base for 
+[CrashAggregates](../crash_aggregates/reference.md).
+
 ## Scheduling
+
+This dataset is updated daily, shortly after midnight UTC.
+The job is scheduled on 
+[telemetry-airflow](https://github.com/mozilla/telemetry-airflow).
+The DAG is [here](https://github.com/mozilla/telemetry-airflow/blob/master/dags/crash_summary.py).
+
 ## Schema
+
+```
+root
+ |-- client_id: string (nullable = true)
+ |-- normalized_channel: string (nullable = true)
+ |-- build_version: string (nullable = true)
+ |-- build_id: string (nullable = true)
+ |-- channel: string (nullable = true)
+ |-- application: string (nullable = true)
+ |-- os_name: string (nullable = true)
+ |-- os_version: string (nullable = true)
+ |-- architecture: string (nullable = true)
+ |-- country: string (nullable = true)
+ |-- experiment_id: string (nullable = true)
+ |-- experiment_branch: string (nullable = true)
+ |-- e10s_enabled: boolean (nullable = true)
+ |-- e10s_cohort: string (nullable = true)
+ |-- gfx_compositor: string (nullable = true)
+ |-- payload: struct (nullable = true)
+ |    |-- crashDate: string (nullable = true)
+ |    |-- processType: string (nullable = true)
+ |    |-- hasCrashEnvironment: boolean (nullable = true)
+ |    |-- metadata: map (nullable = true)
+ |    |    |-- key: string
+ |    |    |-- value: string (valueContainsNull = true)
+ |    |-- version: integer (nullable = true)
+```
+
+For more detail on where these fields come from in the
+[raw data](https://gecko.readthedocs.io/en/latest/toolkit/components/telemetry/telemetry/data/crash-ping.html),
+please look at the case classes 
+[in the CrashSummaryView code](https://github.com/mozilla/telemetry-batch-view/blob/master/src/main/scala/com/mozilla/telemetry/views/CrashSummaryView.scala).
+
diff --git a/datasets/batch_view/longitudinal/reference.md b/datasets/batch_view/longitudinal/reference.md
index 1b4e4cd0e..a15d8962a 100644
--- a/datasets/batch_view/longitudinal/reference.md
+++ b/datasets/batch_view/longitudinal/reference.md
@@ -8,7 +8,7 @@
 
 # Data Reference
 
-## Making Queries
+## Example Queries
 
 Take a look at the 
 [Longitudinal Examples Cookbook](/cookbooks/longitudinal_examples.md).

From 7a0ccc540cf9735c46720fa445f1a68d83b06b1a Mon Sep 17 00:00:00 2001
From: Ryan Harter <harterrt@gmail.com>
Date: Thu, 11 May 2017 12:36:59 -0400
Subject: [PATCH 09/15] Add events reference material

---
 datasets/batch_view/events/reference.md | 52 ++++++++++++++++++++++++-
 1 file changed, 51 insertions(+), 1 deletion(-)

diff --git a/datasets/batch_view/events/reference.md b/datasets/batch_view/events/reference.md
index 83306a198..518014e2c 100644
--- a/datasets/batch_view/events/reference.md
+++ b/datasets/batch_view/events/reference.md
@@ -4,7 +4,57 @@
 
 # Data Reference
 
-## Making Queries
+## Example Queries
+
+
 ## Sampling
+
+The events dataset contains one row for each event in a main ping.
+This dataset is derived from [main_summary](../main_summary/reference.md)
+so any of `main_summary`'s filters affect this dataset as well.
+
+Data is currently available from 2017-01-05 on.
+
 ## Scheduling
+
+The events dataset is updated daily, shortly after 
+[main_summary](../main_summary/reference.md) is updated.
+The job is scheduled on [Airflow](https://github.com/mozilla/telemetry-airflow).
+The DAG is [here](https://github.com/mozilla/telemetry-airflow/blob/master/dags/main_summary.py#L63).
+
+
 ## Schema
+
+As of 2017-01-26, the current version of the `events` dataset is `v1`, and has a schema as follows:
+```
+root
+ |-- document_id: string (nullable = true)
+ |-- client_id: string (nullable = true)
+ |-- normalized_channel: string (nullable = true)
+ |-- country: string (nullable = true)
+ |-- locale: string (nullable = true)
+ |-- app_name: string (nullable = true)
+ |-- app_version: string (nullable = true)
+ |-- os: string (nullable = true)
+ |-- os_version: string (nullable = true)
+ |-- subsession_start_date: string (nullable = true)
+ |-- subsession_length: long (nullable = true)
+ |-- sync_configured: boolean (nullable = true)
+ |-- sync_count_desktop: integer (nullable = true)
+ |-- sync_count_mobile: integer (nullable = true)
+ |-- timestamp: long (nullable = true)
+ |-- sample_id: string (nullable = true)
+ |-- event_timestamp: long (nullable = false)
+ |-- event_category: string (nullable = false)
+ |-- event_method: string (nullable = false)
+ |-- event_object: string (nullable = false)
+ |-- event_string_value: string (nullable = true)
+ |-- event_map_values: map (nullable = true)
+ |    |-- key: string
+ |    |-- value: string
+ |-- submission_date_s3: string (nullable = true)
+ |-- doc_type: string (nullable = true)
+```
+
+Currently, client-side event telemetry is undocumented.
+This doc will link to those once they're published.

From f1b6ab7705f9d03e095e6f7eb60d09dfb2b253c6 Mon Sep 17 00:00:00 2001
From: Ryan Harter <harterrt@gmail.com>
Date: Thu, 11 May 2017 12:45:38 -0400
Subject: [PATCH 10/15] Add sync_summary reference material

---
 datasets/batch_view/sync_summary/reference.md | 70 ++++++++++++++++++-
 1 file changed, 69 insertions(+), 1 deletion(-)

diff --git a/datasets/batch_view/sync_summary/reference.md b/datasets/batch_view/sync_summary/reference.md
index 83306a198..89293b24b 100644
--- a/datasets/batch_view/sync_summary/reference.md
+++ b/datasets/batch_view/sync_summary/reference.md
@@ -1,10 +1,78 @@
+# Sync Summary Reference
+
+<!-- toc -->
+
 # Introduction
 
 {% include "./intro.md" %}
 
 # Data Reference
 
-## Making Queries
+## Example Queries
+
+TODO
+
 ## Sampling
+
+TODO
+
 ## Scheduling
+
+This dataset is updated daily, shortly after midnight UTC.
+The job is scheduled on [Airflow](https://github.com/mozilla/telemetry-airflow).
+The DAG is [here](https://github.com/mozilla/telemetry-airflow/blob/master/dags/sync_view.py).
+
 ## Schema
+
+```
+root
+ |-- app_build_id: string (nullable = true)
+ |-- app_display_version: string (nullable = true)
+ |-- app_name: string (nullable = true)
+ |-- app_version: string (nullable = true)
+ |-- app_channel: string (nullable = true)
+ |-- uid: string
+ |-- device_id: string (nullable = true)
+ |-- when: integer
+ |-- took: integer
+ |-- why: string (nullable = true)
+ |-- failure_reason: struct (nullable = true)
+ |    |-- name: string
+ |    |-- value: string (nullable = true)
+ |-- status: struct (nullable = true)
+ |    |-- sync: string (nullable = true)
+ |    |-- status: string (nullable = true)
+ |-- devices: array (nullable = true)
+ |    |-- element: struct (containsNull = false)
+ |    |    |-- id: string
+ |    |    |-- os: string
+ |    |    |-- version: string
+ |-- engines: array (nullable = true)
+ |    |-- element: struct (containsNull = false)
+ |    |    |-- name: string
+ |    |    |-- took: integer
+ |    |    |-- status: string (nullable = true)
+ |    |    |-- failure_reason: struct (nullable = true)
+ |    |    |    |-- name: string
+ |    |    |    |-- value: string (nullable = true)
+ |    |    |-- incoming: struct (nullable = true)
+ |    |    |    |-- applied: integer
+ |    |    |    |-- failed: integer
+ |    |    |    |-- new_failed: integer
+ |    |    |    |-- reconciled: integer
+ |    |    |-- outgoing: array (nullable = true)
+ |    |    |    |-- element: struct (containsNull = false)
+ |    |    |    |    |-- sent: integer
+ |    |    |    |    |-- failed: integer
+ |    |    |-- validation: struct (containsNull = false)
+ |    |    |    |-- version: integer
+ |    |    |    |-- checked: integer
+ |    |    |    |-- took: integer
+ |    |    |    |-- failure_reason: struct (nullable = true)
+ |    |    |    |    |-- name: string
+ |    |    |    |    |-- value: string (nullable = true)
+ |    |    |    |-- problems: array (nullable = true)
+ |    |    |    |    |-- element: struct (containsNull = false)
+ |    |    |    |    |    |-- name: string
+ |    |    |    |    |    |-- count: integer
+```

From f859e130d259103f892b73151845e512bf13e55c Mon Sep 17 00:00:00 2001
From: Ryan Harter <harterrt@gmail.com>
Date: Thu, 11 May 2017 14:09:02 -0400
Subject: [PATCH 11/15] Update the standard format in the reference docs

---
 datasets/reference.md | 27 +++++++++++++++------------
 1 file changed, 15 insertions(+), 12 deletions(-)

diff --git a/datasets/reference.md b/datasets/reference.md
index fd30e2215..f758e59e0 100644
--- a/datasets/reference.md
+++ b/datasets/reference.md
@@ -1,17 +1,20 @@
 # Dataset Reference
 
-This section contains tutorials that focus on a single dataset each.
+This section contains references that focus on a single dataset each.
 After reading the tutorial, you should know all you need about the dataset.
 Each tutorial should include:
 
-* A short overview of why we built the dataset and what need it's meant to solve
-* An example query to give the reader an idea of what the data looks like
-  and how it is meant to be used
-* What datasource the data is collected from
-* How frequently it's updated
-* How it is processed and sampled
-* How it is stored and how to access the data including
-  * whether the data is available in re:dash
-  * s3 paths
-  * snippets for loading the parquet tables with spark
-* How to augment or modify the dataset
+* Introduction
+  * A short overview of why we built the dataset and what need it's meant to solve
+  * What datasource the data is collected from,
+    and a high level overview of how the data is organized
+  * How it is stored and how to access the data including
+    * whether the data is available in re:dash
+    * s3 paths
+* Reference
+  * An example query to give the reader an idea of what the data looks like
+    and how it is meant to be used
+  * How the data is processed and sampled
+  * How frequently it's updated, and how it's scheduled
+  * An up-to-date schema for the dataset
+  * How to augment or modify the dataset

From 0ab16e9740831f1f01b8bba72b5204a23d876165 Mon Sep 17 00:00:00 2001
From: Ryan Harter <harterrt@gmail.com>
Date: Thu, 11 May 2017 14:18:09 -0400
Subject: [PATCH 12/15] Remove unused files

---
 .../batch_view/crash_aggregates/CrashAggregateView.md |  5 -----
 .../batch_view/cross_sectional/cross_sectional.md     |  5 -----
 datasets/batch_view/ls                                | 11 -----------
 datasets/batch_view/main_summary/MainSummary.md       |  5 -----
 datasets/batch_view/reference.md                      | 11 -----------
 5 files changed, 37 deletions(-)
 delete mode 100644 datasets/batch_view/crash_aggregates/CrashAggregateView.md
 delete mode 100644 datasets/batch_view/cross_sectional/cross_sectional.md
 delete mode 100644 datasets/batch_view/ls
 delete mode 100644 datasets/batch_view/main_summary/MainSummary.md
 delete mode 100644 datasets/batch_view/reference.md

diff --git a/datasets/batch_view/crash_aggregates/CrashAggregateView.md b/datasets/batch_view/crash_aggregates/CrashAggregateView.md
deleted file mode 100644
index c70f640ad..000000000
--- a/datasets/batch_view/crash_aggregates/CrashAggregateView.md
+++ /dev/null
@@ -1,5 +0,0 @@
-{% include "git+https://github.com/mozilla/telemetry-batch-view.git/docs/CrashAggregateView.md" %}
-
-# Appendix
-
-This documentation is taken from the [telemetry-batch-view documentation](https://github.com/mozilla/telemetry-batch-view/tree/master/docs)
diff --git a/datasets/batch_view/cross_sectional/cross_sectional.md b/datasets/batch_view/cross_sectional/cross_sectional.md
deleted file mode 100644
index c2b0c5fbd..000000000
--- a/datasets/batch_view/cross_sectional/cross_sectional.md
+++ /dev/null
@@ -1,5 +0,0 @@
-# Work in Progress
-
-This article is still a work in progress.
-The work is being tracked in
-[this bug](https://bugzilla.mozilla.org/show_bug.cgi?id=1341912)
diff --git a/datasets/batch_view/ls b/datasets/batch_view/ls
deleted file mode 100644
index b6dbfde9e..000000000
--- a/datasets/batch_view/ls
+++ /dev/null
@@ -1,11 +0,0 @@
-# Introduction
-
-{% include "./intro.md" %}
-
-# Data Reference
-
-% include "./data_reference.md" %
-
-# Data Reference
-
-% include "./code_reference.md" %
diff --git a/datasets/batch_view/main_summary/MainSummary.md b/datasets/batch_view/main_summary/MainSummary.md
deleted file mode 100644
index 13f5be06c..000000000
--- a/datasets/batch_view/main_summary/MainSummary.md
+++ /dev/null
@@ -1,5 +0,0 @@
-{% include "git+https://github.com/mozilla/telemetry-batch-view.git/docs/MainSummary.md" %}
-
-# Appendix
-
-This documentation is taken from the [telemetry-batch-view documentation](https://github.com/mozilla/telemetry-batch-view/tree/master/docs)
diff --git a/datasets/batch_view/reference.md b/datasets/batch_view/reference.md
deleted file mode 100644
index b6dbfde9e..000000000
--- a/datasets/batch_view/reference.md
+++ /dev/null
@@ -1,11 +0,0 @@
-# Introduction
-
-{% include "./intro.md" %}
-
-# Data Reference
-
-% include "./data_reference.md" %
-
-# Data Reference
-
-% include "./code_reference.md" %

From 71f9b59635981c2e9934751b24f40db3e2fbb730 Mon Sep 17 00:00:00 2001
From: Ryan Harter <harterrt@gmail.com>
Date: Thu, 11 May 2017 14:45:11 -0400
Subject: [PATCH 13/15] Add links to lingering bugs

---
 datasets/batch_view/addons/intro.md           |  4 +++
 datasets/batch_view/client_count/reference.md | 30 ++++++++++++++++++-
 datasets/batch_view/crash_summary/intro.md    |  3 ++
 datasets/batch_view/events/intro.md           |  4 +++
 datasets/batch_view/events/reference.md       |  3 ++
 datasets/batch_view/sync_summary/intro.md     |  4 +++
 datasets/batch_view/sync_summary/reference.md |  8 +++--
 7 files changed, 53 insertions(+), 3 deletions(-)

diff --git a/datasets/batch_view/addons/intro.md b/datasets/batch_view/addons/intro.md
index e69de29bb..38095e879 100644
--- a/datasets/batch_view/addons/intro.md
+++ b/datasets/batch_view/addons/intro.md
@@ -0,0 +1,4 @@
+
+This is a work in progress.
+The work is being tracked
+[here](https://bugzilla.mozilla.org/show_bug.cgi?id=1364172).
diff --git a/datasets/batch_view/client_count/reference.md b/datasets/batch_view/client_count/reference.md
index 83306a198..307ce4ad5 100644
--- a/datasets/batch_view/client_count/reference.md
+++ b/datasets/batch_view/client_count/reference.md
@@ -1,10 +1,38 @@
+# Client Count Reference
+
+This document is a work in progress.
+The work is being tracked
+[here](https://bugzilla.mozilla.org/show_bug.cgi?id=1364175).
+
+<!-- toc -->
+
 # Introduction
 
 {% include "./intro.md" %}
 
 # Data Reference
 
-## Making Queries
+## Example Queries
+
+This document is a work in progress.
+The work is being tracked
+[here](https://bugzilla.mozilla.org/show_bug.cgi?id=1364175).
+
 ## Sampling
+
+This document is a work in progress.
+The work is being tracked
+[here](https://bugzilla.mozilla.org/show_bug.cgi?id=1364175).
+
 ## Scheduling
+
+This document is a work in progress.
+The work is being tracked
+[here](https://bugzilla.mozilla.org/show_bug.cgi?id=1364175).
+
 ## Schema
+
+This document is a work in progress.
+The work is being tracked
+[here](https://bugzilla.mozilla.org/show_bug.cgi?id=1364175).
+
diff --git a/datasets/batch_view/crash_summary/intro.md b/datasets/batch_view/crash_summary/intro.md
index e69de29bb..4de235c8c 100644
--- a/datasets/batch_view/crash_summary/intro.md
+++ b/datasets/batch_view/crash_summary/intro.md
@@ -0,0 +1,3 @@
+Work in progress.
+Work is being tracked
+[here](https://bugzilla.mozilla.org/show_bug.cgi?id=1364174).
diff --git a/datasets/batch_view/events/intro.md b/datasets/batch_view/events/intro.md
index e69de29bb..a8fd77a57 100644
--- a/datasets/batch_view/events/intro.md
+++ b/datasets/batch_view/events/intro.md
@@ -0,0 +1,4 @@
+
+Tis is a work in progress.
+The work is being tracked
+[here](https://bugzilla.mozilla.org/show_bug.cgi?id=1364170).
diff --git a/datasets/batch_view/events/reference.md b/datasets/batch_view/events/reference.md
index 518014e2c..ae385cc4e 100644
--- a/datasets/batch_view/events/reference.md
+++ b/datasets/batch_view/events/reference.md
@@ -6,6 +6,9 @@
 
 ## Example Queries
 
+This is a work in progress.
+The work is being tracked
+[here](https://bugzilla.mozilla.org/show_bug.cgi?id=1364170).
 
 ## Sampling
 
diff --git a/datasets/batch_view/sync_summary/intro.md b/datasets/batch_view/sync_summary/intro.md
index e69de29bb..2e4038376 100644
--- a/datasets/batch_view/sync_summary/intro.md
+++ b/datasets/batch_view/sync_summary/intro.md
@@ -0,0 +1,4 @@
+
+Work in progress.
+Work is being tracked 
+[here](https://bugzilla.mozilla.org/show_bug.cgi?id=1364171).
diff --git a/datasets/batch_view/sync_summary/reference.md b/datasets/batch_view/sync_summary/reference.md
index 89293b24b..81b15a456 100644
--- a/datasets/batch_view/sync_summary/reference.md
+++ b/datasets/batch_view/sync_summary/reference.md
@@ -10,11 +10,15 @@
 
 ## Example Queries
 
-TODO
+Work in progres.
+Work is being tracked
+[here](https://bugzilla.mozilla.org/show_bug.cgi?id=1364171)
 
 ## Sampling
 
-TODO
+Work in progres.
+Work is being tracked
+[here](https://bugzilla.mozilla.org/show_bug.cgi?id=1364171)
 
 ## Scheduling
 

From 60a98d49925dce976e6fafa06c46a3159b9194c2 Mon Sep 17 00:00:00 2001
From: Ryan Harter <harterrt@gmail.com>
Date: Thu, 11 May 2017 14:48:38 -0400
Subject: [PATCH 14/15] Remove outdated documentation

---
 datasets/batch_view/addons/Addons.md              | 5 -----
 datasets/batch_view/crash_summary/CrashSummary.md | 5 -----
 datasets/batch_view/events/Events.md              | 5 -----
 datasets/batch_view/sync_summary/SyncSummary.md   | 5 -----
 4 files changed, 20 deletions(-)
 delete mode 100644 datasets/batch_view/addons/Addons.md
 delete mode 100644 datasets/batch_view/crash_summary/CrashSummary.md
 delete mode 100644 datasets/batch_view/events/Events.md
 delete mode 100644 datasets/batch_view/sync_summary/SyncSummary.md

diff --git a/datasets/batch_view/addons/Addons.md b/datasets/batch_view/addons/Addons.md
deleted file mode 100644
index 673118b2a..000000000
--- a/datasets/batch_view/addons/Addons.md
+++ /dev/null
@@ -1,5 +0,0 @@
-{% include "git+https://github.com/mozilla/telemetry-batch-view.git/docs/Addons.md" %}
-
-# Appendix
-
-This documentation is taken from the [telemetry-batch-view documentation](https://github.com/mozilla/telemetry-batch-view/tree/master/docs)
diff --git a/datasets/batch_view/crash_summary/CrashSummary.md b/datasets/batch_view/crash_summary/CrashSummary.md
deleted file mode 100644
index 5b43cc371..000000000
--- a/datasets/batch_view/crash_summary/CrashSummary.md
+++ /dev/null
@@ -1,5 +0,0 @@
-{% include "git+https://github.com/mozilla/telemetry-batch-view.git/docs/CrashSummary.md" %}
-
-# Appendix
-
-This documentation is taken from the [telemetry-batch-view documentation](https://github.com/mozilla/telemetry-batch-view/tree/master/docs)
diff --git a/datasets/batch_view/events/Events.md b/datasets/batch_view/events/Events.md
deleted file mode 100644
index 6ad29169f..000000000
--- a/datasets/batch_view/events/Events.md
+++ /dev/null
@@ -1,5 +0,0 @@
-{% include "git+https://github.com/mozilla/telemetry-batch-view.git/docs/Events.md" %}
-
-# Appendix
-
-This documentation is taken from the [telemetry-batch-view documentation](https://github.com/mozilla/telemetry-batch-view/tree/master/docs)
diff --git a/datasets/batch_view/sync_summary/SyncSummary.md b/datasets/batch_view/sync_summary/SyncSummary.md
deleted file mode 100644
index 6132dbba6..000000000
--- a/datasets/batch_view/sync_summary/SyncSummary.md
+++ /dev/null
@@ -1,5 +0,0 @@
-{% include "git+https://github.com/mozilla/telemetry-batch-view.git/docs/SyncSummary.md" %}
-
-# Appendix
-
-This documentation is taken from the [telemetry-batch-view documentation](https://github.com/mozilla/telemetry-batch-view/tree/master/docs)

From 418553d49a45c8b4022e82478daa0564c7ef7090 Mon Sep 17 00:00:00 2001
From: Ryan Harter <harterrt@gmail.com>
Date: Thu, 11 May 2017 17:52:57 -0400
Subject: [PATCH 15/15] Add client count reference docs to summary

---
 SUMMARY.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/SUMMARY.md b/SUMMARY.md
index af4225da2..9ce630f24 100644
--- a/SUMMARY.md
+++ b/SUMMARY.md
@@ -23,6 +23,7 @@
       * [Events](datasets/batch_view/events/reference.md)
       * [Sync Summary](datasets/batch_view/sync_summary/reference.md)
       * [Addons](datasets/batch_view/addons/reference.md)
+      * [Client Count](datasets/batch_view/client_count/reference.md)
     * [Experimental Datasets](tools/experiments.md)
       * [Accessing Shield Study data](datasets/shield.md)
   * [Collecting New Data](datasets/new_data.md)