From 25dee0451e9b6da33626d5a917f0f02247f2ed59 Mon Sep 17 00:00:00 2001 From: Daniel Sakuma Date: Wed, 5 Oct 2016 00:49:54 -0300 Subject: [PATCH 1/8] Add first version for Dockerfile and compose --- docker-compose.yml | 8 ++++++++ docker/Dockerfile | 19 +++++++++++++++++++ 2 files changed, 27 insertions(+) create mode 100644 docker-compose.yml create mode 100644 docker/Dockerfile diff --git a/docker-compose.yml b/docker-compose.yml new file mode 100644 index 000000000..9fd3a31ab --- /dev/null +++ b/docker-compose.yml @@ -0,0 +1,8 @@ +jupyter: + build: . + dockerfile: docker/Dockerfile + ports: + - 8888:8888 + volumes: + - .:/notebook + working_dir: /notebook diff --git a/docker/Dockerfile b/docker/Dockerfile new file mode 100644 index 000000000..108fce423 --- /dev/null +++ b/docker/Dockerfile @@ -0,0 +1,19 @@ +FROM jupyter/datascience-notebook:latest +MAINTAINER Serenata de Amor "datasciencebr@gmail.com" + +USER root + +RUN apt-get update && apt-get install -y \ + unzip + +USER jovyan + +COPY requirements.txt ./ +COPY conda_requirements.txt ./ + +RUN pip install --upgrade pip +RUN pip install -r requirements.txt + +RUN conda config --add channels Rufone +RUN conda config --add channels conda-forge +RUN conda install --yes --file conda_requirements.txt From 04681788ecae0d5ccf8fa588ed05e23364f4ac5d Mon Sep 17 00:00:00 2001 From: Daniel Sakuma Date: Wed, 5 Oct 2016 21:40:48 -0300 Subject: [PATCH 2/8] Move compose syntax from v1 to v2 --- docker-compose.yml | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/docker-compose.yml b/docker-compose.yml index 9fd3a31ab..2e9b8841a 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -1,8 +1,12 @@ -jupyter: - build: . - dockerfile: docker/Dockerfile - ports: - - 8888:8888 - volumes: - - .:/notebook - working_dir: /notebook +version: '2' + +services: + jupyter: + build: + context: . + dockerfile: docker/Dockerfile + ports: + - 8888:8888 + volumes: + - .:/notebook + working_dir: /notebook From 1de30f48b9bcfffd5e1c6ac8c67e3344368acc14 Mon Sep 17 00:00:00 2001 From: Daniel Sakuma Date: Mon, 10 Oct 2016 21:26:33 -0300 Subject: [PATCH 3/8] Add docker environment documentation --- CONTRIBUTING.md | 33 ++++++++++++++++++++++++++++++--- 1 file changed, 30 insertions(+), 3 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 519cc9211..27d655d57 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -17,6 +17,8 @@ A lot of discussions about ideas take place in the [Issues](https://github.com/d ## Environment +##### Local Installation Environment + The recommended way of setting your environment up is with [Anaconda](https://www.continuum.io/), a Python distribution with useful packages for Data Science. [Download it](https://www.continuum.io/downloads) and create an _environment_ for the project. ```console @@ -29,14 +31,39 @@ $ ./setup The `activate serenata_de_amor` command must be run every time you enter in the project folder to start working. -### Pyenv users - -If you installed Anaconda via [pyenv](https://github.com/yyuu/pyenv) probably `source activate serenata_de_amor` will fail _unless_ you explicitly use the path to the Anaconta `activate` script. For example: +**For Pyenv users:** If you installed Anaconda via [pyenv](https://github.com/yyuu/pyenv) probably `source activate serenata_de_amor` will fail _unless_ you explicitly use the path to the Anaconta `activate` script. For example: ```console $ source /usr/local/var/pyenv/versions/anaconda3-4.1.1/bin/activate serenata_de_amor ``` +##### Docker Installation Environment + +Requirements: + +* [Docker](https://docs.docker.com/engine/installation/) +* [Docker-compose](https://docs.docker.com/compose/install/) + +Start the environment (maybe it will take some time, the docker image has 4GB): + +```console +$ docker-compose up -d +``` + +Run the script to fetch Quota for Exercising Parliamentary Activity (CEAP) datasets: + +```console +$ docker-compose run --rm jupyter python src/fetch_datasets.py +``` + +If you want to access the console: + +```console +$ docker-compose run --rm jupyter bash +``` + +And access Jupyter Notebook here: [localhost:8888](localhost:8888) + ## Best practices In order to avoid tons of conflicts when trying to merge [Jupyter Notebooks](http://jupyter.org), there are some [guidelines we follow](http://www.svds.com/jupyter-notebook-best-practices-for-data-science/). From b205891592e8efa5d727a55e7c15e198180c33dc Mon Sep 17 00:00:00 2001 From: Daniel Sakuma Date: Mon, 10 Oct 2016 21:38:36 -0300 Subject: [PATCH 4/8] Fix typo --- CONTRIBUTING.md | 10 +++++----- README.md | 4 ++-- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 27d655d57..9cc27a1b6 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -73,7 +73,7 @@ Basically we have four big directories with different purposes: | Directory | Purpose | File naming | |-----------|---------|-------------| | **`develop/`** | This is where we _explore_ data, feel free to create your own notebook for your exploration. | `[ISO 8601 date]-[author-initials]-[2-4 word description].ipynb` (e.g. `2016-05-13-ec-air-tickets.ipynb`) | -|**`report/`** | This is where we write up the findings and results, here is where we put together different data, analysis and strategies to make a point, feel free to jump in. | Meaninful title for the report (e.g. `Transport-allowances.ipybn` | +|**`report/`** | This is where we write up the findings and results, here is where we put together different data, analysis and strategies to make a point, feel free to jump in. | Meaningful title for the report (e.g. `Transport-allowances.ipynb` | | **`src/`** | This is where our auxiliary scripts lies, code to scrap data, to convert stuff etc. | Small caps, no special character, `-` instead of spaces. | | **`data/`** | This is not supposed to be committed, but it is where saved databases will be stored locally (scripts from `src/` should be able to get this data for you); a copy of this data will be available elsewhere (_just in case_). | Small caps, no special character, `-` instead of spaces. | @@ -83,13 +83,13 @@ Here we explain what each script from `src/` does for you: ##### One script to rule them all -1. `src/fetch_datasets.py` dowloads all the available datasets to `data/` is `.xz` compressed CSV format with headers translated to English. +1. `src/fetch_datasets.py` downloads all the available datasets to `data/` is `.xz` compressed CSV format with headers translated to English. ##### Quota for Exercising Parliamentary Activity (CEAP) -1. `src/fetch_datasets.py --from-source` dowloads all CEAP datasets to `data/` from the official source (in XML format in Portuguese) . -1. `src/fetch_datasets.py` dowloads the CEAP datasets into `data/`; it can download them from the official source (in XML format in Portuguese) or from our backup server (`.xz` compressed CSV format, with headers translated to English). +1. `src/fetch_datasets.py --from-source` downloads all CEAP datasets to `data/` from the official source (in XML format in Portuguese) . +1. `src/fetch_datasets.py` downloads the CEAP datasets into `data/`; it can download them from the official source (in XML format in Portuguese) or from our backup server (`.xz` compressed CSV format, with headers translated to English). 1. `src/xml2csv.py` converts the original XML datasets to `.xz` compressed CSV format. 1. `src/translate_datasets.py` translates the datasets file names and the labels of the variables within these files. 1. `src/translation_table.py` creates a `data/YYYY-MM-DD-ceap-datasets.md` file with details of the meaning and of the translation of each variable from the _Quota for Exercising Parliamentary Activity_ datasets. @@ -126,6 +126,6 @@ The project basically happens in four moments, and contributions are welcomed in ## Jarbas -As soon as we started _Serenata de Amor_ [we felt the need for a simple webservice](https://github.com/datasciencebr/serenata-de-amor/issues/34) to browse our data and refer to documents we analize. This is how [Jarbas](https://github.com/datasciencebr/jarbas) was created. +As soon as we started _Serenata de Amor_ [we felt the need for a simple webservice](https://github.com/datasciencebr/serenata-de-amor/issues/34) to browse our data and refer to documents we analyze. This is how [Jarbas](https://github.com/datasciencebr/jarbas) was created. If you fancy web development, feel free to check Jarbas' source code, to check [Jarbas' own Issues](https://github.com/datasciencebr/jarbas/issues) and to contribute there too. diff --git a/README.md b/README.md index 4a2dbc27d..9b6017932 100644 --- a/README.md +++ b/README.md @@ -8,9 +8,9 @@ The Serenata de Amor Operation arose from a combination of needs, from many peop We are building an intelligence capable of analyzing public spending and saying, with reliability, the possibility of each receipt being unlawful. This information will be used beyond the code, in the world outside of GitHub. Everything is open source from the beginning, allowing others to fork the project when their ideas diverge from the Operation Serenata de Amor. -Our current milestone is to create the means for this kind of automation with the Quota for Exercising Parliamentary Activity (CEAP), from the Brazilian Chamber of Deputies. This job includes the development of APIs, data cleaning and analyses, conception and validation of scientific hyphotheses, confirmation of illicit acts via investigation and reports - to the population and to legal authorities. +Our current milestone is to create the means for this kind of automation with the Quota for Exercising Parliamentary Activity (CEAP), from the Brazilian Chamber of Deputies. This job includes the development of APIs, data cleaning and analyses, conception and validation of scientific hypotheses, confirmation of illicit acts via investigation and reports - to the population and to legal authorities. -To achieve this goal, unprecedented, we invite everyone to train the intelligence, collect information, cross databases, validate hyphotheses and apply Machine Learning with models competing against each other and getting combined in ensembles with higher precision than any previous option. +To achieve this goal, unprecedented, we invite everyone to train the intelligence, collect information, cross databases, validate hypotheses and apply Machine Learning with models competing against each other and getting combined in ensembles with higher precision than any previous option. ## Before contributing From d7898529e1ec4090c72ed854c6ce8d5de0d7cdc6 Mon Sep 17 00:00:00 2001 From: Daniel Sakuma Date: Thu, 13 Oct 2016 20:30:31 -0300 Subject: [PATCH 5/8] Add instruction to create config.ini file --- CONTRIBUTING.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 9cc27a1b6..3fc006867 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -50,6 +50,12 @@ Start the environment (maybe it will take some time, the docker image has 4GB): $ docker-compose up -d ``` +Create your config.ini file from the example: + +```console +$ cp config.ini.example config.ini +``` + Run the script to fetch Quota for Exercising Parliamentary Activity (CEAP) datasets: ```console From c21d7beb16b83245e55219f0119ccdf9279cad35 Mon Sep 17 00:00:00 2001 From: Daniel Sakuma Date: Thu, 13 Oct 2016 20:56:04 -0300 Subject: [PATCH 6/8] Fix typo --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 9b6017932..a2133c91f 100644 --- a/README.md +++ b/README.md @@ -8,9 +8,9 @@ The Serenata de Amor Operation arose from a combination of needs, from many peop We are building an intelligence capable of analyzing public spending and saying, with reliability, the possibility of each receipt being unlawful. This information will be used beyond the code, in the world outside of GitHub. Everything is open source from the beginning, allowing others to fork the project when their ideas diverge from the Operation Serenata de Amor. -Our current milestone is to create the means for this kind of automation with the Quota for Exercising Parliamentary Activity (CEAP), from the Brazilian Chamber of Deputies. This job includes the development of APIs, data cleaning and analyses, conception and validation of scientific hypotheses, confirmation of illicit acts via investigation and reports - to the population and to legal authorities. +Our current milestone is to create the means for this kind of automation with the Quota for Exercising Parliamentary Activity (CEAP), from the Brazilian Chamber of Deputies. This job includes the development of APIs, data cleaning and analyses, conception and validation of scientific hypothesis, confirmation of illicit acts via investigation and reports - to the population and to legal authorities. -To achieve this goal, unprecedented, we invite everyone to train the intelligence, collect information, cross databases, validate hypotheses and apply Machine Learning with models competing against each other and getting combined in ensembles with higher precision than any previous option. +To achieve this goal, unprecedented, we invite everyone to train the intelligence, collect information, cross databases, validate hypothesis and apply Machine Learning with models competing against each other and getting combined in ensembles with higher precision than any previous option. ## Before contributing From fff1395a5651b423a0b2061d0387a8849ae54e5b Mon Sep 17 00:00:00 2001 From: Daniel Sakuma Date: Fri, 21 Oct 2016 21:53:08 -0200 Subject: [PATCH 7/8] Add environment activation --- CONTRIBUTING.md | 2 +- docker/Dockerfile | 6 ++++++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 3fc006867..2230c29e6 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -59,7 +59,7 @@ $ cp config.ini.example config.ini Run the script to fetch Quota for Exercising Parliamentary Activity (CEAP) datasets: ```console -$ docker-compose run --rm jupyter python src/fetch_datasets.py +$ docker-compose run --rm jupyter bash -c "source activate serenata_de_amor && python src/fetch_datasets.py" ``` If you want to access the console: diff --git a/docker/Dockerfile b/docker/Dockerfile index 108fce423..0ac5cb52f 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -14,6 +14,12 @@ COPY conda_requirements.txt ./ RUN pip install --upgrade pip RUN pip install -r requirements.txt +RUN conda update --yes conda RUN conda config --add channels Rufone RUN conda config --add channels conda-forge RUN conda install --yes --file conda_requirements.txt +RUN conda install --yes --file conda_requirements.txt +RUN conda create --yes --name serenata_de_amor python=3 + +RUN echo 'source activate serenata_de_amor' >> /home/jovyan/.bashrc + From ee6f9f93e58dc17b09dd1c5fb26243c5bc734565 Mon Sep 17 00:00:00 2001 From: Daniel Sakuma Date: Sun, 23 Oct 2016 23:59:56 -0200 Subject: [PATCH 8/8] Remove environment creation --- CONTRIBUTING.md | 2 +- docker/Dockerfile | 5 ----- 2 files changed, 1 insertion(+), 6 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 2230c29e6..3fc006867 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -59,7 +59,7 @@ $ cp config.ini.example config.ini Run the script to fetch Quota for Exercising Parliamentary Activity (CEAP) datasets: ```console -$ docker-compose run --rm jupyter bash -c "source activate serenata_de_amor && python src/fetch_datasets.py" +$ docker-compose run --rm jupyter python src/fetch_datasets.py ``` If you want to access the console: diff --git a/docker/Dockerfile b/docker/Dockerfile index 0ac5cb52f..5da756a0e 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -18,8 +18,3 @@ RUN conda update --yes conda RUN conda config --add channels Rufone RUN conda config --add channels conda-forge RUN conda install --yes --file conda_requirements.txt -RUN conda install --yes --file conda_requirements.txt -RUN conda create --yes --name serenata_de_amor python=3 - -RUN echo 'source activate serenata_de_amor' >> /home/jovyan/.bashrc -