From 6ebbbbb02e3133566da825db1ae415e5020cfe42 Mon Sep 17 00:00:00 2001 From: Marco Moretto Date: Wed, 10 Jun 2020 16:04:21 +0200 Subject: [PATCH] delete jupyter use case --- docs/VESPUCCI_with_pyCOMPASS_use_case_1.ipynb | 1530 ----------------- 1 file changed, 1530 deletions(-) delete mode 100644 docs/VESPUCCI_with_pyCOMPASS_use_case_1.ipynb diff --git a/docs/VESPUCCI_with_pyCOMPASS_use_case_1.ipynb b/docs/VESPUCCI_with_pyCOMPASS_use_case_1.ipynb deleted file mode 100644 index b5e7644..0000000 --- a/docs/VESPUCCI_with_pyCOMPASS_use_case_1.ipynb +++ /dev/null @@ -1,1530 +0,0 @@ -{ - "nbformat": 4, - "nbformat_minor": 0, - "metadata": { - "colab": { - "name": "VESPUCCI with pyCOMPASS: use case 1", - "provenance": [], - "collapsed_sections": [], - "toc_visible": true - }, - "kernelspec": { - "name": "python3", - "display_name": "Python 3" - } - }, - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "id": "WTcil1UqkSYo", - "colab_type": "text" - }, - "source": [ - "# VESPUCCI use case n° 1\n", - "\n", - "[VESPUCCI](https://vespucci.readthedocs.io/) is the gene expression database for grapevine and we can access it via its GraphQL interface, called [COMPASS](https://compass-.readthedocs.io/). The [pyCOMPASS](https://pycompass.readthedocs.io/) package is a Python package that wraps some functionalities to simplify communication with the [COMPASS](https://compass-.readthedocs.io/) interface.\n", - "\n", - "In this first excercise we will perform few basic operations with [VESPUCCI](https://vespucci.readthedocs.io/). We will create a Module starting from few genes and then automatically extend it by adding more genes. We will also have a look at gene and samples annotations." - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "TShLgSf8GH-F", - "colab_type": "code", - "outputId": "dcdbc21e-abf8-4c72-da91-1f07db20cb23", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 595 - } - }, - "source": [ - "!pip install pycompass==0.5.13" - ], - "execution_count": 1, - "outputs": [ - { - "output_type": "stream", - "text": [ - "Collecting pycompass==0.5.13\n", - " Downloading https://files.pythonhosted.org/packages/68/c7/a2ae02e4685faf133e0f2b4d1df16e34e9aea32e36535f849b90a9d0774d/pyCOMPASS-0.5.13-py3-none-any.whl\n", - "Collecting requests==2.21.0\n", - "\u001b[?25l Downloading https://files.pythonhosted.org/packages/7d/e3/20f3d364d6c8e5d2353c72a67778eb189176f08e873c9900e10c0287b84b/requests-2.21.0-py2.py3-none-any.whl (57kB)\n", - "\u001b[K |████████████████████████████████| 61kB 4.4MB/s \n", - "\u001b[?25hCollecting numpy==1.16.0\n", - "\u001b[?25l Downloading https://files.pythonhosted.org/packages/7b/74/54c5f9bb9bd4dae27a61ec1b39076a39d359b3fb7ba15da79ef23858a9d8/numpy-1.16.0-cp36-cp36m-manylinux1_x86_64.whl (17.3MB)\n", - "\u001b[K |████████████████████████████████| 17.3MB 237kB/s \n", - "\u001b[?25hRequirement already satisfied: urllib3<1.25,>=1.21.1 in /usr/local/lib/python3.6/dist-packages (from requests==2.21.0->pycompass==0.5.13) (1.24.3)\n", - "Collecting idna<2.9,>=2.5\n", - "\u001b[?25l Downloading https://files.pythonhosted.org/packages/14/2c/cd551d81dbe15200be1cf41cd03869a46fe7226e7450af7a6545bfc474c9/idna-2.8-py2.py3-none-any.whl (58kB)\n", - "\u001b[K |████████████████████████████████| 61kB 7.1MB/s \n", - "\u001b[?25hRequirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.6/dist-packages (from requests==2.21.0->pycompass==0.5.13) (2020.4.5.1)\n", - "Requirement already satisfied: chardet<3.1.0,>=3.0.2 in /usr/local/lib/python3.6/dist-packages (from requests==2.21.0->pycompass==0.5.13) (3.0.4)\n", - "\u001b[31mERROR: umap-learn 0.4.4 has requirement numpy>=1.17, but you'll have numpy 1.16.0 which is incompatible.\u001b[0m\n", - "\u001b[31mERROR: google-colab 1.0.0 has requirement requests~=2.23.0, but you'll have requests 2.21.0 which is incompatible.\u001b[0m\n", - "\u001b[31mERROR: datascience 0.10.6 has requirement folium==0.2.1, but you'll have folium 0.8.3 which is incompatible.\u001b[0m\n", - "\u001b[31mERROR: albumentations 0.1.12 has requirement imgaug<0.2.7,>=0.2.5, but you'll have imgaug 0.2.9 which is incompatible.\u001b[0m\n", - "Installing collected packages: idna, requests, numpy, pycompass\n", - " Found existing installation: idna 2.9\n", - " Uninstalling idna-2.9:\n", - " Successfully uninstalled idna-2.9\n", - " Found existing installation: requests 2.23.0\n", - " Uninstalling requests-2.23.0:\n", - " Successfully uninstalled requests-2.23.0\n", - " Found existing installation: numpy 1.18.5\n", - " Uninstalling numpy-1.18.5:\n", - " Successfully uninstalled numpy-1.18.5\n", - "Successfully installed idna-2.8 numpy-1.16.0 pycompass-0.5.13 requests-2.21.0\n" - ], - "name": "stdout" - }, - { - "output_type": "display_data", - "data": { - "application/vnd.colab-display-data+json": { - "pip_warning": { - "packages": [ - "idna", - "numpy", - "requests" - ] - } - } - }, - "metadata": { - "tags": [] - } - } - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "ngT091vOlWus", - "colab_type": "text" - }, - "source": [ - "### Import the pyCOMPASS module and classes" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "q8uzKfVLGpsP", - "colab_type": "code", - "colab": {} - }, - "source": [ - "from pycompass import Compendium, Connect, BiologicalFeature, Module, SampleSet, Plot, Annotation, Experiment, Sample, Platform, Ontology\n", - "from IPython.core.display import HTML\n", - "import numpy as np\n", - "import pandas as pd" - ], - "execution_count": 0, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "ErnegR7JllhT", - "colab_type": "text" - }, - "source": [ - "### The Connect object represent our connection to the COMPASS GraphQL endpoint" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "vW0FtdOlG3pk", - "colab_type": "code", - "colab": {} - }, - "source": [ - "connect = Connect('http://compass.fmach.it/graphql')" - ], - "execution_count": 0, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "eDAVKmmIlz0C", - "colab_type": "text" - }, - "source": [ - "### The COMPASS GraphQL endpoint might hosts different compendia\n", - "\n", - "At the moment there is only the VESPUCCI compendium, but there are different version of VESPUCCI, and each version might have data normalized in different ways. In this case there are 2 versions of VESPUCCI, version 1.0 (legacy) and version 2.0 (latest). The latter has data normalized in 2 different ways, TPM normalization and LIMMA (the default one) while the legacy version has the legacy normalization only (i.e. per-sample logratios).\n", - "For every query we will need to indicate the compendium we want to use, if no version, no normalization and no database is specified the default values will be used." - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "pF1pr5RumTS_", - "colab_type": "code", - "outputId": "11e1549e-266d-4411-eef7-0a0f9093e3f3", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 255 - } - }, - "source": [ - "connect.describe_compendia()" - ], - "execution_count": 4, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - "{'compendia': [{'defaultVersion': '2.0',\n", - " 'description': 'The Vitis gene expression compendium',\n", - " 'fullName': 'VESPUCCI',\n", - " 'name': 'vespucci',\n", - " 'versions': [{'databases': [{'name': 'vitis_vinifera',\n", - " 'normalizations': ['legacy (default)']}],\n", - " 'defaultDatabase': 'vitis_vinifera',\n", - " 'versionAlias': 'legacy',\n", - " 'versionNumber': '1.0'},\n", - " {'databases': [{'name': 'vitis_vinifera',\n", - " 'normalizations': ['tpm_sample', 'limma (default)']}],\n", - " 'defaultDatabase': 'vitis_vinifera',\n", - " 'versionAlias': 'latest',\n", - " 'versionNumber': '2.0'}]}]}" - ] - }, - "metadata": { - "tags": [] - }, - "execution_count": 4 - } - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "277mqicxowq1", - "colab_type": "text" - }, - "source": [ - "### Let's use the vespucci compendium (of course) but the legacy version" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "t-DBJSj7HK1T", - "colab_type": "code", - "colab": {} - }, - "source": [ - "compendium = connect.get_compendium('vespucci', version='legacy')" - ], - "execution_count": 0, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "LYzlyXZppeNO", - "colab_type": "text" - }, - "source": [ - "### The compendium object has the same fields has the JSON object seen above" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "J3ceMq8aHbIs", - "colab_type": "code", - "outputId": "3e1cdd39-db2d-4882-8ef0-33e71939861f", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 34 - } - }, - "source": [ - "compendium.description" - ], - "execution_count": 6, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - "'The Vitis gene expression compendium'" - ] - }, - "metadata": { - "tags": [] - }, - "execution_count": 6 - } - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "W_OxcaQ8p4t_", - "colab_type": "text" - }, - "source": [ - "### Genes\n", - "\n", - "Let's build our module starting from a bunch of genes that might come from a previous analysis. Gene name used are the VIT_ ids" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "t92FxW2sHdip", - "colab_type": "code", - "colab": {} - }, - "source": [ - "gene_names = ['VIT_00s0246g00220','VIT_00s0332g00060','VIT_00s0332g00110','VIT_00s0332g00160','VIT_00s0396g00010','VIT_00s0505g00030','VIT_00s0505g00060','VIT_00s0873g00020','VIT_00s0904g00010']" - ], - "execution_count": 0, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "se0cxaoorRuM", - "colab_type": "text" - }, - "source": [ - "We can query the compendium with the list of gene names and get a list of BiologicalFeature objects that represents our genes of interest. The easiest way to know which are the valid *filter* values to be used is to use the autocompletition (ALT + SPACEBAR) in the COMPASS GraphiQL interface at http://compass.fmach.it/graphql." - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "QktAL-KHICP-", - "colab_type": "code", - "colab": {} - }, - "source": [ - "bf = BiologicalFeature.using(compendium).get(filter={'name_In': gene_names})" - ], - "execution_count": 0, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "0KrTKIg5r9EM", - "colab_type": "text" - }, - "source": [ - "Genes might have different aliases, all of which are defined in VESPUCCI as annotations. Since annotations (for both genes and samples) are represented as RDF triples using different ontologies, we can perform a [SPARQL](https://en.wikipedia.org/wiki/SPARQL) query to retrieve all the genes that has an ***alias*** ([NCIT_C41095](https://www.ebi.ac.uk/ols/ontologies/ncit/terms?iri=http%3A%2F%2Fpurl.obolibrary.org%2Fobo%2FNCIT_C41095) term on the NCIT Ontology, http://purl.obolibrary.org/obo/) equal to the ones specified in the list." - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "c9hxfIi5ON3v", - "colab_type": "code", - "colab": {} - }, - "source": [ - "alias = []\n", - "for n in ['B9S8R7','Q7M2G6','D7SZ93','B8XIJ8','Vv00s0125g00280','Vv00s0187g00140','Vv00s0246g00010','Vv00s0246g00080','Vv00s0438g00020','Vv00s0246g00200']:\n", - " alias.append(\"{{?s '{n}'}}\".format(n=n))\n", - "sparql = 'SELECT ?s ?p ?o WHERE {{ {alias} }}'.format(alias=' UNION '.join(alias))" - ], - "execution_count": 0, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "4QODSLxwxHl0", - "colab_type": "text" - }, - "source": [ - "This is the resulting query" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "0GZzakvoxL6X", - "colab_type": "code", - "outputId": "e6c015b3-b884-4892-db39-980bcd6e81bd", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 54 - } - }, - "source": [ - "print(sparql)" - ], - "execution_count": 10, - "outputs": [ - { - "output_type": "stream", - "text": [ - "SELECT ?s ?p ?o WHERE { {?s 'B9S8R7'} UNION {?s 'Q7M2G6'} UNION {?s 'D7SZ93'} UNION {?s 'B8XIJ8'} UNION {?s 'Vv00s0125g00280'} UNION {?s 'Vv00s0187g00140'} UNION {?s 'Vv00s0246g00010'} UNION {?s 'Vv00s0246g00080'} UNION {?s 'Vv00s0438g00020'} UNION {?s 'Vv00s0246g00200'} }\n" - ], - "name": "stdout" - } - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "mikfuoKrt0wH", - "colab_type": "text" - }, - "source": [ - "Once the SPARQL query is defined we can pass it to the BiologicalFeature object to get all the BiologicalFeature linked to the SPARQL query response. In this case we are using the ***by*** method instead of ***get*** method. The latter is used to pass constructs similarly as you would do with the COMPASS GraphQL interface (so it has the same parameters names and fields). The former insted is used to create objects from other higher level objects, for example we might want to get Sample objects out of Experiment objects." - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "-Ff1pA4ttzmS", - "colab_type": "code", - "colab": {} - }, - "source": [ - "for _bf in BiologicalFeature.using(compendium).by(sparql=sparql):\n", - " bf.append(_bf)" - ], - "execution_count": 0, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "oyw-ZvgLvgAZ", - "colab_type": "text" - }, - "source": [ - "Each BiologicalFeature object has several fields. In our case, since the BiologicalFeature represent a gene, we have ***sequence***. The ***id*** is the identified used internally in the database." - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "E14L6QMIvXWF", - "colab_type": "code", - "outputId": "5fd1ebf9-7742-4ed9-97d1-02e79c868d77", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 54 - } - }, - "source": [ - "print(bf[0].id, bf[0].sequence)" - ], - "execution_count": 12, - "outputs": [ - { - "output_type": "stream", - "text": [ - "QmlvRmVhdHVyZVR5cGU6Mjc0Mzk= ATGCCTCAACTGGATAAATTCACTTATTTCACACAATTCTTCTGGTCATGCCTTTTCCTCTTTACTTTCTATATTCCCATATGCAATGATGGAGATGGAGTACTTGGGATCAGCAGAATTCTAAAACTACGGAACCAACTGGTTTCACACCGGGGGAACAACATCCGGAGCAACGACCCCAACAGTTTGGAAGATATCTTGAGAAAAGGTTTTAGCACCGGTGTATCCTATATGTACTCTAGTTTATTCGAAGTATCCCAATGGTGTAACGTCGTCGACTTATTGGGAAAAAGGAGTCAAATAACATTGATCTCTTGTTTCGGAGAAATAAGTGGCTCACGAGGAATGGAAAGAAACATATTCTATTTGATCTCGAAGTCCTCATATAGCACTTCTTCCAATCCTGGATGGGGGATCACTTGTAGGAATGACATAATGCTAATCCATGTTCCACACGGCCAAGGAAACATCGTTTTTTAATCTCATTATGACCGGAAGAAATTCTTTCTCGATCAGAAAAGTGGAATGGAAGGCACTACAAGCAAGAAAGATATACTCCTTTTCAAATCGCCCCGCGAAGACGGACGTTCAGAAAGGTTCTCGAGATTCTCAATCGCTCTAGTGGGGAGGGATAGTGCTTGAAGGGAGCGAGACCAAGCCGAGCCACTAGGAGAGTAAGCCCTTCCCACGTGTCAAGTTGAATAAATGAATGCAGCCTCAACCAGAGAGATCAACCAGCGCCTTTTATTTTAGGCCGTCGGCGGTGCAGAACGCACTAATCCGCGACCAACAAGTTTTCCGAAACCGAACTGAATGGAATTCTTACTTTAAAAAAATGCTTGCTGAAAATCAAAGAAAGAAGGTCCATTTTCCCACGTAGTTCGTCGGTCAAACCAACGATTCTCAAAGTAATAGAGAGATCTTTTTCTAGTTAGACTTCTATCAATGCAATGAAAGAACCATCCCTTCCTATTTGTTTGTCCTGTCAGATATAAAGAAAATTAGACCCCAAAGAGCCCTTCTTTACCACTTTAGGGGGTGGGGGTGAAGGGGGGGTTTACATACAACCAAGGCAAAGTGGTTTATGATTGAATCTCAGAGGCATTCTTTTCATTTGGTAGATCCAAGTCCATGGCCTATTTCGGGTTCACTCGGAGCTTTGGCAACCACCGTAGAGGTGTGATGTACATGCACCCATTTCAAGGGGGTGCAACACTTCTCAGTTTGGCATCATATTTATCCTATATACCATGTTCGTATGCTGGCGCGATGTTATACGTGAATCAACGTTGGGAGGACATCATACCAAAGTCGTACA\n" - ], - "name": "stdout" - } - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "ZuuhXXGYwQr1", - "colab_type": "text" - }, - "source": [ - "### Create a Module\n", - "We are now ready to create our first module. If we pass a list of genes (BiologicalFeature objects) it will automatically retrieve the \"best\" conditions (SampleSet objects)." - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "q5PfRO7IIK88", - "colab_type": "code", - "colab": {} - }, - "source": [ - "module_1 = Module.using(compendium).create(biofeatures=bf)" - ], - "execution_count": 0, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "x36_gcX5xOKk", - "colab_type": "text" - }, - "source": [ - "A module can be thought as a matrix composed by BiologicalFeature objects (genes in our case) and SampleSet objects (conditions contrasts in our case). Each SampleSet, as the name suggests, is composed by 1 or more Sample objects. Since we are using the legacy normalization each SampleSet is composed by 2 samples, one Test sample and one Reference sample and the value associated is the logratio between the Test and Reference for a specific gene (BiologicalFeature)." - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "9FJb003mw4ZN", - "colab_type": "code", - "outputId": "78f5ae41-4e0d-46f2-b239-3fbca41a4e69", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 34 - } - }, - "source": [ - "print(module_1.sample_sets[0].id, module_1.sample_sets[0].name, module_1.sample_sets[0].normalization)" - ], - "execution_count": 14, - "outputs": [ - { - "output_type": "stream", - "text": [ - "U2FtcGxlU2V0VHlwZTo1NDc= GSM605594.ch1-vs-GSM605578.ch1 legacy\n" - ], - "name": "stdout" - } - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "6dzez2f6yuD5", - "colab_type": "code", - "outputId": "5f4350ae-9d73-46ea-e338-06c8ebe44760", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 51 - } - }, - "source": [ - "for sample in module_1.sample_sets[0]:\n", - " print(sample.id, sample.sampleName, sample.description)" - ], - "execution_count": 15, - "outputs": [ - { - "output_type": "stream", - "text": [ - "U2FtcGxlVHlwZTo0NDA= GSM605578.ch1 Berry skin stage33|**|Berry skin stage33 Nor33A|**|Berry skins at stage 33|**|ref_GSE24561_EL33\n", - "U2FtcGxlVHlwZTo0NTY= GSM605594.ch1 Berry skin stage 38|**|Berry skin stage 38 Nor38B|**|Berry skins at stage 38|**|test_GSE24561_EL38\n" - ], - "name": "stdout" - } - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "oIeNtdyt1Aag", - "colab_type": "text" - }, - "source": [ - "### Plot a module\n", - "The easiest way to visualize a module is using a heatmap. The Plot object wraps a module and can return different plot type with different format (the Plotly JSON format or HTML). To see an heatmap embedded in a Jupyter Notebook we can retrieve the HTML code of the Plot (that uses Plotly) and display it directly into the notebook." - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "Hm32_AYsLEJM", - "colab_type": "code", - "outputId": "8a8b0e55-739c-4faf-e099-54861a11d2b8", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 467 - } - }, - "source": [ - "module_1_heatmap, sorted_bf, sorted_ss = Plot(module_1).plot_heatmap(output_format='html')\n", - "display(HTML(module_1_heatmap))" - ], - "execution_count": 16, - "outputs": [ - { - "output_type": "display_data", - "data": { - "text/html": [ - "
\n", - " " - ], - "text/plain": [ - "" - ] - }, - "metadata": { - "tags": [] - } - } - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "KogozEo-23WG", - "colab_type": "text" - }, - "source": [ - "As we said, a module is a matrix so the values are a Numpy 2-D array" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "DaVT-90WqpHB", - "colab_type": "code", - "outputId": "1d4cd231-2465-4292-e1c8-0e8c261b57ae", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 102 - } - }, - "source": [ - "module_1.values[:5, :5]" - ], - "execution_count": 17, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - "array([[ nan, 3.8915e-03, -8.3787e-01, -9.9756e-01, -4.2721e+00],\n", - " [ nan, 6.9422e-01, 6.0731e-01, 8.2324e-01, -5.3546e-01],\n", - " [-3.3190e+00, -1.0721e+00, -1.7566e+00, -1.5403e+00, -3.5507e+00],\n", - " [ nan, 9.6475e-01, 5.1048e-01, 1.5966e+00, -8.6214e-01],\n", - " [ nan, 3.4092e+00, 2.1634e+00, 4.0767e+00, -2.3451e+00]])" - ] - }, - "metadata": { - "tags": [] - }, - "execution_count": 17 - } - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "OjE4nDUFCXik", - "colab_type": "text" - }, - "source": [ - "Biological Features (genes) and Sample Sets (contrasts) have no particular order, in a module. But since the heatmap is clustered we can retrieve the sorted list of Biological Features and Sample Sets respectively.\n", - "An easy way is to use a DataFrame to represent our module and then sort it according to the new gene and contrast lists." - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "sg8c8FSGOgIo", - "colab_type": "code", - "outputId": "a9c0424a-dd9e-421f-f254-32e6ff626c8b", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 221 - } - }, - "source": [ - "module_1_df = pd.DataFrame(module_1.values, columns=[ss.name for ss in module_1.sample_sets], index=[bf.name for bf in module_1.biological_features])\n", - "module_1_df[module_1_df.columns[:7]].head()" - ], - "execution_count": 18, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
GSM605594.ch1-vs-GSM605578.ch1GSM881592.ch1-vs-GSM881586.ch1GSM881593.ch1-vs-GSM881586.ch1GSM881594.ch1-vs-GSM881586.ch1GSM1020450.ch1-vs-GSM1020442.ch1GSM1020468.ch1-vs-GSM1020460.ch1GSM1020501.ch1-vs-GSM1020496.ch1
VIT_00s0125g00280NaN0.003891-0.83787-0.99756-4.27210-4.11030-2.21780
VIT_00s0187g00140NaN0.6942200.607310.82324-0.535461.41150-0.70432
VIT_00s0207g00210-3.319-1.072100-1.75660-1.54030-3.55070-3.70420-1.65390
VIT_00s0246g00010NaN0.9647500.510481.59660-0.86214-0.79772-1.15580
VIT_00s0246g00080NaN3.4092002.163404.07670-2.34510-2.00910-2.39680
\n", - "
" - ], - "text/plain": [ - " GSM605594.ch1-vs-GSM605578.ch1 ... GSM1020501.ch1-vs-GSM1020496.ch1\n", - "VIT_00s0125g00280 NaN ... -2.21780\n", - "VIT_00s0187g00140 NaN ... -0.70432\n", - "VIT_00s0207g00210 -3.319 ... -1.65390\n", - "VIT_00s0246g00010 NaN ... -1.15580\n", - "VIT_00s0246g00080 NaN ... -2.39680\n", - "\n", - "[5 rows x 7 columns]" - ] - }, - "metadata": { - "tags": [] - }, - "execution_count": 18 - } - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "h9zDTL3MQYZh", - "colab_type": "text" - }, - "source": [ - "Now let's sort it" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "2AWITV0EPaLd", - "colab_type": "code", - "outputId": "a6cea33c-9d61-4744-95cd-b3badfa064a5", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 221 - } - }, - "source": [ - "sorted_module_1_df = module_1_df.loc[[bf.name for bf in sorted_bf]][[ss.name for ss in sorted_ss]]\n", - "sorted_module_1_df[sorted_module_1_df.columns[:7]].head()" - ], - "execution_count": 19, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
GSM605594.ch1-vs-GSM605578.ch1GSM2627741.ch1-vs-GSM2627691.ch1GSM2627738.ch1-vs-GSM2627691.ch1GSM2627736.ch1-vs-GSM2627691.ch1GSM2627717.ch1-vs-GSM2627691.ch1GSM2627843.ch1-vs-GSM2627823.ch1GSM2572390.ch1-vs-GSM2572412.ch1
VIT_00s0187g00140NaN-0.61135-0.70493-0.88252-0.44565-0.028106-2.6088
VIT_00s0332g00110NaN-1.62490-2.21220-2.06070-2.70300-1.822800-3.4858
VIT_00s0246g00010NaN-0.69742-0.84062-0.61406-1.12830-0.651370-2.1068
VIT_00s0873g000200.14833-1.57320-1.78040-1.59330-2.67070-2.403300-3.1754
VIT_00s0733g00010NaN-1.08320-1.53010-1.19040-2.09160-1.580100-2.1800
\n", - "
" - ], - "text/plain": [ - " GSM605594.ch1-vs-GSM605578.ch1 ... GSM2572390.ch1-vs-GSM2572412.ch1\n", - "VIT_00s0187g00140 NaN ... -2.6088\n", - "VIT_00s0332g00110 NaN ... -3.4858\n", - "VIT_00s0246g00010 NaN ... -2.1068\n", - "VIT_00s0873g00020 0.14833 ... -3.1754\n", - "VIT_00s0733g00010 NaN ... -2.1800\n", - "\n", - "[5 rows x 7 columns]" - ] - }, - "metadata": { - "tags": [] - }, - "execution_count": 19 - } - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "1pSoguiCZX20", - "colab_type": "text" - }, - "source": [ - "### Plot correlation network\n", - "Given a module we can plot the correlation network between Biological Features (genes) by looking at the Pearson correlation coefficient of the module's Sample Sets (contrast). The default threshold is 0.7, but it can be changed.\n", - "Edges in orange represent anti-correlation." - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "MDkEMczxrLFZ", - "colab_type": "code", - "outputId": "8bda1697-f1c5-4212-a908-9d3312cc9624", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 467 - } - }, - "source": [ - "module_1_network = Plot(module_1).plot_network(output_format='html', threshold=0.6)\n", - "display(HTML(module_1_network))" - ], - "execution_count": 20, - "outputs": [ - { - "output_type": "display_data", - "data": { - "text/html": [ - "
" - ], - "text/plain": [ - "" - ] - }, - "metadata": { - "tags": [] - } - } - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "sG1eDW_waqAr", - "colab_type": "text" - }, - "source": [ - "### Automatically expand a module\n", - "One of the classic way to use VESPUCCI is to start with a set of genes selected from previous analysis, have a look at the top conditions in which they show a consistent (and strong) behaviour and then look for other genes that have the same behavior across those same conditions.\n", - "\n", - "The way to do it is to plot the distribution of genes that show a similar behaviour of the module's genes across the same condition in order pick up a suitable threshold." - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "FbxoFz6buI_B", - "colab_type": "code", - "outputId": "96c0ed13-12f0-4d63-d5cf-b2061710298b", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 467 - } - }, - "source": [ - "module_1_bf_distribution, ranked_bf = Plot(module_1).plot_distribution(output_format='html', plot_type='biological_features_uncentered_correlation_distribution', get_rank=True)\n", - "display(HTML(module_1_bf_distribution))" - ], - "execution_count": 21, - "outputs": [ - { - "output_type": "display_data", - "data": { - "text/html": [ - "
" - ], - "text/plain": [ - "" - ] - }, - "metadata": { - "tags": [] - } - } - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "YJvnQa_d0BhH", - "colab_type": "text" - }, - "source": [ - "For example, if we pick a threshold around 0.65 (looking at the top plot) we might expect to have around 30 Biological Features (genes) with a score equal or greater than that, to be added in our module. In this case let's just pick the top 30 genes and create e new module. " - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "THSf5LrYPlCV", - "colab_type": "code", - "colab": {} - }, - "source": [ - "top_30_bf_ids = ranked_bf['id'][:30]\n", - "top_30_bf = BiologicalFeature.using(compendium).get(filter={'id_In': top_30_bf_ids})" - ], - "execution_count": 0, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "106IfAXh2W4D", - "colab_type": "text" - }, - "source": [ - "The new module is created by adding Biological Feature objects to the old module. This operation will modify the module object in-place, so we might want to clone the **module_1** object first, if we don't want to lose it, or we might want to save it in a local file and then create e new module by simply loading it." - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "vca3sjR32C13", - "colab_type": "code", - "colab": {} - }, - "source": [ - "module_1.write_to_file('module_1')\n", - "module_2 = Module.read_from_file('module_1')\n", - "module_2.add_biological_features(top_30_bf)" - ], - "execution_count": 0, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "GgjV6cCw1zE9", - "colab_type": "text" - }, - "source": [ - "We can have a look at the heatmap of the new module" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "ZAd-oxtLHG6t", - "colab_type": "code", - "outputId": "d59531d6-be36-4d00-eaa7-934b3620cfc7", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 467 - } - }, - "source": [ - "module_2_heatmap, sorted_bf, sorted_ss = Plot(module_2).plot_heatmap(output_format='html')\n", - "display(HTML(module_2_heatmap))" - ], - "execution_count": 24, - "outputs": [ - { - "output_type": "display_data", - "data": { - "text/html": [ - "
\n", - " " - ], - "text/plain": [ - "" - ] - }, - "metadata": { - "tags": [] - } - } - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "ZUDifhPvDcb_", - "colab_type": "text" - }, - "source": [ - "### Annotations\n", - "\n", - "In VESPUCCI, both Biological Feature (gene) and Sample annotations are represented using [RDF](https://en.wikipedia.org/wiki/Resource_Description_Framework). We can visualize annotation as a set of RDF triples." - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "MI4INR-519jo", - "colab_type": "code", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 799 - }, - "outputId": "681038dd-cda5-406a-daed-9584b8caff7a" - }, - "source": [ - "# Annotation of sample of the first Sample Sets of the heatmap\n", - "for sample in sorted_ss[0]:\n", - " for annotation in Annotation(sample).get_triples():\n", - " print(sample.sampleName, ' '.join(annotation))" - ], - "execution_count": 28, - "outputs": [ - { - "output_type": "stream", - "text": [ - "GSM605578.ch1 GSM605578.ch1 Year of Onset N5591c638667a4f02b75d517297598c9a\n", - "GSM605578.ch1 N5591c638667a4f02b75d517297598c9a value 2008\n", - "GSM605578.ch1 UO_0000036 name year\n", - "GSM605578.ch1 N5591c638667a4f02b75d517297598c9a unit UO_0000036\n", - "GSM605578.ch1 GSM605578.ch1 whole plant development stage 33\n", - "GSM605578.ch1 NCBITaxon_3605 name Vitis aestivalis\n", - "GSM605578.ch1 GSM605578.ch1 Genotype NCBITaxon_3605\n", - "GSM605578.ch1 GSM605578.ch1 cultivar Norton\n", - "GSM605578.ch1 PO_0009085 name exocarp\n", - "GSM605578.ch1 GSM605578.ch1 type PO_0009085\n", - "GSM605578.ch1 GSM605578.ch1 is grown in N8488c41d24f14bb78ff7427ff580d91a\n", - "GSM605578.ch1 N8488c41d24f14bb78ff7427ff580d91a http://purl.obolibrary.org/obo/GAZ_00000448 N58707b8a270245769beb36e5ab1f16f8\n", - "GSM605578.ch1 ENVO_00000116 name vineyard\n", - "GSM605578.ch1 N8488c41d24f14bb78ff7427ff580d91a type ENVO_00000116\n", - "GSM605578.ch1 N58707b8a270245769beb36e5ab1f16f8 Latitude N4199cfef29ad4de281238f5653fdc8ec\n", - "GSM605578.ch1 N58707b8a270245769beb36e5ab1f16f8 Description Festus Missouri US\n", - "GSM605578.ch1 N58707b8a270245769beb36e5ab1f16f8 Longitude Nd87c74318b724609aec57030de4f2eee\n", - "GSM605578.ch1 UO_0000185 name degree\n", - "GSM605578.ch1 Nd87c74318b724609aec57030de4f2eee unit UO_0000185\n", - "GSM605578.ch1 Nd87c74318b724609aec57030de4f2eee value -90.396814\n", - "GSM605578.ch1 N4199cfef29ad4de281238f5653fdc8ec value 38.222284\n", - "GSM605578.ch1 UO_0000185 name degree\n", - "GSM605578.ch1 N4199cfef29ad4de281238f5653fdc8ec unit UO_0000185\n", - "GSM605594.ch1 GSM605594.ch1 Year of Onset Nb72063e7ed2e4035b27d48983b3afe88\n", - "GSM605594.ch1 Nb72063e7ed2e4035b27d48983b3afe88 value 2008\n", - "GSM605594.ch1 UO_0000036 name year\n", - "GSM605594.ch1 Nb72063e7ed2e4035b27d48983b3afe88 unit UO_0000036\n", - "GSM605594.ch1 NCBITaxon_3605 name Vitis aestivalis\n", - "GSM605594.ch1 GSM605594.ch1 Genotype NCBITaxon_3605\n", - "GSM605594.ch1 GSM605594.ch1 cultivar Norton\n", - "GSM605594.ch1 PO_0009085 name exocarp\n", - "GSM605594.ch1 GSM605594.ch1 type PO_0009085\n", - "GSM605594.ch1 GSM605594.ch1 is grown in Nd6a3100f100047f08c32062d06493323\n", - "GSM605594.ch1 Nd6a3100f100047f08c32062d06493323 http://purl.obolibrary.org/obo/GAZ_00000448 Nf22c5b2ec54847ecafdd5c4253565733\n", - "GSM605594.ch1 ENVO_00000116 name vineyard\n", - "GSM605594.ch1 Nd6a3100f100047f08c32062d06493323 type ENVO_00000116\n", - "GSM605594.ch1 Nf22c5b2ec54847ecafdd5c4253565733 Longitude N0bc8aaab043a47868c384ac19bdd59a6\n", - "GSM605594.ch1 Nf22c5b2ec54847ecafdd5c4253565733 Description Festus Missouri US\n", - "GSM605594.ch1 Nf22c5b2ec54847ecafdd5c4253565733 Latitude Ne99e7fabfebc45c3ba84d7efc8ffb81e\n", - "GSM605594.ch1 UO_0000185 name degree\n", - "GSM605594.ch1 Ne99e7fabfebc45c3ba84d7efc8ffb81e unit UO_0000185\n", - "GSM605594.ch1 Ne99e7fabfebc45c3ba84d7efc8ffb81e value 38.222284\n", - "GSM605594.ch1 UO_0000185 name degree\n", - "GSM605594.ch1 N0bc8aaab043a47868c384ac19bdd59a6 unit UO_0000185\n", - "GSM605594.ch1 N0bc8aaab043a47868c384ac19bdd59a6 value -90.396814\n", - "GSM605594.ch1 GSM605594.ch1 whole plant development stage 38\n" - ], - "name": "stdout" - } - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "ioiWUfzwGHRo", - "colab_type": "text" - }, - "source": [ - "Another way is to visualize the annotation is with RDF graph." - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "_uextG7DE_ti", - "colab_type": "code", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 1000 - }, - "outputId": "7a11d641-f7e7-4fca-a73c-b63748108f0b" - }, - "source": [ - "# Annotation of sample of the first Sample Sets of the heatmap\n", - "for sample in sorted_ss[0]:\n", - " rdf_graph = Annotation(sample).plot_network()\n", - " display(HTML(rdf_graph))" - ], - "execution_count": 37, - "outputs": [ - { - "output_type": "display_data", - "data": { - "text/html": [ - "\n", - " \n", - " \n", - "
\n", - " \n", - " \n", - " " - ], - "text/plain": [ - "" - ] - }, - "metadata": { - "tags": [] - } - }, - { - "output_type": "display_data", - "data": { - "text/html": [ - "\n", - " \n", - " \n", - "
\n", - " \n", - " \n", - " " - ], - "text/plain": [ - "" - ] - }, - "metadata": { - "tags": [] - } - } - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "DBLYpZocTGPr", - "colab_type": "text" - }, - "source": [ - "Annotations can also be used to retrieve Sample or Biological Features (gene) starting from a SPARQL query. For example we might want to select all seed Sample." - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "UxSVxRoiTWVz", - "colab_type": "code", - "colab": {} - }, - "source": [ - "sparql = \"SELECT ?s ?p ?o \" + \\\n", - " \"WHERE { ?s . \" + \\\n", - " \"?s }\"" - ], - "execution_count": 0, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "whHK9n6WTgi0", - "colab_type": "text" - }, - "source": [ - "The term http://purl.obolibrary.org/obo/NCIT_C19157 indicates ***speciemen*** while http://purl.obolibrary.org/obo/PO_0009010 is the Plant Ontology term for ***seed***. To indicate a gene, instead the term http://purl.obolibrary.org/obo/NCIT_C16612 is used." - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "CUrt5zxDTbu5", - "colab_type": "code", - "colab": {} - }, - "source": [ - "seed_samples = Sample.using(compendium).by(sparql=sparql)" - ], - "execution_count": 0, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "90uPkI_9VkRH", - "colab_type": "text" - }, - "source": [ - "Now we can collect all the Sample Sets in which these samples are used and then create a new module based on Sample Sets only leaving VESPUCCI to figure out which are the genes (Biological Feature objects) that show a consistent up or down-regulated profile." - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "PqtZm-nMUhpG", - "colab_type": "code", - "colab": {} - }, - "source": [ - "seed_ss = SampleSet.using(compendium).by(samples=seed_samples)\n", - "module_3 = Module.using(compendium).create(samplesets=seed_ss)" - ], - "execution_count": 0, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "W5EK2vNRX3m1", - "colab_type": "text" - }, - "source": [ - "And now we can plot the heatmap" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "a4AFVF1HWA1x", - "colab_type": "code", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 467 - }, - "outputId": "8a22b626-1d05-471e-a72e-f39818a5a957" - }, - "source": [ - "module_3_heatmap, sorted_bf, sorted_ss = Plot(module_3).plot_heatmap(output_format='html')\n", - "display(HTML(module_3_heatmap))" - ], - "execution_count": 51, - "outputs": [ - { - "output_type": "display_data", - "data": { - "text/html": [ - "
\n", - " " - ], - "text/plain": [ - "" - ] - }, - "metadata": { - "tags": [] - } - } - ] - } - ] -} \ No newline at end of file