## GEO

In [36]:
from IPython.lib import kernel
kernel.get_connection_file()


'/run/user/1000/jupyter/kernel-e484d883-b710-40f9-8fbf-eaa0f3d93366.json'

In [1]:
from elasticsearch import Elasticsearch
from pymongo import MongoClient
import pandas as pd
es = Elasticsearch()
db = MongoClient().scraper_test_dev

### Серии, которых не более одной платформы и не более одного типа эксперимента

In [2]:
query = {
  "query": {
    "filtered": {
      "filter": {
        "bool": {
          "must": [
            {
              "script": {
                "script": "doc['type'].size() == 1"
              }
            },
            {
              "script": {
                "script": "doc['platforms'].size() == 1"
              }
            },
            {
              "term": {
                "data_source": "geo"
              }
            },
            {
              "term": {
                "organism": "Homo sapiens"
              }
            },
            {
              "terms": {
                "type": [
                  "Expression profiling by array",
                  "Expression profiling by high throughput sequencing"
                ]
              }
            }
          ]
        }
      }
    }
  },
  "size": 0, 
  "aggs": {
    "platforms": {
      "terms": {
        "field": "platforms",
        "size": 7000
      }
    }
  },
  "fields": ["accession", "platforms", "type", "organism"]
}




In [3]:
res = es.search(index='series_dev', body=query)

In [4]:
platforms_homo_expr_uncleaned = pd.DataFrame(
    res['aggregations']['platforms']['buckets']
)

In [35]:
platforms_homo_expr_uncleaned.shape

(1119, 2)

In [5]:
platforms_homo_expr_uncleaned.head()

Unnamed: 0,doc_count,key
0,3375,GPL570
1,966,GPL6244
2,827,GPL10558
3,733,GPL96
4,518,GPL6480


In [6]:
platforms_homo = pd.DataFrame(list(db.platforms.find({'data_source': "geo"}, 
                                   {'_id': 0, 'accession': 1, 'organism': 1 })))

In [7]:
platforms_homo.head()

Unnamed: 0,accession,organism
0,GPL15373,Homo sapiens
1,GPL15374,Thunnus thynnus
2,GPL15377,Alternaria brassicicola
3,GPL15378,Cottus
4,GPL15379,Canis lupus familiaris


In [8]:
platforms_homo_expr_uncleaned.key.to_csv('../data/platforms_homo_expr_uncleaned.csv', index=False)

In [54]:
! head ../data/platforms_homo_expr_uncleaned.csv

GPL570
GPL6244
GPL10558
GPL96
GPL6480
GPL4133
GPL571
GPL11154
GPL6947
GPL8300


In [9]:
_t = (
    platforms_homo
    .merge(platforms_homo_expr_uncleaned, left_on='accession', right_on='key', how='right')
#     .drop('key', axis=1)
    .sort('doc_count', ascending=False)
#     .query('organism == "Homo sapiens"')
)
_t
_t[_t.accession.isnull()]

Unnamed: 0,accession,organism,doc_count,key
1095,,,5,GPL15034
1096,,,4,GPL17425
1097,,,4,GPL18734
1098,,,3,GPL15048
1100,,,3,GPL16230
1099,,,3,GPL15069
1101,,,2,GPL16221
1102,,,2,GPL17328
1103,,,1,GPL14963
1104,,,1,GPL15017


### Серии, у которых больше одной платформы

In [10]:
query = {
  "query": {
    "filtered": {
      "filter": {
        "bool": {
          "must": [
            {
              "script": {
                "script": "doc['platforms'].size() >=2"
              }
            },
            {
              "term": {
                "data_source": "geo"
              }
            },
            {
              "terms": {
                "type": [
                  "Expression profiling by array",
                  "Expression profiling by high throughput sequencing"
                ]
              }
            }
            
          ]
        }
      }
    }
  },
  "size": 5000, 
  "fields": ["accession", "platforms", "type", "organism"]
}

res = es.search(index='series_dev', body=query)

In [59]:
res['hits']['hits'][:2]

[{'_id': 'AVC0hG5iW9INVF-Z1EJ7',
  '_index': 'series_dev',
  '_score': 1.0,
  '_type': 'series_dev',
  'fields': {'accession': ['GSE6969'],
   'organism': ['Homo sapiens'],
   'platforms': ['GPL570', 'GPL2507', 'GPL2700'],
   'type': ['Expression profiling by array']}},
 {'_id': 'AVC0hG5iW9INVF-Z1EMz',
  '_index': 'series_dev',
  '_score': 1.0,
  '_type': 'series_dev',
  'fields': {'accession': ['GSE7171'],
   'organism': ['Rattus norvegicus'],
   'platforms': ['GPL4733', 'GPL4917'],
   'type': ['Expression profiling by array']}}]

In [11]:
series_with_many_platforms = [s['fields']['accession'][0] for s in res['hits']['hits']]

In [12]:
series_with_many_platforms[:10]

['GSE6969',
 'GSE7171',
 'GSE7238',
 'GSE7260',
 'GSE7295',
 'GSE7298',
 'GSE7300',
 'GSE7374',
 'GSE7473',
 'GSE7497']

In [13]:
query = {
    "query": {
        'filtered': {
            "query": {
                "match_all": {}
            },
            "filter": {
                "term": {
                    "organism": "Homo sapiens"
                }
            }
        }
        
    },
    "fields": ['accession', 'platforms'],
    "size": 40000
}

res = es.search(index='series_dev', body=query)

In [14]:
len(res['hits']['hits'])

23274

In [15]:
from collections import defaultdict
platform_series = defaultdict(set)

for s in res['hits']['hits']:
    for p in s['fields'].get('platforms', []):
        platform_series[p].add(s['fields']['accession'][0])

In [16]:
len(platform_series)

3875

In [18]:
# Платформы, для которых невозможно решить какой тип эксперимента, 
# т.к. все серии с этими платформами имеют более одной платформы

platforms_with_series_with_many_platforms = [(platform, series_list)
    for platform, series_list in platform_series.items()
    if all(s in series_with_many_platforms for s in series_list)]

In [19]:
len(platforms_with_series_with_many_platforms)

391

In [20]:
platforms_with_series_with_many_platforms[:10]

[('GPL14356', {'GSE31740'}),
 ('GPL8040', {'GSE14067', 'GSE14356'}),
 ('GPL1825', {'GSE2193'}),
 ('GPL351', {'GSE1030', 'GSE522', 'GSE524', 'GSE525', 'GSE526'}),
 ('GPL13219', {'GSE27479'}),
 ('GPL2706', {'GSE3034'}),
 ('GPL9871', {'GSE19821'}),
 ('GPL6765', {'GSE11235'}),
 ('GPL10661', {'GSE22834'}),
 ('GPL14593', {'GSE32245'})]

In [26]:
sum(map(lambda x: len(x[1]), platforms_with_series_with_many_platforms))/ len(platforms_with_series_with_many_platforms)

1.7672634271099745

In [28]:
max(map(lambda x: len(x[1]), platforms_with_series_with_many_platforms))

14

In [29]:
min(map(lambda x: len(x[1]), platforms_with_series_with_many_platforms))

1

In [21]:
# Не пересекаются с платформами у которых есть серии с одной платформой
_t = [x[0] for x in platforms_with_series_with_many_platforms]
platforms_homo_expr_uncleaned[platforms_homo_expr_uncleaned.key.isin(_t)]

Unnamed: 0,doc_count,key


## Expression platforms

In [22]:
platforms_homo_expr_uncleaned.head()

Unnamed: 0,doc_count,key
0,3375,GPL570
1,966,GPL6244
2,827,GPL10558
3,733,GPL96
4,518,GPL6480


## Samples with gene expression

### Samples (Homo sapiens)

In [25]:
query = {
  "query": {
    "filtered": {
      "filter": {
        "terms": {
          "platform": platforms_homo_expr_uncleaned['key'].tolist()
        }
      }
    }
  }
}

res = es.count(index='samples_dev', body=query)
res

{'_shards': {'failed': 0, 'successful': 5, 'total': 5}, 'count': 593610}

## Samples all (GEO, Homo sapiens)

In [34]:
query = {
  "query": {
    "filtered": {
      "filter": {
        "bool": {
          "must": [
            {
              "term": {
                "organism": "Homo sapiens"
              }
            },
            {
              "term": {
                "data_source": "geo"
              }
            }
          ]
        }
      }
    }
  }
}
res = es.count(index='samples_dev', body=query)
res

{'_shards': {'failed': 0, 'successful': 5, 'total': 5}, 'count': 940114}