In [1]:
import agate

In [3]:
table = agate.Table.from_csv('test_gene2pubmed', delimiter='\t')

In [4]:
print(table)

| column    | data_type |
| --------- | --------- |
| #tax_id   | Number    |
| GeneID    | Number    |
| PubMed_ID | Number    |



The data is numeric, but we want it read as categorical, because these are ID's.

In [5]:
tester = agate.TypeTester(force={
    '#tax_id': agate.Text(),
    'GeneID': agate.Text(),
    'PubMed_ID': agate.Text()
})

table = agate.Table.from_csv('test_gene2pubmed', delimiter='\t', column_types=tester)

In [6]:
print(table)

| column    | data_type |
| --------- | --------- |
| #tax_id   | Text      |
| GeneID    | Text      |
| PubMed_ID | Text      |



In [14]:
#table_bins = table.bins('GeneID')
#table_bins.print_bars('GeneID','Count')

GeneID                     Count
[0 - 4,000,000)           14,538 ▓░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░                                            
[4,000,000 - 8,000,000)   16,317 ▓░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░                                       
[8,000,000 - 12,000,000)   5,219 ▓░░░░░░░░░░░░░░░                                                                       
[12,000,000 - 16,000,000)  9,097 ▓░░░░░░░░░░░░░░░░░░░░░░░░░░                                                            
[16,000,000 - 20,000,000)  5,517 ▓░░░░░░░░░░░░░░░░                                                                      
[20,000,000 - 24,000,000)  5,834 ▓░░░░░░░░░░░░░░░░░                                                                     
[24,000,000 - 28,000,000)     69 ▓                                                                                      
[28,000,000 - 32,000,000) 26,993 ▓░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░         

**Q** What Gene is associated by most publications ?

In [34]:
# trying agate way
#geneId_dist = (table.pivot('PubMed_ID', 'GeneID' ))
#geneId_dist = geneId_dist.order_by('Count', reverse=True)
geneId_dist.print_table(max_rows=5) # not quite what I wanted

| PubMed_ID | 1246500 | 1246501 | 1246502 | 1246503 | 1246504 | ... |
| --------- | ------- | ------- | ------- | ------- | ------- | --- |
| 9873079   |       1 |       1 |       1 |       1 |       1 | ... |
| 9812361   |       0 |       0 |       1 |       0 |       0 | ... |
| 10984505  |       0 |       0 |       0 |       0 |       0 | ... |
| 7608990   |       0 |       0 |       0 |       0 |       0 | ... |
| 16413149  |       0 |       0 |       0 |       0 |       0 | ... |
| ...       |     ... |     ... |     ... |     ... |     ... | ... |


In [5]:
# trying using pandas
import pandas as pd
gene2pubmed = pd.read_table("test_gene2pubmed", sep = "\t")
#gene2pubmed = pd.read_table("data/genbank-data/gene2pubmed", sep = "\t")
#gene2pubmed.groupby(['GeneID'])['PubMed_ID'].count().head(n=10) # can we sort this to get a better idea ?

In [11]:
gene_gp_pubmedCount = gene2pubmed.groupby('GeneID', as_index=False)['PubMed_ID'].count() # because pandas preserves the row index and we need it reset to be sorted
gene_gp_pubmedCount.head(n=10)

Unnamed: 0,GeneID,PubMed_ID
0,874673,1
1,874677,3
2,874678,3
3,874679,3
4,874680,3
5,874681,3
6,874682,3
7,874683,3
8,874684,3
9,874685,3


**NOTE** another way to reset index is using `reset_index()` like so

`gene2pubmed.groupby('GeneID')['PubMed_ID'].count().reset_index()`

In [12]:
gene_gp_pubmedCount.sort_values('PubMed_ID', ascending=False).head(n=10)

Unnamed: 0,GeneID,PubMed_ID
2729,2716540,50
6119,3244915,42
6637,3722457,42
1998,1446560,36
6379,3399421,34
7247,4246763,33
8825,6276088,32
7659,4924737,32
7602,4924680,32
1997,1446559,31


Lets change the column header so that the misleading `PubMedID` name is now `PubMed_pubs` to show that the column shows the number of PubMed publications associated with the gene

In [6]:
gene_gp_pubmedCount = gene2pubmed.groupby('GeneID', as_index=False)['PubMed_ID'].agg({'PubMed_pubs': "count"}) # because pandas preserves the row index and we need it reset to be sorted
gene_gp_pubmedCount.sort_values('PubMed_pubs', ascending=False).head(n=10)

Unnamed: 0,GeneID,PubMed_pubs
2729,2716540,50
6119,3244915,42
6637,3722457,42
1998,1446560,36
6379,3399421,34
7247,4246763,33
8825,6276088,32
7659,4924737,32
7602,4924680,32
1997,1446559,31


In [18]:
#tx_gp_genepubmedCount = gene2pubmed.groupby('#tax_id', as_index=False)['GeneID','PubMed_ID'].agg({'GeneID_count': "count",'PubMed_pubs': "count"}) # because pandas preserves the row index and we need it reset to be sorted
tx_gp_genepubmedCount = gene2pubmed.groupby('#tax_id', as_index=False)['GeneID','PubMed_ID'].count()
tx_gp_genepubmedCount.sort_values('GeneID', ascending=False).head()
#tx_gp_genepubmedCount.sort_values('PubMed_pubs', ascending=False).head(n=10)

Unnamed: 0,#tax_id,GeneID,PubMed_ID
47,562,17035,17035
27,394,9164,9164
50,573,8502,8502
90,813,8150,8150
31,470,7023,7023


In [24]:
#grouping
#by_gene = table.group_by('GeneID')
#gene_totals = by_gene.aggregate([
#    ('count', agate.Count())
#])

#sorted_gene_totals = gene_totals.order_by('count', reverse=True)

#sorted_gene_totals.print_table(max_rows=5)



| GeneID  | count |
| ------- | ----- |
| 2716540 |    50 |
| 3244915 |    42 |
| 3722457 |    42 |
| 1446560 |    36 |
| 3399421 |    34 |
| ...     |   ... |


# Testing area

## Testing ETE package

In [3]:
from ete3 import NCBITaxa
ncbi = NCBITaxa()

NCBI database not present yet (first time used?)
Downloading taxdump.tar.gz from NCBI FTP site (via HTTP)...
Done. Parsing...


Loading node names...
1700922 names loaded.
197991 synonyms loaded.
Loading nodes...
1700922 nodes loaded.
Linking nodes...
Tree is loaded.




Updating database: /Users/nsarode/.etetoolkit/taxa.sqlite ...
 1484000 generating entries... generating entries... 284000 generating entries...  generating entries...   generating entries...  generating entries... 

Inserting synonyms:          0 

 1700000 generating entries... 
Uploading to /Users/nsarode/.etetoolkit/taxa.sqlite



Inserting taxid merges:  40000 




Inserting taxids:       20000 









In [7]:
lineage = ncbi.get_lineage(9606)
type(lineage)

131567

## NCBI taxonomy sqlite3 database

In [13]:
import sqlite3
#conn = sqlite3.connect('/home/neha/Documents/whale/metagenome/HiSeq/whaleScripts/NameNode.sqlite')
conn = sqlite3.connect('NameNode.sqlite')
c = conn.cursor()
# list tables. Equivalent to dbListTables(conn) 
c.execute("SELECT name FROM sqlite_master WHERE type='table';")
print(c.fetchall())
# cur.fetchone() # print the first retrieved result only

[('NcbiNameNode',)]


In [14]:
#print information about the table. Equivalent to dbListFields(conn,"NcbiNameNode") 
c.execute("PRAGMA table_info(NcbiNameNode)")
print(c.fetchall())
c.close()
conn.close()

[(0, 'tax_id', 'INTEGER', 0, None, 0), (1, 'name_txt', 'TEXT', 0, None, 0), (2, 'unique_name', 'TEXT', 0, None, 0), (3, 'name_class', 'TEXT', 0, None, 0), (4, 'parent_tax_id', 'INTEGER', 0, None, 0), (5, 'rank', 'TEXT', 0, None, 0)]


Another efficient way is using pandas. You can save results from queries directly into dataframe

In [24]:
import sqlite3
import pandas as pd
# Read sqlite query results into a pandas DataFrame
con = sqlite3.connect("NameNode.sqlite")
#df = pd.read_sql_query("SELECT * from NcbiNameNode", con) 
df = pd.read_sql_query("SELECT tax_id, name_txt,parent_tax_id, rank FROM NcbiNameNode WHERE tax_id == 175128", con)
con.close()
df
#df.loc[0]["name_txt"] # 'Clostridium sp. JCC'

Unnamed: 0,tax_id,name_txt,parent_tax_id,rank


In [27]:
if df.size == 0:
    print("Check !!")

Check !!


In [144]:
Dummymaster = pd.DataFrame(columns=("sciname","species","genus","family","order","class","phylum","superkingdom","norank"))
classi = {'sciname': 'unclassified','species': 'unclassified','genus': 'unclassified','family': 'unclassified','order': 'unclassified','class': 'unclassified','phylum': 'unclassified','superkingdom': 'unclassified','norank': 'unclassified'}
level = df['rank'][0]
parent = df['parent_tax_id'][0]
sciname = df['name_txt'][0]
taxid = int(df['tax_id'][0])

In [100]:
type(taxid)

int

In [145]:
classi[level] = sciname
classi

{'class': 'unclassified',
 'family': 'unclassified',
 'genus': 'unclassified',
 'norank': 'unclassified',
 'order': 'unclassified',
 'phylum': 'unclassified',
 'sciname': 'unclassified',
 'species': 'Clostridium sp. JCC',
 'superkingdom': 'unclassified'}

In [146]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

while parent != 1:
    print(taxid)
    con = sqlite3.connect("NameNode.sqlite")
    df = pd.read_sql_query("SELECT tax_id, name_txt,parent_tax_id, rank FROM NcbiNameNode WHERE tax_id == ? AND name_class == 'scientific name'", con = con, params=(taxid,))
    con.close()
    df
    level = df['rank'][0]
    parent = df['parent_tax_id'][0]
    sciname = df['name_txt'][0]
    #tax_id = df['tax_id'][0]
    taxid = int(df['parent_tax_id'][0])
    classi[level] = sciname
    classi

1414720


Unnamed: 0,tax_id,name_txt,parent_tax_id,rank
0,1414720,Clostridium sp. JCC,1485,species


{'class': 'unclassified',
 'family': 'unclassified',
 'genus': 'unclassified',
 'norank': 'unclassified',
 'order': 'unclassified',
 'phylum': 'unclassified',
 'sciname': 'unclassified',
 'species': 'Clostridium sp. JCC',
 'superkingdom': 'unclassified'}

1485


Unnamed: 0,tax_id,name_txt,parent_tax_id,rank
0,1485,Clostridium,31979,genus


{'class': 'unclassified',
 'family': 'unclassified',
 'genus': 'Clostridium',
 'norank': 'unclassified',
 'order': 'unclassified',
 'phylum': 'unclassified',
 'sciname': 'unclassified',
 'species': 'Clostridium sp. JCC',
 'superkingdom': 'unclassified'}

31979


Unnamed: 0,tax_id,name_txt,parent_tax_id,rank
0,31979,Clostridiaceae,186802,family


{'class': 'unclassified',
 'family': 'Clostridiaceae',
 'genus': 'Clostridium',
 'norank': 'unclassified',
 'order': 'unclassified',
 'phylum': 'unclassified',
 'sciname': 'unclassified',
 'species': 'Clostridium sp. JCC',
 'superkingdom': 'unclassified'}

186802


Unnamed: 0,tax_id,name_txt,parent_tax_id,rank
0,186802,Clostridiales,186801,order


{'class': 'unclassified',
 'family': 'Clostridiaceae',
 'genus': 'Clostridium',
 'norank': 'unclassified',
 'order': 'Clostridiales',
 'phylum': 'unclassified',
 'sciname': 'unclassified',
 'species': 'Clostridium sp. JCC',
 'superkingdom': 'unclassified'}

186801


Unnamed: 0,tax_id,name_txt,parent_tax_id,rank
0,186801,Clostridia,1239,class


{'class': 'Clostridia',
 'family': 'Clostridiaceae',
 'genus': 'Clostridium',
 'norank': 'unclassified',
 'order': 'Clostridiales',
 'phylum': 'unclassified',
 'sciname': 'unclassified',
 'species': 'Clostridium sp. JCC',
 'superkingdom': 'unclassified'}

1239


Unnamed: 0,tax_id,name_txt,parent_tax_id,rank
0,1239,Firmicutes,1783272,phylum


{'class': 'Clostridia',
 'family': 'Clostridiaceae',
 'genus': 'Clostridium',
 'norank': 'unclassified',
 'order': 'Clostridiales',
 'phylum': 'Firmicutes',
 'sciname': 'unclassified',
 'species': 'Clostridium sp. JCC',
 'superkingdom': 'unclassified'}

1783272


Unnamed: 0,tax_id,name_txt,parent_tax_id,rank
0,1783272,Terrabacteria group,2,norank


{'class': 'Clostridia',
 'family': 'Clostridiaceae',
 'genus': 'Clostridium',
 'norank': 'Terrabacteria group',
 'order': 'Clostridiales',
 'phylum': 'Firmicutes',
 'sciname': 'unclassified',
 'species': 'Clostridium sp. JCC',
 'superkingdom': 'unclassified'}

2


Unnamed: 0,tax_id,name_txt,parent_tax_id,rank
0,2,Bacteria,131567,superkingdom


{'class': 'Clostridia',
 'family': 'Clostridiaceae',
 'genus': 'Clostridium',
 'norank': 'Terrabacteria group',
 'order': 'Clostridiales',
 'phylum': 'Firmicutes',
 'sciname': 'unclassified',
 'species': 'Clostridium sp. JCC',
 'superkingdom': 'Bacteria'}

131567


Unnamed: 0,tax_id,name_txt,parent_tax_id,rank
0,131567,cellular organisms,1,norank


{'class': 'Clostridia',
 'family': 'Clostridiaceae',
 'genus': 'Clostridium',
 'norank': 'cellular organisms',
 'order': 'Clostridiales',
 'phylum': 'Firmicutes',
 'sciname': 'unclassified',
 'species': 'Clostridium sp. JCC',
 'superkingdom': 'Bacteria'}

In [23]:
# using dictionary
import time
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

start = time.time()
import sqlite3
import pandas as pd
# Read sqlite query results into a pandas DataFrame
con = sqlite3.connect("NameNode.sqlite")
# 1414720 # 175139 # 175429 # 175439 # 175128
df = pd.read_sql_query("SELECT tax_id, name_txt,parent_tax_id, rank FROM NcbiNameNode WHERE tax_id == 175128 AND name_class == 'scientific name'", con)
con.close()
#df

Dummymaster = pd.DataFrame(columns=("species","genus","family","order","class","phylum","superkingdom","norank"))
classi = {'species': 'unclassified','genus': 'unclassified','family': 'unclassified','order': 'unclassified','class': 'unclassified','phylum': 'unclassified','superkingdom': 'unclassified','norank': 'unclassified'}
level = df['rank'][0]
parent = df['parent_tax_id'][0]
sciname = df['name_txt'][0]
taxid = int(df['tax_id'][0])

while parent != 1:
    print(taxid)
    con = sqlite3.connect("NameNode.sqlite")
    df = pd.read_sql_query("SELECT tax_id, name_txt,parent_tax_id, rank FROM NcbiNameNode WHERE tax_id == ? AND name_class == 'scientific name'", con = con, params=(taxid,))
    con.close()
    #df
    level = df['rank'][0]
    parent = df['parent_tax_id'][0]
    sciname = df['name_txt'][0]
    #tax_id = df['tax_id'][0]
    taxid = int(df['parent_tax_id'][0])
    classi[level] = sciname
    #classi
# df = pd.DataFrame.from_dict(sales)
temp = pd.DataFrame.from_dict([classi])
temp = temp[['species','genus','family','order','class','phylum','superkingdom','norank']]
temp
#Dummymaster = Dummymaster.append(temp)
Dummymaster = pd.concat([Dummymaster,temp],axis=0,join='outer')
Dummymaster
stop = time.time()
duration = stop - start
print(duration)

IndexError: index out of bounds

In [3]:
# using dataframe only. Takes longer (almost twice as long as the dictionary way based on time output)
import time
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

start = time.time()
import sqlite3
import pandas as pd
# Read sqlite query results into a pandas DataFrame
con = sqlite3.connect("NameNode.sqlite")
df = pd.read_sql_query("SELECT tax_id, name_txt,parent_tax_id, rank FROM NcbiNameNode WHERE tax_id == 1414720", con)
con.close()
#df

Dummymaster = pd.DataFrame(columns=("species","genus","family","order","class","phylum","superkingdom","norank"))
classi = pd.DataFrame({'species': 'unclassified','genus': 'unclassified','family': 'unclassified','order': 'unclassified','class': 'unclassified','phylum': 'unclassified','superkingdom': 'unclassified','norank': 'unclassified'}, index=[0])
classi = classi[['species','genus','family','order','class','phylum','superkingdom','norank']]
level = df['rank'][0]
parent = df['parent_tax_id'][0]
sciname = df['name_txt'][0]
taxid = int(df['tax_id'][0])

while parent != 1:
    print(taxid)
    con = sqlite3.connect("NameNode.sqlite")
    df = pd.read_sql_query("SELECT tax_id, name_txt,parent_tax_id, rank FROM NcbiNameNode WHERE tax_id == ? AND name_class == 'scientific name'", con = con, params=(taxid,))
    con.close()
    #df
    level = df['rank'][0]
    parent = df['parent_tax_id'][0]
    sciname = df['name_txt'][0]
    #tax_id = df['tax_id'][0]
    taxid = int(df['parent_tax_id'][0])
    classi[level] = sciname
    #classi

Dummymaster = pd.concat([Dummymaster,classi],axis=0,join='outer')
Dummymaster

stop = time.time()
duration = stop - start
print(duration)

1414720
1485
31979
186802
186801
1239
1783272
2
131567


Unnamed: 0,species,genus,family,order,class,phylum,superkingdom,norank
0,Clostridium sp. JCC,Clostridium,Clostridiaceae,Clostridiales,Clostridia,Firmicutes,Bacteria,cellular organisms


0.09469199180603027


In [56]:
Dummymaster = pd.DataFrame(columns=("sciname","species","genus","family","order","class","phylum","superkingdom","norank"))

#if  df.shape[0] == 1:
def getlevel(Dummymaster,df,x):
    if(x == "species"):
        Dummymaster.loc[0]["species"] = df.loc[0]["name_txt"]
    return Dummymaster
x = df.loc[0]["rank"]
#x
#getlevel(Dummymaster,df,x)
#Dummymaster

Unnamed: 0,sciname,species,genus,family,order,class,phylum,superkingdom,norank


In [83]:
#import pandas as pd
#import sqlite3

# for single table
#conn = sqlite3.connect('/home/neha/Documents/whale/metagenome/HiSeq/whaleScripts/NameNode.sqlite')
#tab = pd.read_sql_query("SELECT * from NcbiNameNode", conn)
#conn.close()

Unnamed: 0,tax_id,name_txt,unique_name,name_class,parent_tax_id,rank
0,1,all,,synonym,1,norank
1,1,root,,scientific name,1,norank
2,2,Bacteria,Bacteria <prokaryote>,scientific name,131567,superkingdom
3,2,Monera,Monera <Bacteria>,in-part,131567,superkingdom
4,2,Procaryotae,Procaryotae <Bacteria>,in-part,131567,superkingdom


In [85]:
tab.head(n=10)

Unnamed: 0,tax_id,name_txt,unique_name,name_class,parent_tax_id,rank
0,1,all,,synonym,1,norank
1,1,root,,scientific name,1,norank
2,2,Bacteria,Bacteria <prokaryote>,scientific name,131567,superkingdom
3,2,Monera,Monera <Bacteria>,in-part,131567,superkingdom
4,2,Procaryotae,Procaryotae <Bacteria>,in-part,131567,superkingdom
5,2,Prokaryota,Prokaryota <Bacteria>,in-part,131567,superkingdom
6,2,Prokaryotae,Prokaryotae <Bacteria>,in-part,131567,superkingdom
7,2,bacteria,bacteria <blast2>,blast name,131567,superkingdom
8,2,eubacteria,,genbank common name,131567,superkingdom
9,2,not Bacteria Haeckel 1894,,synonym,131567,superkingdom


In [None]:
conn.close()