# Inserting ortholuge dump into the sbw25_db mongo database 

In [1]:
%load_ext autoreload
%autoreload 2

In [27]:
import logging
import pandas
import numpy

from pymongo import MongoClient

## Connect to local mongodb server

In [3]:
uri = "localhost:27017"

#The MongoClient constructor accepts many different arguments to configure how the driver connects to MongoDB and how many operations will be performed. We'll look at the most basic configuration first, which is passing the SRV string of our Atlas cluster to MongoClient.

client = MongoClient(uri)

### Connect to database

In [4]:
sbw25_pdc = client.sbw25_pdc

In [6]:
logger = logging.getLogger()

In [7]:
logger.setLevel(logging.INFO)

## Load table of features in PDC.
Data kindly provided by Geoff Winsor @ sfu (Brinkhaus Lab). Dropbox link: https://www.dropbox.com/s/wvfr4p5tn8ob1ym/sbw25_orthologs.csv.gz?dl=1

In [7]:
orthologs = pandas.read_csv('/home/grotec/data/sbw25/sbw25_orthologs_vs_all_in_pdc.csv')

In [10]:
orthologs.head()

Unnamed: 0,sbw25_locus_tag,sbw25_protein_accnum,sbw25_gene_product,sbw25_paralog_locus_tag,sbw25_paralog_protein_accnum,sbw25_paralog_name,locus_tag_hit,protein_accnum_hit,gene_product_hit,locus_tag_hit_paralog,protein_accnum_hit_paralog,gene_product_hit_paralog,ncbi_taxonomy_id_for_hit,strain_name_hit,analysis_type,ortholuge_classification
0,PFLU1813,WP_012723085.1,hypothetical protein,,,,Cp162_0611,WP_014366690.1,hypothetical protein,,,,1161911.0,Corynebacterium pseudotuberculosis Cp162,ortholuge,
1,PFLU4841,WP_015885550.1,hypothetical protein,PFLU6051,WP_043206069.1,putative aldose 1-epimerase,Cp162_2051,WP_041481467.1,aldose 1-epimerase,,,,1161911.0,Corynebacterium pseudotuberculosis Cp162,ortholuge,
2,PFLU6051,WP_043206069.1,putative aldose 1-epimerase,,,,Cp162_2051,WP_041481467.1,aldose 1-epimerase,,,,1161911.0,Corynebacterium pseudotuberculosis Cp162,ortholuge,
3,PFLU4038,WP_015884830.1,putative tartrate dehydrogenase,,,,Cp162_0895,WP_014800345.1,3-isopropylmalate dehydrogenase,,,,1161911.0,Corynebacterium pseudotuberculosis Cp162,ortholuge,
4,PFLU0420,WP_012721824.1,3-oxoacyl-(acyl carrier protein) synthase II,PFLU4703,WP_015885431.1,3-oxoacyl-(acyl carrier protein) synthase II,Cp162_1927,WP_041481433.1,phthiocerol synthesis polyketide synthase type...,,,,1161911.0,Corynebacterium pseudotuberculosis Cp162,ortholuge,


### Explore

In [61]:
hits_in_all = orthologs["strain_name_hit"]

Pseudomonas strains are included

In [65]:
hits_in_all[2300000]

'Pseudomonas aeruginosa LESB58'

## Insert as collection into db

### Convert into dict

In [12]:
orthologs_records = orthologs.to_dict(orient='records')

### Insert all records at once, unordered.

In [13]:
results = sbw25_pdc.orthologs.insert_many(orthologs_records, ordered=False)

<pymongo.results.InsertManyResult at 0x7f4770c56688>

In [None]:
results

## Compare to dataset downloaded from pseudomonas.com/downloads

In [14]:
orthologs_from_pdc_download = pandas.read_csv('/home/grotec/data/sbw25/Pseudomonas_fluorescens_SBW25_116_orthologs.csv')

In [35]:
hits = orthologs_from_pdc_download["Strain(Hit)"].to_list()

In [39]:
all(["Pseudomonas"  in hit for hit in hits])

True

In [19]:
orthologs_from_pdc_download[orthologs_from_pdc_download["Locus Tag(Query)"]=="PFLU0003"]

Unnamed: 0,Strain(Query),Locus Tag(Query),Description(Query),Strain(Hit),Locus Tag(Hit),Description(Hit),Percent Identity,Alignment Length
0,Pseudomonas fluorescens SBW25,PFLU0003,recombination protein F,Pseudomonas brassicacearum subsp. brassicacear...,PSEBR_a3,DNA replication and repair protein,94.4,357
6239,Pseudomonas fluorescens SBW25,PFLU0003,recombination protein F,Pseudomonas chlororaphis PA23,EY04_RS28850,DNA recombination protein RecF,96.7,367
10390,Pseudomonas fluorescens SBW25,PFLU0003,recombination protein F,Pseudomonas protegens Pf-5,PFL_0003,recombination protein F,95.4,367
12515,Pseudomonas fluorescens SBW25,PFLU0003,recombination protein F,Pseudomonas savastanoi pv. phaseolicola 1448A,PSPPH_0003,recombination protein F,92.1,367
17855,Pseudomonas fluorescens SBW25,PFLU0003,recombination protein F,Pseudomonas putida F1,Pput_0003,recombination protein F,88.6,367
20943,Pseudomonas fluorescens SBW25,PFLU0003,recombination protein F,Pseudomonas stutzeri A1501,PST_0003,recombination protein F,79.6,363
23956,Pseudomonas fluorescens SBW25,PFLU0003,recombination protein F,Pseudomonas putida KT2440 (TIGR),,,88.8,367
25706,Pseudomonas fluorescens SBW25,PFLU0003,recombination protein F,Pseudomonas syringae pv. tomato DC3000,PSPTO_0003,DNA replication and repair protein RecF,92.1,367
29337,Pseudomonas fluorescens SBW25,PFLU0003,recombination protein F,Pseudomonas aeruginosa PA7,PSPA7_0003,recombination protein F,83.1,362
32846,Pseudomonas fluorescens SBW25,PFLU0003,recombination protein F,Pseudomonas aeruginosa LESB58,PALES_00021,recombination protein F,82.9,362


In [17]:
orthologs[orthologs['sbw25_locus_tag']=='PFLU0003']

Unnamed: 0,sbw25_locus_tag,sbw25_protein_accnum,sbw25_gene_product,sbw25_paralog_locus_tag,sbw25_paralog_protein_accnum,sbw25_paralog_name,locus_tag_hit,protein_accnum_hit,gene_product_hit,locus_tag_hit_paralog,protein_accnum_hit_paralog,gene_product_hit_paralog,ncbi_taxonomy_id_for_hit,strain_name_hit,analysis_type,ortholuge_classification
898,PFLU0003,WP_012721452.1,recombination protein F,,,,Cp162_0003,WP_014522920.1,Recombination protein F,,,,1161911.0,Corynebacterium pseudotuberculosis Cp162,ortholuge,
1899,PFLU0003,WP_012721452.1,recombination protein F,,,,HMPREF0772_10464,WP_000775113.1,recombination protein F,,,,548473.0,Staphylococcus aureus subsp. aureus TCH60,ortholuge,
3310,PFLU0003,WP_012721452.1,recombination protein F,,,,CPS0B_0487,WP_014518588.1,recombination protein F,,,,1027845.0,Chlamydophila psittaci 02DC15,rbb,
4241,PFLU0003,WP_012721452.1,recombination protein F,,,,R2846_1320,WP_014550978.1,DNA replication and repair protein RecF,,,,262727.0,Haemophilus influenzae R2846,ortholuge,SSD
5283,PFLU0003,WP_012721452.1,recombination protein F,,,,BJ6T_08330,WP_014491011.1,DNA replication and repair protein,,,,1037409.0,Bradyrhizobium japonicum USDA 6,ortholuge,SSD
8078,PFLU0003,WP_012721452.1,recombination protein F,,,,SAOV_0004,WP_000775113.1,recF protein,,,,685039.0,Staphylococcus aureus subsp. aureus ED133,ortholuge,
9245,PFLU0003,WP_012721452.1,recombination protein F,,,,H9401_0004,WP_000470753.1,DNA replication and repair protein RecF,,,,768494.0,Bacillus anthracis str. H9401,rbb,
12848,PFLU0003,WP_012721452.1,recombination protein F,,,,MYY_2150,WP_000266662.1,recombination protein F,,,,1130804.0,Streptococcus pneumoniae ST556,ortholuge,
13044,PFLU0003,WP_012721452.1,recombination protein F,,,,Fleli_0469,WP_014796405.1,DNA replication and repair protein RecF,,,,880071.0,Flexibacter litoralis DSM 6794,ortholuge,
14307,PFLU0003,WP_012721452.1,recombination protein F,,,,lmo4a_0005,WP_012582236.1,DNA replication and repair protein,,,,563174.0,Listeria monocytogenes L99,rbb,
