# Load Library

In [1]:
from pandas import DataFrame
import pandas as pd
import numpy as np
from Bio import Entrez 
import pycurl
import matplotlib.pyplot as plt
import os, sys
from tqdm import tqdm_notebook

In [2]:
def get_GenomeAssembly(G, S):
    F = G+'_'+S+'.csv'
    if not os.path.isfile(F):
        with open(F, 'wb') as f:
            c = pycurl.Curl()
            c.setopt(c.URL, 'https://www.ncbi.nlm.nih.gov/genomes/Genome2BE/genome2srv.cgi?action=download&orgn=%22'
                     +G+'%20'+S+
                     '%22[orgn]&status=50|40||30|20&report=proks&group=--%20All%20Prokaryotes%20--&subgroup=--%20All%20Prokaryotes%20--&format=csv')
            c.setopt(c.WRITEDATA, f)
            c.perform()
            c.close()
    return F

# Load Dataset to Scrap

In [3]:
#Pilih data dari Sebaceous
df_Byrd = pd.read_csv("Byrd_2018_Bacteria.csv")
df_Sebaceous = df_Byrd.loc[:,'Sebaceous']
df_Sebaceous

0               Propionibacterium acnes
1            Staphylococcus epidermidis
2    Corynebacterium tuberculostearicum
3                Staphylococcus capitis
4              Corynebacterium simulans
5                   Streptococcus mitis
6                Staphylococcus hominis
7           Corynebacterium aurimucosum
8        Corynebacterium kroppenstedtii
9            Corynebacterium amycolatum
Name: Sebaceous, dtype: object

# Iteration Test

In [8]:
#Email for NCBI
Entrez.email = "matin_nuhamunada@mail.ugm.ac.id"

#Pilih data dari Sebaceous
df_Byrd = pd.read_csv("Byrd_2018_Bacteria.csv")
df_Sebaceous = df_Byrd.loc[:,'Sebaceous']
df_Sebaceous

#Add information on biome
df_NCBI = DataFrame(columns=("IdList","QueryTranslation","Genus","Species","NoStrain","Size","GC","Genes","Proteins"))
#df2.index.name = 'No'

for i in tqdm_notebook(range(len(df_Sebaceous))):
    handle = Entrez.esearch(db="genome", term=df_Sebaceous[i])
    record = Entrez.read(handle)
    df_NCBI.loc[i, "IdList"] = record['IdList']
    df_NCBI.loc[i, "QueryTranslation"] = record['QueryTranslation']
    string = df_NCBI.loc[i, "QueryTranslation"]
    string = string.replace('[Organism]', '')
    string = string.replace('"', '')
    string = string.split()
    df_NCBI.loc[i, "Genus"] = string[0]
    df_NCBI.loc[i, "Species"] = string[1]
    #print(record)

HBox(children=(IntProgress(value=0, max=10), HTML(value='')))




In [9]:
for i in tqdm_notebook(range(len(df_NCBI))):
    get_GenomeAssembly(df_NCBI.loc[i, "Genus"], df_NCBI.loc[i, "Species"])

HBox(children=(IntProgress(value=0, max=10), HTML(value='')))




In [11]:
for i in tqdm_notebook(range(len(df_NCBI))):
    df = pd.read_csv(df_NCBI.loc[i, 'Genus']+'_'+df_NCBI.loc[i, 'Species']+'.csv')
    df_Complete = df
    #df_Complete = df[df['Level'].str.contains("Complete Genome|chromosome")]
    df_Complete = df_Complete[['#Organism/Name','Strain', 'Size (Mb)', 'GC%', 'Replicons', 'Level', 'Genes', 'Proteins']]
    NoStrain = df_Complete["Strain"].count()
    Size = df_Complete["Size (Mb)"].mean()
    GC = df_Complete["GC%"].mean()

    try:
        Genes = df_Complete["Genes"].apply(int)
        Genes = Genes.mean()
        Proteins = df_Complete["Proteins"].apply(int)  
        Proteins = Proteins.mean()
        df_NCBI.loc[i, "NoStrain"] = NoStrain
        df_NCBI.loc[i, "Size"] = Size
        df_NCBI.loc[i, "GC"] = GC
        df_NCBI.loc[i, "Genes"] = Genes
        df_NCBI.loc[i, "Proteins"] = Proteins
    except ValueError:
        pass      # or whatever
    
df_NCBI

HBox(children=(IntProgress(value=0, max=10), HTML(value='')))




Unnamed: 0,IdList,QueryTranslation,Genus,Species,NoStrain,Size,GC,Genes,Proteins
0,[1140],"""Cutibacterium acnes""[Organism]",Cutibacterium,acnes,189.0,2.51243,59.9714,,
1,[155],"""Staphylococcus epidermidis""[Organism] OR Stap...",Staphylococcus,epidermidis,,,,,
2,[2024],"""Corynebacterium tuberculostearicum""[Organism]",Corynebacterium,tuberculostearicum,1.0,2.37226,60.0,2271.0,2171.0
3,[2054],"""Staphylococcus capitis""[Organism]",Staphylococcus,capitis,68.0,2.47772,32.8599,2408.74,2288.41
4,[43748],"""Corynebacterium simulans""[Organism]",Corynebacterium,simulans,3.0,2.6633,59.1667,2510.0,2358.33
5,[530],"""Streptococcus mitis""[Organism]",Streptococcus,mitis,,,,,
6,[2014],"""Staphylococcus hominis""[Organism]",Staphylococcus,hominis,65.0,2.25821,31.4626,2252.98,2130.86
7,[2077],"""Corynebacterium aurimucosum""[Organism]",Corynebacterium,aurimucosum,,,,,
8,[2146],"""Corynebacterium kroppenstedtii""[Organism]",Corynebacterium,kroppenstedtii,6.0,2.5158,56.95,2163.67,2042.67
9,[2023],"""Corynebacterium amycolatum""[Organism]",Corynebacterium,amycolatum,6.0,2.5245,58.8,2218.17,2104.17


In [12]:
for i in tqdm_notebook(range(len(df_NCBI))):
    df = pd.read_csv(df_NCBI.loc[i, 'Genus']+'_'+df_NCBI.loc[i, 'Species']+'.csv')
    #df_Complete = df
    df_Complete = df[df['Level'].str.contains("Complete Genome")]
    df_Complete = df_Complete[['#Organism/Name','Strain', 'Size (Mb)', 'GC%', 'Replicons', 'Level', 'Genes', 'Proteins']]
    NoStrain = df_Complete["Strain"].count()
    Size = df_Complete["Size (Mb)"].mean()
    GC = df_Complete["GC%"].mean()

    try:
        Genes = df_Complete["Genes"].apply(int)
        Genes = Genes.mean()
        Proteins = df_Complete["Proteins"].apply(int)  
        Proteins = Proteins.mean()
    except ValueError:
        pass      # or whatever

    df_NCBI.loc[i, "NoStrain"] = NoStrain
    df_NCBI.loc[i, "Size"] = Size
    df_NCBI.loc[i, "GC"] = GC
    df_NCBI.loc[i, "Genes"] = Genes
    df_NCBI.loc[i, "Proteins"] = Proteins
df_NCBI

HBox(children=(IntProgress(value=0, max=10), HTML(value='')))




Unnamed: 0,IdList,QueryTranslation,Genus,Species,NoStrain,Size,GC,Genes,Proteins
0,[1140],"""Cutibacterium acnes""[Organism]",Cutibacterium,acnes,18,2.52176,60.0389,2537.0,2381.0
1,[155],"""Staphylococcus epidermidis""[Organism] OR Stap...",Staphylococcus,epidermidis,12,2.59849,32.0975,2580.83,2414.75
2,[2024],"""Corynebacterium tuberculostearicum""[Organism]",Corynebacterium,tuberculostearicum,0,,,,
3,[2054],"""Staphylococcus capitis""[Organism]",Staphylococcus,capitis,3,2.49173,32.991,2425.0,2289.67
4,[43748],"""Corynebacterium simulans""[Organism]",Corynebacterium,simulans,2,2.66833,59.15,2503.5,2351.0
5,[530],"""Streptococcus mitis""[Organism]",Streptococcus,mitis,3,2.07362,40.0667,2047.67,1903.67
6,[2014],"""Staphylococcus hominis""[Organism]",Staphylococcus,hominis,1,2.25341,31.4,2258.0,2070.0
7,[2077],"""Corynebacterium aurimucosum""[Organism]",Corynebacterium,aurimucosum,1,2.81923,60.5248,2663.0,2525.0
8,[2146],"""Corynebacterium kroppenstedtii""[Organism]",Corynebacterium,kroppenstedtii,1,2.4468,57.5,2088.0,1999.0
9,[2023],"""Corynebacterium amycolatum""[Organism]",Corynebacterium,amycolatum,0,,,,


In [13]:
for a in tqdm_notebook(range(len(df_NCBI))):
    df = pd.read_csv(df_NCBI.loc[a, 'Genus']+'_'+df_NCBI.loc[a, 'Species']+'.csv')
    df_Complete = df[df['Level'].str.contains("Complete Genome")]
    df_Complete = df_Complete.reset_index()
    
    df_Refseq = DataFrame(columns=("Organism/Name","Strain","Refseq"))   
    
    for i in range(len(df_Complete)):
        try:
            df_Refseq.loc[i, "Organism/Name"] = df_Complete.loc[i, "#Organism/Name"]
            df_Refseq.loc[i, "Strain"] = df_Complete.loc[i, "Strain"]
            string = df_Complete.loc[i, "Replicons"]
            string = string.replace('chromosome:', '')
            string = string.replace('chromosome 1:', '')
            string = string.replace('chromosome I:', '')
            string = string.replace('/', ' ')
            string = string.split()
            df_Refseq.loc[i, "Refseq"] = string[0]
            df_Refseq.to_csv('Refseq_'+df_NCBI.loc[a, 'Genus']+'_'+df_NCBI.loc[a, 'Species']+'.csv')
        except:
            print('error '+df_NCBI.loc[a, 'Genus']+'_'+df_NCBI.loc[a, 'Species'])
            pass           

HBox(children=(IntProgress(value=0, max=10), HTML(value='')))




In [15]:
a = 1
pd.read_csv("Refseq_"+df_NCBI.loc[a, 'Genus']+'_'+df_NCBI.loc[a, 'Species']+'.csv')
#x = pd.read_csv(df_NCBI.loc[a, 'Genus']+'_'+df_NCBI.loc[a, 'Species']+'.csv')
#x["Replicons"][4]

Unnamed: 0.1,Unnamed: 0,Organism/Name,Strain,Refseq
0,0,Staphylococcus epidermidis ATCC 12228,ATCC 12228,NC_004461.1
1,1,Staphylococcus epidermidis RP62A,RP62A,NC_002976.3
2,2,Staphylococcus epidermidis PM221,PM221,NZ_HG813242.1
3,3,Staphylococcus epidermidis,SEI,NZ_CP009046.1
4,4,Staphylococcus epidermidis,14.1.R1,CP018841.1;
5,5,Staphylococcus epidermidis,1457,NZ_CP020463.1
6,6,Staphylococcus epidermidis,ATCC 12228,NZ_CP022247.1
7,7,Staphylococcus epidermidis,DAR1907,NZ_CP013943.1
8,8,Staphylococcus epidermidis,FDAARGOS_153,NZ_CP014119.1
9,9,Staphylococcus epidermidis,FDAARGOS_161,NZ_CP014132.1
