In [2]:
# Load Library
from pandas import DataFrame
import pandas as pd
from alive_progress import alive_bar
from pathlib import Path

from Bio import Entrez 
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.Alphabet import IUPAC

In [3]:
!jupyter nbextension enable --py widgetsnbextension

Enabling notebook extension jupyter-js-widgets/extension...
      - Validating: [32mOK[0m


In [4]:
try:
    df = pd.read_csv("output/ncov2019_ncbi_spike.csv", index_col=0)
except:
    df = pd.read_csv("dataset/ncov2019_ncbi08052020.csv")
    df["s-gene_location"] = ""
    df["gene_length"] = ""
    df["protein_id"] = ""
    df["protein_length"] = ""

In [5]:
#Email for NCBI
Entrez.email = "matin_nuhamunada@ugm.ac.id"

with alive_bar(len(df), force_tty=True) as bar:
    for i in df.index:
        if df.loc[i, "s-gene_location"] != '':
            handle = Entrez.efetch(db="nucleotide", id=df.Accession[i], rettype="gb", retmode="text")
            seq_record = SeqIO.read(handle, "genbank")
            handle.close()
            for feature in seq_record.features:
                try:
                    if feature.type == 'CDS':
                        if feature.qualifiers['gene'][0] == 'S':
                            start = feature.location.start.position
                            end = feature.location.end.position
                            protein_id = str(feature.qualifiers['protein_id'][0])

                            df.loc[i, "s-gene_location"] = str(feature.location)
                            df.loc[i, "gene_length"] = len(seq_record.seq[start:end])
                            df.loc[i, "protein_id"] = protein_id
                            df.loc[i, "protein_length"] = len(feature.qualifiers['translation'][0])

                        #write spike gene fasta
                            record = SeqRecord(Seq(""))
                            record.seq = seq_record.seq[start:end]
                            record.id = seq_record.id
                            with open("output/gene/s_gene_"+str(i)+"_"+record.id+".fasta", "w") as output_handle:
                                SeqIO.write(record, output_handle, "fasta")

                        #write spike protein fasta
                            protein = feature.qualifiers['translation'][0]
                            protein_record = SeqRecord(Seq(protein, IUPAC.protein), id= protein_id, description=feature.qualifiers['product'][0])
                            with open("output/protein/spike_"+str(i)+"_"+protein_id+".fasta", "w") as output_handle:
                                SeqIO.write(protein_record, output_handle, "fasta")
                except:
                    print(df.loc[i, "Accession"]+' error')
        bar()
df.to_csv('output/ncov2019_ncbi_spike.csv')

on 2022: MT263381 error                                                                                                 
on 2023: MT263382 error                                                                                                 
on 2024: MT263383 error                                                                                                 
on 2025: MT263384 error                                                                                                 
on 2026: MT263385 error                                                                                                 
on 2028: MT263387 error                                                                                                 
on 2029: MT263388 error                                                                                                 
on 2030: MT263389 error                                                                                                 
on 2031: MT263390 error         

In [6]:
df_simple = df[['Accession','Geo_Location', 's-gene_location','gene_length', 'protein_id', 'protein_length']]
df_simple = df_simple.dropna()
df_simple

Unnamed: 0,Accession,Geo_Location,s-gene_location,gene_length,protein_id,protein_length
0,NC_045512,China,[21562:25384](+),3822.0,YP_009724390.1,1273.0
1,MT444148,USA: CA,[21549:25371](+),3822.0,QJQ82624.1,1273.0
2,MT444515,USA,[21530:25352](+),3822.0,QJQ82636.1,1273.0
3,MT444516,USA,[21512:25334](+),3822.0,QJQ82648.1,1273.0
4,MT444517,USA,[21523:25345](+),3822.0,QJQ82660.1,1273.0
...,...,...,...,...,...,...
2427,MN975266,China,[<0:>107](+),107.0,QHN73822.1,35.0
2428,MN975267,China,[<0:>107](+),107.0,QHN73823.1,35.0
2429,MN975268,China,[<0:>107](+),107.0,QHN73824.1,35.0
2430,MN985325,USA,[21562:25384](+),3822.0,QHO60594.1,1273.0


In [7]:
print('gene length')
print(df_simple['gene_length'].value_counts())

print('protein length')
print(df_simple['protein_length'].value_counts())


df_simple = df_simple[df_simple['gene_length'] == 3822.0]

gene length
3822.0    2284
107.0        7
3819.0       3
3718.0       2
491.0        2
3785.0       1
158.0        1
157.0        1
493.0        1
3807.0       1
3691.0       1
3395.0       1
3719.0       1
3702.0       1
3786.0       1
3818.0       1
3768.0       1
3696.0       1
3777.0       1
3386.0       1
Name: gene_length, dtype: int64
protein length
1273.0    2284
35.0         7
1272.0       4
1238.0       3
163.0        3
52.0         2
1233.0       1
1259.0       1
1231.0       1
1255.0       1
1262.0       1
1127.0       1
1130.0       1
1260.0       1
1230.0       1
1269.0       1
Name: protein_length, dtype: int64


In [8]:
df_simple['Geo_Location'].unique()

array(['China', 'USA: CA', 'USA', 'China: Guangdong, Guangzhou',
       'USA: MARINGOUIN, LA', 'USA: KENNER, LA', 'USA: KILLONA, LA',
       'USA: SAINT ROSE, LA', 'USA: New Orleans, LA', 'USA: LOCKPORT, LA',
       'USA: NEW ORLEANS, LA', 'USA: LULING, LA', 'USA: RACELAND, LA',
       'USA: GHEENS, LA', 'USA: THIBODAUX,LA', 'USA: Slidell, LA',
       'USA: Lacombe, LA', 'USA: HOUMA, LA', 'USA: SLIDELL, LA',
       'USA: SLIDELL LA', 'USA: DELACROIX, LA', 'Thailand', 'Iran',
       'USA: MI', 'India', 'USA: NY', 'Serbia', 'India: Ahmedabad',
       'India: Gandhinagar', 'India: Mansa', 'Hong Kong', 'Kazakhstan',
       'USA: Wisconsin', 'USA: East Feliciana Parish, Louisiana',
       'USA: FL', 'USA: VA', 'Puerto Rico', 'USA: Michigan',
       'USA: Illinois', 'USA: WA', 'USA: CT', 'USA: ID', 'USA: OR',
       'Germany: Dusseldorf', 'Netherlands: Milheeze', 'USA: New York',
       'Taiwan', 'USA: NJ', 'Sri Lanka', 'Czech Republic', 'Malaysia',
       'Japan', 'India: Rajkot', 'Spain', 

In [9]:
df_simple.loc[df_simple['Geo_Location'].str.contains('USA'), 'Geo_Location'] = 'USA'
df_simple.loc[df_simple['Geo_Location'].str.contains('China'), 'Geo_Location'] = 'China'
df_simple.loc[df_simple['Geo_Location'].str.contains('India'), 'Geo_Location'] = 'India'
df_simple.loc[df_simple['Geo_Location'].str.contains('Germany'), 'Geo_Location'] = 'Germany'
df_simple.loc[df_simple['Geo_Location'].str.contains('Netherlands'), 'Geo_Location'] = 'Netherlands'
df_simple.loc[df_simple['Geo_Location'].str.contains('Pakistan'), 'Geo_Location'] = 'Pakistan'
df_simple.loc[df_simple['Geo_Location'].str.contains('South Africa'), 'Geo_Location'] = 'South Africa'
df_simple.loc[df_simple['Geo_Location'].str.contains('Colombia'), 'Geo_Location'] = 'Colombia'
df_simple.loc[df_simple['Geo_Location'].str.contains('Spain'), 'Geo_Location'] = 'Spain'
df_simple.loc[df_simple['Geo_Location'].str.contains('Viet Nam'), 'Geo_Location'] = 'Vietnam'
df_simple.loc[df_simple['Geo_Location'].str.contains('Australia'), 'Geo_Location'] = 'Australia'
df_simple['Geo_Location'].unique()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)


array(['China', 'USA', 'Thailand', 'Iran', 'India', 'Serbia', 'Hong Kong',
       'Kazakhstan', 'Puerto Rico', 'Germany', 'Netherlands', 'Taiwan',
       'Sri Lanka', 'Czech Republic', 'Malaysia', 'Japan', 'Spain',
       'Brazil', 'South Africa', 'Turkey', 'Greece', 'Italy', 'France',
       'South Korea', 'Israel', 'Pakistan', 'Peru', 'Colombia', 'Vietnam',
       'Sweden', 'Nepal', 'Finland', 'Australia'], dtype=object)

In [10]:
df_simple['Geo_Location'].value_counts()

USA               2032
China               67
India               28
Thailand            23
Taiwan              22
Hong Kong           21
Spain               21
Puerto Rico         12
Czech Republic       7
Japan                5
Kazakhstan           4
South Korea          4
Malaysia             4
Greece               4
Sri Lanka            4
Iran                 3
Brazil               2
Italy                2
Vietnam              2
Serbia               2
Israel               2
Pakistan             2
Colombia             1
Finland              1
Nepal                1
Sweden               1
Germany              1
Peru                 1
France               1
Netherlands          1
Turkey               1
South Africa         1
Australia            1
Name: Geo_Location, dtype: int64

In [11]:
df_simple.to_csv('output/ncov2019_ncbi_spike_simple.csv')