In [1]:
!pip install biopython



In [2]:
from Bio import SeqIO
import pandas as pd
import re

In [3]:
def fasta2df(path):
    seq_ids = []
    seq_annos = []
    seq_sps = []
    seqs = []

    for record in SeqIO.parse(path, 'fasta'):
        seq_id   = record.id
        seq_desc = record.description
        seq      = record.seq

        seq_desc = re.split('\[(.*?)\]', seq_desc)[:-1]
        seq_sp   = seq_desc[-1]
        seq_anno = seq_desc[0].split()
        seq_anno = ' '.join(seq_anno[1:])
        seq_anno = re.sub(r'MULTISPECIES: ', '', seq_anno)

        seq_ids.append(seq_id)
        seq_annos.append(seq_anno)
        seq_sps.append(seq_sp)
        seqs.append(str(seq))

        
    seq_ids = pd.Series(seq_ids, name='id')
    seq_annos = pd.Series(seq_annos, name='annotation')
    seq_sps = pd.Series(seq_sps, name='species')
    seqs = pd.Series(seqs, name='sequence')

    df = pd.DataFrame([seq_ids, seq_annos, seq_sps, seqs]).T.set_index('id')
    
    return df

In [4]:
path = 'fasta_sample.txt' # blast検索の結果をfasta形式でexportしたファイルのパスを指定

In [5]:
df = fasta2df(path)

In [6]:
df

Unnamed: 0_level_0,annotation,species,sequence
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1DHF_A,CRYSTAL STRUCTURES OF RECOMBINANT HUMAN DIHYDR...,Homo sapiens,VGSLNCIVAVSQNMGIGKNGDLPWPPLRNEFRYFQRMTTTSSVEGK...
NP_000782.1,dihydrofolate reductase isoform 1,Homo sapiens,MVGSLNCIVAVSQNMGIGKNGDLPWPPLRNEFRYFQRMTTTSSVEG...
EAW95859.1,"dihydrofolate reductase, isoform CRA_a",Homo sapiens,MAARRQGPARSANPRPQFPGVCGREHAATLRAPGRGGGASPAQIGT...
6VCJ_A,Crystal structure of hsDHFR in complex with NA...,Homo sapiens,GMVGSLNCIVAVSQNMGIGKNGDLPWPPLRNEFRYFQRMTTTSSVE...
7ESE_A,"Chain A, Dihydrofolate reductase",Homo sapiens,MGGSHHHHHHENLYFQGMVGSLNCIVAVSQNMGIGKNGDLPWPPLR...
3NXO_A,Perferential Selection of Isomer Binding from ...,Homo sapiens,VGSLNCIVAVSQNMGIGKNGDLPWPPLRNEFRYFQRMTTTSSVEGK...
AAH70280.1,Dihydrofolate reductase,Homo sapiens,MVGSLNCIVAVSQNMGIGKNGDLPWPPLRNEFRYFQRMTTTSSVEG...
XP_009447211.1,dihydrofolate reductase isoform X1,Pan troglodytes,MVGSLNCIVAVSQNMGIGKNGDLPWPPLRNEFRYFQRMTTTSSVEG...
3F8Y_A,"Chain A, Dihydrofolate reductase",Homo sapiens,MVGSLNCIVAVSQNMGIGKNGDLPWPPLRNEFRYFKRMTTTSSVEG...
XP_003822315.2,dihydrofolate reductase,Pan paniscus,MAARRQGPARSASPRPQFPGVCGREHAATLRAPGRGGGASPAQIGT...


In [7]:
que = 'Homo sapiens'
que_col = 'species'
df[df[que_col] == que] # 完全一致

Unnamed: 0_level_0,annotation,species,sequence
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1DHF_A,CRYSTAL STRUCTURES OF RECOMBINANT HUMAN DIHYDR...,Homo sapiens,VGSLNCIVAVSQNMGIGKNGDLPWPPLRNEFRYFQRMTTTSSVEGK...
NP_000782.1,dihydrofolate reductase isoform 1,Homo sapiens,MVGSLNCIVAVSQNMGIGKNGDLPWPPLRNEFRYFQRMTTTSSVEG...
EAW95859.1,"dihydrofolate reductase, isoform CRA_a",Homo sapiens,MAARRQGPARSANPRPQFPGVCGREHAATLRAPGRGGGASPAQIGT...
6VCJ_A,Crystal structure of hsDHFR in complex with NA...,Homo sapiens,GMVGSLNCIVAVSQNMGIGKNGDLPWPPLRNEFRYFQRMTTTSSVE...
7ESE_A,"Chain A, Dihydrofolate reductase",Homo sapiens,MGGSHHHHHHENLYFQGMVGSLNCIVAVSQNMGIGKNGDLPWPPLR...
3NXO_A,Perferential Selection of Isomer Binding from ...,Homo sapiens,VGSLNCIVAVSQNMGIGKNGDLPWPPLRNEFRYFQRMTTTSSVEGK...
AAH70280.1,Dihydrofolate reductase,Homo sapiens,MVGSLNCIVAVSQNMGIGKNGDLPWPPLRNEFRYFQRMTTTSSVEG...
3F8Y_A,"Chain A, Dihydrofolate reductase",Homo sapiens,MVGSLNCIVAVSQNMGIGKNGDLPWPPLRNEFRYFKRMTTTSSVEG...
3L3R_A,"Chain A, Dihydrofolate reductase",Homo sapiens,VGSLNCIVAVSQNMGIGKNGDLPWPPLRNEFRYFKRMTTTSSVEGK...
1DLR_A,Methotrexate-Resistant Variants Of Human Dihyd...,Homo sapiens,VGSLNCIVAVSQNMGIGKNGDFPWPPLRNEFRYFQRMTTTSSVEGK...


In [8]:
que = 'species'
set(df[que]) #重複を除外

{'Cercocebus atys',
 'Chlorocebus sabaeus',
 'Cloning vector pPL5618_pUG_MtxR',
 'Colobus angolensis palliatus',
 'Expression vector pTGPI-GFP',
 'Gorilla gorilla gorilla',
 'Homo sapiens',
 'Macaca fascicularis',
 'Macaca mulatta',
 'Macaca nemestrina',
 'Macaca thibetana thibetana',
 'Mandrillus leucophaeus',
 'Nomascus leucogenys',
 'Pan paniscus',
 'Pan troglodytes',
 'Papio anubis',
 'Piliocolobus tephrosceles',
 'Pongo abelii',
 'Recombinant plasmid p6cys A-B-dk',
 'Rhinopithecus roxellana',
 'Theropithecus gelada',
 'Trachypithecus francoisi'}