# Bitacora para el manejo de secuencias fasta y búsqueda basica de *Blastx*

## Para el siguiente ejercicio es necesario tener el Blast+ instalado en la computadora
https://www.ncbi.nlm.nih.gov/guide/data-software/

In [1]:
cd ~/Desktop/data/exp710/

/home/user1/Desktop/data/exp710


In [2]:
ls

710_transcritos.fasta  710_transcritos.tab  core.29269


In [11]:
%%bash
export BLASTDB=/home/user1/DATA/swiss/

date  
time blastx -query 710_transcritos.fasta -db /home/user1/DATA/swiss/swissprot \
-out 710_transcritos.tab -evalue 1E-6 -max_target_seqs 1 \
-num_threads 2 -outfmt "6 std stitle" 
date

Process is interrupted.


In [3]:
!head 710_transcritos.tab

1001070759	P46595.1	87.611	113	14	0	7	345	1	113	1.86e-66	199	RecName: Full=Ubiquitin-conjugating enzyme E2 4; AltName: Full=E2 ubiquitin-conjugating enzyme 4; AltName: Full=Ubiquitin carrier protein 4; AltName: Full=Ubiquitin-protein ligase 4
1001070758	Q9Y4A8.1	43.243	111	61	1	1031	1357	536	646	1.81e-14	80.1	RecName: Full=Nuclear factor erythroid 2-related factor 3; Short=NF-E2-related factor 3; Short=NFE2-related factor 3; AltName: Full=Nuclear factor, erythroid derived 2, like 3
1001070756	P22813.1	44.286	70	39	0	621	830	164	233	8.29e-14	75.1	RecName: Full=Heat shock factor protein; Short=HSF; AltName: Full=Heat shock transcription factor; Short=HSTF
1001070755	P02833.1	90.361	83	4	1	1185	1421	283	365	3.54e-41	157	RecName: Full=Homeotic protein antennapedia
1001070754	G5E8K5.1	25.131	382	203	13	434	1498	352	677	1.58e-12	76.3	RecName: Full=Ankyrin-3; Short=ANK-3; AltName: Full=Ankyrin-G
1001070754	G5E8K5.1	33.077	130	78	3	386	775	469	589	1.26e-10	70.1	RecName: Full=Ankyrin-3; Sh

In [4]:
from pandas import Series, DataFrame
import pandas as pd
from Bio import SeqIO, AlignIO, SeqRecord
from Bio.SeqRecord import SeqRecord
from Bio.Seq import Seq
import matplotlib.pyplot as plt 
import os
from matplotlib_venn import venn3_unweighted

###  Blastx da los resultados sin nombre de columnas, por lo que se asignan a la variable "encabezado". 
### *NOTA:* el blastx a la base de datos swissprot da como segunda columna el identificador de Uniprot y no el del GenBank como en el caso de blastn a la base de datos 16 microbial o nt

In [5]:
# observe el nombre de la segunda columna
encabezado =("qseqid", "uniprotid", "pident", "length", "mismatch", "gapopen","qstart", 
             "qend", "sstart","send", "evalue", "bitscore", "stitle")

### Se lee el archivo de salida y se asigna a la variable "ftab", con ello se pueden ver los resultados

In [6]:
ftab=pd.read_table("710_transcritos.tab", header=None , names= encabezado)
ftab.head()


Unnamed: 0,qseqid,uniprotid,pident,length,mismatch,gapopen,qstart,qend,sstart,send,evalue,bitscore,stitle
0,1001070759,P46595.1,87.611,113,14,0,7,345,1,113,1.8600000000000002e-66,199.0,RecName: Full=Ubiquitin-conjugating enzyme E2 ...
1,1001070758,Q9Y4A8.1,43.243,111,61,1,1031,1357,536,646,1.81e-14,80.1,RecName: Full=Nuclear factor erythroid 2-relat...
2,1001070756,P22813.1,44.286,70,39,0,621,830,164,233,8.29e-14,75.1,RecName: Full=Heat shock factor protein; Short...
3,1001070755,P02833.1,90.361,83,4,1,1185,1421,283,365,3.5399999999999997e-41,157.0,RecName: Full=Homeotic protein antennapedia
4,1001070754,G5E8K5.1,25.131,382,203,13,434,1498,352,677,1.58e-12,76.3,RecName: Full=Ankyrin-3; Short=ANK-3; AltName:...


In [7]:
ftab1= ftab.groupby("stitle")["qseqid"].count()
ftab1 = DataFrame(ftab1)
ftab1

Unnamed: 0_level_0,qseqid
stitle,Unnamed: 1_level_1
"RecName: Full=6-phosphofructo-2-kinase/fructose-2,6-bisphosphatase 1; Short=6PF-2-K/Fru-2,6-P2ase 1; Short=PFK/FBPase 1; AltName: Full=6PF-2-K/Fru-2,6-P2ase liver isozyme; Includes: RecName: Full=6-phosphofructo-2-kinase; Includes: RecName: Full=Fructose-2,6-bisphosphatase",1
RecName: Full=Akirin; AltName: Full=Protein bhringi,2
RecName: Full=Alpha-crystallin A chain,2
RecName: Full=Alpha-crystallin B chain; AltName: Full=Alpha(B)-crystallin,5
RecName: Full=Ankyrin repeat and KH domain-containing protein 1; AltName: Full=HIV-1 Vpr-binding ankyrin repeat protein; AltName: Full=Multiple ankyrin repeats single KH domain; Short=hMASK,7
RecName: Full=Ankyrin repeat and KH domain-containing protein mask; AltName: Full=Multiple ankyrin repeat single KH domain-containing protein,68
RecName: Full=Ankyrin repeat domain-containing protein 17; AltName: Full=Ankyrin repeat domain-containing protein FOE; AltName: Full=Gene trap ankyrin repeat protein,13
RecName: Full=Ankyrin repeat domain-containing protein 17; AltName: Full=Gene trap ankyrin repeat protein; AltName: Full=Serologically defined breast cancer antigen NY-BR-16,116
RecName: Full=Ankyrin-1; Short=ANK-1; AltName: Full=Ankyrin-R; AltName: Full=Erythrocyte ankyrin,15
RecName: Full=Ankyrin-3; Short=ANK-3; AltName: Full=Ankyrin-G,29


In [9]:
ftab1 = ftab.drop_duplicates(subset = "qseqid", inplace = False)
ftab1.describe().round(3)

Unnamed: 0,qseqid,pident,length,mismatch,gapopen,qstart,qend,sstart,send,evalue,bitscore
count,230.0,230.0,230.0,230.0,230.0,230.0,230.0,230.0,230.0,230.0,230.0
mean,1001071000.0,51.128,285.765,127.791,3.47,841.191,748.113,238.73,515.396,0.0,264.723
std,105.511,16.756,248.37,119.647,4.199,1006.605,706.577,482.784,514.793,0.0,231.177
min,1001070000.0,25.103,37.0,4.0,0.0,1.0,1.0,1.0,49.0,0.0,50.4
25%,1001070000.0,36.352,110.0,45.25,1.0,123.75,276.0,11.0,196.0,0.0,81.6
50%,1001071000.0,49.184,192.0,94.0,2.0,457.5,575.0,58.0,343.0,0.0,175.5
75%,1001071000.0,63.09,353.5,157.5,4.0,1175.75,975.0,199.75,635.0,0.0,339.25
max,1001071000.0,90.361,1130.0,616.0,19.0,3575.0,4035.0,2302.0,2688.0,0.0,912.0


In [10]:
!date

Thu Aug  2 20:33:29 UTC 2018


In [None]:
fspid = pd.read_csv('~/Desktop/bigdata/spidgo.csv', engine="python")
fspid.head(2)

In [None]:
!date

In [None]:
f2=pd.merge(ftab1,fspid, on ="uniprotid" , how='inner')
f2.head(2)

In [None]:
fspid = ''
fspid

In [None]:
!date

In [None]:
!date

In [None]:
!date
fgo = pd.read_csv('~/Desktop/bigdata/go_to_goslim.csv', engine="python")
fgo.head(2)

In [None]:
!date

In [None]:
f3=pd.merge(f2,fgo, on ="GO_id" , how='inner')
f3.head()

In [None]:
!date

In [None]:
f4=f3.drop_duplicates(subset = ('qseqid', "aspect"), inplace = False)
f4.describe()[['length','evalue']]

In [None]:
f4.to_csv("710_transcritos_goslim.csv", index =  None)

In [None]:
ftabpivot = f4.pivot_table(values="uniprotid" , index=["qseqid"], aggfunc=len, columns="aspect")
ftabpivot.describe()

# Proceso para generar el diagama de Venn con la información de 
## Componentes celulares, funciones biologicas y procesos biologicos, C, F y P, respectivamente

In [None]:
lineaC =[] # data from C
lineaF =[] # data from F
lineaP =[] # data from P
linea = ""
n=1
for row in ftabpivot.index:
    row2=ftabpivot.loc[row]
    if str(row2["C"])=="nan" and str(row2["F"])=="nan" and str(row2["P"])=="nan" :
        continue    
    else:        
        if str(row2["C"]) !="nan":
            linea = row
        else:
            linea = ""
        lineaC.append(linea)
        if str(row2["F"]) !="nan":
            linea = row
        else:
            linea = ""
        lineaF.append(linea)

        if str(row2["P"]) !="nan":
            linea = row
        else:
            linea = ""
        lineaP.append(linea)

        n+=1
        #if n==1000:
        #    break

len(lineaC), len(lineaF), len(lineaP)

In [None]:
lineaC = set(lineaC)
lineaF = set(lineaF)
lineaP = set(lineaP)
venn3_unweighted([lineaC, lineaF, lineaP], ('C', 'F', 'P'))
plt.savefig("710_transcritos_venn3_1.png", dpi=400, bbox_inches='tight')
plt.savefig("710_transcritos_venn3_1.pdf", dpi=400, bbox_inches='tight')
plt.show()

In [None]:
fgo=f4.groupby('GOSlim_bin')["qseqid"].count()
#fgo

fgo.sort_values(inplace = True, ascending=False)
#fgo

linea10=fgo[0:10]
linea11=fgo[10:]
#linea10

#linea11
otro=sum (linea11)
#otro
otros = pd.DataFrame({0:otro}, index=["Other"])
#otros
linea10=linea10.append(otros)
#linea10
linea10.plot(kind='barh', color=list('ybg'))
plt.axis([-1, 500, -1, 11], label=None)
plt.xlabel("Count")
plt.ylabel("GOSlim bin")
plt.legend().set_visible(False)
#plt.savefig("710transcritos_blastx_GObar.png", dpi=400, bbox_inches='tight')


plt.show()

# Escriba qué es lo que aprendió en esta bitácora y en qué le podría ayudar en su trabajo, sea tan explícito como le sea posible