# Bitacora para el manejo de secuencias ensambladas y búsqueda con *Blastx*

## Para el siguiente ejercicio es necesario tener el Blast+ instalado en la computadora
https://www.ncbi.nlm.nih.gov/guide/data-software/

## Se utilizarán los contigs formados por el ensamblaje que se localizan en 
`~/Desktop/data/ejercicio_ensamblaje/8_S356_contigs.fa`

In [None]:
from pandas import Series, DataFrame
import pandas as pd
from Bio import SeqIO, AlignIO, SeqRecord
from Bio.SeqRecord import SeqRecord
from Bio.Seq import Seq
import matplotlib.pyplot as plt 
import os
from matplotlib_venn import venn3_unweighted

In [None]:
cd ~/Desktop/data/ejercicio_ensamblaje/

In [None]:
ls 

In [None]:
os.makedirs('img',exist_ok=True)

In [None]:
!grep ">" 8_S356_contigs.fa |wc -l

# Se analizarán con blastx los contigs obtenidos a la base de datos *Swissprot*

In [None]:
%%bash
export BLASTDB=/home/user1/DATA/swiss/

date  
time blastx -query 8_S356_contigs.fa  -db /home/user1/DATA/swiss/swissprot \
-out 8_S356_contigs_blastx.tab -evalue 1E-6 -max_target_seqs 1 \
-num_threads 2 -outfmt "6 std stitle" 
date

In [None]:
!date

In [None]:
!head -2 8_S356_contigs_blastx.tab

In [None]:
encabezado =("qseqid", "sseqid", "pident", "length", "mismatch", "gapopen","qstart", 
             "qend", "sstart","send", "evalue", "bitscore", "stitle")

In [None]:
ftab=pd.read_table("8_S356_contigs_blastx.tab", header=None , names= encabezado)
ftab.head()

In [None]:
ftab["uniprotid"]=ftab["sseqid"].astype(str).str[:6]
ftab.head(2)

In [None]:
!date

In [None]:
fspid = pd.read_csv('/home/user1/DATA/spidgo.csv')
fspid.head(2)

In [None]:
!date

In [None]:
f2=pd.merge(ftab,fspid, on ="uniprotid" , how='inner')
f2.head(2)

In [None]:
fspid = ''
fspid

In [None]:
!date

In [None]:
fgo = pd.read_csv('~/Desktop/bigdata/go_to_goslim.csv')
fgo.head(2)

In [None]:
!date

In [None]:
f3=pd.merge(f2,fgo, on ="GO_id" , how='inner')
f3.head()

In [None]:
f4=f3.drop_duplicates(subset = ('qseqid', "aspect"), inplace = False)
f4.describe()[['length','evalue']]

In [None]:
f4.to_csv("8_S356_contigs_blastx_goslim.csv", index =  None)

In [None]:
f4 = pd.read_csv('8_S356_contigs_blastx_goslim.csv')
f4.head(2)

In [None]:
ftabpivot = f4.pivot_table(values="uniprotid" , index=["qseqid"], aggfunc=len, columns="aspect")
ftabpivot.describe()

# Proceso para generar el diagama de Venn con la información de 
## Componentes celulares, funciones biologicas y procesos biologicos, C, F y P, respectivamente

In [None]:
lineaC =[] # data from C
lineaF =[] # data from F
lineaP =[] # data from P
linea = ""
n=1
for row in ftabpivot.index:
    row2=ftabpivot.loc[row]
    if str(row2["C"])=="nan" and str(row2["F"])=="nan" and str(row2["P"])=="nan" :
        continue    
    else:        
        if str(row2["C"]) !="nan":
            linea = row
        else:
            linea = ""
        lineaC.append(linea)
        if str(row2["F"]) !="nan":
            linea = row
        else:
            linea = ""
        lineaF.append(linea)

        if str(row2["P"]) !="nan":
            linea = row
        else:
            linea = ""
        lineaP.append(linea)

        n+=1
        #if n==1000:
        #    break

len(lineaC), len(lineaF), len(lineaP)

In [None]:
lineaC = set(lineaC)
lineaF = set(lineaF)
lineaP = set(lineaP)
venn3_unweighted([lineaC, lineaF, lineaP], ('C', 'F', 'P'))
#plt.savefig("710_transcritos_venn3_1.png", dpi=400, bbox_inches='tight')
#plt.savefig("710_transcritos_venn3_1.pdf", dpi=400, bbox_inches='tight')
plt.show()

In [None]:
from matplotlib_venn import venn3

In [None]:
lineaC = set(lineaC)
lineaF = set(lineaF)
lineaP = set(lineaP)
venn3([lineaC, lineaF, lineaP], ('C', 'F', 'P'))
#plt.savefig("710_transcritos_venn3_1.png", dpi=400, bbox_inches='tight')
#plt.savefig("710_transcritos_venn3_1.pdf", dpi=400, bbox_inches='tight')
plt.show()

In [None]:
fgo=f4.groupby(['GOSlim_bin', 'aspect'])["qseqid"].count()
#fgo

fgo.sort_values(inplace = True, ascending=False)
#fgo

linea10=fgo[0:10]
linea11=fgo[10:]
#linea10

#linea11
otro=sum (linea11)
#otro
otros = pd.DataFrame({0:otro}, index=["Other"])
#otros
linea10=linea10.append(otros)
#linea10
linea10.plot(kind='barh', color=list('ybg'))
plt.axis([-1, max(linea10[0]+5), -1, 11], label=None)
plt.xlabel("Count")
plt.ylabel("GOSlim bin")
plt.legend().set_visible(False)
yes="n"
yes = input("save figure? ")
if yes=="y":
    plt.savefig("img/8_S356_contigs_blastx_GObar.png", dpi=400, bbox_inches='tight')

plt.show()