In [3]:
from pyspark.context import SparkContext
from pyspark.sql import SparkSession
from pyspark.context import SparkConf
from pyspark.sql import Row
from pyspark.sql.window import Window
from pyspark.sql import functions as F
import pyspark.sql.types as T 
from pyspark.sql.functions import udf
from pyspark.sql.functions import col, size
from operator import add
from functools import reduce
from bio_spark.io.fasta_reader import FASTAReader, FASTAQReader
import collections
import numpy as np
import sys

from pathlib import Path

from operator import add

# Sobre este Notebook

Este notebook executa uma clusterização de seuência de Aminoácidos usando a ML lib dp Spark. Clustrização é um método que pode auxiliar os pesquisadores a descobrir relações filogenéticas e/ou relações de similaridade entre sequências sem a necessidade de comparar com uma base de referência. O fluxo é composto dos seguintes passos:

1. Leutra e parsing do arquivos fasta de entrada
2. Cálculo dos Kmers a partir das sequências encontradas nos arquivos de entrada
3. Uso do método de Elbow para encontrar clusters coesos.

___

## Cluster local

Para fins de desenvolvimento, utilizamos imagens Docker para criar um cluster spark local. Esse cluster deve estar rodadndo para que o notebook funcione como esperado. Na raiz do projeto:

```shell
docker-compose up
```

In [4]:
sConf = SparkConf("spark://localhost:7077")
sc = SparkContext(conf=sConf)
spark = SparkSession(sc)

## Data Input

Tdoso os arquivos de entrada serão tratados em único Dataframe

```shell
INPUT_DIR_PATH: caminho para o diretório com os arquivs .fna (FASTA)
```

In [5]:
INPUT_DIR_PATH = Path("/home/thiago/Dados/sparkAAI-1/data/genomes/")
OUTPUT_DIR_PATH = Path("/home/thiago/Dados/sparkAAI-1/output/")
files_to_process = [str(f) for f in INPUT_DIR_PATH.iterdir()]
print("Files to process :", len(files_to_process))

Files to process : 10


In [6]:
fasta_plain_df = sc.textFile(','.join(files_to_process))\
            .map(lambda x: Row(row=x))\
            .zipWithIndex()\
            .toDF(["row","idx"])

print("raw file lines to process", fasta_plain_df.count())

raw file lines to process 86243


inspecionando o dataframe lido

In [7]:
fasta_plain_df.show()

+--------------------+---+
|                 row|idx|
+--------------------+---+
|[>ALPH01000001.1 ...|  0|
|[TCTCCCAGCACTTAGG...|  1|
|[CAACCTCTTTAGAGTT...|  2|
|[ATATTAGAAAGTACTT...|  3|
|[AATTCCCGCACTTCTT...|  4|
|[CAGGACTTGTATCAAG...|  5|
|[CCTGCAGTAACACATG...|  6|
|[TCTTATTTCTCTCCAA...|  7|
|[ATTCTACTTCTTGAAT...|  8|
|[CAACCTCCTGTTTTTA...|  9|
|[CCACATTAAATCTATA...| 10|
|[AATCTTGATTCAATTT...| 11|
|[CCACCAAATCTCCTAT...| 12|
|[ATCCGTTATATAAATT...| 13|
|[GCAAGTCAGGATCTTG...| 14|
|[CCTGAGATTGACTTCC...| 15|
|[TGTAAATTGATCATTA...| 16|
|[CGCCAATAAATTTGAT...| 17|
|[AGAAATTTCACCTCTT...| 18|
|[TTTAGAAACTTTAATT...| 19|
+--------------------+---+
only showing top 20 rows



### Parse dos arquivos FASTA

os arquivos [FASTA]([FASTA](https://blast.ncbi.nlm.nih.gov/Blast.cgi?CMD=Web&PAGE_TYPE=BlastDocs&DOC_TYPE=BlastHelp)), tem o seguinte formato:

```
>ID.CONTIG
ATTC....
GCG...
CCG...
>ID2.CONTIG
GGC...
...
```

nesta primeira sessão fazermos um parse desses arquivos para agrupar as sequẽncias por ID, calcular os kmers para esses contigs e obter um map com as freqências dos kmers em todos os contigs de uma sequẽncia.

In [8]:
def parse_fasta_id_line(l):
    """
    Desejamos extrair os IDs das sequências da linhas que começarem pelo caracter ''>'. Pelo padrão
    FASTA, o ID é a primeira palavra e é um campo composto por ID.CONTIG
    
    Input>
        l: Uma linha de um arquivo FASTA
    Return:
        ID: da sequência ignorando o número de contigs, ou None caso não seja uma linha de ID
    """
    if l[0][0] == ">":
        heaer_splits = l[0][1:].split(" ")[0]
        seq_id_split = heaer_splits.split(".")
        return seq_id_split[0]
    else:
        return None
seq2kmer_udf = udf(parse_fasta_id_line, T.StringType())

In [9]:
fasta_null_ids_df = fasta_plain_df.withColumn("seqID_wNull", seq2kmer_udf("row"))

inspecionar o resultado

In [10]:
fasta_null_ids_df.show()

+--------------------+---+------------+
|                 row|idx| seqID_wNull|
+--------------------+---+------------+
|[>ALPH01000001.1 ...|  0|ALPH01000001|
|[TCTCCCAGCACTTAGG...|  1|        null|
|[CAACCTCTTTAGAGTT...|  2|        null|
|[ATATTAGAAAGTACTT...|  3|        null|
|[AATTCCCGCACTTCTT...|  4|        null|
|[CAGGACTTGTATCAAG...|  5|        null|
|[CCTGCAGTAACACATG...|  6|        null|
|[TCTTATTTCTCTCCAA...|  7|        null|
|[ATTCTACTTCTTGAAT...|  8|        null|
|[CAACCTCCTGTTTTTA...|  9|        null|
|[CCACATTAAATCTATA...| 10|        null|
|[AATCTTGATTCAATTT...| 11|        null|
|[CCACCAAATCTCCTAT...| 12|        null|
|[ATCCGTTATATAAATT...| 13|        null|
|[GCAAGTCAGGATCTTG...| 14|        null|
|[CCTGAGATTGACTTCC...| 15|        null|
|[TGTAAATTGATCATTA...| 16|        null|
|[CGCCAATAAATTTGAT...| 17|        null|
|[AGAAATTTCACCTCTT...| 18|        null|
|[TTTAGAAACTTTAATT...| 19|        null|
+--------------------+---+------------+
only showing top 20 rows



In [11]:
num_ids = fasta_null_ids_df.where(F.col("seqID_wNull").isNotNull()).count()
print("número de seuências para serem processadas", num_ids)

número de seuências para serem processadas 1864


desejamos fazer um "fillna" com o último valor não nulo encontrado na coluna de sequência, para isso usaremos um operador de janela deslizante em cima do índice que serve para manter a ordem original das linhas

In [12]:
fasta_n_filter_df = fasta_null_ids_df.withColumn(
    "seqID", F.last('seqID_wNull', ignorenulls=True)\
    .over(Window\
    .orderBy('idx')\
    .rowsBetween(Window.unboundedPreceding, Window.currentRow)))

A seguir devemos excluir as linhas de header e renomear as colunas excluíndo as que não foram utilizadas

In [13]:
fasta_n_filter_df.show()

+--------------------+---+------------+------------+
|                 row|idx| seqID_wNull|       seqID|
+--------------------+---+------------+------------+
|[>ALPH01000001.1 ...|  0|ALPH01000001|ALPH01000001|
|[TCTCCCAGCACTTAGG...|  1|        null|ALPH01000001|
|[CAACCTCTTTAGAGTT...|  2|        null|ALPH01000001|
|[ATATTAGAAAGTACTT...|  3|        null|ALPH01000001|
|[AATTCCCGCACTTCTT...|  4|        null|ALPH01000001|
|[CAGGACTTGTATCAAG...|  5|        null|ALPH01000001|
|[CCTGCAGTAACACATG...|  6|        null|ALPH01000001|
|[TCTTATTTCTCTCCAA...|  7|        null|ALPH01000001|
|[ATTCTACTTCTTGAAT...|  8|        null|ALPH01000001|
|[CAACCTCCTGTTTTTA...|  9|        null|ALPH01000001|
|[CCACATTAAATCTATA...| 10|        null|ALPH01000001|
|[AATCTTGATTCAATTT...| 11|        null|ALPH01000001|
|[CCACCAAATCTCCTAT...| 12|        null|ALPH01000001|
|[ATCCGTTATATAAATT...| 13|        null|ALPH01000001|
|[GCAAGTCAGGATCTTG...| 14|        null|ALPH01000001|
|[CCTGAGATTGACTTCC...| 15|        null|ALPH010

In [14]:
fasta_df = fasta_n_filter_df\
                .where(F.col("seqID_wNull").isNull())\
                .select("seqID","row")\
                .toDF("seqID","seq")

O Dataframe tratado tem o seguinte esquema

In [15]:
fasta_df.printSchema()

root
 |-- seqID: string (nullable = true)
 |-- seq: struct (nullable = true)
 |    |-- row: string (nullable = true)



inspeção do daframe

In [16]:
fasta_per_seq_df = fasta_df.rdd\
            .map(lambda r: (r.seqID, r.seq[0]))\
            .reduceByKey(lambda x,y:x+y)\
            .map(lambda x: Row(seqID=x[1],seq=x[0]))\
            .toDF(["seqID", "seq"])

In [17]:
fasta_per_seq_df.printSchema()

root
 |-- seqID: string (nullable = true)
 |-- seq: string (nullable = true)



In [18]:
fasta_per_seq_df.show()

+------------+--------------------+
|       seqID|                 seq|
+------------+--------------------+
|ALPH01000001|TCTCCCAGCACTTAGGC...|
|ALPH01000002|CCTTGCTTATTTAGAAA...|
|ALPH01000003|ATTCTTCTTCATCATCC...|
|ALPH01000004|AATATCATTTCTTACTT...|
|ALPH01000005|AACTTTTAATTGGCAAA...|
|ALPH01000006|CCACTACTAACAATTTC...|
|ALPH01000007|CTTGGCTTGTTTTTATC...|
|ALPH01000008|CTGAGTCCTATTTAAAT...|
|ALPH01000009|CGATGTAATGGCTATGC...|
|ALPH01000010|TCTCACTAGAAGAAAAT...|
|ALPH01000011|GTTTTTATCAGTAGCTT...|
|ALPH01000012|AGGGTGTCGGTTAAAAG...|
|ALPH01000013|TTTTCATCTAATAAGTA...|
|ALPH01000014|AATGTTGTGAGCTTTAA...|
|ALPH01000015|ACTGCAGCATTATTTAT...|
|ALPH01000016|GCAATACCTCCAACAAT...|
|ALPH01000017|GACTCTGAAAGTAAATA...|
|ALPH01000018|AGACTCATTGGACATAT...|
|ALPH01000019|CTTCTATATCACTAGCG...|
|ALPH01000020|AGGATTTTTTATTTTTA...|
+------------+--------------------+
only showing top 20 rows



### Calculate Kmers

Nesta sessão faremos o cálculo dos [kmers](https://en.wikipedia.org/wiki/K-mer) de tambo ```K```. O objetivo é associar cada ID de sequência ao conjunto de kmers distiontos presentes em todos os seus motifs

In [19]:
fasta_per_seq_df.cache()

DataFrame[seqID: string, seq: string]

In [20]:
K = 3

In [21]:
Seq2kmerTy = T.ArrayType(T.StringType())
def seq2kmer(seq_):
    global K
    value = seq_.strip()
    num_kmers = len(value) - K + 1
    kmers_list = [value[n:K+n] for n in range(0, num_kmers)]
    
    # return len(value)
    return kmers_list

seq2kmer_udf = udf(seq2kmer,Seq2kmerTy)

In [22]:
fasta_kmers_df = fasta_per_seq_df\
        .withColumn("kmers", seq2kmer_udf("seq"))\

In [23]:
fasta_kmers_df.printSchema()

root
 |-- seqID: string (nullable = true)
 |-- seq: string (nullable = true)
 |-- kmers: array (nullable = true)
 |    |-- element: string (containsNull = true)



In [24]:
fasta_kmers_df.show()

+------------+--------------------+--------------------+
|       seqID|                 seq|               kmers|
+------------+--------------------+--------------------+
|ALPH01000001|TCTCCCAGCACTTAGGC...|[TCT, CTC, TCC, C...|
|ALPH01000002|CCTTGCTTATTTAGAAA...|[CCT, CTT, TTG, T...|
|ALPH01000003|ATTCTTCTTCATCATCC...|[ATT, TTC, TCT, C...|
|ALPH01000004|AATATCATTTCTTACTT...|[AAT, ATA, TAT, A...|
|ALPH01000005|AACTTTTAATTGGCAAA...|[AAC, ACT, CTT, T...|
|ALPH01000006|CCACTACTAACAATTTC...|[CCA, CAC, ACT, C...|
|ALPH01000007|CTTGGCTTGTTTTTATC...|[CTT, TTG, TGG, G...|
|ALPH01000008|CTGAGTCCTATTTAAAT...|[CTG, TGA, GAG, A...|
|ALPH01000009|CGATGTAATGGCTATGC...|[CGA, GAT, ATG, T...|
|ALPH01000010|TCTCACTAGAAGAAAAT...|[TCT, CTC, TCA, C...|
|ALPH01000011|GTTTTTATCAGTAGCTT...|[GTT, TTT, TTT, T...|
|ALPH01000012|AGGGTGTCGGTTAAAAG...|[AGG, GGG, GGT, G...|
|ALPH01000013|TTTTCATCTAATAAGTA...|[TTT, TTT, TTC, T...|
|ALPH01000014|AATGTTGTGAGCTTTAA...|[AAT, ATG, TGT, G...|
|ALPH01000015|ACTGCAGCATTATTTAT

inspeção do daframe

Para validação, podemos obter estatísticas básicas dso kmers obtidos. Para isso vamos contar o número de kmers por ID de sequência e obter um describe da coluna

### K-mers Tranformer

Desde que o o tamanho dos *k-mers* é um parâmetro passívelde otimização, Devemos criar um estimator que possa ser passdo para a MLlib e fazer parte do parameter sewwp

In [25]:
from pyspark.ml import Estimator
from pyspark.ml.evaluation import Evaluator
from pyspark.ml import Transformer
import uuid

from pyspark.ml.param.shared import HasInputCol, HasOutputCol, Param, Params, TypeConverters
from pyspark.ml.util import DefaultParamsReadable, DefaultParamsWritable  

In [26]:
class KMersTranformer(
        Transformer, HasInputCol, HasOutputCol,
        # Credits https://stackoverflow.com/a/52467470
        # by https://stackoverflow.com/users/234944/benjamin-manns
        DefaultParamsReadable, DefaultParamsWritable):
    
    def __init__(self, inputCol="seq", outputCol="kmers", kmer_sz=3):

        super(KMersTranformer, self).__init__()
        self.kmer_sz = Param(self, "kmer_sz", 3)
        self.outputCol = Param(self, "outputCol", "kmers")
        self.inputCol = Param(self, "inputCol", "seq")
        
        self._setDefault(kmer_sz=kmer_sz)
        self._setDefault(outputCol=outputCol)
        self._setDefault(inputCol=inputCol)
    
    def setParams(self, inputCol=None, outputCol=None, kmer_sz=None):
        kwargs = self._input_kwargs
        return self._set(**kwargs)
    
    def setKmers_sz(self, value):
        return self._set(kmer_sz=value)
    
    def getKmers_sz(self):
        return self.getOrDefault(self.kmer_sz)
    
    def setInputCol(self, value):
        return self._set(inputCol=value)

    def getInputCol(self):
        return self.getOrDefault(self.inputCol)
    
    # Required in Spark >= 3.0
    def setOutputCol(self, value):
        return self._set(outputCol=value)
    
    def getOutputCol(self):
        return self.getOrDefault(self.outputCol)
    
    def transform(self, dataset, params=None):
#         self.set("Kmers_sz", params["Kmers_sz"])
        out_col = self.getOutputCol()
        in_col = dataset[self.getInputCol()]
        kmers_sz = self.getKmers_sz()

        Seq2kmerTy = T.ArrayType(T.StringType())
        def seq2kmer(seq_, K):
            value = seq_.strip()
            num_kmers = len(value) - K + 1
            kmers_list = [value[n:K+n] for n in range(0, num_kmers)]            
            return kmers_list

        return dataset.withColumn(out_col, 
                                  udf(lambda r: seq2kmer(r,kmers_sz), Seq2kmerTy)(in_col))
#                                   udf(seq2kmer, kmers_sz, Seq2kmerTy)(in_col))



In [27]:
kmers_transformer = KMersTranformer(kmer_sz=10)

In [28]:
fasta_kmers_df = kmers_transformer.transform(fasta_per_seq_df)

In [29]:
fasta_kmers_df.show()

+------------+--------------------+--------------------+
|       seqID|                 seq|               kmers|
+------------+--------------------+--------------------+
|ALPH01000001|TCTCCCAGCACTTAGGC...|[TCTCCCAGCA, CTCC...|
|ALPH01000002|CCTTGCTTATTTAGAAA...|[CCTTGCTTAT, CTTG...|
|ALPH01000003|ATTCTTCTTCATCATCC...|[ATTCTTCTTC, TTCT...|
|ALPH01000004|AATATCATTTCTTACTT...|[AATATCATTT, ATAT...|
|ALPH01000005|AACTTTTAATTGGCAAA...|[AACTTTTAAT, ACTT...|
|ALPH01000006|CCACTACTAACAATTTC...|[CCACTACTAA, CACT...|
|ALPH01000007|CTTGGCTTGTTTTTATC...|[CTTGGCTTGT, TTGG...|
|ALPH01000008|CTGAGTCCTATTTAAAT...|[CTGAGTCCTA, TGAG...|
|ALPH01000009|CGATGTAATGGCTATGC...|[CGATGTAATG, GATG...|
|ALPH01000010|TCTCACTAGAAGAAAAT...|[TCTCACTAGA, CTCA...|
|ALPH01000011|GTTTTTATCAGTAGCTT...|[GTTTTTATCA, TTTT...|
|ALPH01000012|AGGGTGTCGGTTAAAAG...|[AGGGTGTCGG, GGGT...|
|ALPH01000013|TTTTCATCTAATAAGTA...|[TTTTCATCTA, TTTC...|
|ALPH01000014|AATGTTGTGAGCTTTAA...|[AATGTTGTGA, ATGT...|
|ALPH01000015|ACTGCAGCATTATTTAT

In [30]:
n_kmers_df = fasta_kmers_df\
                    .withColumn("n_kmers", size(col("kmers")))\
                    .select("n_kmers")\

In [31]:
kmers_pofile_df = fasta_kmers_df.select("seqID","kmers")

### Extração de features

O número de K que defie o tamanho dos k-mers define um espaço de features de dimensão $4^K$, para codificar essas features podemos usar a classe ```CountVectorizer```. Essa codificação atribui ordinais a cada kmer único e cria duas listas para representar a presença e o frequência absoluta dos mesmos

In [32]:
from pyspark.ml.feature import CountVectorizer

In [33]:
kmers_pofile_df.printSchema()

root
 |-- seqID: string (nullable = true)
 |-- kmers: array (nullable = true)
 |    |-- element: string (containsNull = true)



In [34]:
%%time
cv = CountVectorizer(inputCol="kmers", outputCol="features")

model = cv.fit(kmers_pofile_df)

features_df = model.transform(kmers_pofile_df)

CPU times: user 9.84 ms, sys: 1.15 ms, total: 11 ms
Wall time: 28.9 s


In [35]:
## conferir resultado temporário
features_df.select("seqID","features").toPandas().to_csv('features.csv')

In [36]:
%%time
unique_features_count = features_df.select("features").distinct().count()
print("Número de features únicas ",unique_features_count )

Número de features únicas  1864
CPU times: user 14.6 ms, sys: 9.45 ms, total: 24.1 ms
Wall time: 22.1 s


In [37]:
print("%d das %d sequências tem features únicas" % (unique_features_count, num_ids))

1864 das 1864 sequências tem features únicas


In [38]:
features_df.printSchema()

root
 |-- seqID: string (nullable = true)
 |-- kmers: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- features: vector (nullable = true)



In [39]:
features_df.show()

+------------+--------------------+--------------------+
|       seqID|               kmers|            features|
+------------+--------------------+--------------------+
|ALPH01000001|[TCTCCCAGCA, CTCC...|(262144,[18,35,46...|
|ALPH01000002|[CCTTGCTTAT, CTTG...|(262144,[38,260,3...|
|ALPH01000003|[ATTCTTCTTC, TTCT...|(262144,[176,349,...|
|ALPH01000004|[AATATCATTT, ATAT...|(262144,[19,20,39...|
|ALPH01000005|[AACTTTTAAT, ACTT...|(262144,[1385,184...|
|ALPH01000006|[CCACTACTAA, CACT...|(262144,[1,4,6,8,...|
|ALPH01000007|[CTTGGCTTGT, TTGG...|(262144,[71,82,13...|
|ALPH01000008|[CTGAGTCCTA, TGAG...|(262144,[43,101,1...|
|ALPH01000009|[CGATGTAATG, GATG...|(262144,[32,80,84...|
|ALPH01000010|[TCTCACTAGA, CTCA...|(262144,[5,10,39,...|
|ALPH01000011|[GTTTTTATCA, TTTT...|(262144,[4,8,46,8...|
|ALPH01000012|[AGGGTGTCGG, GGGT...|(262144,[16,20,29...|
|ALPH01000013|[TTTTCATCTA, TTTC...|(262144,[4,22,25,...|
|ALPH01000014|[AATGTTGTGA, ATGT...|(262144,[4,6,8,9,...|
|ALPH01000015|[ACTGCAGCAT, CTGC

salva arquivo temporário com os kmers e suas frequências

In [40]:
def sparseVectorToColumsn(seqID, vector):
    global vocab
    
    vector_as_dict = {vocab[k]:str(v) for k,v in zip(vector.indices, vector.values)}
    vector_as_dict["seqID"] = seqID
    vector_as_row = Row(**vector_as_dict)
    return vector_as_row

In [41]:
pivoted_kmers_rdd = features_df.rdd\
            .map(lambda r: sparseVectorToColumsn(r.seqID, r.features))

In [42]:
# pivoted_kmers_rdd.saveAsTextFile(str(OUTPUT_DIR_PATH.joinpath("kmers_freq")))

## Clustering

Para o ajuste dos hiperparâmetros da clusterização devemos fazer um parameter sweep para achar o número ideal de clusters. A avaliação da qualidade do cluster é dada pela [Métreica de Silhouette](https://spark.apache.org/docs/2.3.1/api/java/org/apache/spark/ml/evaluation/ClusteringEvaluator.html)

In [43]:
from pyspark.ml.clustering import BisectingKMeans
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import ClusteringEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

In [44]:
class CachingTranformer(
        Transformer, HasInputCol, HasOutputCol,
        # Credits https://stackoverflow.com/a/52467470
        # by https://stackoverflow.com/users/234944/benjamin-manns
        DefaultParamsReadable, DefaultParamsWritable):
    
    def __init__(self):

        super(CachingTranformer, self).__init__()
        
    def setParams(self, inputCol=None, outputCol=None, kmer_sz=None):
        kwargs = self._input_kwargs
        return self._set(**kwargs)
    
    def transform(self, dataset, params=None):
        dataset = dataset.cache()
        return dataset

In [46]:
kmers_calc = KMersTranformer()
cv = CountVectorizer(inputCol="kmers", outputCol="features")
ct = CachingTranformer()
bkm = BisectingKMeans(distanceMeasure='cosine')

clustering_pipeline = Pipeline(stages=[kmers_calc, ct, cv, ct, bkm])

In [47]:
%%time
paramGrid = ParamGridBuilder() \
    .addGrid(kmers_calc.kmer_sz, [3, 7, 12, 15])\
    .addGrid(bkm.k, [5, 7, 10, 15, 20, 27, 36, 50]) \
    .build()

cluster_eval = ClusteringEvaluator(distanceMeasure='cosine')

crossval = CrossValidator(estimator=clustering_pipeline,
                          estimatorParamMaps=paramGrid,
                          evaluator=cluster_eval,
                          numFolds=5)  # use 3+ folds in practice

# Run cross-validation, and choose the best set of parameters.
# cvModel= crossval.fit(features_df)
cvModel= crossval.fit(fasta_per_seq_df)

Py4JJavaError: An error occurred while calling o5123.fit.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 2 in stage 7105.0 failed 1 times, most recent failure: Lost task 2.0 in stage 7105.0 (TID 7355, localhost, executor driver): java.lang.OutOfMemoryError: Java heap space
	at java.lang.reflect.Array.newInstance(Array.java:75)
	at java.io.ObjectInputStream.readArray(ObjectInputStream.java:1883)
	at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1529)
	at java.io.ObjectInputStream.defaultReadFields(ObjectInputStream.java:2231)
	at java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:2155)
	at java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:2013)
	at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1535)
	at java.io.ObjectInputStream.defaultReadFields(ObjectInputStream.java:2231)
	at java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:2155)
	at java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:2013)
	at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1535)
	at java.io.ObjectInputStream.defaultReadFields(ObjectInputStream.java:2231)
	at java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:2155)
	at java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:2013)
	at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1535)
	at java.io.ObjectInputStream.readArray(ObjectInputStream.java:1919)
	at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1529)
	at java.io.ObjectInputStream.defaultReadFields(ObjectInputStream.java:2231)
	at java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:2155)
	at java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:2013)
	at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1535)
	at java.io.ObjectInputStream.readArray(ObjectInputStream.java:1919)
	at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1529)
	at java.io.ObjectInputStream.defaultReadFields(ObjectInputStream.java:2231)
	at java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:2155)
	at java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:2013)
	at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1535)
	at java.io.ObjectInputStream.readArray(ObjectInputStream.java:1919)
	at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1529)
	at java.io.ObjectInputStream.defaultReadFields(ObjectInputStream.java:2231)
	at java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:2155)
	at java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:2013)

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1889)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1877)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1876)
	at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1876)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:926)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:926)
	at scala.Option.foreach(Option.scala:257)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:926)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2110)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2059)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2048)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:737)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2061)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2082)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2101)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2126)
	at org.apache.spark.rdd.RDD$$anonfun$collect$1.apply(RDD.scala:945)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:363)
	at org.apache.spark.rdd.RDD.collect(RDD.scala:944)
	at org.apache.spark.sql.execution.SparkPlan.executeCollect(SparkPlan.scala:299)
	at org.apache.spark.sql.Dataset.org$apache$spark$sql$Dataset$$collectFromPlan(Dataset.scala:3389)
	at org.apache.spark.sql.Dataset$$anonfun$collect$1.apply(Dataset.scala:2788)
	at org.apache.spark.sql.Dataset$$anonfun$collect$1.apply(Dataset.scala:2788)
	at org.apache.spark.sql.Dataset$$anonfun$52.apply(Dataset.scala:3370)
	at org.apache.spark.sql.execution.SQLExecution$$anonfun$withNewExecutionId$1.apply(SQLExecution.scala:78)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:125)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:73)
	at org.apache.spark.sql.Dataset.withAction(Dataset.scala:3369)
	at org.apache.spark.sql.Dataset.collect(Dataset.scala:2788)
	at org.apache.spark.ml.clustering.ClusteringSummary.clusterSizes$lzycompute(ClusteringSummary.scala:51)
	at org.apache.spark.ml.clustering.ClusteringSummary.clusterSizes(ClusteringSummary.scala:49)
	at org.apache.spark.ml.clustering.BisectingKMeans$$anonfun$fit$1.apply(BisectingKMeans.scala:276)
	at org.apache.spark.ml.clustering.BisectingKMeans$$anonfun$fit$1.apply(BisectingKMeans.scala:257)
	at org.apache.spark.ml.util.Instrumentation$$anonfun$11.apply(Instrumentation.scala:185)
	at scala.util.Try$.apply(Try.scala:192)
	at org.apache.spark.ml.util.Instrumentation$.instrumented(Instrumentation.scala:185)
	at org.apache.spark.ml.clustering.BisectingKMeans.fit(BisectingKMeans.scala:257)
	at org.apache.spark.ml.clustering.BisectingKMeans.fit(BisectingKMeans.scala:213)
	at sun.reflect.GeneratedMethodAccessor134.invoke(Unknown Source)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:745)
Caused by: java.lang.OutOfMemoryError: Java heap space
	at java.lang.reflect.Array.newInstance(Array.java:75)
	at java.io.ObjectInputStream.readArray(ObjectInputStream.java:1883)
	at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1529)
	at java.io.ObjectInputStream.defaultReadFields(ObjectInputStream.java:2231)
	at java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:2155)
	at java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:2013)
	at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1535)
	at java.io.ObjectInputStream.defaultReadFields(ObjectInputStream.java:2231)
	at java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:2155)
	at java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:2013)
	at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1535)
	at java.io.ObjectInputStream.defaultReadFields(ObjectInputStream.java:2231)
	at java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:2155)
	at java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:2013)
	at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1535)
	at java.io.ObjectInputStream.readArray(ObjectInputStream.java:1919)
	at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1529)
	at java.io.ObjectInputStream.defaultReadFields(ObjectInputStream.java:2231)
	at java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:2155)
	at java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:2013)
	at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1535)
	at java.io.ObjectInputStream.readArray(ObjectInputStream.java:1919)
	at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1529)
	at java.io.ObjectInputStream.defaultReadFields(ObjectInputStream.java:2231)
	at java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:2155)
	at java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:2013)
	at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1535)
	at java.io.ObjectInputStream.readArray(ObjectInputStream.java:1919)
	at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1529)
	at java.io.ObjectInputStream.defaultReadFields(ObjectInputStream.java:2231)
	at java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:2155)
	at java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:2013)


In [48]:
cluster_df = cvModel.transform(fasta_per_seq_df)

NameError: name 'cvModel' is not defined

In [None]:
cluster_eval.evaluate(cluster_df)

In [None]:
best_reg_param = cvModel.getEstimatorParamMaps()

In [None]:
best_reg_param

----------------------------------------
Exception happened during processing of request from ('127.0.0.1', 50998)
ERROR:root:Exception while sending command.
Traceback (most recent call last):
  File "/home/thiago/anaconda3/envs/bio_env/lib/python3.7/site-packages/py4j/java_gateway.py", line 1159, in send_command
    raise Py4JNetworkError("Answer from Java side is empty")
py4j.protocol.Py4JNetworkError: Answer from Java side is empty

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/thiago/anaconda3/envs/bio_env/lib/python3.7/site-packages/py4j/java_gateway.py", line 985, in send_command
    response = connection.send_command(command)
  File "/home/thiago/anaconda3/envs/bio_env/lib/python3.7/site-packages/py4j/java_gateway.py", line 1164, in send_command
    "Error while receiving", e, proto.ERROR_ON_RECEIVE)
py4j.protocol.Py4JNetworkError: Error while receiving
Traceback (most recent call last):
ERROR:root:Exceptio

## Visualizing results

In [None]:
import seaborn as sns; sns.set(color_codes=True)

In [None]:
cluster_df_pd = cluster_df.toPandas()

In [None]:
cluster_df_pd.columns

In [None]:
cluster_df_pd.reset_index(inplace=True)  

In [None]:
len(cluster_df_pd)

In [59]:
feature_0 = cluster_df_pd.features.apply(lambda x: x[0])
feature_1 = cluster_df_pd.features.apply(lambda x: x[1])
clusters = cluster_df_pd.prediction

In [60]:
len(clusters)

1864

In [61]:
import matplotlib.pyplot as plt
aimport pandas as pd
import numpy as np

SyntaxError: invalid syntax (<ipython-input-61-f2cf5c9b3ec0>, line 2)

In [None]:
cluster_df_pd.prediction.hist()
plt.xlabel("cluster")
plt.ylabel("sequence freq")

In [None]:
sns.set(style="ticks")

# Initialize the figure with a logarithmic x axis
f, ax = plt.subplots(figsize=(7, 6))
ax.set_xscale("log")

# Plot the orbital period with horizontal boxes
sns.boxplot(x=feature_0, y=clusters,
            palette="vlag")

In [None]:
sns.jointplot(x=feature_0, y=feature_1);

## Saving result

In [None]:
cluster_pandasDF = cluster_df.toPandas()

In [None]:
cluster_pandasDF.to_csv(str(OUTPUT_DIR_PATH.joinpath("result_cluster.csv")))

In [None]:
import pickle

In [None]:
# with open(str(OUTPUT_DIR_PATH.joinpath("cluster_model")),"w") as f:
#     pickle.dumps(cvModel, f)

In [35]:
cluster_df.show()

NameError: name 'cluster_df' is not defined

In [None]:
cluster_df.select("prediction").describe().show()