# Secondary Structure Elements Word2Vec Encoder Demo

This demo creates a dataset by extracting secondary structure elements "H", then encode an overlapping Ngram feature vector

## Imports

In [6]:
from pyspark import SparkConf, SparkContext, SQLContext
from mmtfPyspark.ml import proteinSequenceEncoder
from mmtfPyspark.mappers import structureToPolymerChains
from mmtfPyspark.filters import containsLProteinChain
from mmtfPyspark.datasets import secondaryStructureElementExtractor
from mmtfPyspark.webFilters import Pisces
from mmtfPyspark.io import mmtfReader

## Configure Spark Context

In [2]:
conf = SparkConf() \
            .setMaster("local[*]") \
            .setAppName("SecondaryStructureElementsWord2VecEncoderDemo")

sc = SparkContext(conf = conf)

 ## Read MMTF Hadoop sequence file and 
 
 Create a non-redundant set(<=20% seq. identity) of L-protein chains

In [3]:
path = "../../resources/mmtf_reduced_sample/"
fraction = 0.05
seed = 123

pdb = mmtfReader \
        .read_sequence_file(path, sc) \
        .flatMap(structureToPolymerChains(False, True)) \
        .filter(containsLProteinChain()) \
        .sample(False, fraction, seed)

## Extract Element "H" from Secondary Structure

In [7]:
label = "H"
data = secondaryStructureElementExtractor.get_dataset(pdb, label).cache()
print(f"original data   : {data.count()}")
data.show(10, False)

original data   : 3225
+-----------------------+-----+
|sequence               |label|
+-----------------------+-----+
|ACAGV                  |H    |
|GIGLHLAVRLA            |H    |
|RLWEAARAL              |H    |
|SKSVAAARE              |H    |
|EDAVASVLDVN            |H    |
|GTVRMLQAFLPDMKRR       |H    |
|VYCASKFALEGLCESLAVLLLPF|H    |
|HTFHRFYQYLALSKQVFREA   |H    |
|EEVAEVFLTALR           |H    |
|LPLLRMRL               |H    |
+-----------------------+-----+
only showing top 10 rows



## Word2Vec encoded feature Vector

In [10]:
segmentLength = 11
n = 2
windowSize = (segmentLength-1)/2
vectorSize = 50

encoder = proteinSequenceEncoder(data)
data = encoder.overlappingNgramWord2VecEncode(n, windowSize, vectorSize)

data.show(5)

+-----------+-----+--------------------+--------------------+
|   sequence|label|               ngram|            features|
+-----------+-----+--------------------+--------------------+
|      ACAGV|    H|    [AC, CA, AG, GV]|[-0.5438956143334...|
|GIGLHLAVRLA|    H|[GI, IG, GL, LH, ...|[0.14872017204761...|
|  RLWEAARAL|    H|[RL, LW, WE, EA, ...|[0.28531974926590...|
|  SKSVAAARE|    H|[SK, KS, SV, VA, ...|[0.22657969500869...|
|EDAVASVLDVN|    H|[ED, DA, AV, VA, ...|[-0.4830648854374...|
+-----------+-----+--------------------+--------------------+
only showing top 5 rows



## Terminate Spark Context

In [11]:
sc.stop()