# Example of using PySpark to do data analysis with dataframe

## Imports and variables

In [1]:
from pyspark import SparkConf, SparkContext                    
from mmtfPyspark.io import mmtfReader                                
from mmtfPyspark.webFilters import Pisces                        
from mmtfPyspark.datasets import groupInteractionExtractor
from mmtfPyspark.structureViewer import interaction_structure_viewer
import py3Dmol
import time
                                                               
# Create variables                                             
APP_NAME = "MMTF_Spark"                                        
path = "./resources/mmtf_full_sample/"                            
                                                               
# Configure Spark                                              
conf = SparkConf().setAppName(APP_NAME).setMaster("local[*]")  
sc = SparkContext(conf=conf)                                   

## Read PDB and create PISCES non-redundant set

In [2]:
pdb = mmtfReader.read_sequence_file(path, sc)
pdb = pdb.filter(Pisces(sequenceIdentity = 20, resolution = 2.0))         

## Extract Zinc interactions

In [3]:
finder = groupInteractionExtractor("ZN", distance = 3.0)       
interactions = finder.getDataset(pdb)

interactions.show(10)

+-----------+--------+-----+--------+------+--------+-----+--------+------+---------+
|structureId|residue1|atom1|element1|index1|residue2|atom2|element2|index2| distance|
+-----------+--------+-----+--------+------+--------+-----+--------+------+---------+
|       1FN9|      ZN|   ZN|      Zn|   730|     CYS|   SG|       S|    50|2.3709755|
|       1FN9|      ZN|   ZN|      Zn|   730|     CYS|   SG|       S|    53|2.3940797|
|       1FN9|      ZN|   ZN|      Zn|   730|     HIS|  NE2|       N|    70|2.2196307|
|       1FN9|      ZN|   ZN|      Zn|   730|     CYS|   SG|       S|    72|2.3465357|
|       1FN9|      ZN|   ZN|      Zn|   731|     CYS|   SG|       S|   415|2.3747551|
|       1FN9|      ZN|   ZN|      Zn|   731|     CYS|   SG|       S|   418|2.3680198|
|       1FN9|      ZN|   ZN|      Zn|   731|     HIS|  NE2|       N|   435|2.1647959|
|       1FN9|      ZN|   ZN|      Zn|   731|     CYS|   SG|       S|   437|2.3763454|
|       1A73|      ZN|   ZN|      Zn|   366|     CYS| 

## Get unique PDBIds from result

In [4]:
# Get list of PDBids
pdbIds = [row.structureId for row in interactions.collect()]

# Get unique pdbIds and convert back to list
unique_pdbIds = list(set(pdbIds))

## Visualize first hit

In [5]:
interaction_structure_viewer(unique_pdbIds, 'ZN')

<function mmtfPyspark.structureViewer.interaction_structure_viewer.<locals>.view3d>

## Show top 5 interacting groups                           


In [6]:
interactions.filter("element2 != 'C'").groupBy("residue2").count().sort("count", ascending=False).show(10)

+--------+-----+
|residue2|count|
+--------+-----+
|     CYS|   19|
|     HIS|   14|
|     HOH|   10|
|     ASP|    5|
|     GLU|    5|
|     TRP|    2|
|     VAL|    2|
|      CL|    1|
+--------+-----+



In [7]:
sc.stop()