In [1]:
from pyspark.sql.session import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import StructType, StructField, IntegerType, StringType,FloatType
import pyspark.sql.functions as f
import pandas as pd
import os

In [2]:
class PSTool:
    def __init__(self):
        print('Creating output folder')
        if os.path.exists('output'):
            pass
        else:
            os.makedirs('output')

    def pyspark_session(self, host_location):
        """
        Creates and returns spark session object
        """
        print('Starting session')
        sc = SparkContext(host_location)  # Create spark context
        spark = SparkSession(sc)  # Create session
        return spark

    def file_loader(self, path, delim, spark_obj, schema):
        print('Loading in file')
        data = spark_obj.read.options(delimiter=delim).option("header","False").csv(path, schema=schema)
        
        print('File loaded')
        return data

    def get_questions(self, df):
        pass

if __name__ == "__main__":
    pstool = PSTool()  # Instanciate object
    spk = pstool.pyspark_session('local[16]')  # start session
    # load data
#     path = '/data/dataprocessing/interproscan/all_bacilli.tsv'
    path = 'all_bacilli_subset_1000.tsv'
    schema = StructType([
        StructField("Protein_accession", StringType(), True),
        StructField("Sequence_MD5_digest", StringType(), True),
        StructField("Sequence_length", IntegerType(), True),
        StructField("Analysis", StringType(), True),
        StructField("Signature_accession", StringType(), True),
        StructField("Signature_description", StringType(), True),
        StructField("Start_location", IntegerType(), True),
        StructField("Stop_location", IntegerType(), True),
        StructField("Score", FloatType(), True),
        StructField("Status", StringType(), True),
        StructField("Date", StringType(), True),
        StructField("InterPro_annotations_accession", StringType(), True),
        StructField("InterPro_annotations_description", StringType(), True),
        StructField("GO_annotations", StringType(), True),
        StructField("Pathways_annotations", StringType(), True)])
    
    df = pstool.file_loader(path, '\t', spk, schema)
#     pstool.get_questions(df)
#     print('Closing spark session')
#     spk.sparkContext.stop()
#     df.printSchema()  # Shows column names and some info


Creating output folder
Starting session
Loading in file
File loaded


In [3]:
print('len:', len(df.columns))
print('count:' , df.count())

len: 15
count: 1000


In [4]:
df

DataFrame[Protein_accession: string, Sequence_MD5_digest: string, Sequence_length: int, Analysis: string, Signature_accession: string, Signature_description: string, Start_location: int, Stop_location: int, Score: float, Status: string, Date: string, InterPro_annotations_accession: string, InterPro_annotations_description: string, GO_annotations: string, Pathways_annotations: string]

# Data munging

First, the data has to be munged. 
* Remove entries without InterPRO number
* Note which proteins features are >90% size of the protein and remove them. "ProtID_select"
* Select the smaller features for the above found proteins. Proteins without large feature do not matter.

In [7]:
IPRO_filt = df.filter(df["InterPro_annotations_accession"] != '-')

In [8]:
print('len:', len(IPRO_filt.columns))
print('count:' , IPRO_filt.count())

len: 15
count: 519


In [14]:
df_sizes = IPRO_filt.withColumn('perc', abs(df.Start_location - df.Stop_location) / df.Sequence_length).sort('perc')

In [21]:
for i in df_sizes.columns:
    print(df_sizes.select(i).show(5))

+--------------------+
|   Protein_accession|
+--------------------+
|gi|29893849|gb|AA...|
|gi|29899051|gb|AA...|
|gi|29893849|gb|AA...|
|gi|29893849|gb|AA...|
|gi|29893849|gb|AA...|
+--------------------+
only showing top 5 rows

None
+--------------------+
| Sequence_MD5_digest|
+--------------------+
|e54a5c888b8e04a19...|
|9d151ff160d221b4c...|
|e54a5c888b8e04a19...|
|e54a5c888b8e04a19...|
|e54a5c888b8e04a19...|
+--------------------+
only showing top 5 rows

None
+---------------+
|Sequence_length|
+---------------+
|            660|
|            613|
|            660|
|            660|
|            660|
+---------------+
only showing top 5 rows

None
+---------------+
|       Analysis|
+---------------+
|ProSitePatterns|
|         PRINTS|
|         PRINTS|
|         PRINTS|
|         PRINTS|
+---------------+
only showing top 5 rows

None
+-------------------+
|Signature_accession|
+-------------------+
|            PS00178|
|            PR00344|
|            PR01041|
|         