In [None]:
!pip install textblob
!python -m textblob.download_corpora lite

In [None]:
import json

import pyspark
import pyspark.sql
from pyspark.ml import Pipeline

from modules.posts import (
    SentenceTransformer, PostTransformer, TranslateTransformer,
    SpeechPartsTransformer, SentimentTransformer
)
from modules.features import (
    FeaturesTransformer
)


sc = pyspark.SparkContext('local[*]', 'PipelineFlow')
sess = pyspark.sql.SparkSession(sc)
    

In [None]:

def transform_features(spark_context):
    rdd = spark_context.wholeTextFiles('data/featuresample.json')
    rdd = rdd.map(lambda x: (x[0], x[1]))
    df = rdd.toDF(['file', 'content'])

    features = [
        "leaf",
        "has-attribute-class",
    ]
    
    feature_transformer = FeaturesTransformer(features=features)
    feature_transformer.setInputCol('content').setOutputCol('features')
    
    stages = [
        feature_transformer,
    ]
    
    pipeline = Pipeline(stages=stages)
    result = pipeline.fit(df).transform(df)
    return result


In [None]:

def transform_posts(spark_context):
    rdd = spark_context.wholeTextFiles('data/posts/*')
    rdd = rdd.map(lambda x: (x[0], json.loads(x[1])))
    df = rdd.toDF(['file', 'content'])
    
    poster = PostTransformer().setInputCol('content').setOutputCol('posts')
    translator = TranslateTransformer().setInputCol('posts').setOutputCol('translated')
    sentencer = SentenceTransformer().setInputCol('translated').setOutputCol('sentences')
    speechParter = SpeechPartsTransformer().setInputCol('translated').setOutputCol('speechParts')
    sentimenter = SentimentTransformer().setInputCol('translated').setOutputCol('sentiments')

    stages = [
        poster,
        translator, 
        sentencer, 
        speechParter, 
        sentimenter
    ]
    
    pipeline = Pipeline(stages=stages)
    result = pipeline.fit(df).transform(df)
    return result


In [None]:
out = transform_posts(sc)

a = out.select('sentences').first().sentences[0]
b = out.select('sentences').first().sentences[1]
c = out.select('sentences').first().sentences[2]
d = out.select('translated').first().translated[0]
e = out.select('speechParts').first().speechParts
f = out.select('sentiments').first().sentiments[0]
g = out.select('sentiments').first().sentiments[1]
h = out.select('sentiments').first().sentiments[2]

scheme = '{}\n\n{}\n\n{}\n\n{}\n\n{}\n\n{}\n\n{}\n\n{}'
print(scheme.format(a,b,c,d,e,f,g,h))

In [None]:
result = transform_features(sc)

print(result.first().features)