In [1]:
# Importieren der notwendigen Bibliotheken
import sparknlp
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from sparknlp.base import DocumentAssembler
from sparknlp.annotator import SentimentDLModel, Tokenizer, Normalizer, StopWordsCleaner
from pyspark.ml import Pipeline
import pandas as pd

from dotenv import load_dotenv
import os
import sparknlp
spark = sparknlp.start()
# Starten einer Spark-Session
SparkSession.builder \
    .appName("Spark NLP") \
    .master("local[*]") \
    .config("spark.driver.memory", "16G") \
    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") \
    .config("spark.kryoserializer.buffer.max", "2000M") \
    .config("spark.driver.maxResultSize", "0") \
    .config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.12:5.5.1") \
    .getOrCreate()


In [4]:
# Review-Daten in ein Pandas DataFrame einfügen
# data = {
#     "image": ["https://images-na.ssl-images-amazon.com/images/I/71eG75FTJJL._SY88.jpg"],
#     "overall": [5.0],
#     "vote": ["2"],
#     "verified": [True],
#     "reviewTime": ["01 1, 2018"],
#     "reviewerID": ["AUI6WTTT0QZYS"],
#     "asin": ["5120053084"],
#     "style": [{"Size:": "Large", "Color:": "Charcoal"}],
#     "reviewerName": ["Abbey"],
#     "reviewText": ["I now have 4 of the 5 available colors of this shirt... "],
#     "summary": ["Comfy, flattering, discreet--highly recommended!"],
#     "unixReviewTime": [1514764800]
# }

load_dotenv("./env.env")
path = os.getenv("DATA_PATH")
reviews_df = spark.read.json(path)
valid_reviews_df = reviews_df.filter(reviews_df.overall.isNotNull())

# Umwandeln des Pandas DataFrame in ein Spark DataFrame
df = valid_reviews_df

# Anzeigen der Struktur
df.printSchema()


root
 |-- asin: string (nullable = true)
 |-- image: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- overall: double (nullable = true)
 |-- reviewText: string (nullable = true)
 |-- reviewTime: string (nullable = true)
 |-- reviewerID: string (nullable = true)
 |-- reviewerName: string (nullable = true)
 |-- style: struct (nullable = true)
 |    |-- Color Name:: string (nullable = true)
 |    |-- Color:: string (nullable = true)
 |    |-- Configuration:: string (nullable = true)
 |    |-- Content:: string (nullable = true)
 |    |-- Denomination:: string (nullable = true)
 |    |-- Edition:: string (nullable = true)
 |    |-- Format:: string (nullable = true)
 |    |-- Item Package Quantity:: string (nullable = true)
 |    |-- Length:: string (nullable = true)
 |    |-- Offer Type:: string (nullable = true)
 |    |-- Package Quantity:: string (nullable = true)
 |    |-- Package Type:: string (nullable = true)
 |    |-- Pattern:: string (nullable = true)
 | 

In [13]:
from pyspark.sql import SparkSession
from sparknlp.base import *
from sparknlp.annotator import *
from pyspark.sql.functions import col

document_assembler = DocumentAssembler() \
    .setInputCol("reviewText") \
    .setOutputCol("document")

sentence_embeddings = UniversalSentenceEncoder.pretrained("tfhub_use", "en") \
    .setInputCols(["document"]) \
    .setOutputCol("sentence_embeddings")

sentiment_detector = SentimentDLModel.pretrained("sentimentdl_use_imdb", "en") \
    .setInputCols(["sentence_embeddings"]) \
    .setOutputCol("sentiment")

# Pipeline erstellen
pipeline = Pipeline(stages=[
    document_assembler,
    sentence_embeddings,
    sentiment_detector
])

# Pipeline auf den Datensatz anwenden
model = pipeline.fit(df)
result = model.transform(df)


tfhub_use download started this may take some time.
Approximate size to download 923,7 MB
[OK!]
sentimentdl_use_imdb download started this may take some time.
Approximate size to download 12 MB
[OK!]
+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [14]:
# Ergebnisse anzeigen (Spalten "reviewText" und "sentiment.result" gruppiert)
result.select(
    col("reviewText").alias("Review Text"),
    col("sentiment.result").getItem(0).alias("Sentiment Result")
).show(truncate=False)


+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------