In [1]:
# Importieren der notwendigen Bibliotheken
import sparknlp
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from sparknlp.base import DocumentAssembler
from sparknlp.annotator import SentimentDLModel, Tokenizer, Normalizer, StopWordsCleaner
from pyspark.ml import Pipeline
import pandas as pd

from dotenv import load_dotenv
import os

# Starten einer Spark-Session
spark = SparkSession.builder \
    .appName("SentimentAnalysisExample") \
    .getOrCreate()

# Initialisieren von Spark NLP
sparkNLP_version = sparknlp.version()
print(f"Spark NLP version: {sparkNLP_version}")


Spark NLP version: 5.5.1


In [2]:
# Review-Daten in ein Pandas DataFrame einfügen
data = {
    "image": ["https://images-na.ssl-images-amazon.com/images/I/71eG75FTJJL._SY88.jpg"],
    "overall": [5.0],
    "vote": ["2"],
    "verified": [True],
    "reviewTime": ["01 1, 2018"],
    "reviewerID": ["AUI6WTTT0QZYS"],
    "asin": ["5120053084"],
    "style": [{"Size:": "Large", "Color:": "Charcoal"}],
    "reviewerName": ["Abbey"],
    "reviewText": ["I now have 4 of the 5 available colors of this shirt... "],
    "summary": ["Comfy, flattering, discreet--highly recommended!"],
    "unixReviewTime": [1514764800]
}

load_dotenv("./env.env")
path = os.getenv("DATA_PATH")
reviews_df = spark.read.json(path)
valid_reviews_df = reviews_df.filter(reviews_df.overall.isNotNull())

# Umwandeln des Pandas DataFrame in ein Spark DataFrame
df = valid_reviews_df#spark.createDataFrame(pd.DataFrame(data))

# Anzeigen der Struktur
df.printSchema()


root
 |-- asin: string (nullable = true)
 |-- image: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- overall: double (nullable = true)
 |-- reviewText: string (nullable = true)
 |-- reviewTime: string (nullable = true)
 |-- reviewerID: string (nullable = true)
 |-- reviewerName: string (nullable = true)
 |-- style: struct (nullable = true)
 |    |-- Color Name:: string (nullable = true)
 |    |-- Color:: string (nullable = true)
 |    |-- Configuration:: string (nullable = true)
 |    |-- Content:: string (nullable = true)
 |    |-- Denomination:: string (nullable = true)
 |    |-- Edition:: string (nullable = true)
 |    |-- Format:: string (nullable = true)
 |    |-- Item Package Quantity:: string (nullable = true)
 |    |-- Length:: string (nullable = true)
 |    |-- Offer Type:: string (nullable = true)
 |    |-- Package Quantity:: string (nullable = true)
 |    |-- Package Type:: string (nullable = true)
 |    |-- Pattern:: string (nullable = true)
 | 

In [3]:
# Erstellen eines DocumentAssemblers (um Text in ein annotiertes Format zu bringen)
document_assembler = DocumentAssembler() \
    .setInputCol("reviewText") \
    .setOutputCol("document")

# Tokenizer, Normalizer und Stopwords Cleaner
tokenizer = Tokenizer() \
    .setInputCols(["document"]) \
    .setOutputCol("tokens")

normalizer = Normalizer() \
    .setInputCols(["tokens"]) \
    .setOutputCol("normalized")

stopwords_cleaner = StopWordsCleaner() \
    .setInputCols(["normalized"]) \
    .setOutputCol("clean_tokens")

# Sentiment-Analyse-Modell
sentiment_model = SentimentDLModel.pretrained("sentimentdl_use_twitter", "en") \
    .setInputCols(["clean_tokens", "document"]) \
    .setOutputCol("sentiment")

# Aufbau der Pipeline
pipeline = Pipeline(stages=[
    document_assembler,
    tokenizer,
    normalizer,
    stopwords_cleaner,
    sentiment_model
])

# Anpassen der Pipeline an die Daten
model = pipeline.fit(df)

# Vorhersagen auf den Datensatz anwenden
result = model.transform(df)

# Ausgabe der Sentiment-Ergebnisse
result.select("reviewText", "sentiment.result").show(truncate=False)


TypeError: 'JavaPackage' object is not callable

In [None]:
# Konvertieren der Spark DataFrame zu Pandas DataFrame
result_df = result.select("reviewText", "sentiment.result").toPandas()

# Ausgabe der Pandas DataFrame
print(result_df)
