In [0]:
data = [
    ("Mobile", "2024-01-01", 1000),
    ("Laptop", "2024-01-01", 2000),
    ("Mobile", "2024-01-02", 1500),
    ("Laptop", "2024-01-02", 1800)
]

df = spark.createDataFrame(data, ["product", "date", "sales"])

df.write.mode("overwrite").saveAsTable("default.sales_data")


In [0]:
df.display()

In [0]:
import mlflow
from sklearn.linear_model import LinearRegression
import pandas as pd

pdf = df.toPandas()

X = pdf[["sales"]]
y = [1, 0, 1, 0]

with mlflow.start_run():
    model = LinearRegression()
    model.fit(X, y)
    mlflow.log_param("model", "LinearRegression")


In [0]:
from pyspark.sql.functions import when, col

reviews = [
    ("This product is very good",),
    ("Very bad experience",),
    ("Excellent quality",),
    ("Worst service",)
]

df_reviews = spark.createDataFrame(reviews, ["review"])

df_reviews = df_reviews.withColumn(
    "sentiment",
    when(col("review").contains("good"), "Positive")
    .when(col("review").contains("Excellent"), "Positive")
    .otherwise("Negative")
)

df_reviews.show()


In [0]:
df.groupBy("product").sum("sales").show()
