In [0]:
import pandas as pd

data = {
    "views": [10, 20, 30, 40, 50, 60, 70, 80],
    "purchases": [1, 2, 3, 4, 6, 7, 8, 9]
}

df = pd.DataFrame(data)
df.display() 


In [0]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression


In [0]:
models = {
    "linear": LinearRegression(),
    "decision_tree": DecisionTreeRegressor(max_depth=5),
    "random_forest": RandomForestRegressor(n_estimators=100)
}


In [0]:
X = df[["views"]]      # input
y = df["purchases"]   # output


In [0]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2
)


In [0]:
import mlflow

for name, model in models.items():
    with mlflow.start_run(run_name=f"{name}_model"):

        mlflow.log_param("model_type", name)

        model.fit(X_train, y_train)

        score = model.score(X_test, y_test)

        mlflow.log_metric("r2_score", score)

        print(f"{name}: RÂ² = {score:.4f}")


In [0]:
score = model.score(X_test, y_test)


In [0]:
spark_data = [
    (10, 1),
    (20, 2),
    (30, 3),
    (40, 4),
    (50, 6),
    (60, 7),
    (70, 8),
    (80, 9)
]

spark_df = spark.createDataFrame(
    spark_data,
    ["views", "purchases"]
)

spark_df.display()


In [0]:
from pyspark.ml.feature import VectorAssembler

assembler = VectorAssembler(
    inputCols=["views"],
    outputCol="features"
)


In [0]:
from pyspark.ml.regression import LinearRegression as SparkLR
from pyspark.ml import Pipeline

lr = SparkLR(
    featuresCol="features",
    labelCol="purchases"
)

pipeline = Pipeline(stages=[assembler, lr])


In [0]:
train, test = spark_df.randomSplit([0.8, 0.2], seed=42)

model = pipeline.fit(train)

predictions = model.transform(test)

predictions.display()
