In [None]:
from pyspark.sql import SparkSession
import pandas as pd

In [None]:
# Create a Spark session
spark = SparkSession.builder.appName("LinearRegression").getOrCreate()

# Load your data into a DataFrame using pandas, then spark
df_pandas = pd.read_csv("kworb_popularity.csv")
df = spark.createDataFrame(df_pandas)

# Select the columns you want to calculate the correlation for
selected_cols = ["danceability", "energy", "loudness", "speechiness", "acousticness", "instrumentalness", "liveness", "valence", "tempo", "duration_ms", "popularity"]

In [5]:
# Assemble the selected columns into a feature vector
from pyspark.ml.feature import VectorAssembler
assembler = VectorAssembler(inputCols=selected_cols[:-1], outputCol="features")
data_with_features = assembler.transform(df).select("features", selected_cols[-1])

# Split the data into training and test sets
train_data, test_data = data_with_features.randomSplit([0.7, 0.3])

# Create a LinearRegression object and fit the model on the training data
from pyspark.ml.regression import LinearRegression
lr = LinearRegression(labelCol=selected_cols[-1], featuresCol="features", maxIter=10, regParam=0.3, elasticNetParam=0.8)
model = lr.fit(train_data)

# Evaluate the model on the test data
predictions = model.transform(test_data)
from pyspark.ml.evaluation import RegressionEvaluator
evaluator = RegressionEvaluator(labelCol=selected_cols[-1], predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictions)
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)

# k-means clustering