In [50]:
#import modules
import pyspark

In [51]:
#start a pyspark session
from pyspark.sql import SparkSession

In [52]:
spark=SparkSession.builder.appName('practiceML').getOrCreate()

In [53]:
training = spark.read.csv(r"C:\Users\User\Documents\spotify\top 100 streamed songs.csv", header = True, inferSchema = True)

In [54]:
training.show()

+--------------------+--------------------+--------+------+---+--------+----+-----------+------------+----------------+--------+-------+-------+------------+
|                  id|                name|duration|energy|key|loudness|mode|speechiness|acousticness|instrumentalness|liveness|valence|  tempo|danceability|
+--------------------+--------------------+--------+------+---+--------+----+-----------+------------+----------------+--------+-------+-------+------------+
|4ZtFanR9U6ndgddUv...|Good 4 U Olivia R...|    2.97| 0.664|  9|  -5.044|   1|      0.154|       0.335|             0.0|  0.0849|  0.688|166.928|       0.563|
|5fxyZf6m2xHeSrOzU...|Stay The Kid LARO...|     2.3| 0.506|  8| -11.275|   1|     0.0589|       0.379|           0.868|    0.11|  0.454|170.054|       0.564|
|5nujrmhLynf4yMoMt...|Levitating Dua Li...|    3.38| 0.825|  6|  -3.787|   0|     0.0601|     0.00883|             0.0|  0.0674|  0.915|102.977|       0.702|
|4iJyoBOLtHqaGxP12...|Peaches Justin Bi...|     3.3|

In [55]:
training.printSchema()

root
 |-- id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- duration: double (nullable = true)
 |-- energy: double (nullable = true)
 |-- key: integer (nullable = true)
 |-- loudness: double (nullable = true)
 |-- mode: integer (nullable = true)
 |-- speechiness: double (nullable = true)
 |-- acousticness: double (nullable = true)
 |-- instrumentalness: double (nullable = true)
 |-- liveness: double (nullable = true)
 |-- valence: double (nullable = true)
 |-- tempo: double (nullable = true)
 |-- danceability: double (nullable = true)



In [56]:
training.columns

['id',
 'name',
 'duration',
 'energy',
 'key',
 'loudness',
 'mode',
 'speechiness',
 'acousticness',
 'instrumentalness',
 'liveness',
 'valence',
 'tempo',
 'danceability']

In [57]:
from pyspark.ml.feature import VectorAssembler
featureassembler=VectorAssembler(inputCols=['tempo', 'danceability'])

In [58]:
output = featureassembler.transform(training)

In [59]:
output.show()

+--------------------+--------------------+--------+------+---+--------+----+-----------+------------+----------------+--------+-------+-------+------------+------------------------------------+
|                  id|                name|duration|energy|key|loudness|mode|speechiness|acousticness|instrumentalness|liveness|valence|  tempo|danceability|VectorAssembler_f7f01a017ef1__output|
+--------------------+--------------------+--------+------+---+--------+----+-----------+------------+----------------+--------+-------+-------+------------+------------------------------------+
|4ZtFanR9U6ndgddUv...|Good 4 U Olivia R...|    2.97| 0.664|  9|  -5.044|   1|      0.154|       0.335|             0.0|  0.0849|  0.688|166.928|       0.563|                     [166.928,0.563]|
|5fxyZf6m2xHeSrOzU...|Stay The Kid LARO...|     2.3| 0.506|  8| -11.275|   1|     0.0589|       0.379|           0.868|    0.11|  0.454|170.054|       0.564|                     [170.054,0.564]|
|5nujrmhLynf4yMoMt...|Lev

In [60]:
output.columns

['id',
 'name',
 'duration',
 'energy',
 'key',
 'loudness',
 'mode',
 'speechiness',
 'acousticness',
 'instrumentalness',
 'liveness',
 'valence',
 'tempo',
 'danceability',
 'VectorAssembler_f7f01a017ef1__output']

In [63]:
finalized_data = output.select('VectorAssembler_f7f01a017ef1__output', 'danceability')

In [None]:
finalized_data.show()

In [64]:
#train test split
from pyspark.ml.regression import LinearRegression
train_data,test_data = finalized_data.randomSplit([0.75, 0.25])
regressor = LinearRegression(featuresCol = 'VectorAssembler_f7f01a017ef1__output', labelCol = 'danceability')
regressor = regressor.fit(train_data)

In [65]:
#coefficients
regressor.coefficients

DenseVector([-0.0, 1.0])

In [66]:
#intercepts
regressor.intercept

3.28857662421453e-16

In [67]:
pred_results = regressor.evaluate(test_data)

In [68]:
pred_results.predictions.show()

+------------------------------------+------------+-------------------+
|VectorAssembler_f7f01a017ef1__output|danceability|         prediction|
+------------------------------------+------------+-------------------+
|                      [87.064,0.443]|       0.443|0.44300000000000006|
|                       [90.03,0.677]|       0.677|              0.677|
|                      [91.007,0.701]|       0.701| 0.7009999999999998|
|                      [92.043,0.731]|       0.731| 0.7309999999999999|
|                      [93.023,0.631]|       0.631|              0.631|
|                      [94.009,0.889]|       0.889| 0.8889999999999999|
|                      [95.977,0.825]|       0.825| 0.8249999999999998|
|                      [102.04,0.717]|       0.717| 0.7169999999999999|
|                     [102.977,0.702]|       0.702| 0.7019999999999998|
|                     [110.011,0.567]|       0.567|              0.567|
|                      [110.97,0.764]|       0.764| 0.7639999999

In [69]:
pred_results.meanAbsoluteError,pred_results.meanSquaredError

(7.327471962526034e-17, 8.0118685686509e-33)