In [1]:
!pip install pyspark
from pyspark.sql import SparkSession

# Create Spark session
spark = SparkSession.builder.appName("MusicDataAnalysis").getOrCreate()

Collecting pyspark
  Downloading pyspark-3.5.1.tar.gz (317.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.0/317.0 MB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.1-py2.py3-none-any.whl size=317488491 sha256=6c2e8d9b479f1e84a9d6df10f70e7de435198cb13ba5edc4022031c8fb0f03bb
  Stored in directory: /root/.cache/pip/wheels/80/1d/60/2c256ed38dddce2fdd93be545214a63e02fbd8d74fb0b7f3a6
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.1


In [5]:
# Import necessary library
from pyspark.sql import SparkSession
from pyspark.sql.functions import avg, when

# Create Spark session
spark = SparkSession.builder.appName("MusicDataAnalysis").getOrCreate()

# Load the dataset into a Spark DataFrame
file_path = "/content/dataset.csv"
df = spark.read.csv(file_path, header=True, inferSchema=True)

# Print schema and verify data types
df.printSchema()

# Calculate average danceability, energy, and tempo of tracks by artist
artist_avg_metrics = df.groupBy("artists").agg(
    avg("danceability").alias("AvgDanceability"),
    avg("energy").alias("AvgEnergy"),
    avg("tempo").alias("AvgTempo")
)

# Identify top 5 artists with highest average track popularity
top_artists_popularity = df.groupBy("artists").agg(
    avg("popularity").alias("AvgPopularity")
).orderBy("AvgPopularity", ascending=False).limit(5)

# Create a new column 'energy_level' classifying tracks as 'High Energy' or 'Regular Energy'
df = df.withColumn("energy_level", when(df["energy"] > 0.8, "High Energy").otherwise("Regular Energy"))

# Group the data by this new energy classification and calculate average popularity and loudness
energy_level_metrics = df.groupBy("energy_level").agg(
    avg("popularity").alias("AvgPopularity"),
    avg("loudness").alias("AvgLoudness")
)

# Export the data that have been classified as 'High Energy' with overwrite mode
high_energy_tracks = df.filter(df["energy_level"] == "High Energy")
high_energy_tracks.write.csv("/content/high_energy_tracks.csv", mode="overwrite", header=True)



root
 |-- _c0: integer (nullable = true)
 |-- track_id: string (nullable = true)
 |-- artists: string (nullable = true)
 |-- album_name: string (nullable = true)
 |-- track_name: string (nullable = true)
 |-- popularity: string (nullable = true)
 |-- duration_ms: string (nullable = true)
 |-- explicit: string (nullable = true)
 |-- danceability: string (nullable = true)
 |-- energy: string (nullable = true)
 |-- key: string (nullable = true)
 |-- loudness: string (nullable = true)
 |-- mode: string (nullable = true)
 |-- speechiness: string (nullable = true)
 |-- acousticness: string (nullable = true)
 |-- instrumentalness: double (nullable = true)
 |-- liveness: string (nullable = true)
 |-- valence: string (nullable = true)
 |-- tempo: double (nullable = true)
 |-- time_signature: double (nullable = true)
 |-- track_genre: string (nullable = true)

