In [3]:
import os
# Find the latest version of spark 3.x  from http://www.apache.org/dist/spark/ and enter as the spark version
# For example:
# spark_version = 'spark-3.4.0'
spark_version = 'spark-3.5.4'
os.environ['SPARK_VERSION']=spark_version

# Install Spark and Java
!apt-get update
!apt-get install openjdk-11-jdk-headless -qq > /dev/null
!wget -q http://www.apache.org/dist/spark/$SPARK_VERSION/$SPARK_VERSION-bin-hadoop3.tgz
!tar xf $SPARK_VERSION-bin-hadoop3.tgz
!pip install -q findspark

# Set Environment Variables
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"
os.environ["SPARK_HOME"] = f"/content/{spark_version}-bin-hadoop3"

# Start a SparkSession
import findspark
findspark.init()


0% [Working]            Hit:1 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease
Get:2 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,626 B]
Hit:3 http://archive.ubuntu.com/ubuntu jammy InRelease
Get:4 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]
Get:5 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]
Get:6 https://r2u.stat.illinois.edu/ubuntu jammy InRelease [6,555 B]
Get:7 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ Packages [61.9 kB]
Get:8 http://archive.ubuntu.com/ubuntu jammy-backports InRelease [127 kB]
Get:9 https://r2u.stat.illinois.edu/ubuntu jammy/main all Packages [8,590 kB]
Get:10 http://archive.ubuntu.com/ubuntu jammy-updates/restricted amd64 Packages [3,663 kB]
Hit:11 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Get:12 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease [24.3 kB]
Get:13 http://security.ubun

In [4]:
# Import packages
from pyspark.sql import SparkSession
import time

# Create a SparkSession
spark = SparkSession.builder.appName("SparkSQL").getOrCreate()

 **Load Data from SQL or Spark**

In [5]:
from google.colab import files
import pandas as pd

# Upload the CSV file
uploaded = files.upload()

# The uploaded file will be stored in the current working directory
# Check the file names in the directory
import os
print(os.listdir())

# Load the CSV file into a pandas DataFrame
song_df = pd.read_csv('updated_file.csv')

# Display the first few rows of the dataset
song_df.head()


Saving updated_file.csv to updated_file.csv
['.config', 'updated_file.csv', 'spark-3.5.4-bin-hadoop3.tgz', 'spark-3.5.4-bin-hadoop3', 'sample_data']


Unnamed: 0,track,artist,uri,danceability,energy,key,loudness,mode,speechiness,acousticness,...,liveness,valence,tempo,duration_ms,time_signature,chorus_hit,sections,popularity,decade,url
0,Jealous Kind Of Fella,Garland Green,spotify:track:1dtKN6wwlolkM8XZy2y9C1,0.417,0.62,3,-7.727,1,0.0403,0.49,...,0.0779,0.845,185.655,173533,3,32.94975,9,1,1960.0,https://open.spotify.com/track/1dtKN6wwlolkM8X...
1,Initials B.B.,Serge Gainsbourg,spotify:track:5hjsmSnUefdUqzsDogisiX,0.498,0.505,3,-12.475,1,0.0337,0.018,...,0.176,0.797,101.801,213613,4,48.8251,10,0,1960.0,https://open.spotify.com/track/5hjsmSnUefdUqzs...
2,Melody Twist,Lord Melody,spotify:track:6uk8tI6pwxxdVTNlNOJeJh,0.657,0.649,5,-13.392,1,0.038,0.846,...,0.119,0.908,115.94,223960,4,37.22663,12,0,1960.0,https://open.spotify.com/track/6uk8tI6pwxxdVTN...
3,Mi Bomba Sonó,Celia Cruz,spotify:track:7aNjMJ05FvUXACPWZ7yJmv,0.59,0.545,7,-12.058,0,0.104,0.706,...,0.061,0.967,105.592,157907,4,24.75484,8,0,1960.0,https://open.spotify.com/track/7aNjMJ05FvUXACP...
4,Uravu Solla,P. Susheela,spotify:track:1rQ0clvgkzWr001POOPJWx,0.515,0.765,11,-3.515,0,0.124,0.857,...,0.213,0.906,114.617,245600,4,21.79874,14,0,1960.0,https://open.spotify.com/track/1rQ0clvgkzWr001...


In [6]:
# Convert the pandas DataFrame to a Spark DataFrame
songs_df = spark.createDataFrame(song_df)
songs_df.show()

+--------------------+--------------------+--------------------+------------+------+---+--------+----+-----------+------------+----------------+--------+-------+-------+-----------+--------------+----------+--------+----------+------+--------------------+
|               track|              artist|                 uri|danceability|energy|key|loudness|mode|speechiness|acousticness|instrumentalness|liveness|valence|  tempo|duration_ms|time_signature|chorus_hit|sections|popularity|decade|                 url|
+--------------------+--------------------+--------------------+------------+------+---+--------+----+-----------+------------+----------------+--------+-------+-------+-----------+--------------+----------+--------+----------+------+--------------------+
|Jealous Kind Of F...|       Garland Green|spotify:track:1dt...|       0.417|  0.62|  3|  -7.727|   1|     0.0403|        0.49|             0.0|  0.0779|  0.845|185.655|     173533|             3|  32.94975|       9|         1|1960.

In [7]:
# Register songs_df as a temporary view
songs_df.createOrReplaceTempView("songs")


In [8]:
columns_names = songs_df.columns
columns_names

['track',
 'artist',
 'uri',
 'danceability',
 'energy',
 'key',
 'loudness',
 'mode',
 'speechiness',
 'acousticness',
 'instrumentalness',
 'liveness',
 'valence',
 'tempo',
 'duration_ms',
 'time_signature',
 'chorus_hit',
 'sections',
 'popularity',
 'decade',
 'url']

In [9]:
spark.sql("""
SELECT track, artist,
CASE
    WHEN decade = '60s' THEN 1960.0
    WHEN decade = '70s' THEN 1970.0
    WHEN decade = '80s' THEN 1980.0
    WHEN decade = '90s' THEN 1990.0
    WHEN decade = '00s' THEN 2000.0
    WHEN decade = '10s' THEN 2010.0
    ELSE NULL
  END AS decade
FROM songs""").show()

+--------------------+--------------------+------+
|               track|              artist|decade|
+--------------------+--------------------+------+
|Jealous Kind Of F...|       Garland Green|  NULL|
|       Initials B.B.|    Serge Gainsbourg|  NULL|
|        Melody Twist|         Lord Melody|  NULL|
|       Mi Bomba Sonó|          Celia Cruz|  NULL|
|         Uravu Solla|         P. Susheela|  NULL|
|           Beat n. 3|     Ennio Morricone|  NULL|
|Samba De Uma Nota...|Antônio Carlos Jobim|  NULL|
|          Happy Days|        Marv Johnson|  NULL|
|Carolina - Remast...|      Caetano Veloso|  NULL|
|    I Can Hear Music|      The Beach Boys|  NULL|
|The Aftermath (Fr...|     Jerry Goldsmith|  NULL|
|           Ride Away|         Roy Orbison|  NULL|
|  Caboclo Nordestino|        Luiz Gonzaga|  NULL|
|            P.T. 109|          Jimmy Dean|  NULL|
|        Bye Bye Baby|          Mary Wells|  NULL|
|A Festa dos Seus ...|                Leño|  NULL|
|       My Empty Arms|       Ja

In [10]:
# Fetch the data with an SQL query (for example, select relevant features)
data = spark.sql("""
    SELECT danceability, energy, acousticness, instrumentalness, tempo, popularity
    FROM songs
    WHERE popularity IS NOT NULL AND danceability IS NOT NULL
""")
data.show()

+------------+------+------------+----------------+-------+----------+
|danceability|energy|acousticness|instrumentalness|  tempo|popularity|
+------------+------+------------+----------------+-------+----------+
|       0.417|  0.62|        0.49|             0.0|185.655|         1|
|       0.498| 0.505|       0.018|           0.107|101.801|         0|
|       0.657| 0.649|       0.846|         4.42E-6| 115.94|         0|
|        0.59| 0.545|       0.706|          0.0246|105.592|         0|
|       0.515| 0.765|       0.857|         8.72E-4|114.617|         0|
|       0.697| 0.673|       0.714|           0.919|112.117|         0|
|       0.662| 0.272|        0.36|           0.228|143.507|         0|
|        0.72| 0.624|       0.795|             0.0|119.999|         1|
|       0.545|  0.22|       0.582|           0.239|118.223|         0|
|       0.511| 0.603|      0.0385|         1.67E-6|128.336|         1|
|       0.491|0.0675|       0.826|           0.519|124.986|         0|
|     

**Data Cleaning, Normalization, and Standardization**

***Clean the data***

In [11]:
# Drop rows with any missing values
data_clean = data.dropna()
pandas_df = data_clean.toPandas()
pandas_df

Unnamed: 0,danceability,energy,acousticness,instrumentalness,tempo,popularity
0,0.417,0.620,0.4900,0.000000,185.655,1
1,0.498,0.505,0.0180,0.107000,101.801,0
2,0.657,0.649,0.8460,0.000004,115.940,0
3,0.590,0.545,0.7060,0.024600,105.592,0
4,0.515,0.765,0.8570,0.000872,114.617,0
...,...,...,...,...,...,...
41094,0.172,0.358,0.8860,0.966000,72.272,0
41095,0.910,0.366,0.0996,0.000000,119.985,1
41096,0.719,0.804,0.0132,0.000003,119.999,1
41097,0.600,0.177,0.9890,0.868000,120.030,0


**Normalize/Standardize the data**






In [27]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler

#Separate features and target
X = pandas_df[['danceability', 'energy', 'acousticness','instrumentalness','tempo']] # Features
y = pandas_df['popularity'] # Target

# Normalize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

**Train Test Split**

In [28]:
from sklearn.model_selection import train_test_split

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

In [29]:
# Train a k-Nearest Neighbors model
from sklearn.neighbors import NearestNeighbors

model = NearestNeighbors(n_neighbors=15, algorithm='ball_tree')
model.fit(X_scaled)

**Model Initialization and Training**

Random Forest Classifier (for classification). We'll use classification here to meet the requirement of at least 75% accuracy.

In [30]:
from sklearn.ensemble import RandomForestClassifier

#Initilize the model
model = RandomForestClassifier(n_estimators=100, random_state=42)

#Train the model
model.fit(X_train, y_train)

**Evaluate the Model**

Classification Accuracy (for a classification task)

In [31]:
from sklearn.metrics import accuracy_score, classification_report

#make predictions
y_pred = model.predict(X_test)

#calculate classification accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Classification  Accuracy: {accuracy * 100:2f}%")
print(classification_report(y_test, y_pred))

Classification  Accuracy: 74.622871%
              precision    recall  f1-score   support

           0       0.77      0.71      0.74      4124
           1       0.73      0.78      0.75      4096

    accuracy                           0.75      8220
   macro avg       0.75      0.75      0.75      8220
weighted avg       0.75      0.75      0.75      8220



**R-squared (for a regression task)**

In [32]:
from sklearn.metrics import r2_score

# Regression model
r2 = r2_score(y_test, y_pred)
print(f"R-squared: {r2: .4f}")


R-squared: -0.0151


In [33]:
from google.colab import files
import pandas as pd

# Upload the CSV file
uploaded = files.upload()

# The uploaded file will be stored in the current working directory
# Check the file names in the directory
import os
print(os.listdir())

# Load the CSV file into a pandas DataFrame
genre_df = pd.read_csv('genre_music.csv')

# Display the first few rows of the dataset
genre_df.head()

Saving genre_music.csv to genre_music (1).csv
['.config', 'updated_file.csv', 'spark-3.5.4-bin-hadoop3.tgz', 'spark-3.5.4-bin-hadoop3', 'genre_music.csv', 'genre_music (1).csv', 'sample_data']


Unnamed: 0,track,artist,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_s,time_signature,chorus_hit,sections,popularity,decade,genre
0,Jealous Kind Of Fella,Garland Green,0.417,0.62,3,-7.727,1,0.0403,0.49,0.0,0.0779,0.845,185.655,173.533,3,32.94975,9,1,60s,edm
1,Initials B.B.,Serge Gainsbourg,0.498,0.505,3,-12.475,1,0.0337,0.018,0.107,0.176,0.797,101.801,213.613,4,48.8251,10,0,60s,pop
2,Melody Twist,Lord Melody,0.657,0.649,5,-13.392,1,0.038,0.846,4e-06,0.119,0.908,115.94,223.96,4,37.22663,12,0,60s,pop
3,Mi Bomba Sonó,Celia Cruz,0.59,0.545,7,-12.058,0,0.104,0.706,0.0246,0.061,0.967,105.592,157.907,4,24.75484,8,0,60s,pop
4,Uravu Solla,P. Susheela,0.515,0.765,11,-3.515,0,0.124,0.857,0.000872,0.213,0.906,114.617,245.6,4,21.79874,14,0,60s,r&b


In [15]:
genre_df.genre.unique()

array(['edm', 'pop', 'r&b', 'rock', 'rap', 'latin'], dtype=object)

In [None]:
# Convert the pandas DataFrame to a Spark DataFrame
genre_df = spark.createDataFrame(song_df)
genre_df.show()

+--------------------+--------------------+------------+------+---+--------+----+-----------+------------+----------------+--------+-------+-------+----------+--------------+----------+--------+----------+------+-----+
|               track|              artist|danceability|energy|key|loudness|mode|speechiness|acousticness|instrumentalness|liveness|valence|  tempo|duration_s|time_signature|chorus_hit|sections|popularity|decade|genre|
+--------------------+--------------------+------------+------+---+--------+----+-----------+------------+----------------+--------+-------+-------+----------+--------------+----------+--------+----------+------+-----+
|Jealous Kind Of F...|       Garland Green|       0.417|  0.62|  3|  -7.727|   1|     0.0403|        0.49|             0.0|  0.0779|  0.845|185.655|   173.533|             3|  32.94975|       9|         1|   60s|  edm|
|       Initials B.B.|    Serge Gainsbourg|       0.498| 0.505|  3| -12.475|   1|     0.0337|       0.018|           0.107| 

In [None]:
# Register the DataFrame as a temporary SQL view
genre_df.createOrReplaceTempView("genre_data")

In [None]:
# SQL query to select columns and drop unnecessary ones
cleaned_data = spark.sql("""
    SELECT *
    FROM genre_data
    WHERE track IS NOT NULL AND artist IS NOT NULL AND genre IS NOT NULL
""")

# Drop the unnecessary columns (track, artist, genre)
cleaned_data = cleaned_data.drop('track', 'artist', 'genre')
cleaned_data.show()

+------------+------+---+--------+----+-----------+------------+----------------+--------+-------+-------+----------+--------------+----------+--------+----------+------+
|danceability|energy|key|loudness|mode|speechiness|acousticness|instrumentalness|liveness|valence|  tempo|duration_s|time_signature|chorus_hit|sections|popularity|decade|
+------------+------+---+--------+----+-----------+------------+----------------+--------+-------+-------+----------+--------------+----------+--------+----------+------+
|       0.417|  0.62|  3|  -7.727|   1|     0.0403|        0.49|             0.0|  0.0779|  0.845|185.655|   173.533|             3|  32.94975|       9|         1|   60s|
|       0.498| 0.505|  3| -12.475|   1|     0.0337|       0.018|           0.107|   0.176|  0.797|101.801|   213.613|             4|   48.8251|      10|         0|   60s|
|       0.657| 0.649|  5| -13.392|   1|      0.038|       0.846|         4.42E-6|   0.119|  0.908| 115.94|    223.96|             4|  37.22663|  

In [16]:
# Create copy
copy_df = genre_df.drop(columns = ['track', 'artist', 'genre'])
copy_df = pd.get_dummies(copy_df)
copy_df

Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,...,time_signature,chorus_hit,sections,popularity,decade_00s,decade_10s,decade_60s,decade_70s,decade_80s,decade_90s
0,0.417,0.620,3,-7.727,1,0.0403,0.4900,0.000000,0.0779,0.8450,...,3,32.94975,9,1,False,False,True,False,False,False
1,0.498,0.505,3,-12.475,1,0.0337,0.0180,0.107000,0.1760,0.7970,...,4,48.82510,10,0,False,False,True,False,False,False
2,0.657,0.649,5,-13.392,1,0.0380,0.8460,0.000004,0.1190,0.9080,...,4,37.22663,12,0,False,False,True,False,False,False
3,0.590,0.545,7,-12.058,0,0.1040,0.7060,0.024600,0.0610,0.9670,...,4,24.75484,8,0,False,False,True,False,False,False
4,0.515,0.765,11,-3.515,0,0.1240,0.8570,0.000872,0.2130,0.9060,...,4,21.79874,14,0,False,False,True,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41094,0.172,0.358,9,-14.430,1,0.0342,0.8860,0.966000,0.3140,0.0361,...,4,24.30824,7,0,False,True,False,False,False,False
41095,0.910,0.366,1,-9.954,1,0.0941,0.0996,0.000000,0.2610,0.7400,...,4,32.53856,8,1,False,True,False,False,False,False
41096,0.719,0.804,10,-4.581,1,0.0355,0.0132,0.000003,0.1390,0.6050,...,4,20.73371,7,1,False,True,False,False,False,False
41097,0.600,0.177,7,-16.070,1,0.0561,0.9890,0.868000,0.1490,0.5600,...,4,21.65301,14,0,False,True,False,False,False,False


In [17]:
correlation_matrix = copy_df.corr()

print(correlation_matrix)

                  danceability    energy       key  loudness      mode  \
danceability          1.000000  0.206036  0.015433  0.273997 -0.032740   
energy                0.206036  1.000000  0.022598  0.772611 -0.033907   
key                   0.015433  0.022598  1.000000  0.008483 -0.140398   
loudness              0.273997  0.772611  0.008483  1.000000  0.000384   
mode                 -0.032740 -0.033907 -0.140398  0.000384  1.000000   
speechiness           0.156362  0.122360  0.026554  0.069115 -0.059758   
acousticness         -0.261122 -0.715084 -0.024240 -0.566503  0.050028   
instrumentalness     -0.301834 -0.208113 -0.013120 -0.374206 -0.075968   
liveness             -0.115275  0.157797  0.000639  0.086676  0.008781   
valence               0.553845  0.341398  0.007748  0.271706  0.035613   
tempo                -0.066588  0.224107  0.001116  0.169506  0.027088   
duration_s           -0.062915  0.011961  0.015480 -0.049733 -0.074744   
time_signature        0.191814  0.1962

In [18]:
# Create a clustering model
y = genre_df['genre']
X = copy_df.drop(columns = ['key', 'mode', 'liveness', 'duration_s', 'time_signature', 'chorus_hit', 'sections'])
# 'decade_60s', 'decade_70s', 'decade_80s', 'decade_90s', 'decade_00s', 'decade_10s'

In [19]:
# Import the required modules
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

In [20]:
# Split into testing and training sets using train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [21]:
# Create the StandardScaler instance
scaler = StandardScaler()
# Fit the Standard Scaler with the training data
X_scaler = scaler.fit(X_train)
# Scale the training data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [22]:
# Declare a logistic regression model.
# Apply a random_state of 9 to the model
logistic_regression_model = LogisticRegression(random_state=9)

# Fit and save the logistic regression model using the training data
lr_model = logistic_regression_model.fit(X_train_scaled, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [23]:
#Generate training predictions
training_predictions = lr_model.predict(X_train_scaled)

#Generate testing predictions
testing_predictions = logistic_regression_model.predict(X_test_scaled)

In [24]:
# Import the model for sklearn confusion matrix
from sklearn.metrics import confusion_matrix

# Create and save the confusion matrix for the training data
training_matrix = confusion_matrix(y_train, training_predictions)

# Print the confusion matrix for the training data
print(training_matrix)

[[1067   35   20    1    5   11]
 [  10 1470  127  513   12   11]
 [  56    0 9630 2579   21   76]
 [  35  780 3225 4139   40   57]
 [   2    0    6    5 1727   28]
 [  55   23   17    8    3 5030]]


In [25]:
# Create and save the training classification report
training_report = classification_report(y_train, training_predictions)

# Print the training classification report
print(training_report)

              precision    recall  f1-score   support

         edm       0.87      0.94      0.90      1139
       latin       0.64      0.69      0.66      2143
         pop       0.74      0.78      0.76     12362
         r&b       0.57      0.50      0.53      8276
         rap       0.96      0.98      0.97      1768
        rock       0.96      0.98      0.97      5136

    accuracy                           0.75     30824
   macro avg       0.79      0.81      0.80     30824
weighted avg       0.74      0.75      0.74     30824



In [26]:
# Create and save the testing classification report
testing_report = classification_report(y_test, testing_predictions)

# Print the testing classification report
print(testing_report)

              precision    recall  f1-score   support

         edm       0.86      0.93      0.89       376
       latin       0.66      0.71      0.68       727
         pop       0.73      0.78      0.76      4098
         r&b       0.57      0.49      0.52      2764
         rap       0.94      0.97      0.96       596
        rock       0.96      0.97      0.97      1714

    accuracy                           0.74     10275
   macro avg       0.79      0.81      0.80     10275
weighted avg       0.74      0.74      0.74     10275



In [39]:
from time import time

algorithms = ['auto', 'ball_tree', 'kd_tree', 'brute']
for algo in algorithms:
    start_time = time()
    model = NearestNeighbors(n_neighbors=15, algorithm=algo)
    model.fit(X_scaled)
    print(f"Algorithm: {algo}, Time Taken: {time() - start_time:.4f} seconds")

Algorithm: auto, Time Taken: 0.0406 seconds
Algorithm: ball_tree, Time Taken: 0.0320 seconds
Algorithm: kd_tree, Time Taken: 0.0401 seconds
Algorithm: brute, Time Taken: 0.0027 seconds
