## Data cleaning 

**Import Modules**

In [480]:
from pyspark.sql.session import SparkSession
from pyspark.sql import functions as F

from pyspark.ml import Pipeline
from pyspark.ml.regression import LinearRegression
from pyspark.ml.linalg import DenseVector
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler, StandardScaler, PCA, OneHotEncoder, StringIndexer
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

from pyspark.sql.types import IntegerType
from pyspark.sql.types import FloatType
from pyspark.sql.functions import isnan, when, count, col

In [481]:
# Data Manipulation
import numpy as np 
import pandas as pd

In [482]:
import os
spark = SparkSession.builder.getOrCreate()
sc = spark.sparkContext

**Read csv file**

In [483]:
filename = 'data.csv'
df = spark.read.csv(filename, inferSchema=True, header = True).limit(300)

In [484]:
type(df)

pyspark.sql.dataframe.DataFrame

In [485]:
df.toPandas()

Unnamed: 0,acousticness,artists,danceability,duration_ms,energy,explicit,id,instrumentalness,key,liveness,loudness,mode,name,popularity,release_date,speechiness,tempo,valence,year
0,0.991000,['Mamie Smith'],0.598,168333,0.22399999999999998,0,0cS0A1fUEUd1EW3FcF8AEI,0.000522,5,0.379,-12.628,0,Keep A Song In Your Soul,12,1920,0.0936,149.976,0.634,1920
1,0.643000,"""[""""Screamin' Jay Hawkins""""]""",0.852,150200,0.517,0,0hbkKFIJm7Z05H8Zl9w30f,0.0264,5,0.0809,-7.261,0,I Put A Spell On You,7,1920-01-05,0.0534,86.889,0.95,1920
2,0.993000,['Mamie Smith'],0.647,163827,0.18600000000000005,0,11m7laMUgmOKqI3oYzuhne,1.76e-05,0,0.519,-12.097999999999999,1,Golfing Papa,4,1920,0.174,97.6,0.6890000000000001,1920
3,0.000173,['Oscar Velazquez'],0.73,422087,0.7979999999999999,0,19Lc5SfJJ5O1oaxY0fpwfh,0.8009999999999999,2,0.128,-7.311,1,True House Music - Xavier Santos & Carlos Gomi...,17,1920-01-01,0.0425,127.99700000000001,0.0422,1920
4,0.295000,['Mixe'],0.7040000000000001,165224,0.7070000000000001,1,2hJjbsLCytGsnAHfdsLejp,0.000246,10,0.402,-6.0360000000000005,0,Xuniverxe,2,1920-10-01,0.0768,122.07600000000001,0.299,1920
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
295,0.317000,['SAVY..TLG'],0.8809999999999999,207962,0.4970000000000001,1,3nBY21TkPVYoTeczBCJBwK,0.0,1,0.534,-11.686,1,Ski Mask,0,1921-06-01,0.41200000000000003,140.984,0.42200000000000004,1921
296,0.983000,['Ignacio Corsini'],0.366,171213,0.231,0,3nX4gFJqhxKT61sYkB029F,0.905,4,0.11199999999999999,-24.963,1,Pobre Mi China - Remasterizado,0,1921-03-20,0.28300000000000003,63.24100000000001,0.429,1921
297,0.992000,['Morton Downey'],0.38,170240,0.11699999999999999,0,3tEwdlwf8ZrxpksJorI5Tc,1.14e-06,7,0.105,-13.298,1,That's How You Spell Ireland,0,1921,0.0415,86.493,0.17300000000000001,1921
298,0.672000,['Mehmet Kemiksiz'],0.40299999999999997,396356,0.385,0,47LO5zrjx6ShMymLqp80ef,0.000339,11,0.21100000000000002,-6.5070000000000014,0,Şühedâ Gövdesi Bir Baksana Dağlar Taşlar,0,1921,0.0315,123.839,0.21899999999999997,1921


In [486]:
df.describe().toPandas()

Unnamed: 0,summary,acousticness,artists,danceability,duration_ms,energy,explicit,id,instrumentalness,key,liveness,loudness,mode,name,popularity,release_date,speechiness,tempo,valence,year
0,count,300.0,300,300.0,300.0,300.0,300.0,300,300.0,300.0,300.0,300.0,300.0,300,300.0,300,300.0,300.0,300.0,300.0
1,mean,0.7111488746666667,,0.5153026666666674,229638.21333333332,0.3720655666666668,0.13,,0.2987565099333333,5.183333333333334,0.2156246666666666,-13.648463333333334,0.6433333333333333,1920.0,0.5166666666666667,1920.5208333333333,0.0864346666666666,109.22763999999998,0.4813940000000003,1920.3333333333333
2,stddev,0.3855105063941189,,0.1827691441675938,118544.61314077835,0.2202075222911126,0.3368653453081294,,0.3809612043695937,3.504854095344684,0.1464257506716449,5.83822959620886,0.4798155535623629,,1.8658313547505097,0.5013094740885771,0.0972271821459876,29.66239057041041,0.2623225462891797,0.4721921646499687
3,min,1.16e-05,"""[""""Screamin' Jay Hawkins""""]""",0.0663,102621.0,0.00817,0.0,02FzJbHtqElixxCmrpSCUa,0.0,0.0,0.0402,-0.8,0.0,"""""""U"""" Need Some Lovin' Blues""",0.0,1920,0.0253,100.011,0.0224,1920.0
4,max,0.996,['محمد الإدريسي'],0.936,95933.0,0.998,1.0,7xPhfUan2yNtyFG0cUWkt8,7.53e-06,9.0,0.805,-9.888,1.0,لما بدا يتثنى,9.0,1921-12-18,0.845,99.989,0.965,1921.0


**Drop Variables**

In [487]:
# Variables id, name, release_date are not related to our question.
# Thus, droping all three.
df = df.drop("id", "name", "release_date","key","explicit","mode")

**Convert Data Type**

In [488]:
floats = ["acousticness", "danceability", "energy", "instrumentalness", "liveness", "loudness",
         "speechiness", "tempo", "valence"]
for i in floats:
        df = df.withColumn(i, df[i].cast(FloatType()))

ints = ["duration_ms", "popularity", "year"]
for i in ints:
    df = df.withColumn(i, df[i].cast(IntegerType()))

In [489]:
# With Correct Variable Type
df.createOrReplaceTempView("df")
spark.sql("select * from df").show(5)

+------------+--------------------+------------+-----------+------+----------------+--------+--------+----------+-----------+-------+-------+----+
|acousticness|             artists|danceability|duration_ms|energy|instrumentalness|liveness|loudness|popularity|speechiness|  tempo|valence|year|
+------------+--------------------+------------+-----------+------+----------------+--------+--------+----------+-----------+-------+-------+----+
|       0.991|     ['Mamie Smith']|       0.598|     168333| 0.224|         5.22E-4|   0.379| -12.628|        12|     0.0936|149.976|  0.634|1920|
|       0.643|"[""Screamin' Jay...|       0.852|     150200| 0.517|          0.0264|  0.0809|  -7.261|         7|     0.0534| 86.889|   0.95|1920|
|       0.993|     ['Mamie Smith']|       0.647|     163827| 0.186|         1.76E-5|   0.519| -12.098|         4|      0.174|   97.6|  0.689|1920|
|     1.73E-4| ['Oscar Velazquez']|        0.73|     422087| 0.798|           0.801|   0.128|  -7.311|        17|     

In [490]:
df.printSchema()

root
 |-- acousticness: float (nullable = true)
 |-- artists: string (nullable = true)
 |-- danceability: float (nullable = true)
 |-- duration_ms: integer (nullable = true)
 |-- energy: float (nullable = true)
 |-- instrumentalness: float (nullable = true)
 |-- liveness: float (nullable = true)
 |-- loudness: float (nullable = true)
 |-- popularity: integer (nullable = true)
 |-- speechiness: float (nullable = true)
 |-- tempo: float (nullable = true)
 |-- valence: float (nullable = true)
 |-- year: integer (nullable = true)



**Duplicates & Nulls**

In [491]:
# Count rows and unique rows
print('Rows = {}'.format(df.count()))
print('Distinct Rows = {}'.format(df.distinct().count()))

Rows = 300
Distinct Rows = 290


In [492]:
# Check
# Drop all duplicates 
df = df.dropDuplicates()
print('The number of rows with duplicate data removed = {}'.format(df.count()))

The number of rows with duplicate data removed = 290


In [493]:
# Check for NULL
df.select([count(when(isnan(c), c)).alias(c) for c in df.columns]).show()

+------------+-------+------------+-----------+------+----------------+--------+--------+----------+-----------+-----+-------+----+
|acousticness|artists|danceability|duration_ms|energy|instrumentalness|liveness|loudness|popularity|speechiness|tempo|valence|year|
+------------+-------+------------+-----------+------+----------------+--------+--------+----------+-----------+-----+-------+----+
|           0|      0|           0|          0|     0|               0|       0|       0|         0|          0|    0|      0|   0|
+------------+-------+------------+-----------+------+----------------+--------+--------+----------+-----------+-----+-------+----+



**Numeric Variables & Categorical Variables**

In [494]:
num_cols = []
cat_cols = []

for s in df.schema:
    data_type = str(s.dataType)
    if data_type == "StringType":
        cat_cols.append(s.name)
    
    #if data_type == "FloatType" or data_type == "IntType" or data_type =="String":
    else:
        num_cols.append(s.name)

In [495]:
print(num_cols)
print(cat_cols)

['acousticness', 'danceability', 'duration_ms', 'energy', 'instrumentalness', 'liveness', 'loudness', 'popularity', 'speechiness', 'tempo', 'valence', 'year']
['artists']


In [496]:
# Correlation
import six
for i in df.columns:
    if not(isinstance(df.select(i).take(1)[0][0], six.string_types)):
        print( "Correlation to popularity for ", i, df.stat.corr('popularity',i))

Correlation to popularity for  acousticness -0.12673057285882536
Correlation to popularity for  danceability 0.13655843406413337
Correlation to popularity for  duration_ms 0.11621782424211506
Correlation to popularity for  energy 0.040698802026386594
Correlation to popularity for  instrumentalness 0.048312739244376374
Correlation to popularity for  liveness -0.020267565496525756
Correlation to popularity for  loudness -0.00425633701320097
Correlation to popularity for  popularity 1.0
Correlation to popularity for  speechiness -0.03201265087177231
Correlation to popularity for  tempo 0.015804557016442095
Correlation to popularity for  valence -0.0824132398039687
Correlation to popularity for  year -0.020890563342533443


## Linear Regression Model

**SCALING**

In [497]:
import pyspark.sql.functions as func
# Scale popularity
df1 = df.withColumn('popularity_final',
                   func.round(df['popularity']/100, 2)) \
        .withColumn('duration_min',
                   func.round(df['duration_ms']/60000, 2))

**SELECT AND STANDARDIZE FEATURES**

In [498]:
vars_to_keep = ['popularity_final',
                'acousticness',
                'danceability',
                'duration_min', 
                'energy',
                'instrumentalness',
                'liveness',
                'loudness',
                'speechiness',
                'tempo',
                'valence',
                'year']

# subset the dataframe on these predictors
df1 = df1.select(vars_to_keep)
df1.show(1)

+----------------+------------+------------+------------+------+----------------+--------+--------+-----------+-------+-------+----+
|popularity_final|acousticness|danceability|duration_min|energy|instrumentalness|liveness|loudness|speechiness|  tempo|valence|year|
+----------------+------------+------------+------------+------+----------------+--------+--------+-----------+-------+-------+----+
|            0.12|       0.991|       0.598|        2.81| 0.224|         5.22E-4|   0.379| -12.628|     0.0936|149.976|  0.634|1920|
+----------------+------------+------------+------------+------+----------------+--------+--------+-----------+-------+-------+----+
only showing top 1 row



In [499]:
# Popularity_final V.S. Features
features= ['acousticness',
           'danceability',
           'duration_min',
           'energy', 
           'instrumentalness',
           'liveness',
           'loudness', 
           'speechiness',
           'tempo',
           'valence',
           'year']

assembler = VectorAssembler(
    inputCols = features, 
    outputCol = "features") 
df1 = assembler.transform(df1)

In [500]:
df1 = df1.select(["popularity_final",'features']) \
       .withColumnRenamed("popularity_final", 'label')
df1.show(5)

+-----+--------------------+
|label|            features|
+-----+--------------------+
| 0.12|[0.99099999666213...|
| 0.07|[0.64300000667572...|
| 0.04|[0.99299997091293...|
| 0.17|[1.72999993083067...|
| 0.02|[0.29499998688697...|
+-----+--------------------+
only showing top 5 rows



In [501]:
## Feature scaling
# Initialize the `standardScaler`
standardScaler = StandardScaler(inputCol="features", outputCol="features_scaled", 
                                withStd=True, withMean=False)

# Fit the DataFrame to the scaler; this computes the mean, standard deviation of each feature
scaler = standardScaler.fit(df1)

# Transform the data in `df2` with the scaler
scaled_df1 = scaler.transform(df1)

In [502]:
scaled_df1.show(3)

+-----+--------------------+--------------------+
|label|            features|     features_scaled|
+-----+--------------------+--------------------+
| 0.12|[0.99099999666213...|[2.54430893912201...|
| 0.07|[0.64300000667572...|[1.65084830509672...|
| 0.04|[0.99299997091293...|[2.54944370439088...|
+-----+--------------------+--------------------+
only showing top 3 rows



In [503]:
# Split data into train set (80%), test set (20%) 
splits = scaled_df1.randomSplit([0.8, 0.2])
train_df1 = splits[0]
test_df1 = splits[1]

In [504]:
train_df1.show(3)

+-----+--------------------+--------------------+
|label|            features|     features_scaled|
+-----+--------------------+--------------------+
|  0.0|[5.22000009368639...|[1.34019101365466...|
|  0.0|[6.00999992457218...|[1.54301680965846...|
|  0.0|[1.02999998489394...|[2.64443812077495...|
+-----+--------------------+--------------------+
only showing top 3 rows



In [511]:
lr = LinearRegression(featuresCol = 'features', labelCol='label',
                      maxIter=10, regParam=0.3, elasticNetParam=0.8)
model = lr.fit(train_df1)
prediction = model.transform(test_df1)

print("Coefficients: " + str(lr_model.coefficients))
print("Intercept: " + str(lr_model.intercept))

Coefficients: [0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0]
Intercept: 0.004598214285714286


In [512]:
trainingSummary = lr_model.summary
print("RMSE: %f" % trainingSummary.rootMeanSquaredError)
print("r2: %f" % trainingSummary.r2)

RMSE: 0.016361
r2: -0.000000


In [513]:
eval = RegressionEvaluator(labelCol='label', predictionCol='prediction', metricName = 'rmse')

mse = eval.evaluate(prediction, {eval.metricName: 'mse'}) 
print("MSE: %.3f" % mse)

MSE: 0.001


In [514]:
evaluator = RegressionEvaluator() \
                 .setPredictionCol("prediction") \
                 .setLabelCol("label") \
                 .setMetricName("rmse")

print("RMSE error value: ", evaluator.evaluate(prediction))

RMSE error value:  0.025823800704823742


## Naive Bayes

In [520]:
from pyspark.mllib.classification import NaiveBayes, NaiveBayesModel


In [516]:
# Split data into train set (80%), test set (20%) 
splits = scaled_df.randomSplit([0.8, 0.2])
train_df2 = splits[0]
test_df2 = splits[1]

In [517]:
# Train a naive Bayes model.
model = NaiveBayes.train(train_df2, 1.0)

AttributeError: 'NoneType' object has no attribute 'setCallSite'

In [None]:
# Make prediction and test accuracy.
labelsAndPreds = test.map(lambda p: (p.label, model.predict(p.features)))
accuracy = 1.0 * labelsAndPreds.filter(lambda pl: pl[0] == pl[1]).count() / test_df2.count()
print('model accuracy {}'.format(accuracy))

## Random Forest

## Gradient-boosted tree regression