## Data cleaning 

**Import Modules**

In [214]:
from pyspark.sql.session import SparkSession
from pyspark.sql import functions as F

from pyspark.ml import Pipeline
from pyspark.ml.regression import LinearRegression
from pyspark.ml.linalg import DenseVector
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler, StandardScaler, PCA, OneHotEncoder, StringIndexer
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

from pyspark.sql.types import IntegerType
from pyspark.sql.types import FloatType
from pyspark.sql.functions import isnan, when, count, col

In [215]:
# Data Manipulation
import numpy as np 
import pandas as pd

In [216]:
import os
spark = SparkSession.builder.getOrCreate()
sc = spark.sparkContext

**Read csv file**

In [217]:
filename = 'data.csv'
df = spark.read.csv(filename, inferSchema=True, header = True).limit(500)

In [79]:
type(df)

pyspark.sql.dataframe.DataFrame

In [80]:
df.toPandas()

Unnamed: 0,acousticness,artists,danceability,duration_ms,energy,explicit,id,instrumentalness,key,liveness,loudness,mode,name,popularity,release_date,speechiness,tempo,valence,year
0,0.991000,['Mamie Smith'],0.598,168333,0.22399999999999998,0,0cS0A1fUEUd1EW3FcF8AEI,0.000522,5,0.379,-12.628,0,Keep A Song In Your Soul,12,1920,0.0936,149.976,0.634,1920
1,0.643000,"""[""""Screamin' Jay Hawkins""""]""",0.852,150200,0.517,0,0hbkKFIJm7Z05H8Zl9w30f,0.0264,5,0.0809,-7.261,0,I Put A Spell On You,7,1920-01-05,0.0534,86.889,0.95,1920
2,0.993000,['Mamie Smith'],0.647,163827,0.18600000000000005,0,11m7laMUgmOKqI3oYzuhne,1.76e-05,0,0.519,-12.097999999999999,1,Golfing Papa,4,1920,0.174,97.6,0.6890000000000001,1920
3,0.000173,['Oscar Velazquez'],0.73,422087,0.7979999999999999,0,19Lc5SfJJ5O1oaxY0fpwfh,0.8009999999999999,2,0.128,-7.311,1,True House Music - Xavier Santos & Carlos Gomi...,17,1920-01-01,0.0425,127.99700000000001,0.0422,1920
4,0.295000,['Mixe'],0.7040000000000001,165224,0.7070000000000001,1,2hJjbsLCytGsnAHfdsLejp,0.000246,10,0.402,-6.0360000000000005,0,Xuniverxe,2,1920-10-01,0.0768,122.07600000000001,0.299,1920
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,0.996000,"""[""""King Oliver's Creole Jazz Band""""]""",0.546,170827,0.18899999999999997,0,3rauXVLOOM5BlxWqUcDpkg,0.9079999999999999,0,0.33899999999999997,-15.984000000000002,1,Chimes Blues,13,1923,0.0581,80.318,0.5539999999999999,1923
496,0.960000,['Louis Armstrong'],0.5710000000000001,182707,0.26899999999999996,0,5WlMyDvyQedom0kQ3MCTow,0.0129,4,0.121,-11.728,0,Lazy River,13,1923,0.067,101.845,0.49,1923
497,0.989000,['Louis Armstrong & His Hot Five'],0.586,168533,0.113,0,08XPnovoMlLsFB3LDGq0rX,0.276,10,0.18100000000000002,-12.210999999999999,1,Muggles,12,1923,0.0607,85.235,0.484,1923
498,0.985000,['Louis Armstrong'],0.544,195000,0.18100000000000002,0,3U6L5KW48YOsksN9Qvlydl,0.0256,5,0.401,-11.729000000000001,1,I've Got The World On A String,11,1923,0.0387,129.803,0.41100000000000003,1923


In [81]:
df.describe().toPandas()

Unnamed: 0,summary,acousticness,artists,danceability,duration_ms,energy,explicit,id,instrumentalness,key,liveness,loudness,mode,name,popularity,release_date,speechiness,tempo,valence,year
0,count,500.0,500,500.0,500.0,500.0,500.0,500,500.0,500.0,500.0,500.0,500.0,500,500.0,500.0,500.0,500.0,500.0,500.0
1,mean,0.7707071247999999,,0.5233922000000003,210010.694,0.3106379199999999,0.086,,0.2910775922,5.004,0.2228323999999999,-15.740400000000005,0.664,1920.0,1.358,1921.0128205128208,0.1763996,105.36336399999996,0.4910567999999997,1920.934
2,stddev,0.3330715886347827,,0.1761360270905372,114170.9388238364,0.2020866202405148,0.2806448343477788,,0.3851030225302941,3.4687242750314407,0.1544022400540464,6.317065108352638,0.4728119099646378,,5.016389571061773,0.8957734291186612,0.2761492587461571,30.15162617853181,0.2444855433262897,0.9073536141784608
3,min,1.16e-05,"""[""""King Oliver's Creole Jazz Band""""]""",0.0663,100600.0,0.00399,0.0,021ht4sdgPcrDgSk7JTbKY,0.0,0.0,0.0402,-0.8,0.0,"""""""U"""" Need Some Lovin' Blues""",0.0,1920.0,0.0253,100.011,0.0224,1920.0
4,max,0.996,['محمد الإدريسي'],0.936,99900.0,0.998,1.0,7zph0VMbYl7KoqReyowKiv,7.53e-06,9.0,0.966,-9.935,1.0,لما بدا يتثنى,9.0,1923.0,0.967,99.989,0.969,1923.0


**Drop Variables**

In [82]:
# Variables id, name, release_date are not related to our question.
# Thus, droping all three.
df = df.drop("id", "name", "release_date","key","explicit","mode")

**Convert Data Type**

In [83]:
floats = ["acousticness", "danceability", "energy", "instrumentalness", "liveness", "loudness",
         "speechiness", "tempo", "valence"]
for i in floats:
        df = df.withColumn(i, df[i].cast(FloatType()))

ints = ["duration_ms", "popularity", "year"]
for i in ints:
    df = df.withColumn(i, df[i].cast(IntegerType()))

In [84]:
# With Correct Variable Type
df.createOrReplaceTempView("df")
spark.sql("select * from df").show(5)

+------------+--------------------+------------+-----------+------+----------------+--------+--------+----------+-----------+-------+-------+----+
|acousticness|             artists|danceability|duration_ms|energy|instrumentalness|liveness|loudness|popularity|speechiness|  tempo|valence|year|
+------------+--------------------+------------+-----------+------+----------------+--------+--------+----------+-----------+-------+-------+----+
|       0.991|     ['Mamie Smith']|       0.598|     168333| 0.224|         5.22E-4|   0.379| -12.628|        12|     0.0936|149.976|  0.634|1920|
|       0.643|"[""Screamin' Jay...|       0.852|     150200| 0.517|          0.0264|  0.0809|  -7.261|         7|     0.0534| 86.889|   0.95|1920|
|       0.993|     ['Mamie Smith']|       0.647|     163827| 0.186|         1.76E-5|   0.519| -12.098|         4|      0.174|   97.6|  0.689|1920|
|     1.73E-4| ['Oscar Velazquez']|        0.73|     422087| 0.798|           0.801|   0.128|  -7.311|        17|     

In [85]:
df.printSchema()

root
 |-- acousticness: float (nullable = true)
 |-- artists: string (nullable = true)
 |-- danceability: float (nullable = true)
 |-- duration_ms: integer (nullable = true)
 |-- energy: float (nullable = true)
 |-- instrumentalness: float (nullable = true)
 |-- liveness: float (nullable = true)
 |-- loudness: float (nullable = true)
 |-- popularity: integer (nullable = true)
 |-- speechiness: float (nullable = true)
 |-- tempo: float (nullable = true)
 |-- valence: float (nullable = true)
 |-- year: integer (nullable = true)



**Duplicates & Nulls**

In [86]:
# Count rows and unique rows
print('Rows = {}'.format(df.count()))
print('Distinct Rows = {}'.format(df.distinct().count()))

Rows = 500
Distinct Rows = 490


In [87]:
# Check
# Drop all duplicates 
df = df.dropDuplicates()
print('The number of rows with duplicate data removed = {}'.format(df.count()))

The number of rows with duplicate data removed = 490


In [88]:
# Check for NULL
df.select([count(when(isnan(c), c)).alias(c) for c in df.columns]).show()

+------------+-------+------------+-----------+------+----------------+--------+--------+----------+-----------+-----+-------+----+
|acousticness|artists|danceability|duration_ms|energy|instrumentalness|liveness|loudness|popularity|speechiness|tempo|valence|year|
+------------+-------+------------+-----------+------+----------------+--------+--------+----------+-----------+-----+-------+----+
|           0|      0|           0|          0|     0|               0|       0|       0|         0|          0|    0|      0|   0|
+------------+-------+------------+-----------+------+----------------+--------+--------+----------+-----------+-----+-------+----+



**Numeric Variables & Categorical Variables**

In [89]:
num_cols = []
cat_cols = []

for s in df.schema:
    data_type = str(s.dataType)
    if data_type == "StringType":
        cat_cols.append(s.name)
    
    #if data_type == "FloatType" or data_type == "IntType" or data_type =="String":
    else:
        num_cols.append(s.name)

In [90]:
print(num_cols)
print(cat_cols)

['acousticness', 'danceability', 'duration_ms', 'energy', 'instrumentalness', 'liveness', 'loudness', 'popularity', 'speechiness', 'tempo', 'valence', 'year']
['artists']


In [108]:
# Correlation
import six
for i in df.columns:
    if not(isinstance(df.select(i).take(1)[0][0], six.string_types)):
        print( "Correlation to popularity for ", i, df.stat.corr('popularity',i))

Correlation to popularity for  acousticness 0.09202827599446742
Correlation to popularity for  danceability 0.12148826891543778
Correlation to popularity for  duration_ms 0.001784097771222781
Correlation to popularity for  energy -0.13604579444665205
Correlation to popularity for  instrumentalness -0.0519780175924492
Correlation to popularity for  liveness -0.029208975316850662
Correlation to popularity for  loudness 0.12919160748443684
Correlation to popularity for  popularity 1.0
Correlation to popularity for  speechiness -0.11196693672706326
Correlation to popularity for  tempo -0.06345659560810465
Correlation to popularity for  valence -0.04062527626266994
Correlation to popularity for  year 0.4168984637414494


## Linear Regression Model

**SCALING**

In [193]:
import pyspark.sql.functions as func
# Scale popularity
df1 = df.withColumn('popularity_final',
                   func.round(df['popularity']/100, 2)) \
        .withColumn('duration_min',
                   func.round(df['duration_ms']/60000, 2))\
        .withColumn('instrumentalness_final',
                   func.round(df['instrumentalness'] *1000, 2))

**SELECT AND STANDARDIZE FEATURES**

In [194]:
vars_to_keep = ['popularity_final',
                'acousticness',
                'danceability',
                'duration_min', 
                'energy',
                'instrumentalness_final',
                'liveness',
                'loudness',
                'speechiness',
                'tempo',
                'valence',
                'year']

# subset the dataframe on these predictors
df1 = df1.select(vars_to_keep)
df1.show(1)

+----------------+------------+------------+------------+------+----------------------+--------+--------+-----------+-------+-------+----+
|popularity_final|acousticness|danceability|duration_min|energy|instrumentalness_final|liveness|loudness|speechiness|  tempo|valence|year|
+----------------+------------+------------+------------+------+----------------------+--------+--------+-----------+-------+-------+----+
|            0.12|       0.991|       0.598|        2.81| 0.224|                  0.52|   0.379| -12.628|     0.0936|149.976|  0.634|1920|
+----------------+------------+------------+------------+------+----------------------+--------+--------+-----------+-------+-------+----+
only showing top 1 row



In [195]:
# Popularity_final V.S. Features
features= ['acousticness',
           'danceability',
           'duration_min',
           'energy', 
           'instrumentalness_final',
           'liveness',
           'loudness', 
           'speechiness',
           'tempo',
           'valence',
           'year']

assembler = VectorAssembler(
    inputCols = features, 
    outputCol = "features") 
df1 = assembler.transform(df1)

In [196]:
df1 = df1.select(["popularity_final",'features']) \
       .withColumnRenamed("popularity_final", 'label')
df1.show(5)

+-----+--------------------+
|label|            features|
+-----+--------------------+
| 0.12|[0.99099999666213...|
| 0.07|[0.64300000667572...|
| 0.04|[0.99299997091293...|
| 0.17|[1.72999993083067...|
| 0.02|[0.29499998688697...|
+-----+--------------------+
only showing top 5 rows



In [197]:
## Feature scaling
# Initialize the `standardScaler`
standardScaler = StandardScaler(inputCol="features", outputCol="features_scaled", 
                                withStd=True, withMean=False)

# Fit the DataFrame to the scaler; this computes the mean, standard deviation of each feature
scaler = standardScaler.fit(df1)

# Transform the data in `df2` with the scaler
scaled_df = scaler.transform(df1)

In [198]:
# Split data into train set (80%), test set (20%) 
seed = 314
train_test = [0.8, 0.2]
train_data, test_data = scaled_df.randomSplit(train_test, seed)

In [199]:
# 可删除
train_data.show(3)

+-----+--------------------+--------------------+
|label|            features|     features_scaled|
+-----+--------------------+--------------------+
|  0.0|[1.75000004674075...|[5.21742603279626...|
|  0.0|[5.22000009368639...|[1.55628363729026...|
|  0.0|[6.00999992457218...|[1.79181309863197...|
+-----+--------------------+--------------------+
only showing top 3 rows



In [200]:
lr = LinearRegression(featuresCol = 'features', labelCol='label',
                      maxIter=10, regParam=0.3, elasticNetParam=0.8)
model = lr.fit(train_df1)
prediction = model.transform(test_data)

print("Coefficients: " + str(model.coefficients))
print("Intercept: " + str(model.intercept))

Coefficients: [0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0]
Intercept: 0.005132743362831858


In [201]:
trainingSummary = model.summary
print("RMSE: %f" % trainingSummary.rootMeanSquaredError)
print("r2: %f" % trainingSummary.r2)

RMSE: 0.018295
r2: 0.000000


In [202]:
train_data.describe().toPandas()

Unnamed: 0,summary,label
0,count,383.0
1,mean,0.0128198433420365
2,stddev,0.0488280069724011
3,min,0.0
4,max,0.41


In [203]:
prediction = model.transform(test_data)
prediction.select("prediction","label","features").show(5)

+--------------------+-----+--------------------+
|          prediction|label|            features|
+--------------------+-----+--------------------+
|0.005132743362831858|  0.0|[1.02999998489394...|
|0.005132743362831858|  0.0|[1.80999995791353...|
|0.005132743362831858|  0.0|[0.00294000003486...|
|0.005132743362831858|  0.0|[0.01559999957680...|
|0.005132743362831858|  0.0|[0.02099999971687...|
+--------------------+-----+--------------------+
only showing top 5 rows



In [204]:
lr_evaluator = RegressionEvaluator(predictionCol="prediction", \
                 labelCol="label",metricName="r2")
print("R Squared (R2) on test data = %g" % lr_evaluator.evaluate(prediction))

R Squared (R2) on test data = -0.0485144


In [205]:
print("numIterations: %d" % trainingSummary.totalIterations)
print("objectiveHistory: %s" % str(trainingSummary.objectiveHistory))
trainingSummary.residuals.show()

numIterations: 1
objectiveHistory: [0.5000000000000001]
+--------------------+
|           residuals|
+--------------------+
|-0.00513274336283...|
|-0.00513274336283...|
|-0.00513274336283...|
|-0.00513274336283...|
|-0.00513274336283...|
|-0.00513274336283...|
|-0.00513274336283...|
|-0.00513274336283...|
|-0.00513274336283...|
|-0.00513274336283...|
|-0.00513274336283...|
|-0.00513274336283...|
|-0.00513274336283...|
|-0.00513274336283...|
|-0.00513274336283...|
|-0.00513274336283...|
|-0.00513274336283...|
|-0.00513274336283...|
|-0.00513274336283...|
|-0.00513274336283...|
+--------------------+
only showing top 20 rows



## Naive Bayes

## Random Forest

In [232]:
# IMPORT
import numpy
from numpy import allclose
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import StringIndexer
from pyspark.ml.classification import RandomForestClassifier

In [233]:
# PREPARE DATA
stringIndexer = StringIndexer(inputCol="label", outputCol="indexed")
si_model = stringIndexer.fit(df1)
td = si_model.transform(df1)

In [234]:
# BUILD THE MODEL
rf = RandomForestClassifier(numTrees=3, maxDepth=2, labelCol="indexed", seed=314)
model = rf.fit(td)

In [235]:
# FEATURE IMPORTANCES
print(model.featureImportances)

(11,[3,4,10],[0.41164746853287254,0.09090003628501331,0.49745249518211415])


///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////

In [236]:
from pyspark.ml import Pipeline
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.feature import VectorIndexer
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.mllib.util import MLUtils

In [240]:
# Automatically identify categorical features, and index them.
# Set maxCategories so features with > 4 distinct values are treated as continuous.
featureIndexer = VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(df1)

# Split the data into training and test sets (20% held out for testing)
(trainingData, testData) = df1.randomSplit([0.8, 0.2])

# Train a RandomForest model.
rf = RandomForestRegressor(featuresCol="indexedFeatures")

# Chain indexer and forest in a Pipeline
pipeline = Pipeline(stages=[featureIndexer, rf])

# Train model.  This also runs the indexer.
model = pipeline.fit(trainingData)

# Make predictions.
predictions = model.transform(testData)

# Select example rows to display.
predictions.select("prediction", "label", "features").show(5)

# Select (prediction, true label) and compute test error
evaluator = RegressionEvaluator(
    labelCol="label", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictions)
print ("Root Mean Squared Error (RMSE) on test data = %g" % rmse)

rfModel = model.stages[1]
print (rfModel) # summary only

+--------------------+-----+--------------------+
|          prediction|label|            features|
+--------------------+-----+--------------------+
|0.003206466323314954|  0.0|[0.00282000005245...|
| 0.01280181101164445|  0.0|[0.00389999989420...|
|0.005397849125806318|  0.0|[0.00908999983221...|
|0.010974922045614389|  0.0|[0.01130000036209...|
|0.008311732947836954|  0.0|[0.02099999971687...|
+--------------------+-----+--------------------+
only showing top 5 rows

Root Mean Squared Error (RMSE) on test data = 0.0213167
RandomForestRegressionModel: uid=RandomForestRegressor_eedf398cae82, numTrees=20, numFeatures=11


## Tree Decision

In [206]:
from pyspark.ml.regression import DecisionTreeRegressor
dt = DecisionTreeRegressor(featuresCol ='features', labelCol = 'label')
model = dt.fit(train_data)
prediction = model.transform(test_data)
evaluator = RegressionEvaluator(
    labelCol="label", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(prediction)
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)

Root Mean Squared Error (RMSE) on test data = 0.0297779


In [207]:
model.featureImportances

SparseVector(11, {0: 0.0181, 1: 0.0272, 2: 0.008, 3: 0.0076, 4: 0.0207, 7: 0.0009, 8: 0.0, 9: 0.0535, 10: 0.864})

In [208]:
df1.take(1)

[Row(label=0.12, features=DenseVector([0.991, 0.598, 2.81, 0.224, 0.52, 0.379, -12.628, 0.0936, 149.976, 0.634, 1920.0]))]

## Gradient-boosted Tree Regression

In [209]:
from pyspark.ml.regression import GBTRegressor
gbt = GBTRegressor(featuresCol = 'features', labelCol = 'label', maxIter=10)
gbt_model = gbt.fit(train_data)
gbt_predictions = gbt_model.transform(test_data)
gbt_predictions.select('prediction', 'label', 'features').show(5)

+--------------------+-----+--------------------+
|          prediction|label|            features|
+--------------------+-----+--------------------+
|0.001417647218086...|  0.0|[1.02999998489394...|
|0.001194653609392...|  0.0|[1.80999995791353...|
|0.002774863546040...|  0.0|[0.00294000003486...|
|0.002531947405644...|  0.0|[0.01559999957680...|
|0.003521292963933...|  0.0|[0.02099999971687...|
+--------------------+-----+--------------------+
only showing top 5 rows



In [210]:
gbt_evaluator = RegressionEvaluator(
    labelCol="label", predictionCol="prediction", metricName="rmse")
rmse = gbt_evaluator.evaluate(gbt_predictions)
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)

Root Mean Squared Error (RMSE) on test data = 0.0297901
