In [1]:
!pip install pyspark

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
import time

In [3]:
from pyspark.ml.classification import MultilayerPerceptronClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [4]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName('tree').getOrCreate()
df = spark.read.csv('/content/drive/MyDrive/Datasets/Thesis/dr16.csv', inferSchema=True, header=True)
df.printSchema()

root
 |-- _c0: integer (nullable = true)
 |-- objid: double (nullable = true)
 |-- modelMag_u: double (nullable = true)
 |-- modelMag_g: double (nullable = true)
 |-- modelMag_r: double (nullable = true)
 |-- modelMag_i: double (nullable = true)
 |-- modelMag_z: double (nullable = true)
 |-- fiberMag_u: double (nullable = true)
 |-- fiberMag_g: double (nullable = true)
 |-- fiberMag_r: double (nullable = true)
 |-- fiberMag_i: double (nullable = true)
 |-- fiberMag_z: double (nullable = true)
 |-- petroR50_r: double (nullable = true)
 |-- petroR90_r: double (nullable = true)
 |-- petroR50_z: double (nullable = true)
 |-- petroR90_z: double (nullable = true)
 |-- r: double (nullable = true)
 |-- i: double (nullable = true)
 |-- z: double (nullable = true)
 |-- redshift: double (nullable = true)
 |-- zerr: double (nullable = true)
 |-- mmug: double (nullable = true)
 |-- mmgr: double (nullable = true)
 |-- mmri: double (nullable = true)
 |-- mmiz: double (nullable = true)
 |-- mfug: doub

In [5]:
import pyspark.sql.functions as func
df = df.withColumn("redshift", func.round(df["redshift"], 2).cast('integer'))

In [6]:
import pandas as pd
pd.DataFrame(df.take(5), columns = df.columns).transpose()

Unnamed: 0,0,1,2,3,4
_c0,0.0,1.0,2.0,3.0,4.0
objid,1.23768e+18,1.23768e+18,1.23768e+18,1.23768e+18,1.23768e+18
modelMag_u,21.63269,19.74829,23.74654,20.63075,21.38126
modelMag_g,21.27911,19.45819,23.19651,20.25426,21.13488
modelMag_r,21.2255,19.33118,22.01303,20.04478,21.09993
modelMag_i,20.9569,19.01371,20.80744,19.79378,20.92882
modelMag_z,20.82752,18.80369,19.9166,19.70326,20.80634
fiberMag_u,22.05036,20.1019,24.60136,20.99431,21.85225
fiberMag_g,21.62439,19.79539,23.64122,20.60174,21.4586
fiberMag_r,21.64663,19.6658,22.4804,20.40031,21.4534


In [7]:
df.columns

['_c0',
 'objid',
 'modelMag_u',
 'modelMag_g',
 'modelMag_r',
 'modelMag_i',
 'modelMag_z',
 'fiberMag_u',
 'fiberMag_g',
 'fiberMag_r',
 'fiberMag_i',
 'fiberMag_z',
 'petroR50_r',
 'petroR90_r',
 'petroR50_z',
 'petroR90_z',
 'r',
 'i',
 'z',
 'redshift',
 'zerr',
 'mmug',
 'mmgr',
 'mmri',
 'mmiz',
 'mfug',
 'mfgr',
 'mfri',
 'mfiz']

In [8]:
df.show(5)

+---+--------------------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+------------------+----------+------------------+----------+--------+--------+--------+--------+-----------------+---------+------------------+---------+--------+------------------+---------+--------+------------------+
|_c0|               objid|modelMag_u|modelMag_g|modelMag_r|modelMag_i|modelMag_z|fiberMag_u|fiberMag_g|fiberMag_r|fiberMag_i|fiberMag_z|        petroR50_r|petroR90_r|        petroR50_z|petroR90_z|       r|       i|       z|redshift|             zerr|     mmug|              mmgr|     mmri|    mmiz|              mfug|     mfgr|    mfri|              mfiz|
+---+--------------------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+------------------+----------+------------------+----------+--------+--------+--------+--------+-----------------+---------+------------------+---------

In [9]:
df = df.drop('objid', '_c0', 'zerr')
df.show(5)

+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+------------------+----------+------------------+----------+--------+--------+--------+--------+---------+------------------+---------+--------+------------------+---------+--------+------------------+
|modelMag_u|modelMag_g|modelMag_r|modelMag_i|modelMag_z|fiberMag_u|fiberMag_g|fiberMag_r|fiberMag_i|fiberMag_z|        petroR50_r|petroR90_r|        petroR50_z|petroR90_z|       r|       i|       z|redshift|     mmug|              mmgr|     mmri|    mmiz|              mfug|     mfgr|    mfri|              mfiz|
+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+------------------+----------+------------------+----------+--------+--------+--------+--------+---------+------------------+---------+--------+------------------+---------+--------+------------------+
|  21.63269|  21.27911|   21.2255|   20.9569|  20.82752|  22.

In [10]:
#from pyspark.sql.functions import monotonically_increasing_id
#df = df.withColumn("objid", monotonically_increasing_id())
#df.show(5)

In [11]:
from pyspark.ml.feature import VectorAssembler

assembler = VectorAssembler(inputCols = ['modelMag_u', 'modelMag_g', 'modelMag_r', 'modelMag_i', 'modelMag_z', 'fiberMag_u', 'fiberMag_g', 'fiberMag_r', 'fiberMag_i', 'fiberMag_z', 'petroR50_r', 'petroR90_r', 'petroR50_z', 'petroR90_z', 'r', 'i', 'z', 'mmug', 'mmgr', 'mmri', 'mmiz', 'mfug', 'mfgr', 'mfri', 'mfiz'], outputCol = 'features')

In [12]:
output = assembler.transform(df)

In [13]:
final_df = output.select('features', 'redshift')
final_df.show(3)

+--------------------+--------+
|            features|redshift|
+--------------------+--------+
|[21.63269,21.2791...|       1|
|[19.74829,19.4581...|       2|
|[23.74654,23.1965...|       0|
+--------------------+--------+
only showing top 3 rows



In [14]:
train, test = final_df.randomSplit([0.7, 0.3])

In [15]:
layers = [25, 24, 24, 8]

In [16]:
#trainer = MultilayerPerceptronClassifier(maxIter=100, layers=layers, blockSize=128, seed=1234)
classifier = MultilayerPerceptronClassifier(labelCol='redshift', featuresCol='features', maxIter=100, layers=layers, blockSize=128, seed=1234)


In [17]:
start_time = time.time()
model = classifier.fit(train)
print("%s seconds" % (time.time() - start_time))

836.9976134300232 seconds


In [18]:
mp_train_prediction = model.transform(train)
mp_train_prediction.show(5)

+--------------------+--------+--------------------+--------------------+----------+
|            features|redshift|       rawPrediction|         probability|prediction|
+--------------------+--------+--------------------+--------------------+----------+
|[11.72647,13.1948...|       0|[7.17898851610978...|[0.99349865332391...|       0.0|
|[12.48301,13.3513...|       0|[7.25037006644864...|[0.99457823775321...|       0.0|
|[13.42211,12.3572...|       0|[7.28057065498487...|[0.99426675112941...|       0.0|
|[13.71563,12.6128...|       0|[6.70776023740550...|[0.99105008510689...|       0.0|
|[13.7939700000000...|       1|[7.17758567114409...|[0.99414788051611...|       0.0|
+--------------------+--------+--------------------+--------------------+----------+
only showing top 5 rows



In [19]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
multi_evaluator = MulticlassClassificationEvaluator(labelCol = 'redshift', metricName = 'accuracy')

print('Random Forest Training Accuracy:', multi_evaluator.evaluate(mp_train_prediction))
print('Training data (weightedPrecision): ', multi_evaluator.setMetricName('weightedPrecision').evaluate(mp_train_prediction))

Random Forest Training Accuracy: 0.8576070947741472
Training data (weightedPrecision):  0.8189323488971444


In [20]:
result = model.transform(test)

In [21]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

multi_evaluator = MulticlassClassificationEvaluator(labelCol = 'redshift', metricName = 'accuracy')
print('Random Forest Accu:', multi_evaluator.evaluate(result))
print('test data (weightedPrecision): ', multi_evaluator.setMetricName('weightedPrecision').evaluate(result))

Random Forest Accu: 0.8572256466495235
test data (weightedPrecision):  0.8184686321001315


In [22]:
precision_evaluator = MulticlassClassificationEvaluator(labelCol = 'redshift', metricName = 'weightedPrecision')
print('Random Forest Accu:', precision_evaluator.evaluate(result))

Random Forest Accu: 0.8184686321001315


In [23]:
#print('test data (weightedRecall): ', multi_evaluator.setMetricName('weightedRecall').evaluate(result))

#Cascading

In [24]:
result2 = model.transform(final_df)

In [25]:
result2.show(3)

+--------------------+--------+--------------------+--------------------+----------+
|            features|redshift|       rawPrediction|         probability|prediction|
+--------------------+--------+--------------------+--------------------+----------+
|[21.63269,21.2791...|       1|[1.65788954422034...|[0.14602154356786...|       1.0|
|[19.74829,19.4581...|       2|[2.18959286906841...|[0.27073863458706...|       1.0|
|[23.74654,23.1965...|       0|[6.40400408004294...|[0.98722776357599...|       0.0|
+--------------------+--------+--------------------+--------------------+----------+
only showing top 3 rows



In [26]:
from pyspark.sql.functions import monotonically_increasing_id
df = df.withColumn("objid", monotonically_increasing_id())
df.show(5)

+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+------------------+----------+------------------+----------+--------+--------+--------+--------+---------+------------------+---------+--------+------------------+---------+--------+------------------+-----+
|modelMag_u|modelMag_g|modelMag_r|modelMag_i|modelMag_z|fiberMag_u|fiberMag_g|fiberMag_r|fiberMag_i|fiberMag_z|        petroR50_r|petroR90_r|        petroR50_z|petroR90_z|       r|       i|       z|redshift|     mmug|              mmgr|     mmri|    mmiz|              mfug|     mfgr|    mfri|              mfiz|objid|
+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+------------------+----------+------------------+----------+--------+--------+--------+--------+---------+------------------+---------+--------+------------------+---------+--------+------------------+-----+
|  21.63269|  21.27911|   21.2255|   20.956

In [27]:
from pyspark.sql.functions import monotonically_increasing_id
result2 = result2.withColumn("objid", monotonically_increasing_id())
result2.show(3)

+--------------------+--------+--------------------+--------------------+----------+-----+
|            features|redshift|       rawPrediction|         probability|prediction|objid|
+--------------------+--------+--------------------+--------------------+----------+-----+
|[21.63269,21.2791...|       1|[1.65788954422034...|[0.14602154356786...|       1.0|    0|
|[19.74829,19.4581...|       2|[2.18959286906841...|[0.27073863458706...|       1.0|    1|
|[23.74654,23.1965...|       0|[6.40400408004294...|[0.98722776357599...|       0.0|    2|
+--------------------+--------+--------------------+--------------------+----------+-----+
only showing top 3 rows



In [28]:
print(df.count(), len(df.columns))
print(result2.count(), len(result2.columns))

new_df = df.join(result2, df.objid == result2.objid).select(df["*"],result2["prediction"])
#new_df = df.join(dt_predictions, df.objid == dt_predictions.objid)
new_df.show(5)

3524477 27
3524477 6
+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+------------------+------------------+----------+------------------+--------+--------+--------+--------+---------+------------------+------------------+---------+---------+---------+---------+---------+-----+----------+
|modelMag_u|modelMag_g|modelMag_r|modelMag_i|modelMag_z|fiberMag_u|fiberMag_g|fiberMag_r|fiberMag_i|fiberMag_z|        petroR50_r|        petroR90_r|petroR50_z|        petroR90_z|       r|       i|       z|redshift|     mmug|              mmgr|              mmri|     mmiz|     mfug|     mfgr|     mfri|     mfiz|objid|prediction|
+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+------------------+------------------+----------+------------------+--------+--------+--------+--------+---------+------------------+------------------+---------+---------+---------+---------+---------+----

In [29]:
assembler = VectorAssembler(inputCols = ['modelMag_u', 'modelMag_g', 'modelMag_r', 'modelMag_i', 'modelMag_z', 'fiberMag_u', 'fiberMag_g', 'fiberMag_r', 'fiberMag_i', 'fiberMag_z', 'petroR50_r', 'petroR90_r', 'petroR50_z', 'petroR90_z', 'r', 'i', 'z', 'mmug', 'mmgr', 'mmri', 'mmiz', 'mfug', 'mfgr', 'mfri', 'mfiz', 'prediction'], outputCol = 'features2')

In [30]:
output = assembler.transform(new_df)

In [31]:
final_df = output.select('features2', 'redshift')
final_df.show(3)

+--------------------+--------+
|           features2|redshift|
+--------------------+--------+
|[21.63269,21.2791...|       1|
|[20.90748,21.2373...|       1|
|[23.24831,21.2138...|       3|
+--------------------+--------+
only showing top 3 rows



In [32]:
train, test = final_df.randomSplit([0.7, 0.3])

In [33]:
layers = [26, 24, 24, 8]

In [34]:
classifier2 = MultilayerPerceptronClassifier(labelCol='redshift', featuresCol='features2', maxIter=100, layers=layers, blockSize=128, seed=1234)

In [35]:
model2 = classifier2.fit(train)

In [36]:
mp_train_prediction2 = model2.transform(train)
mp_train_prediction2.show(5)

+--------------------+--------+--------------------+--------------------+----------+
|           features2|redshift|       rawPrediction|         probability|prediction|
+--------------------+--------+--------------------+--------------------+----------+
|[11.41754,9.89709...|       0|[7.12280292299381...|[0.98744282788471...|       0.0|
|[11.61913,12.3107...|       1|[4.87396317108015...|[0.89344246766128...|       0.0|
|[13.72725,12.1701...|       0|[5.12587608865284...|[0.92472983561867...|       0.0|
|[13.78523,12.4974...|       0|[5.41919387854846...|[0.94489143269632...|       0.0|
|[13.95265,12.231,...|       0|[5.88691491224270...|[0.96730270977188...|       0.0|
+--------------------+--------+--------------------+--------------------+----------+
only showing top 5 rows



In [37]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
multi_evaluator = MulticlassClassificationEvaluator(labelCol = 'redshift', metricName = 'accuracy')

print('Random Forest Training Accuracy:', multi_evaluator.evaluate(mp_train_prediction2))
print('Training data (weightedPrecision): ', multi_evaluator.setMetricName('weightedPrecision').evaluate(mp_train_prediction2))

Random Forest Training Accuracy: 0.8550673802967222
Training data (weightedPrecision):  0.8243290128155603


In [38]:
result2 = model2.transform(test)

In [39]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

multi_evaluator = MulticlassClassificationEvaluator(labelCol = 'redshift', metricName = 'accuracy')
print('Decision Tree Accu:', multi_evaluator.evaluate(result2))
print('test data (weightedPrecision): ', multi_evaluator.setMetricName('weightedPrecision').evaluate(result2))

Decision Tree Accu: 0.8552957294028722
test data (weightedPrecision):  0.8246287815655312


In [40]:

#print('test data (weightedRecall): ', multi_evaluator.setMetricName('weightedRecall').evaluate(result2))