In case you want to learn how ETL is done, please run the following notebook first and update the file name below accordingly

https://github.com/IBM/coursera/blob/master/coursera_ml/a2_w1_s3_ETL.ipynb


In [1]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SQLContext, SparkSession
from pyspark.sql.types import StructType, StructField, DoubleType, IntegerType, StringType

sc = SparkContext.getOrCreate(SparkConf().setMaster("local[*]"))

from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .getOrCreate()

spark

In [3]:
# delete files from previous runs
# !rm -f hmp.parquet*

# download the file containing the data in PARQUET format
# !wget https://github.com/IBM/coursera/raw/master/hmp.parquet
    
# create a dataframe out of it
df = spark.read.parquet('../w3/hmp.parquet')

# register a corresponding query table
df.createOrReplaceTempView('df')

In [4]:
df_energy = spark.sql("""
select sqrt(sum(x * x) + sum(y * y) + sum(z * z)) as label, class from df group by class
""")      
df_energy.createOrReplaceTempView('df_energy')          

In [5]:
df_join = spark.sql('select * from df inner join df_energy on df.class=df_energy.class')

In [6]:
df_join.show()

+---+---+---+--------------------+-----------+-----------------+-----------+
|  x|  y|  z|              source|      class|            label|      class|
+---+---+---+--------------------+-----------+-----------------+-----------+
| 22| 49| 35|Accelerometer-201...|Brush_teeth|11785.39634462923|Brush_teeth|
| 22| 49| 35|Accelerometer-201...|Brush_teeth|11785.39634462923|Brush_teeth|
| 22| 52| 35|Accelerometer-201...|Brush_teeth|11785.39634462923|Brush_teeth|
| 22| 52| 35|Accelerometer-201...|Brush_teeth|11785.39634462923|Brush_teeth|
| 21| 52| 34|Accelerometer-201...|Brush_teeth|11785.39634462923|Brush_teeth|
| 22| 51| 34|Accelerometer-201...|Brush_teeth|11785.39634462923|Brush_teeth|
| 20| 50| 35|Accelerometer-201...|Brush_teeth|11785.39634462923|Brush_teeth|
| 22| 52| 34|Accelerometer-201...|Brush_teeth|11785.39634462923|Brush_teeth|
| 22| 50| 34|Accelerometer-201...|Brush_teeth|11785.39634462923|Brush_teeth|
| 22| 51| 35|Accelerometer-201...|Brush_teeth|11785.39634462923|Brush_teeth|

In [8]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import Normalizer


vectorAssembler = VectorAssembler(inputCols=["x", "y", "z"],
                                  outputCol="features")

# disablethe following, because it is counter-productive
# normalizer = Normalizer(inputCol="features", outputCol="features_norm", p=1.0)

In [9]:
from pyspark.ml.regression import LinearRegression

lr = LinearRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8) 

# regParam? == regularization?
# elasticNetParam

In [10]:
from pyspark.ml import Pipeline

# pipeline = Pipeline(stages=[vectorAssembler, normalizer,lr])
pipeline = Pipeline(stages=[vectorAssembler, lr])

In [11]:
model = pipeline.fit(df_join)


In [12]:
prediction = model.transform(df_join)


In [13]:
prediction.show()

+---+---+---+--------------------+-----------+-----------------+-----------+----------------+------------------+
|  x|  y|  z|              source|      class|            label|      class|        features|        prediction|
+---+---+---+--------------------+-----------+-----------------+-----------+----------------+------------------+
| 22| 49| 35|Accelerometer-201...|Brush_teeth|11785.39634462923|Brush_teeth|[22.0,49.0,35.0]|12586.729735016828|
| 22| 49| 35|Accelerometer-201...|Brush_teeth|11785.39634462923|Brush_teeth|[22.0,49.0,35.0]|12586.729735016828|
| 22| 52| 35|Accelerometer-201...|Brush_teeth|11785.39634462923|Brush_teeth|[22.0,52.0,35.0]|12542.703337345756|
| 22| 52| 35|Accelerometer-201...|Brush_teeth|11785.39634462923|Brush_teeth|[22.0,52.0,35.0]|12542.703337345756|
| 21| 52| 34|Accelerometer-201...|Brush_teeth|11785.39634462923|Brush_teeth|[21.0,52.0,34.0]|12573.865911821156|
| 22| 51| 34|Accelerometer-201...|Brush_teeth|11785.39634462923|Brush_teeth|[22.0,51.0,34.0]|125

In [15]:
model.stages[1].summary.r2 # not 2

0.03259100556263628