In [1]:
    from pyspark import SparkContext, SparkConf
    from pyspark.sql import SparkSession

    sc = SparkContext.getOrCreate(SparkConf().setMaster("local[*]"))
    
    spark = SparkSession \
        .builder \
        .getOrCreate()

In [2]:
# create a dataframe out of it
df = spark.read.parquet('/Users/orcun/PythonProjects/skillsnetwork/hmp.parquet')

# register a corresponding query table
df.createOrReplaceTempView('df')

In [3]:
df_energy = spark.sql("""
select sqrt(sum(x*x)+sum(y*y)+sum(z*z)) as label, class from df group by class
""")      
df_energy.createOrReplaceTempView('df_energy') 

In [4]:
df_join = spark.sql('select * from df inner join df_energy on df.class=df_energy.class')

In [5]:
splits = df_join.randomSplit([0.8, 0.2])
df_train = splits[0]
df_test = splits[1]

In [6]:
df_train.count()

356818

In [7]:
df_test.count()

89711

In [8]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import MinMaxScaler


vectorAssembler = VectorAssembler(inputCols=["x","y","z"],
                                  outputCol="features")
normalizer = MinMaxScaler(inputCol="features", outputCol="features_norm")



In [9]:
from pyspark.ml.regression import LinearRegression

lr = LinearRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8)


In [10]:
from pyspark.ml import Pipeline
pipeline = Pipeline(stages=[vectorAssembler, normalizer,lr])

model = pipeline.fit(df_train)

In [11]:
model.stages[2].summary.r2

0.03251505478711836

In [12]:
model = pipeline.fit(df_test)

In [13]:
model.stages[2].summary.r2

0.03290042128725357