In [24]:
import os
import sys
os.environ["PYSPARK_SUBMIT_ARGS"]='pyspark-shell'
os.environ["PYSPARK_PYTHON"]='python3'
os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'

spark_home = os.environ.get('SPARK_HOME', None)
if not spark_home:
    raise ValueError('SPARK_HOME environment variable is not set')
sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.4-src.zip'))
os.environ["PYSPARK_PYTHON"] = 'python3'
exec(open(os.path.join(spark_home, 'python/pyspark/shell.py')).read())

Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /__ / .__/\_,_/_/ /_/\_\   version 2.3.0
      /_/

Using Python version 3.6.4 (default, Jan 28 2018 00:00:00)
SparkSession available as 'spark'.


In [25]:
from pyspark.mllib.regression import LabeledPoint, LinearRegressionWithSGD
from numpy import array
import random

In [26]:
def gen_point(i):
    x1 = random.uniform(-100,100)
    x2 = random.uniform(-20,60)
    return LabeledPoint(x1 * 10.0 + x2 * 25 + random.uniform(-2,2), [x1, x2])

points = sc.parallelize(range(100000)).map(gen_point)

In [27]:
points.takeSample(False, 10)

[LabeledPoint(175.51729190788961, [-47.89441274498605,26.19022320556057]),
 LabeledPoint(1202.5338637000914, [38.32186975657004,32.73174290206193]),
 LabeledPoint(176.35293805848238, [7.889355822216842,3.8335761614964525]),
 LabeledPoint(-339.73204420145913, [-0.08577526028648208,-13.495311187455155]),
 LabeledPoint(1163.805071461759, [37.37378593881644,31.67169341418394]),
 LabeledPoint(-338.51944845865717, [4.813563173088369,-15.528337818036002]),
 LabeledPoint(-1227.4449384499608, [-86.69754304927646,-14.404020971364728]),
 LabeledPoint(-347.9437009996504, [-43.73030893477245,3.5124503486493097]),
 LabeledPoint(191.6928430144413, [-25.255261678247123,17.714751133395062]),
 LabeledPoint(-672.3404562685955, [-64.50998513105155,-1.1246520389106323])]

## Do it the easy way

In [28]:
model = LinearRegressionWithSGD.train(points, 
            iterations=100, step=1e-4, intercept=True)

In [29]:
model

(weights=[9.981181780754241,20.67325856457534], intercept=1.4412311324626597)

## Do it the hard way

In [30]:
weights = array([0, 0])
step = 1e-9

In [31]:
x = points.map(lambda p: p.features)

In [32]:
y = points.map(lambda p: p.label)

In [33]:
x.take(5)

[DenseVector([-99.4793, 24.1007]),
 DenseVector([38.9934, 46.4276]),
 DenseVector([-27.5514, -18.0038]),
 DenseVector([-26.9623, -10.4377]),
 DenseVector([-15.6088, 22.1985])]

In [34]:
y.take(5)

[692.9700084993823,
 1026.3064792147816,
 -607.8570212443774,
 545.4583814405335,
 -919.777148686593]

In [35]:
prediction = x.map(lambda point: point.dot(weights))

In [36]:
prediction.take(5)

[0.0, 0.0, 0.0, 0.0, 0.0]

In [37]:
gradient = x.zip(y).map(lambda xy: (xy[0].dot(weights) - xy[1]) * xy[0])

In [38]:
gradient.take(5)

[DenseVector([8383.6626, -22522.668]),
 DenseVector([-18836.7628, -34659.2079]),
 DenseVector([-38598.3363, 636.4838]),
 DenseVector([-14704.2465, -6012.3232]),
 DenseVector([-67343.074, -6879.6198])]

In [39]:
gradient_average = gradient.mean()

In [40]:
weights = weights - step * gradient_average

In [41]:
weights

array([3.33582680e-05, 2.33304722e-05])

In [42]:
iterations = 100
weights = array([0, 0])
step = 1e-2

In [43]:
for i in range(iterations):
    gradient = x.zip(y).map(lambda xy: (xy[0].dot(weights) - xy[1]) * xy[0]).mean()
    weights = weights - step * gradient
    print(i, weights, gradient)

0 [333.58268018 233.30472182] [-33358.26801833049,-23330.47218150352]
1 [-10475.30970575  -1704.76361446] [1080889.2385936526,193806.8336279605]
2 [339896.60336776  14198.3078449 ] [-35037191.30735179,-1590307.1459360325]
3 [-11018610.08979547   -109898.78243564] [1135850669.3163233,12409709.02805384]
4 [3.57215894e+08 6.47618960e+05] [-36823450371.88883,-75751774.22418861]
5 [-1.15807580e+10  3.33118647e+06] [1193797391225.3948,-268356751.00775594]
6 [ 3.75442905e+11 -3.10920189e+08] [-38702366275535.04,31425137591.55202]
7 [-1.21716931e+13  1.17726927e+10] [1254713596798469.2,-1208361284090.593]
8 [ 3.94600952e+14 -3.95786773e+11] [-4.067726448968771e+16,40755946552064.02]
9 [-1.27927902e+16  1.29490328e+13] [1.3187391123335616e+18,-1334481953788527.5]
10 [ 4.14736662e+17 -4.20784651e+14] [-4.2752945197644505e+19,4.337336836771403e+16]
11 [-1.34455812e+19  1.36498518e+16] [1.3860317856095428e+21,-1.4070636420058153e+18]
12 [ 4.35899862e+20 -4.42590607e+17] [-4.493454434071061e+22,4.5