# Linear Regression

In [None]:
import findspark

In [5]:
findspark.init('/home/oussama/spark-2.4.0-bin-hadoop2.7')

In [9]:
from pyspark.sql import SparkSession

In [11]:
spark = SparkSession.builder.appName('lrex').getOrCreate()

In [13]:
from pyspark.ml.regression import LinearRegression

In [14]:
training = spark.read.format('libsvm').load('sample_linear_regression_data.txt')

In [15]:
training.show()

+-------------------+--------------------+
|              label|            features|
+-------------------+--------------------+
| -9.490009878824548|(10,[0,1,2,3,4,5,...|
| 0.2577820163584905|(10,[0,1,2,3,4,5,...|
| -4.438869807456516|(10,[0,1,2,3,4,5,...|
|-19.782762789614537|(10,[0,1,2,3,4,5,...|
| -7.966593841555266|(10,[0,1,2,3,4,5,...|
| -7.896274316726144|(10,[0,1,2,3,4,5,...|
| -8.464803554195287|(10,[0,1,2,3,4,5,...|
| 2.1214592666251364|(10,[0,1,2,3,4,5,...|
| 1.0720117616524107|(10,[0,1,2,3,4,5,...|
|-13.772441561702871|(10,[0,1,2,3,4,5,...|
| -5.082010756207233|(10,[0,1,2,3,4,5,...|
|  7.887786536531237|(10,[0,1,2,3,4,5,...|
| 14.323146365332388|(10,[0,1,2,3,4,5,...|
|-20.057482615789212|(10,[0,1,2,3,4,5,...|
|-0.8995693247765151|(10,[0,1,2,3,4,5,...|
| -19.16829262296376|(10,[0,1,2,3,4,5,...|
|  5.601801561245534|(10,[0,1,2,3,4,5,...|
|-3.2256352187273354|(10,[0,1,2,3,4,5,...|
| 1.5299675726687754|(10,[0,1,2,3,4,5,...|
| -0.250102447941961|(10,[0,1,2,3,4,5,...|
+----------

In [16]:
lr = LinearRegression(featuresCol='features', labelCol='label',
                     predictionCol='prediction')

In [17]:
lrModel = lr.fit(training)

In [18]:
lrModel.coefficients

DenseVector([0.0073, 0.8314, -0.8095, 2.4412, 0.5192, 1.1535, -0.2989, -0.5129, -0.6197, 0.6956])

In [19]:
lrModel.intercept

0.14228558260358093

In [20]:
training_summary = lrModel.summary

In [21]:
training_summary.r2

0.027839179518600154

In [22]:
all_data = spark.read.format('libsvm').load('sample_linear_regression_data.txt')

In [24]:
train_data, test_data = all_data.randomSplit([0.7,0.3])

In [25]:
train_data.describe().show()

+-------+-------------------+
|summary|              label|
+-------+-------------------+
|  count|                345|
|   mean| 0.6445710427517952|
| stddev| 10.530418852769566|
|    min|-28.571478869743427|
|    max|  27.78383192005107|
+-------+-------------------+



In [26]:
test_data.describe().show()

+-------+-------------------+
|summary|              label|
+-------+-------------------+
|  count|                156|
|   mean|-0.6004853194210219|
| stddev|  9.810058890276773|
|    min|-26.736207182601724|
|    max| 22.647750304177556|
+-------+-------------------+



In [33]:
correct_model = lr.fit(train_data)

In [34]:
test_results = correct_model.evaluate(test_data)

In [36]:
test_results.rootMeanSquaredError

9.90004480421104

In [37]:
unlabeled_data = test_data.select('features')

In [38]:
unlabeled_data.show()

+--------------------+
|            features|
+--------------------+
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
+--------------------+
only showing top 20 rows



In [39]:
predictions = correct_model.transform(unlabeled_data)

In [40]:
predictions.show()

+--------------------+--------------------+
|            features|          prediction|
+--------------------+--------------------+
|(10,[0,1,2,3,4,5,...| -3.3132093795987525|
|(10,[0,1,2,3,4,5,...|  1.4294104584777105|
|(10,[0,1,2,3,4,5,...|  0.7422053831343378|
|(10,[0,1,2,3,4,5,...|  1.7600975026109664|
|(10,[0,1,2,3,4,5,...|  1.3805758902472027|
|(10,[0,1,2,3,4,5,...| 0.45459996476295395|
|(10,[0,1,2,3,4,5,...|  1.7516902985566063|
|(10,[0,1,2,3,4,5,...| -1.4485274734007658|
|(10,[0,1,2,3,4,5,...|   3.103441854461547|
|(10,[0,1,2,3,4,5,...| -0.2314938940155557|
|(10,[0,1,2,3,4,5,...|  2.0489409394091114|
|(10,[0,1,2,3,4,5,...| -1.6435946672765456|
|(10,[0,1,2,3,4,5,...|  3.2954575270839848|
|(10,[0,1,2,3,4,5,...| -1.7295749264389613|
|(10,[0,1,2,3,4,5,...| 0.19779735352872413|
|(10,[0,1,2,3,4,5,...|  -3.206228053830371|
|(10,[0,1,2,3,4,5,...|  1.1258762711995092|
|(10,[0,1,2,3,4,5,...|   2.052349302668606|
|(10,[0,1,2,3,4,5,...|-0.26652739369108486|
|(10,[0,1,2,3,4,5,...|  -1.87353

### Example regression

In [42]:
data  =spark.read.csv('Ecommerce_Customers.csv',inferSchema=True,
                     header=True)

In [43]:
data.printSchema()

root
 |-- Email: string (nullable = true)
 |-- Address: string (nullable = true)
 |-- Avatar: string (nullable = true)
 |-- Avg Session Length: double (nullable = true)
 |-- Time on App: double (nullable = true)
 |-- Time on Website: double (nullable = true)
 |-- Length of Membership: double (nullable = true)
 |-- Yearly Amount Spent: double (nullable = true)



In [46]:
for item in data.head(2)[1]:
    print(item)

hduke@hotmail.com
4547 Archer CommonDiazchester, CA 06566-8576
DarkGreen
31.92627202636016
11.109460728682564
37.268958868297744
2.66403418213262
392.2049334443264


In [47]:
from pyspark.ml.linalg import Vectors

In [49]:
from pyspark.ml.feature import VectorAssembler

In [50]:
data.columns

['Email',
 'Address',
 'Avatar',
 'Avg Session Length',
 'Time on App',
 'Time on Website',
 'Length of Membership',
 'Yearly Amount Spent']

In [51]:
assembler = VectorAssembler(inputCols=['Avg Session Length',
                                       'Time on App',
                                        'Time on Website',
                                        'Length of Membership'],
                                       outputCol='features')

In [52]:
output = assembler.transform(data)

In [53]:
output.head(1)

[Row(Email='mstephenson@fernandez.com', Address='835 Frank TunnelWrightmouth, MI 82180-9605', Avatar='Violet', Avg Session Length=34.49726772511229, Time on App=12.65565114916675, Time on Website=39.57766801952616, Length of Membership=4.0826206329529615, Yearly Amount Spent=587.9510539684005, features=DenseVector([34.4973, 12.6557, 39.5777, 4.0826]))]

In [54]:
final_data = output.select('features', 'Yearly Amount Spent')

In [55]:
final_data.show()

+--------------------+-------------------+
|            features|Yearly Amount Spent|
+--------------------+-------------------+
|[34.4972677251122...|  587.9510539684005|
|[31.9262720263601...|  392.2049334443264|
|[33.0009147556426...| 487.54750486747207|
|[34.3055566297555...|  581.8523440352177|
|[33.3306725236463...|  599.4060920457634|
|[33.8710378793419...|   637.102447915074|
|[32.0215955013870...|  521.5721747578274|
|[32.7391429383803...|  549.9041461052942|
|[33.9877728956856...|  570.2004089636196|
|[31.9365486184489...|  427.1993848953282|
|[33.9925727749537...|  492.6060127179966|
|[33.8793608248049...|  522.3374046069357|
|[29.5324289670579...|  408.6403510726275|
|[33.1903340437226...|  573.4158673313865|
|[32.3879758531538...|  470.4527333009554|
|[30.7377203726281...|  461.7807421962299|
|[32.1253868972878...| 457.84769594494855|
|[32.3388993230671...| 407.70454754954415|
|[32.1878120459321...|  452.3156754800354|
|[32.6178560628234...|   605.061038804892|
+----------

In [56]:
train_data, test_data = final_data.randomSplit([0.7, 0.3])

In [57]:
train_data.describe().show()

+-------+-------------------+
|summary|Yearly Amount Spent|
+-------+-------------------+
|  count|                359|
|   mean|  495.4084592210148|
| stddev|  80.26614810765086|
|    min| 256.67058229005585|
|    max|  744.2218671047146|
+-------+-------------------+



In [58]:
test_data.describe().show()

+-------+-------------------+
|summary|Yearly Amount Spent|
+-------+-------------------+
|  count|                141|
|   mean|  509.2580302762494|
| stddev|  76.21574667207612|
|    min|  275.9184206503857|
|    max|  765.5184619388373|
+-------+-------------------+



In [59]:
lr = LinearRegression(labelCol='Yearly Amount Spent')

In [60]:
lr_model = lr.fit(train_data)

In [61]:
test_results = lr_model.evaluate(test_data)

In [62]:
test_results.residuals.show()

+-------------------+
|          residuals|
+-------------------+
| 10.867783580917148|
| -2.802364434817889|
| 3.8530810390590204|
|-2.8788165872340414|
| 3.9306515687627552|
|-5.8710300741302035|
|-5.9591025339719295|
|  3.305332532482282|
|-13.092829180932313|
|-1.0285602855759635|
| 17.711381266780847|
|   6.82716839722201|
|-25.794117977514247|
|-5.0432131179133535|
|-10.044171446509381|
| 0.8056038722297671|
| -9.338389529926587|
| 12.634250985304902|
|-1.7320056067127894|
|  5.458039274008229|
+-------------------+
only showing top 20 rows



In [63]:
test_results.rootMeanSquaredError

10.438043945282788

In [64]:
test_results.r2

0.9811096544846659

In [65]:
unlabeled_data = test_data.select('features')

In [66]:
predictions = lr_model.transform(unlabeled_data)

In [67]:
predictions.show()

+--------------------+------------------+
|            features|        prediction|
+--------------------+------------------+
|[29.5324289670579...|397.77256749171033|
|[30.8794843441274...|493.00896441967257|
|[31.0472221394875...| 388.6443181499624|
|[31.2681042107507...|426.34934976105797|
|[31.3662121671876...|426.65823098772216|
|[31.5147378578019...| 495.6835180705916|
|[31.5171218025062...|281.87752318435764|
|[31.5316044825729...| 433.2102731968803|
|[31.5741380228732...| 557.5021013415192|
|[31.5761319713222...| 542.2551442749043|
|[31.6098395733896...| 426.8341683843273|
|[31.6548096756927...| 468.4362553303265|
|[31.6739155032749...|501.51918588739545|
|[31.7242025238451...|508.43110040587385|
|[31.8093003166791...| 546.8160708093505|
|[31.8293464559211...| 384.3467341157452|
|[31.8854062999117...| 399.4416625024021|
|[31.9096268275227...| 550.8117846879343|
|[31.9120759292006...| 389.2667219124205|
|[31.9480174211613...| 456.4628376188896|
+--------------------+------------