In [2]:
!pip install pyspark



In [3]:
from pyspark import SparkContext

In [4]:
from pyspark.sql import SparkSession

In [5]:
 spark = SparkSession.builder.appName('reg').getOrCreate()

In [6]:
from pyspark.ml.regression import LinearRegression

In [8]:
train = spark.read.format("libsvm").load("sample_linear_regression_data.txt")

In [9]:
train.show()

+-------------------+--------------------+
|              label|            features|
+-------------------+--------------------+
| -9.490009878824548|(10,[0,1,2,3,4,5,...|
| 0.2577820163584905|(10,[0,1,2,3,4,5,...|
| -4.438869807456516|(10,[0,1,2,3,4,5,...|
|-19.782762789614537|(10,[0,1,2,3,4,5,...|
| -7.966593841555266|(10,[0,1,2,3,4,5,...|
| -7.896274316726144|(10,[0,1,2,3,4,5,...|
| -8.464803554195287|(10,[0,1,2,3,4,5,...|
| 2.1214592666251364|(10,[0,1,2,3,4,5,...|
| 1.0720117616524107|(10,[0,1,2,3,4,5,...|
|-13.772441561702871|(10,[0,1,2,3,4,5,...|
| -5.082010756207233|(10,[0,1,2,3,4,5,...|
|  7.887786536531237|(10,[0,1,2,3,4,5,...|
| 14.323146365332388|(10,[0,1,2,3,4,5,...|
|-20.057482615789212|(10,[0,1,2,3,4,5,...|
|-0.8995693247765151|(10,[0,1,2,3,4,5,...|
| -19.16829262296376|(10,[0,1,2,3,4,5,...|
|  5.601801561245534|(10,[0,1,2,3,4,5,...|
|-3.2256352187273354|(10,[0,1,2,3,4,5,...|
| 1.5299675726687754|(10,[0,1,2,3,4,5,...|
| -0.250102447941961|(10,[0,1,2,3,4,5,...|
+----------

In [11]:
lr = LinearRegression(featuresCol = 'features', labelCol = 'label', predictionCol =  'prediction')

In [12]:
modelo = lr.fit(train)

In [13]:
modelo.coefficients

DenseVector([0.0073, 0.8314, -0.8095, 2.4412, 0.5192, 1.1535, -0.2989, -0.5129, -0.6197, 0.6956])

In [14]:
 modelo.intercept

0.14228558260358093

In [15]:
 summary = modelo.summary

In [17]:
summary.predictions.show()

+-------------------+--------------------+--------------------+
|              label|            features|          prediction|
+-------------------+--------------------+--------------------+
| -9.490009878824548|(10,[0,1,2,3,4,5,...|  1.5211201432720063|
| 0.2577820163584905|(10,[0,1,2,3,4,5,...| -0.6658770747591632|
| -4.438869807456516|(10,[0,1,2,3,4,5,...|  0.1568703823211514|
|-19.782762789614537|(10,[0,1,2,3,4,5,...|  0.6374146679690593|
| -7.966593841555266|(10,[0,1,2,3,4,5,...|   2.372566473232916|
| -7.896274316726144|(10,[0,1,2,3,4,5,...| -1.9410651727650883|
| -8.464803554195287|(10,[0,1,2,3,4,5,...|  2.2621027950886363|
| 2.1214592666251364|(10,[0,1,2,3,4,5,...|-0.00134792656609...|
| 1.0720117616524107|(10,[0,1,2,3,4,5,...| -3.0051104606414007|
|-13.772441561702871|(10,[0,1,2,3,4,5,...|  3.5437265095387804|
| -5.082010756207233|(10,[0,1,2,3,4,5,...| -0.4889664122481736|
|  7.887786536531237|(10,[0,1,2,3,4,5,...|  1.5073098457843013|
| 14.323146365332388|(10,[0,1,2,3,4,5,..

In [18]:
summary.rootMeanSquaredError

10.16309157133015

In [19]:
summary.r2

0.027839179518600154

In [20]:
## Split the data set

In [25]:
 data = spark.read.format("libsvm").load("sample_linear_regression_data.txt")

In [27]:
split = data.randomSplit([0.7,0.3]) 

In [23]:
 split

[DataFrame[label: double, features: vector],
 DataFrame[label: double, features: vector]]

In [28]:
train, test = data.randomSplit([0.7,0.3]) # Tuple unpacking 

In [29]:
train

DataFrame[label: double, features: vector]

In [30]:
 test

DataFrame[label: double, features: vector]

In [31]:
train.describe().show()

+-------+-------------------+
|summary|              label|
+-------+-------------------+
|  count|                341|
|   mean| 0.9163951185176094|
| stddev|  9.922366653183223|
|    min|-28.046018037776633|
|    max|  27.78383192005107|
+-------+-------------------+



In [32]:
 test.describe().show()

+-------+-------------------+
|summary|              label|
+-------+-------------------+
|  count|                160|
|   mean|-1.1486839718425947|
| stddev| 11.014799648427012|
|    min|-28.571478869743427|
|    max| 24.290551295953957|
+-------+-------------------+



In [33]:
 correct_model = lr.fit(train)

In [34]:
results_test = correct_model.evaluate(test)

In [35]:
results_test.rootMeanSquaredError

11.196608776431459

In [36]:
data_without_label = test.select('features')

In [37]:
data_without_label.show()

+--------------------+
|            features|
+--------------------+
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
+--------------------+
only showing top 20 rows



In [38]:
 ## Evaluate our test data set

In [41]:
predictions = correct_model.transform(data_without_label)

In [42]:
 predictions.show()

+--------------------+--------------------+
|            features|          prediction|
+--------------------+--------------------+
|(10,[0,1,2,3,4,5,...|  0.3693721358488725|
|(10,[0,1,2,3,4,5,...| -1.0589964369749683|
|(10,[0,1,2,3,4,5,...|   -1.94736371021694|
|(10,[0,1,2,3,4,5,...|  2.5508898573213576|
|(10,[0,1,2,3,4,5,...| -1.7173145671204573|
|(10,[0,1,2,3,4,5,...|  1.7940216103779472|
|(10,[0,1,2,3,4,5,...| 0.32648078995131913|
|(10,[0,1,2,3,4,5,...|   2.141771755834233|
|(10,[0,1,2,3,4,5,...|  0.3499887245155041|
|(10,[0,1,2,3,4,5,...| 0.15225105251858395|
|(10,[0,1,2,3,4,5,...| 0.35638424910970967|
|(10,[0,1,2,3,4,5,...|   3.371359979145224|
|(10,[0,1,2,3,4,5,...|  0.2601099031737425|
|(10,[0,1,2,3,4,5,...|  0.9628719587216745|
|(10,[0,1,2,3,4,5,...|  1.3497565849104531|
|(10,[0,1,2,3,4,5,...|   2.066720014647361|
|(10,[0,1,2,3,4,5,...|0.007716562915414782|
|(10,[0,1,2,3,4,5,...|  1.9070317536891794|
|(10,[0,1,2,3,4,5,...|-0.26181112693516007|
|(10,[0,1,2,3,4,5,...|-0.0833238