In [23]:
import findspark
findspark.init('/home/ubuntu/spark-2.1.1-bin-hadoop2.7')
import pyspark
from pyspark.sql import SparkSession

In [24]:
from pyspark.ml.regression import LinearRegression

In [25]:
spark=SparkSession.builder.appName("linearRegression").getOrCreate()

In [26]:
df=spark.read.csv("Ecommerce_Customers.csv",inferSchema=True,header=True)

# vector assembler needed to format data as expected by the model

In [28]:
from pyspark.ml.feature import VectorAssembler

In [29]:
feature_list=["Avg Session Length","Time on App","Time on Website", "Length of Membership"]

# initialize the vector assembler

In [30]:
assembler=VectorAssembler(inputCols=feature_list,outputCol="featureVector")

# transform the input to a format expected by the model

In [31]:
df_vectorized=assembler.transform(df)

In [33]:
input_df=df_vectorized["featureVector","Yearly Amount Spent"]

In [35]:
train_data,test_data=input_df.randomSplit([0.7,0.3])

# instantiate the model

In [59]:
lr=LinearRegression(labelCol="Yearly Amount Spent",featuresCol="featureVector")

# fit the model

In [60]:
model=lr.fit(train_data)

# predict for the test data using the model

In [61]:
test_result=model.evaluate(test_data)

# check the quality of model

In [42]:
test_result.rootMeanSquaredError

9.940036340415583

In [43]:
test_result.r2

0.9842541026018095

In [52]:
labeled_data,unlabeled_data=df_vectorized.randomSplit([0.8,0.2])

In [55]:
unlabeled_data=unlabeled_data[['featureVector']]

In [56]:
unlabeled_data.printSchema()

root
 |-- featureVector: vector (nullable = true)



# predict for new data which there is no label

In [57]:
prediction=model.transform(unlabeled_data[['featureVector']])

In [58]:
prediction.show()

+--------------------+------------------+
|       featureVector|        prediction|
+--------------------+------------------+
|[34.1881840610182...| 586.4334399756528|
|[32.9597643110742...|461.14739004568764|
|[34.5014178527839...| 581.7076749504354|
|[32.1338624098483...|447.66808595847397|
|[33.7801567632950...|  519.061342802712|
|[33.7008855390197...|500.82250610394635|
|[31.2606468698795...|422.86800204307815|
|[32.7704921585493...|495.68344484120917|
|[32.2129238300579...| 513.9221925698389|
|[32.7391429383803...| 558.6464270560175|
|[33.5404790601237...|  635.214257217332|
|[33.7081534080930...| 611.5132638489581|
|[33.5873733902455...| 411.2005633942349|
|[31.9365486184489...|440.87347295400446|
|[31.5171218025062...| 279.2339516680197|
|[32.4017318273546...| 505.3318050018795|
|[32.1223647957977...| 532.1463662427859|
|[32.7897726183107...| 470.3578968709119|
|[32.0444861274404...| 448.1515661594676|
|[31.6253601348306...| 381.6037483289706|
+--------------------+------------