<a href="https://colab.research.google.com/github/mttbanizi/PowerPlant-ML/blob/main/PowerPlant_multi_regression_pyspark.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
pip install pyspark


Collecting pyspark
  Downloading pyspark-3.4.1.tar.gz (310.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m310.8/310.8 MB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.4.1-py2.py3-none-any.whl size=311285387 sha256=527c872d57347666cc18b484be62b1a0e5403a1e27e3423103fd37b26e6631ef
  Stored in directory: /root/.cache/pip/wheels/0d/77/a3/ff2f74cc9ab41f8f594dabf0579c2a7c6de920d584206e0834
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.4.1


In [10]:
import pyspark
from pyspark.sql import SparkSession
spark=SparkSession.builder.appName('dataframe').getOrCreate()

In [12]:
df= spark.read.csv("new_data", header=True, inferSchema=True)

In [13]:
df.show()

+---+--------+--------+--------+----------+------------------+-----------------+
|_c0|    time|pressure|Steem MW|AmientTemp|              G_MW|             Flow|
+---+--------+--------+--------+----------+------------------+-----------------+
|  0|12 17:58| 364.315| 134.021|    35.901|           256.289|          626.694|
|  1|12 18:58| 362.293| 134.215|      34.7|256.93399999999997|          557.287|
|  2|12 16:58| 350.456| 134.935|    37.516|           255.058|          626.297|
|  3|12 15:58| 354.372| 135.036|    37.297|           255.034|          635.316|
|  4|12 19:58| 340.949| 135.883|    33.254|           258.033|626.0329999999999|
|  5|12 13:58| 353.162| 136.982|    36.349|           252.471|          776.088|
|  6|12 12:58| 350.716| 137.837|    35.653|           252.708|          778.697|
|  7|12 14:58| 351.172|  139.17|    36.887|           254.875|          774.057|
|  8|13 06:58| 214.926| 139.477|    28.271|           224.637|          860.056|
|  9|12 20:58| 317.867|  140

In [14]:
df.printSchema()

root
 |-- _c0: integer (nullable = true)
 |-- time: string (nullable = true)
 |-- pressure: double (nullable = true)
 |-- Steem MW: double (nullable = true)
 |-- AmientTemp: double (nullable = true)
 |-- G_MW: double (nullable = true)
 |-- Flow: double (nullable = true)



In [15]:
from pyspark.ml.feature import VectorAssembler
featureassembler=VectorAssembler(inputCols=['pressure','G_MW',"AmientTemp","Flow"],
                                 outputCol="Independent Feature")
output=featureassembler.transform(df)

In [16]:
output.select("Independent Feature").show()

+--------------------+
| Independent Feature|
+--------------------+
|[364.315,256.289,...|
|[362.293,256.9339...|
|[350.456,255.058,...|
|[354.372,255.034,...|
|[340.949,258.033,...|
|[353.162,252.471,...|
|[350.716,252.708,...|
|[351.172,254.875,...|
|[214.926,224.637,...|
|[317.867,259.627,...|
|[368.036,258.806,...|
|[355.787,258.8189...|
|[318.158,253.24,3...|
|[225.313,228.727,...|
|[347.637,259.599,...|
|[310.272,256.968,...|
|[344.738,261.3209...|
|[340.589,258.971,...|
|[328.704,260.6910...|
|[309.53,261.20799...|
+--------------------+
only showing top 20 rows



In [18]:
finalized_data=output.select("Independent Feature", "Steem MW")

In [19]:
finalized_data.show()

+--------------------+--------+
| Independent Feature|Steem MW|
+--------------------+--------+
|[364.315,256.289,...| 134.021|
|[362.293,256.9339...| 134.215|
|[350.456,255.058,...| 134.935|
|[354.372,255.034,...| 135.036|
|[340.949,258.033,...| 135.883|
|[353.162,252.471,...| 136.982|
|[350.716,252.708,...| 137.837|
|[351.172,254.875,...|  139.17|
|[214.926,224.637,...| 139.477|
|[317.867,259.627,...|  140.26|
|[368.036,258.806,...| 140.814|
|[355.787,258.8189...| 141.678|
|[318.158,253.24,3...| 141.781|
|[225.313,228.727,...| 142.369|
|[347.637,259.599,...| 142.401|
|[310.272,256.968,...| 142.474|
|[344.738,261.3209...| 142.723|
|[340.589,258.971,...| 142.739|
|[328.704,260.6910...| 143.244|
|[309.53,261.20799...| 143.259|
+--------------------+--------+
only showing top 20 rows



In [20]:
from pyspark.ml.regression import LinearRegression
train_data, test_data=finalized_data.randomSplit([0.75,0.25])
regressor=LinearRegression(featuresCol="Independent Feature",labelCol="Steem MW")
regressor=regressor.fit(train_data)

In [21]:
regressor.coefficients

DenseVector([-0.0881, 0.2752, 0.1041, 0.0252])

In [22]:
pred_resaults=regressor.evaluate(test_data)

In [23]:
pred_resaults.predictions.show()

+--------------------+--------+------------------+
| Independent Feature|Steem MW|        prediction|
+--------------------+--------+------------------+
|[196.75,247.96499...| 150.464|151.00898555870657|
|[205.62,247.32099...| 152.138|150.74681366402876|
|[231.144,238.49,3...| 146.174|145.90764732102684|
|[236.23,241.016,3...| 149.469|147.67252730460183|
|[238.099,242.732,...| 149.276|147.68757070738394|
|[241.234,243.177,...| 148.391|146.63957361623045|
|[317.867,259.627,...|  140.26| 141.0679089297588|
|[321.996,259.469,...| 143.917| 144.3517181221962|
|[339.278,262.632,...| 143.261| 143.4313940833252|
|[340.589,258.971,...| 142.739|142.72598702995862|
|[340.949,258.033,...| 135.883|136.39740754553551|
|[354.372,255.034,...| 135.036| 135.0440527187956|
+--------------------+--------+------------------+



In [28]:
test_data


AttributeError: ignored