<a href="https://colab.research.google.com/github/mintesin/Projects/blob/main/ML_spark.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Machine learning using Spark**
This is a simple machine learning project using pyspark.

In [None]:
#Starting spark session
from pyspark.sql import SparkSession
spark=SparkSession.builder.appName("Dataframe").getOrCreate()

In [None]:
#Reading our csv dataset using spark
data_train=spark.read.csv("feature_matrix.csv",header=True,inferSchema=True)
data_2=spark.read.csv("output_matrix.csv",header=True,inferSchema=True)

In [None]:
data_train.show(5)

+---+---------+-----------------+-----+------------------+--------------------+--------------------+---+------+------+-------+
|_c0|Longitude|         Latitude|Speed|          Distance|          Distance_x|          Distance_y|PCI|PCI_64|PCI_65|PCI_302|
+---+---------+-----------------+-----+------------------+--------------------+--------------------+---+------+------+-------+
|  0|12.520112|        55.780073|21.51|0.5530644296850808|-0.00459719999999...|-0.00336900000000...| 64|     1|     0|      0|
|  1|12.520119|        55.780071|20.93| 0.553103502794036|-0.00459920000000...|-0.00336200000000...| 64|     1|     0|      0|
|  2|12.520127|         55.78007|20.36|0.5530163642392476|-0.00460019999999...|-0.00335399999999...| 64|     1|     0|      0|
|  3|12.520135|55.78006800000001|19.78| 0.553032643909539|-0.00460219999999...|-0.00334600000000...| 64|     1|     0|      0|
|  4|12.520142|        55.780067|19.21|0.5529700878366755|-0.00460319999999...|-0.00333900000000...| 64|     1|

In [None]:
num_rows=data_train.count()
num_cols=len(data_train.dtypes)
print("There are {} columns and {} rows in our dataset".format(num_cols,num_rows))

There are 11 columns and 57586 rows in our dataset


In [None]:
data_2.show(5)

+---+-----+------+------+------+
|_c0| SINR|  RSRP|  RSRQ| Power|
+---+-----+------+------+------+
|  0|16.18|-75.44|-13.36|-55.42|
|  1|16.18|-75.44|-13.36|-55.42|
|  2|16.18|-75.44|-13.36|-55.42|
|  3|16.18|-75.44|-13.36|-55.42|
|  4|16.18|-75.44|-13.36|-55.42|
+---+-----+------+------+------+
only showing top 5 rows



In [None]:
target=data_2.select("_c0","Power")
target.show(10)

+---+------+
|_c0| Power|
+---+------+
|  0|-55.42|
|  1|-55.42|
|  2|-55.42|
|  3|-55.42|
|  4|-55.42|
|  5|-55.42|
|  6|-55.42|
|  7|-55.42|
|  8|-49.67|
|  9|-49.67|
+---+------+
only showing top 10 rows



In [None]:
data_train.printSchema()

root
 |-- _c0: integer (nullable = true)
 |-- Longitude: double (nullable = true)
 |-- Latitude: double (nullable = true)
 |-- Speed: double (nullable = true)
 |-- Distance: double (nullable = true)
 |-- Distance_x: double (nullable = true)
 |-- Distance_y: double (nullable = true)
 |-- PCI: integer (nullable = true)
 |-- PCI_64: integer (nullable = true)
 |-- PCI_65: integer (nullable = true)
 |-- PCI_302: integer (nullable = true)



In [None]:
target.printSchema()

root
 |-- Power: double (nullable = true)



In [None]:
total_data=data_train.join(target,on="_c0")

In [None]:
total_data.show(5)

+---+---------+-----------------+-----+------------------+--------------------+--------------------+---+------+------+-------+------+
|_c0|Longitude|         Latitude|Speed|          Distance|          Distance_x|          Distance_y|PCI|PCI_64|PCI_65|PCI_302| Power|
+---+---------+-----------------+-----+------------------+--------------------+--------------------+---+------+------+-------+------+
|  0|12.520112|        55.780073|21.51|0.5530644296850808|-0.00459719999999...|-0.00336900000000...| 64|     1|     0|      0|-55.42|
|  1|12.520119|        55.780071|20.93| 0.553103502794036|-0.00459920000000...|-0.00336200000000...| 64|     1|     0|      0|-55.42|
|  2|12.520127|         55.78007|20.36|0.5530163642392476|-0.00460019999999...|-0.00335399999999...| 64|     1|     0|      0|-55.42|
|  3|12.520135|55.78006800000001|19.78| 0.553032643909539|-0.00460219999999...|-0.00334600000000...| 64|     1|     0|      0|-55.42|
|  4|12.520142|        55.780067|19.21|0.5529700878366755|-0.0

In [None]:
total_data.columns

['_c0',
 'Longitude',
 'Latitude',
 'Speed',
 'Distance',
 'Distance_x',
 'Distance_y',
 'PCI',
 'PCI_64',
 'PCI_65',
 'PCI_302',
 'Power']

In [None]:
#Assembling our features together
from pyspark.ml.feature import VectorAssembler
col1=['Longitude','Latitude','Speed','Distance','Distance_x','Distance_y','PCI','PCI_64','PCI_65','PCI_302']
col2=['Power']
feature_assembler=VectorAssembler(inputCols=col1,outputCol='features')

In [None]:
result=feature_assembler.transform(total_data)

In [None]:
result.show(5)

+---+---------+-----------------+-----+------------------+--------------------+--------------------+---+------+------+-------+------+--------------------+
|_c0|Longitude|         Latitude|Speed|          Distance|          Distance_x|          Distance_y|PCI|PCI_64|PCI_65|PCI_302| Power|            features|
+---+---------+-----------------+-----+------------------+--------------------+--------------------+---+------+------+-------+------+--------------------+
|  0|12.520112|        55.780073|21.51|0.5530644296850808|-0.00459719999999...|-0.00336900000000...| 64|     1|     0|      0|-55.42|[12.520112,55.780...|
|  1|12.520119|        55.780071|20.93| 0.553103502794036|-0.00459920000000...|-0.00336200000000...| 64|     1|     0|      0|-55.42|[12.520119,55.780...|
|  2|12.520127|         55.78007|20.36|0.5530163642392476|-0.00460019999999...|-0.00335399999999...| 64|     1|     0|      0|-55.42|[12.520127,55.780...|
|  3|12.520135|55.78006800000001|19.78| 0.553032643909539|-0.004602199

In [None]:

result.select('features').first()

Row(features=DenseVector([12.5201, 55.7801, 21.51, 0.5531, -0.0046, -0.0034, 64.0, 1.0, 0.0, 0.0]))

In [None]:
#selecting Out our assembled features for training
final_data=result.select('features','Power')
final_data.show(10)

+--------------------+------+
|            features| Power|
+--------------------+------+
|[12.520112,55.780...|-55.42|
|[12.520119,55.780...|-55.42|
|[12.520127,55.780...|-55.42|
|[12.520135,55.780...|-55.42|
|[12.520142,55.780...|-55.42|
|[12.52015,55.7800...|-55.42|
|[12.520158,55.780...|-55.42|
|[12.520165,55.780...|-55.42|
|[12.520173,55.780...|-49.67|
|[12.520181,55.780...|-49.67|
+--------------------+------+
only showing top 10 rows



In [None]:
#importing our linearregression model and training 
from pyspark.ml.regression import LinearRegression
train_data,test_data=final_data.randomSplit([0.75,0.25])
regressor=LinearRegression(featuresCol='features',labelCol='Power')
regressor=regressor.fit(train_data)

In [None]:
#let us evaluate our model
pred=regressor.evaluate(test_data)

In [None]:
#doing predictions
pred.predictions.show(10)

+--------------------+-------+-------------------+
|            features|  Power|         prediction|
+--------------------+-------+-------------------+
|[12.510841,55.781...|-110.11|-103.07553040655876|
|[12.51087,55.7817...|-110.11|-102.84591504108721|
|[12.510946,55.781...| -97.69| -102.2659175474837|
|[12.510952,55.781...| -97.69|-102.21991730967056|
|[12.510968,55.781...| -95.21|-102.10558092880092|
|[12.511007,55.782...| -95.23|-101.82439606888329|
|[12.511043,55.782...| -95.72| -101.5687620247736|
|[12.511049,55.782...| -95.72|-101.53080346232673|
|[12.511054,55.782...| -95.72|-101.49301341247156|
|[12.511068,55.782...| -95.72|-101.39322558782442|
+--------------------+-------+-------------------+
only showing top 10 rows



In [None]:
#Calculating our errors
mean_absolute_error=pred.meanAbsoluteError
mean_squared_error=pred.meanSquaredError

In [None]:
print("The mean absolute error is {}".format(mean_absolute_error))
print("The mean squared error is {}".format(mean_squared_error))

The mean absolute error is 6.847487566058613
The mean squared error is 77.77952090821448
