In [1]:
# Import necessary libraries
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator


In [2]:
# Initialize Spark session
spark = SparkSession.builder \
    .appName("Education Dataset Analysis") \
    .getOrCreate()

Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/07/02 06:53:41 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
24/07/02 06:53:41 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [4]:
# Load national data CSV into Spark DataFrame
national_data = spark.read.csv("EDUNONCORE_DATA_NATIONAL.csv", header=True, inferSchema=True)



                                                                                

In [5]:
# Show schema and first few rows of the DataFrame
national_data.printSchema()
national_data.show(5)

root
 |-- INDICATOR_ID: string (nullable = true)
 |-- COUNTRY_ID: string (nullable = true)
 |-- YEAR: integer (nullable = true)
 |-- VALUE: double (nullable = true)
 |-- MAGNITUDE: string (nullable = true)
 |-- QUALIFIER: string (nullable = true)

+------------+----------+----+--------+---------+---------+
|INDICATOR_ID|COUNTRY_ID|YEAR|   VALUE|MAGNITUDE|QUALIFIER|
+------------+----------+----+--------+---------+---------+
|       20000|       ABW|2003|   597.0|     null|     null|
|       20000|       ABW|2004|   590.0|     null|     null|
|       20000|       ABW|2005|   654.0|     null|     null|
|       20000|       ABW|2013|   757.0|     null|     null|
|       20000|       AGO|1999|157782.0|     null|     null|
+------------+----------+----+--------+---------+---------+
only showing top 5 rows



In [6]:
# Example: Perform a simple linear regression using MLlib
# Assuming we want to predict 'VALUE' based on other features

# Selecting relevant columns and dropping rows with null values
selected_data = national_data.select("YEAR", "VALUE").na.drop()

# Assemble features into a vector
assembler = VectorAssembler(inputCols=["YEAR"], outputCol="features")
assembled_data = assembler.transform(selected_data)



In [7]:
# Split data into training and test sets
train_data, test_data = assembled_data.randomSplit([0.7, 0.3], seed=123)



In [8]:
# Initialize Linear Regression model
lr = LinearRegression(featuresCol="features", labelCol="VALUE")

# Fit the model to the training data
lr_model = lr.fit(train_data)



24/07/02 06:54:54 WARN Instrumentation: [3758c7af] regParam is zero, which might cause numerical instability and overfitting.
24/07/02 06:54:55 WARN InstanceBuilder$NativeBLAS: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
24/07/02 06:54:55 WARN InstanceBuilder$NativeBLAS: Failed to load implementation from:dev.ludovic.netlib.blas.ForeignLinkerBLAS
24/07/02 06:54:58 WARN InstanceBuilder$NativeLAPACK: Failed to load implementation from:dev.ludovic.netlib.lapack.JNILAPACK
                                                                                

In [9]:
# Print coefficients and intercept
print("Coefficients: " + str(lr_model.coefficients))
print("Intercept: " + str(lr_model.intercept))



Coefficients: [74535617.98572202]
Intercept: -150387967565.79614


In [10]:
# Predict on the test data
predictions = lr_model.transform(test_data)

# Show predictions alongside actual values
predictions.select("YEAR", "VALUE", "prediction").show()



[Stage 7:>                                                          (0 + 1) / 1]

+----+-----+--------------------+
|YEAR|VALUE|          prediction|
+----+-----+--------------------+
|1970|  0.0|-3.552800133923767E9|
|1970|  0.0|-3.552800133923767E9|
|1970|  0.0|-3.552800133923767E9|
|1970|  0.0|-3.552800133923767E9|
|1970|  0.0|-3.552800133923767E9|
|1970|  0.0|-3.552800133923767E9|
|1970| 25.0|-3.552800133923767E9|
|1970| 53.0|-3.552800133923767E9|
|1970| 53.0|-3.552800133923767E9|
|1970| 80.0|-3.552800133923767E9|
|1970| 84.0|-3.552800133923767E9|
|1970|102.0|-3.552800133923767E9|
|1970|108.0|-3.552800133923767E9|
|1970|119.0|-3.552800133923767E9|
|1970|183.0|-3.552800133923767E9|
|1970|185.0|-3.552800133923767E9|
|1970|243.0|-3.552800133923767E9|
|1970|279.0|-3.552800133923767E9|
|1970|393.0|-3.552800133923767E9|
|1970|428.0|-3.552800133923767E9|
+----+-----+--------------------+
only showing top 20 rows



                                                                                

In [11]:
# Evaluate the model using RMSE
evaluator = RegressionEvaluator(labelCol="VALUE", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictions)
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)





Root Mean Squared Error (RMSE) on test data = 1.33833e+13


                                                                                

In [12]:
# Stop the Spark session
spark.stop()
