## Create a spark session

In [2]:
from pyspark.sql import SparkSession

In [3]:
# Create the sparksession
spark = SparkSession.builder.appName("Grad").getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/01/27 19:55:47 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
23/01/27 19:55:48 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [4]:
spark

In [5]:
# Read the dataset
grad_df = spark.read.csv('Admission_Predict_Ver1.1.csv',header = True,inferSchema = True)

                                                                                

In [7]:
# View the data
grad_df.show()

+----------+---------+-----------+-----------------+---+----+----+--------+----------------+
|Serial No.|GRE Score|TOEFL Score|University Rating|SOP|LOR |CGPA|Research|Chance of Admit |
+----------+---------+-----------+-----------------+---+----+----+--------+----------------+
|         1|      337|        118|                4|4.5| 4.5|9.65|       1|            0.92|
|         2|      324|        107|                4|4.0| 4.5|8.87|       1|            0.76|
|         3|      316|        104|                3|3.0| 3.5| 8.0|       1|            0.72|
|         4|      322|        110|                3|3.5| 2.5|8.67|       1|             0.8|
|         5|      314|        103|                2|2.0| 3.0|8.21|       0|            0.65|
|         6|      330|        115|                5|4.5| 3.0|9.34|       1|             0.9|
|         7|      321|        109|                3|3.0| 4.0| 8.2|       1|            0.75|
|         8|      308|        101|                2|3.0| 4.0| 7.9|    

In [9]:
# Size of the dataset
print((grad_df.count(),len(grad_df.columns)))

(500, 9)


In [10]:
# Print Schema
grad_df.printSchema()

root
 |-- Serial No.: integer (nullable = true)
 |-- GRE Score: integer (nullable = true)
 |-- TOEFL Score: integer (nullable = true)
 |-- University Rating: integer (nullable = true)
 |-- SOP: double (nullable = true)
 |-- LOR : double (nullable = true)
 |-- CGPA: double (nullable = true)
 |-- Research: integer (nullable = true)
 |-- Chance of Admit : double (nullable = true)



In [14]:
# Statistics of the grad data
grad_df.describe().show()

+-------+-----------------+------------------+-----------------+-----------------+------------------+------------------+------------------+------------------+-------------------+
|summary|       Serial No.|         GRE Score|      TOEFL Score|University Rating|               SOP|              LOR |              CGPA|          Research|   Chance of Admit |
+-------+-----------------+------------------+-----------------+-----------------+------------------+------------------+------------------+------------------+-------------------+
|  count|              500|               500|              500|              500|               500|               500|               500|               500|                500|
|   mean|            250.5|           316.472|          107.192|            3.114|             3.374|             3.484| 8.576440000000003|              0.56| 0.7217399999999996|
| stddev|144.4818327679989|11.295148372354712|6.081867659564538|1.143511800759815|0.9910036207566072|0.92

In [21]:
# drop the unnecessary column
grad_df = grad_df.drop('Serial No.')

In [22]:
# display the dataframe
grad_df.show()

+---------+-----------+-----------------+---+----+----+--------+----------------+
|GRE Score|TOEFL Score|University Rating|SOP|LOR |CGPA|Research|Chance of Admit |
+---------+-----------+-----------------+---+----+----+--------+----------------+
|      337|        118|                4|4.5| 4.5|9.65|       1|            0.92|
|      324|        107|                4|4.0| 4.5|8.87|       1|            0.76|
|      316|        104|                3|3.0| 3.5| 8.0|       1|            0.72|
|      322|        110|                3|3.5| 2.5|8.67|       1|             0.8|
|      314|        103|                2|2.0| 3.0|8.21|       0|            0.65|
|      330|        115|                5|4.5| 3.0|9.34|       1|             0.9|
|      321|        109|                3|3.0| 4.0| 8.2|       1|            0.75|
|      308|        101|                2|3.0| 4.0| 7.9|       0|            0.68|
|      302|        102|                1|2.0| 1.5| 8.0|       0|             0.5|
|      323|     

In [23]:
#Check for null values
for i in grad_df.columns:
    print(i+":",grad_df[grad_df[i].isNull()].count())

GRE Score: 0
TOEFL Score: 0
University Rating: 0
SOP: 0
LOR : 0
CGPA: 0
Research: 0
Chance of Admit : 0


In [26]:
# Correlation Analysis
for col in grad_df.columns:
    print("Correlation to chance of admit for {} is {} ".format(col,grad_df.stat.corr("Chance of Admit ",col)))

Correlation to chance of admit for GRE Score is 0.8103506354632598 
Correlation to chance of admit for TOEFL Score is 0.7922276143050823 
Correlation to chance of admit for University Rating is 0.6901323687886892 
Correlation to chance of admit for SOP is 0.6841365241316723 
Correlation to chance of admit for LOR  is 0.6453645135280112 
Correlation to chance of admit for CGPA is 0.882412574904574 
Correlation to chance of admit for Research is 0.5458710294711379 
Correlation to chance of admit for Chance of Admit  is 1.0 


In [28]:
# feature selection
from pyspark.ml.feature import VectorAssembler
featureassembler = VectorAssembler(inputCols = ['GRE Score','TOEFL Score','CGPA'],outputCol = 'feature')

In [29]:
output_data = featureassembler.transform(grad_df)
output_data.show()

+---------+-----------+-----------------+---+----+----+--------+----------------+------------------+
|GRE Score|TOEFL Score|University Rating|SOP|LOR |CGPA|Research|Chance of Admit |           feature|
+---------+-----------+-----------------+---+----+----+--------+----------------+------------------+
|      337|        118|                4|4.5| 4.5|9.65|       1|            0.92|[337.0,118.0,9.65]|
|      324|        107|                4|4.0| 4.5|8.87|       1|            0.76|[324.0,107.0,8.87]|
|      316|        104|                3|3.0| 3.5| 8.0|       1|            0.72| [316.0,104.0,8.0]|
|      322|        110|                3|3.5| 2.5|8.67|       1|             0.8|[322.0,110.0,8.67]|
|      314|        103|                2|2.0| 3.0|8.21|       0|            0.65|[314.0,103.0,8.21]|
|      330|        115|                5|4.5| 3.0|9.34|       1|             0.9|[330.0,115.0,9.34]|
|      321|        109|                3|3.0| 4.0| 8.2|       1|            0.75| [321.0,10

In [31]:
# import linear-regression and create final dataframe
from pyspark.ml.regression import LinearRegression
final_data = output_data.select('feature','Chance of Admit ')

In [32]:
# Print schema of final data
final_data.printSchema()

root
 |-- feature: vector (nullable = true)
 |-- Chance of Admit : double (nullable = true)



In [33]:
# Split the dataset into train and test
train,test = final_data.randomSplit([0.75,0.25])

In [40]:
# Build and train the model
models = LinearRegression(featuresCol = 'feature',labelCol = 'Chance of Admit ')
model = models.fit(train)

23/01/27 22:12:17 WARN Instrumentation: [17c734d6] regParam is zero, which might cause numerical instability and overfitting.
23/01/27 22:12:18 WARN InstanceBuilder$NativeBLAS: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
23/01/27 22:12:18 WARN InstanceBuilder$NativeBLAS: Failed to load implementation from:dev.ludovic.netlib.blas.ForeignLinkerBLAS
23/01/27 22:12:18 WARN InstanceBuilder$NativeLAPACK: Failed to load implementation from:dev.ludovic.netlib.lapack.JNILAPACK


In [43]:
# Coefficients 
print("coefficients: ",model.coefficients)

coefficients:  [0.0026377217079742864,0.0025727859947345024,0.14288173079622432]


In [44]:
# Intercept 
print("Intercept: ",model.intercept)

Intercept:  -1.6122002303911394


In [48]:
# get the summary
summary = model.summary


In [49]:
#print RMSE
print("The RMSE : ", summary.rootMeanSquaredError)

The RMSE :  0.05907235939505537


In [51]:
# print r2 score
print("The r2 score is : ",summary.r2)

The r2 score is :  0.8195228518067821


In [52]:
# transform the test data
predictions = model.transform(test)

In [53]:
# display predictions
predictions.show()

+------------------+----------------+-------------------+
|           feature|Chance of Admit |         prediction|
+------------------+----------------+-------------------+
| [294.0,95.0,7.64]|            0.49| 0.4993210445362324|
|[295.0,101.0,7.86]|            0.69|  0.548829462987783|
|[296.0,101.0,7.68]|             0.6| 0.5257484731524367|
| [297.0,96.0,7.43]|            0.34| 0.4798018321876827|
| [297.0,96.0,7.89]|            0.43| 0.5455274283539457|
| [298.0,97.0,7.21]|            0.45|0.45357835911522204|
| [299.0,96.0,7.86]|            0.54| 0.5465164198460075|
|[299.0,100.0,8.02]|            0.63| 0.5796686407523415|
|[300.0,100.0,8.26]|            0.62| 0.6165979778514095|
|[300.0,102.0,8.17]|            0.63| 0.6088841940692185|
|[300.0,104.0,8.16]|            0.71| 0.6126009487507249|
|[301.0,106.0,8.47]|            0.57| 0.6646775789949981|
|[304.0,101.0,7.66]|            0.38| 0.5439926122003069|
|[304.0,103.0,7.92]|            0.71| 0.5862874341967939|
| [304.0,105.0

In [55]:
# Evaluate
from pyspark.ml.evaluation import RegressionEvaluator
evaluator = RegressionEvaluator(predictionCol = 'prediction',labelCol = 'Chance of Admit ',metricName = "r2")

In [59]:
# print the r2 score
print("r2 score on test data : ",evaluator.evaluate(predictions))

r2 score on test data :  0.7599635940123712
