## Import packages

In [1]:
import os
import sys

os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

In [2]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.evaluation import RegressionEvaluator

In [4]:
import pyspark.mllib.linalg # the 'mllib' is for RDD which is absoluted.

In [5]:
spark = SparkSession.builder.appName('Pyspark ML Algo').getOrCreate()

In [54]:
dataframe = spark.read.csv('Admission_Predict.csv',header=True)

In [55]:
type(dataframe)

pyspark.sql.dataframe.DataFrame

In [56]:
dataframe.show()

+---------+-----------+-----------------+---+----+----+--------+----------------+
|GRE Score|TOEFL Score|University Rating|SOP|LOR |CGPA|Research|Chance of Admit |
+---------+-----------+-----------------+---+----+----+--------+----------------+
|      337|        118|                4|4.5| 4.5|9.65|       1|            0.92|
|      324|        107|                4|  4| 4.5|8.87|       1|            0.76|
|      316|        104|                3|  3| 3.5|   8|       1|            0.72|
|      322|        110|                3|3.5| 2.5|8.67|       1|             0.8|
|      314|        103|                2|  2|   3|8.21|       0|            0.65|
|      330|        115|                5|4.5|   3|9.34|       1|             0.9|
|      321|        109|                3|  3|   4| 8.2|       1|            0.75|
|      308|        101|                2|  3|   4| 7.9|       0|            0.68|
|      302|        102|                1|  2| 1.5|   8|       0|             0.5|
|      323|     

In [57]:
dataframe.printSchema()

root
 |-- GRE Score: string (nullable = true)
 |-- TOEFL Score: string (nullable = true)
 |-- University Rating: string (nullable = true)
 |-- SOP: string (nullable = true)
 |-- LOR : string (nullable = true)
 |-- CGPA: string (nullable = true)
 |-- Research: string (nullable = true)
 |-- Chance of Admit : string (nullable = true)



In [58]:
dataframe.columns

['GRE Score',
 'TOEFL Score',
 'University Rating',
 'SOP',
 'LOR ',
 'CGPA',
 'Research',
 'Chance of Admit ']

In [59]:
from pyspark.sql.functions import col

for c in dataframe.columns:
    print(c)
    print(col(c))

GRE Score
Column<'GRE Score'>
TOEFL Score
Column<'TOEFL Score'>
University Rating
Column<'University Rating'>
SOP
Column<'SOP'>
LOR 
Column<'LOR '>
CGPA
Column<'CGPA'>
Research
Column<'Research'>
Chance of Admit 
Column<'Chance of Admit '>


In [60]:
dataframe.select(*(col(c) for c in dataframe.columns)).show()

+---------+-----------+-----------------+---+----+----+--------+----------------+
|GRE Score|TOEFL Score|University Rating|SOP|LOR |CGPA|Research|Chance of Admit |
+---------+-----------+-----------------+---+----+----+--------+----------------+
|      337|        118|                4|4.5| 4.5|9.65|       1|            0.92|
|      324|        107|                4|  4| 4.5|8.87|       1|            0.76|
|      316|        104|                3|  3| 3.5|   8|       1|            0.72|
|      322|        110|                3|3.5| 2.5|8.67|       1|             0.8|
|      314|        103|                2|  2|   3|8.21|       0|            0.65|
|      330|        115|                5|4.5|   3|9.34|       1|             0.9|
|      321|        109|                3|  3|   4| 8.2|       1|            0.75|
|      308|        101|                2|  3|   4| 7.9|       0|            0.68|
|      302|        102|                1|  2| 1.5|   8|       0|             0.5|
|      323|     

## EDA
### Convert String to Float type

In [66]:
dataframe.select(*(col(c).cast('float') for c in dataframe.columns)).show()

+---------+-----------+-----------------+---+----+----+--------+----------------+
|GRE Score|TOEFL Score|University Rating|SOP|LOR |CGPA|Research|Chance of Admit |
+---------+-----------+-----------------+---+----+----+--------+----------------+
|    337.0|      118.0|              4.0|4.5| 4.5|9.65|     1.0|            0.92|
|    324.0|      107.0|              4.0|4.0| 4.5|8.87|     1.0|            0.76|
|    316.0|      104.0|              3.0|3.0| 3.5| 8.0|     1.0|            0.72|
|    322.0|      110.0|              3.0|3.5| 2.5|8.67|     1.0|             0.8|
|    314.0|      103.0|              2.0|2.0| 3.0|8.21|     0.0|            0.65|
|    330.0|      115.0|              5.0|4.5| 3.0|9.34|     1.0|             0.9|
|    321.0|      109.0|              3.0|3.0| 4.0| 8.2|     1.0|            0.75|
|    308.0|      101.0|              2.0|3.0| 4.0| 7.9|     0.0|            0.68|
|    302.0|      102.0|              1.0|2.0| 1.5| 8.0|     0.0|             0.5|
|    323.0|     

In [67]:
new_df = dataframe.select(*(col(c).cast('float') for c in dataframe.columns))


In [68]:
new_df.printSchema()

root
 |-- GRE Score: float (nullable = true)
 |-- TOEFL Score: float (nullable = true)
 |-- University Rating: float (nullable = true)
 |-- SOP: float (nullable = true)
 |-- LOR : float (nullable = true)
 |-- CGPA: float (nullable = true)
 |-- Research: float (nullable = true)
 |-- Chance of Admit : float (nullable = true)



### Null Check

In [69]:
from pyspark.sql.functions import col,count,isnan,isnull,when

In [70]:
new_df.select([count(when(col(c).isNull(),c)) for c in new_df.columns]).show()

+-------------------------------------------------------+-----------------------------------------------------------+-----------------------------------------------------------------------+-------------------------------------------+---------------------------------------------+---------------------------------------------+-----------------------------------------------------+---------------------------------------------------------------------+
|count(CASE WHEN (GRE Score IS NULL) THEN GRE Score END)|count(CASE WHEN (TOEFL Score IS NULL) THEN TOEFL Score END)|count(CASE WHEN (University Rating IS NULL) THEN University Rating END)|count(CASE WHEN (SOP IS NULL) THEN SOP END)|count(CASE WHEN (LOR  IS NULL) THEN LOR  END)|count(CASE WHEN (CGPA IS NULL) THEN CGPA END)|count(CASE WHEN (Research IS NULL) THEN Research END)|count(CASE WHEN (Chance of Admit  IS NULL) THEN Chance of Admit  END)|
+-------------------------------------------------------+-------------------------------------------

In [71]:
new_df.select([count(when(col(c).isNull(),c)).alias(c) for c in new_df.columns]).show()

+---------+-----------+-----------------+---+----+----+--------+----------------+
|GRE Score|TOEFL Score|University Rating|SOP|LOR |CGPA|Research|Chance of Admit |
+---------+-----------+-----------------+---+----+----+--------+----------------+
|        0|          0|                0|  0|   0|   0|       0|               0|
+---------+-----------+-----------------+---+----+----+--------+----------------+



## Feature Engineering

In [72]:
from pyspark.ml.feature import Imputer

In [73]:
imputer = Imputer(inputCols=["GRE Score","TOEFL Score","University Rating"],
                 outputCols=["GRE Score","TOEFL Score","University Rating"])
model = imputer.fit(new_df)
imputed_data=model.transform(new_df)

In [74]:
imputed_data.show()

+---------+-----------+-----------------+---+----+----+--------+----------------+
|GRE Score|TOEFL Score|University Rating|SOP|LOR |CGPA|Research|Chance of Admit |
+---------+-----------+-----------------+---+----+----+--------+----------------+
|    337.0|      118.0|              4.0|4.5| 4.5|9.65|     1.0|            0.92|
|    324.0|      107.0|              4.0|4.0| 4.5|8.87|     1.0|            0.76|
|    316.0|      104.0|              3.0|3.0| 3.5| 8.0|     1.0|            0.72|
|    322.0|      110.0|              3.0|3.5| 2.5|8.67|     1.0|             0.8|
|    314.0|      103.0|              2.0|2.0| 3.0|8.21|     0.0|            0.65|
|    330.0|      115.0|              5.0|4.5| 3.0|9.34|     1.0|             0.9|
|    321.0|      109.0|              3.0|3.0| 4.0| 8.2|     1.0|            0.75|
|    308.0|      101.0|              2.0|3.0| 4.0| 7.9|     0.0|            0.68|
|    302.0|      102.0|              1.0|2.0| 1.5| 8.0|     0.0|             0.5|
|    323.0|     

In [75]:
new_df.select([count(when(col(c).isNull(),c)).alias(c) for c in new_df.columns]).show()

+---------+-----------+-----------------+---+----+----+--------+----------------+
|GRE Score|TOEFL Score|University Rating|SOP|LOR |CGPA|Research|Chance of Admit |
+---------+-----------+-----------------+---+----+----+--------+----------------+
|        0|          0|                0|  0|   0|   0|       0|               0|
+---------+-----------+-----------------+---+----+----+--------+----------------+



In [76]:
features= imputed_data.drop('Chance of Admit')

In [77]:
features.columns

['GRE Score',
 'TOEFL Score',
 'University Rating',
 'SOP',
 'LOR ',
 'CGPA',
 'Research',
 'Chance of Admit ']

In [78]:
#Assemble our features together using VectorAssembler
assembler = VectorAssembler(inputCols=features.columns,outputCol='features')

In [79]:
output = assembler.transform(imputed_data)

In [80]:
output.columns

['GRE Score',
 'TOEFL Score',
 'University Rating',
 'SOP',
 'LOR ',
 'CGPA',
 'Research',
 'Chance of Admit ',
 'features']

In [81]:
output.show()

+---------+-----------+-----------------+---+----+----+--------+----------------+--------------------+
|GRE Score|TOEFL Score|University Rating|SOP|LOR |CGPA|Research|Chance of Admit |            features|
+---------+-----------+-----------------+---+----+----+--------+----------------+--------------------+
|    337.0|      118.0|              4.0|4.5| 4.5|9.65|     1.0|            0.92|[337.0,118.0,4.0,...|
|    324.0|      107.0|              4.0|4.0| 4.5|8.87|     1.0|            0.76|[324.0,107.0,4.0,...|
|    316.0|      104.0|              3.0|3.0| 3.5| 8.0|     1.0|            0.72|[316.0,104.0,3.0,...|
|    322.0|      110.0|              3.0|3.5| 2.5|8.67|     1.0|             0.8|[322.0,110.0,3.0,...|
|    314.0|      103.0|              2.0|2.0| 3.0|8.21|     0.0|            0.65|[314.0,103.0,2.0,...|
|    330.0|      115.0|              5.0|4.5| 3.0|9.34|     1.0|             0.9|[330.0,115.0,5.0,...|
|    321.0|      109.0|              3.0|3.0| 4.0| 8.2|     1.0|         

In [82]:
output.select('features').toPandas().values

array([[DenseVector([337.0, 118.0, 4.0, 4.5, 4.5, 9.65, 1.0, 0.92])],
       [DenseVector([324.0, 107.0, 4.0, 4.0, 4.5, 8.87, 1.0, 0.76])],
       [DenseVector([316.0, 104.0, 3.0, 3.0, 3.5, 8.0, 1.0, 0.72])],
       [DenseVector([322.0, 110.0, 3.0, 3.5, 2.5, 8.67, 1.0, 0.8])],
       [DenseVector([314.0, 103.0, 2.0, 2.0, 3.0, 8.21, 0.0, 0.65])],
       [DenseVector([330.0, 115.0, 5.0, 4.5, 3.0, 9.34, 1.0, 0.9])],
       [DenseVector([321.0, 109.0, 3.0, 3.0, 4.0, 8.2, 1.0, 0.75])],
       [DenseVector([308.0, 101.0, 2.0, 3.0, 4.0, 7.9, 0.0, 0.68])],
       [DenseVector([302.0, 102.0, 1.0, 2.0, 1.5, 8.0, 0.0, 0.5])],
       [DenseVector([323.0, 108.0, 3.0, 3.5, 3.0, 8.6, 0.0, 0.45])],
       [DenseVector([325.0, 106.0, 3.0, 3.5, 4.0, 8.4, 1.0, 0.52])],
       [DenseVector([327.0, 111.0, 4.0, 4.0, 4.5, 9.0, 1.0, 0.84])],
       [DenseVector([328.0, 112.0, 4.0, 4.0, 4.5, 9.1, 1.0, 0.78])],
       [DenseVector([307.0, 109.0, 3.0, 4.0, 3.0, 8.0, 1.0, 0.62])],
       [DenseVector([311.0, 104.

In [83]:
data = output.select('features','Chance of Admit ')
data.show()

+--------------------+----------------+
|            features|Chance of Admit |
+--------------------+----------------+
|[337.0,118.0,4.0,...|            0.92|
|[324.0,107.0,4.0,...|            0.76|
|[316.0,104.0,3.0,...|            0.72|
|[322.0,110.0,3.0,...|             0.8|
|[314.0,103.0,2.0,...|            0.65|
|[330.0,115.0,5.0,...|             0.9|
|[321.0,109.0,3.0,...|            0.75|
|[308.0,101.0,2.0,...|            0.68|
|[302.0,102.0,1.0,...|             0.5|
|[323.0,108.0,3.0,...|            0.45|
|[325.0,106.0,3.0,...|            0.52|
|[327.0,111.0,4.0,...|            0.84|
|[328.0,112.0,4.0,...|            0.78|
|[307.0,109.0,3.0,...|            0.62|
|[311.0,104.0,3.0,...|            0.61|
|[314.0,105.0,3.0,...|            0.54|
|[317.0,107.0,3.0,...|            0.66|
|[319.0,106.0,3.0,...|            0.65|
|[318.0,110.0,3.0,...|            0.63|
|[303.0,102.0,3.0,...|            0.62|
+--------------------+----------------+
only showing top 20 rows



### Train Test Split

In [84]:
train_df , test_df = data.randomSplit([0.7,0.3])

In [85]:
train_df.show()

+--------------------+----------------+
|            features|Chance of Admit |
+--------------------+----------------+
|[290.0,100.0,1.0,...|            0.47|
|[290.0,104.0,4.0,...|            0.45|
|[293.0,97.0,2.0,2...|            0.64|
|[294.0,95.0,1.0,1...|            0.49|
|[295.0,96.0,2.0,1...|            0.47|
|[296.0,95.0,2.0,3...|            0.44|
|[296.0,97.0,2.0,1...|            0.49|
|[296.0,99.0,2.0,2...|            0.61|
|[296.0,101.0,1.0,...|             0.6|
|[297.0,96.0,2.0,2...|            0.34|
|[297.0,98.0,2.0,2...|            0.59|
|[297.0,100.0,1.0,...|            0.52|
|[298.0,92.0,1.0,2...|            0.51|
|[298.0,98.0,2.0,1...|            0.44|
|[298.0,98.0,2.0,4...|            0.34|
|[298.0,99.0,2.0,4...|            0.46|
|[298.0,105.0,3.0,...|            0.69|
|[299.0,96.0,2.0,1...|            0.54|
|[299.0,97.0,3.0,5...|            0.38|
|[299.0,100.0,1.0,...|            0.59|
+--------------------+----------------+
only showing top 20 rows



In [86]:
test_df.show()

+--------------------+----------------+
|            features|Chance of Admit |
+--------------------+----------------+
|[294.0,93.0,1.0,1...|            0.46|
|[295.0,93.0,1.0,2...|            0.46|
|[295.0,99.0,2.0,2...|            0.57|
|[295.0,101.0,2.0,...|            0.69|
|[296.0,99.0,2.0,3...|            0.47|
|[297.0,96.0,2.0,2...|            0.43|
|[298.0,99.0,1.0,1...|            0.53|
|[298.0,101.0,2.0,...|            0.54|
|[299.0,94.0,1.0,1...|            0.42|
|[300.0,99.0,1.0,1...|            0.58|
|[300.0,99.0,1.0,3...|            0.36|
|[300.0,102.0,3.0,...|            0.63|
|[300.0,104.0,3.0,...|            0.71|
|[301.0,98.0,1.0,2...|            0.67|
|[301.0,106.0,4.0,...|            0.57|
|[302.0,99.0,2.0,1...|            0.56|
|[304.0,97.0,2.0,1...|            0.47|
|[305.0,105.0,2.0,...|            0.67|
|[305.0,105.0,2.0,...|            0.66|
|[306.0,103.0,2.0,...|            0.69|
+--------------------+----------------+
only showing top 20 rows



### Modeling
### Linear Regressor 

In [87]:
lin_reg = LinearRegression(featuresCol = 'features', labelCol='Chance of Admit ')

In [88]:
linerRegressor_model = lin_reg.fit(train_df)

In [89]:
print("Cofficients :- ", linerRegressor_model.coefficients)
print("Intercept :- ", linerRegressor_model.intercept)

Cofficients :-  [-7.776922089107526e-16,2.765044435524325e-17,1.868833427231193e-15,8.693437197977878e-16,-9.60871037836799e-16,-4.110962096564268e-15,4.058922006152679e-15,1.000000000000039]
Intercept :-  2.429096021212757e-13


In [90]:
trainSummary = linerRegressor_model.summary
print("RMSE :- ", trainSummary.rootMeanSquaredError)
print("R2 Score :- ", trainSummary.r2)

RMSE :-  5.499543554245529e-15
R2 Score :-  1.0


In [91]:
# Prediction
pred = linerRegressor_model.transform(test_df)
pred.select("prediction","Chance of Admit ",'features').show()

+-------------------+----------------+--------------------+
|         prediction|Chance of Admit |            features|
+-------------------+----------------+--------------------+
|0.46000000834465604|            0.46|[294.0,93.0,1.0,1...|
| 0.4600000083446564|            0.46|[295.0,93.0,1.0,2...|
| 0.5699999928474527|            0.57|[295.0,99.0,2.0,2...|
| 0.6899999976158291|            0.69|[295.0,101.0,2.0,...|
|  0.469999998807914|            0.47|[296.0,99.0,2.0,3...|
| 0.4300000071525608|            0.43|[297.0,96.0,2.0,2...|
| 0.5299999713897747|            0.53|[298.0,99.0,1.0,1...|
|  0.540000021457678|            0.54|[298.0,101.0,2.0,...|
|0.41999998688697915|            0.42|[299.0,94.0,1.0,1...|
| 0.5799999833107019|            0.58|[300.0,99.0,1.0,1...|
| 0.3600000143051198|            0.36|[300.0,99.0,1.0,3...|
| 0.6299999952316381|            0.63|[300.0,102.0,3.0,...|
| 0.7099999785423403|            0.71|[300.0,104.0,3.0,...|
| 0.6700000166893101|            0.67|[3

In [93]:
# Model Evaluation
pred_evaluator = RegressionEvaluator(predictionCol='prediction',labelCol='Chance of Admit ',metricName='r2')
print("R Squared (R2) on test data :- ", pred_evaluator.evaluate(pred))

R Squared (R2) on test data :-  1.0


### Random Forest Regressor

In [94]:
rf_reg =RandomForestRegressor(featuresCol = 'features', labelCol='Chance of Admit ')

In [96]:
# Train Model , This also runs the indexer
rf_model = rf_reg.fit(train_df)

In [97]:
# make Predictions
pred =  rf_model.transform(test_df)

In [98]:
pred.show()

+--------------------+----------------+-------------------+
|            features|Chance of Admit |         prediction|
+--------------------+----------------+-------------------+
|[294.0,93.0,1.0,1...|            0.46|  0.456819496537139|
|[295.0,93.0,1.0,2...|            0.46|  0.455023066358088|
|[295.0,99.0,2.0,2...|            0.57| 0.5659494850326985|
|[295.0,101.0,2.0,...|            0.69| 0.6427943304756355|
|[296.0,99.0,2.0,3...|            0.47|0.47057268513910955|
|[297.0,96.0,2.0,2...|            0.43|0.46534966862055105|
|[298.0,99.0,1.0,1...|            0.53| 0.5162438799401958|
|[298.0,101.0,2.0,...|            0.54| 0.5106368755157774|
|[299.0,94.0,1.0,1...|            0.42|0.43403484710689744|
|[300.0,99.0,1.0,1...|            0.58| 0.5824139718545196|
|[300.0,99.0,1.0,3...|            0.36| 0.4407534351551553|
|[300.0,102.0,3.0,...|            0.63| 0.6250995454695794|
|[300.0,104.0,3.0,...|            0.71| 0.6878256871606461|
|[301.0,98.0,1.0,2...|            0.67| 

In [99]:
pred_evaluator = RegressionEvaluator(labelCol='Chance of Admit ',predictionCol='prediction',metricName='r2')
print("R Squared (R2) on test data :- ", pred_evaluator.evaluate(pred))

R Squared (R2) on test data :-  0.9771942969881777


In [100]:
pred_evaluator = RegressionEvaluator(labelCol='Chance of Admit ',predictionCol='prediction',metricName='rmse')
print("RMSE on test data :- ", pred_evaluator.evaluate(pred))

RMSE on test data :-  0.021773607899974193
