In [2]:
#Predicting UPSC rank using linear regression of Pyspark
#!pip install Bio
#!pip install biopython

In [3]:
import pyspark
from pyspark.sql import SparkSession

spark=SparkSession.builder.appName("Predicting_upsc_rank").getOrCreate()

In [4]:
upsc_data=spark.read.csv("/home/moglix/Desktop/Amit/PGitHub/Python/UPSC_Result/CSM18_FQ_WEB_CELL_NEW.csv",inferSchema=True,header=True)
#upsc_data=upsc_data.drop(['ROLL_NO', 'NAME'])
#Dropping roll_num and name because it's not a important feature
columns_to_drop = ['ROLL_NO', 'NAME']
upsc_data = upsc_data.drop(*columns_to_drop)


In [5]:
#upsc_data=upsc_data.na.fill("UR", "CATEGORY")
# Filling null value for Category to best fit in feature model
colname=["CATEGORY"]
upsc_data=upsc_data.na.fill("UR",colname);
upsc_data.show()

+--------+--------+----------+-------+----+
|CATEGORY|PT_MARKS|MAIN_MARKS|F_TOTAL|RANK|
+--------+--------+----------+-------+----+
|      SC|     179|       942|   1121|   1|
|      UR|     198|       882|   1080|   2|
|      UR|     184|       893|   1077|   3|
|      UR|     184|       887|   1071|   4|
|      UR|     173|       895|   1068|   5|
|      UR|     184|       883|   1067|   6|
|      UR|     182|       885|   1067|   7|
|      UR|     195|       871|   1066|   8|
|      UR|     193|       871|   1064|   9|
|      UR|     187|       877|   1064|  10|
|      UR|     193|       870|   1063|  11|
|      UR|     171|       891|   1062|  12|
|      UR|     165|       897|   1062|  13|
|      UR|     182|       879|   1061|  14|
|      UR|     206|       854|   1060|  15|
|     OBC|     176|       884|   1060|  16|
|      UR|     193|       866|   1059|  17|
|      UR|     180|       879|   1059|  18|
|      UR|     171|       887|   1058|  19|
|      UR|     193|       864|  

In [6]:
# Because Category is a String so need to reindex it as a double to fit in feature model
from pyspark.ml.feature import StringIndexer
indexer=StringIndexer(inputCol='CATEGORY', outputCol='CATEGORY_cat')
indexed=indexer.fit(upsc_data).transform(upsc_data) 

In [7]:
indexed.show()

+--------+--------+----------+-------+----+------------+
|CATEGORY|PT_MARKS|MAIN_MARKS|F_TOTAL|RANK|CATEGORY_cat|
+--------+--------+----------+-------+----+------------+
|      SC|     179|       942|   1121|   1|         2.0|
|      UR|     198|       882|   1080|   2|         0.0|
|      UR|     184|       893|   1077|   3|         0.0|
|      UR|     184|       887|   1071|   4|         0.0|
|      UR|     173|       895|   1068|   5|         0.0|
|      UR|     184|       883|   1067|   6|         0.0|
|      UR|     182|       885|   1067|   7|         0.0|
|      UR|     195|       871|   1066|   8|         0.0|
|      UR|     193|       871|   1064|   9|         0.0|
|      UR|     187|       877|   1064|  10|         0.0|
|      UR|     193|       870|   1063|  11|         0.0|
|      UR|     171|       891|   1062|  12|         0.0|
|      UR|     165|       897|   1062|  13|         0.0|
|      UR|     182|       879|   1061|  14|         0.0|
|      UR|     206|       854| 

In [8]:
# Training model based on Main_marks, PT_marks, and Category, and total marks considering these a a feature
# Rank is the output 
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

assembler=VectorAssembler(inputCols=['PT_MARKS', 
 'MAIN_MARKS', 
 'F_TOTAL', 
 'CATEGORY_cat'], outputCol='feature')
output=assembler.transform(indexed)
output.select('feature','RANK').show()

+--------------------+----+
|             feature|RANK|
+--------------------+----+
|[179.0,942.0,1121...|   1|
|[198.0,882.0,1080...|   2|
|[184.0,893.0,1077...|   3|
|[184.0,887.0,1071...|   4|
|[173.0,895.0,1068...|   5|
|[184.0,883.0,1067...|   6|
|[182.0,885.0,1067...|   7|
|[195.0,871.0,1066...|   8|
|[193.0,871.0,1064...|   9|
|[187.0,877.0,1064...|  10|
|[193.0,870.0,1063...|  11|
|[171.0,891.0,1062...|  12|
|[165.0,897.0,1062...|  13|
|[182.0,879.0,1061...|  14|
|[206.0,854.0,1060...|  15|
|[176.0,884.0,1060...|  16|
|[193.0,866.0,1059...|  17|
|[180.0,879.0,1059...|  18|
|[171.0,887.0,1058...|  19|
|[193.0,864.0,1057...|  20|
+--------------------+----+
only showing top 20 rows



In [9]:
# Training data model in 80% and 20% ratio
final_data=output.select('feature','RANK')
train_data, test_data=final_data.randomSplit([0.8,0.2])

In [10]:
train_data.describe().show()

+-------+------------------+
|summary|              RANK|
+-------+------------------+
|  count|               607|
|   mean| 380.7199341021417|
| stddev|221.24004103482565|
|    min|                 1|
|    max|               759|
+-------+------------------+



In [11]:
test_data.describe().show()

+-------+------------------+
|summary|              RANK|
+-------+------------------+
|  count|               152|
|   mean|           377.125|
| stddev|211.79636329104244|
|    min|                 2|
|    max|               751|
+-------+------------------+



In [12]:
# Using Linear Regression from Spark ML Library
from pyspark.ml.regression import LinearRegression

ship_lr=LinearRegression(featuresCol='feature', labelCol='RANK')
trainded_ship_model=ship_lr.fit(train_data)
ship_result=trainded_ship_model.evaluate(train_data)

# Getting accuracy score for trained model
print('Score :',ship_result.r2) 

Rsquared Error : 0.8740538013781964


In [16]:
unlabeled_data=test_data.select('feature')
unlabeled_data.head(10)

[Row(feature=DenseVector([135.0, 792.0, 927.0, 3.0])),
 Row(feature=DenseVector([138.0, 799.0, 937.0, 2.0])),
 Row(feature=DenseVector([138.0, 799.0, 937.0, 2.0])),
 Row(feature=DenseVector([140.0, 829.0, 969.0, 1.0])),
 Row(feature=DenseVector([143.0, 776.0, 919.0, 2.0])),
 Row(feature=DenseVector([143.0, 800.0, 943.0, 0.0])),
 Row(feature=DenseVector([143.0, 800.0, 943.0, 1.0])),
 Row(feature=DenseVector([143.0, 847.0, 990.0, 0.0])),
 Row(feature=DenseVector([143.0, 856.0, 999.0, 2.0])),
 Row(feature=DenseVector([143.0, 861.0, 1004.0, 0.0]))]

In [17]:
# Transforming the test data and pre
predictions=trainded_ship_model.transform(unlabeled_data)

In [18]:
predictions.show()

+--------------------+------------------+
|             feature|        prediction|
+--------------------+------------------+
|[135.0,792.0,927....| 680.1649350009698|
|[138.0,799.0,937....| 587.1013679671078|
|[138.0,799.0,937....| 587.1013679671078|
|[140.0,829.0,969....|411.12647843968534|
|[143.0,776.0,919....| 655.0848417246862|
|[143.0,800.0,943....|453.67848742286515|
|[143.0,800.0,943....| 509.1764725680946|
|[143.0,847.0,990....| 276.6248187339488|
|[143.0,856.0,999....|353.71689502014715|
|[143.0,861.0,1004...| 223.8854280606547|
|[146.0,774.0,920....| 651.4231538403137|
|[146.0,844.0,990....| 332.2282153286128|
|[146.0,850.0,996....|254.12763418054283|
|[149.0,767.0,916....| 666.5969626249753|
|[149.0,787.0,936....| 646.7529610940696|
|[149.0,834.0,983....| 358.7033221146944|
|[149.0,841.0,990....| 276.8356416328179|
|[151.0,773.0,924....| 692.0284273993734|
|[151.0,775.0,926....| 684.4942287317599|
|[151.0,777.0,928....|  621.462044918917|
+--------------------+------------