In [47]:
#Predicting UPSC rank using linear regression of Pyspark
#!pip install Bio
#!pip install biopython

In [48]:
import pyspark
from pyspark.sql import SparkSession

spark=SparkSession.builder.appName("Predicting_upsc_rank").getOrCreate()

In [49]:
upsc_data=spark.read.csv("/home/moglix/Desktop/Amit/PGitHub/Python/UPSC_Result/CSM18_FQ_WEB_CELL_NEW.csv",inferSchema=True,header=True)
#upsc_data=upsc_data.drop(['ROLL_NO', 'NAME'])
#Dropping roll_num and name because it's not a important feature
columns_to_drop = ['ROLL_NO', 'NAME']
upsc_data = upsc_data.drop(*columns_to_drop)


In [50]:
#upsc_data=upsc_data.na.fill("UR", "CATEGORY")
# Filling null value for Category to best fit in feature model
colname=["CATEGORY"]
upsc_data=upsc_data.na.fill("UR",colname);
upsc_data.show()

+--------+--------+----------+-------+----+
|CATEGORY|PT_MARKS|MAIN_MARKS|F_TOTAL|RANK|
+--------+--------+----------+-------+----+
|      SC|     179|       942|   1121|   1|
|      UR|     198|       882|   1080|   2|
|      UR|     184|       893|   1077|   3|
|      UR|     184|       887|   1071|   4|
|      UR|     173|       895|   1068|   5|
|      UR|     184|       883|   1067|   6|
|      UR|     182|       885|   1067|   7|
|      UR|     195|       871|   1066|   8|
|      UR|     193|       871|   1064|   9|
|      UR|     187|       877|   1064|  10|
|      UR|     193|       870|   1063|  11|
|      UR|     171|       891|   1062|  12|
|      UR|     165|       897|   1062|  13|
|      UR|     182|       879|   1061|  14|
|      UR|     206|       854|   1060|  15|
|     OBC|     176|       884|   1060|  16|
|      UR|     193|       866|   1059|  17|
|      UR|     180|       879|   1059|  18|
|      UR|     171|       887|   1058|  19|
|      UR|     193|       864|  

In [51]:
# Because Category is a String so need to reindex it as a double to fit in feature model
from pyspark.ml.feature import StringIndexer
indexer=StringIndexer(inputCol='CATEGORY', outputCol='CATEGORY_cat')
indexed=indexer.fit(upsc_data).transform(upsc_data) 

In [52]:
indexed.show()

+--------+--------+----------+-------+----+------------+
|CATEGORY|PT_MARKS|MAIN_MARKS|F_TOTAL|RANK|CATEGORY_cat|
+--------+--------+----------+-------+----+------------+
|      SC|     179|       942|   1121|   1|         2.0|
|      UR|     198|       882|   1080|   2|         0.0|
|      UR|     184|       893|   1077|   3|         0.0|
|      UR|     184|       887|   1071|   4|         0.0|
|      UR|     173|       895|   1068|   5|         0.0|
|      UR|     184|       883|   1067|   6|         0.0|
|      UR|     182|       885|   1067|   7|         0.0|
|      UR|     195|       871|   1066|   8|         0.0|
|      UR|     193|       871|   1064|   9|         0.0|
|      UR|     187|       877|   1064|  10|         0.0|
|      UR|     193|       870|   1063|  11|         0.0|
|      UR|     171|       891|   1062|  12|         0.0|
|      UR|     165|       897|   1062|  13|         0.0|
|      UR|     182|       879|   1061|  14|         0.0|
|      UR|     206|       854| 

In [53]:
# Training model based on Main_marks, PT_marks, and Category, and total marks considering these a a feature
# Rank is the output 
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import  VectorAssembler

assembler=VectorAssembler(inputCols=['PT_MARKS', 
 'MAIN_MARKS', 
 'F_TOTAL', 
 'CATEGORY_cat'], outputCol='feature')
output=assembler.transform(indexed)
output.select('feature','RANK').show()

+--------------------+----+
|             feature|RANK|
+--------------------+----+
|[179.0,942.0,1121...|   1|
|[198.0,882.0,1080...|   2|
|[184.0,893.0,1077...|   3|
|[184.0,887.0,1071...|   4|
|[173.0,895.0,1068...|   5|
|[184.0,883.0,1067...|   6|
|[182.0,885.0,1067...|   7|
|[195.0,871.0,1066...|   8|
|[193.0,871.0,1064...|   9|
|[187.0,877.0,1064...|  10|
|[193.0,870.0,1063...|  11|
|[171.0,891.0,1062...|  12|
|[165.0,897.0,1062...|  13|
|[182.0,879.0,1061...|  14|
|[206.0,854.0,1060...|  15|
|[176.0,884.0,1060...|  16|
|[193.0,866.0,1059...|  17|
|[180.0,879.0,1059...|  18|
|[171.0,887.0,1058...|  19|
|[193.0,864.0,1057...|  20|
+--------------------+----+
only showing top 20 rows



In [54]:
# Training data model in 80% and 20% ratio
final_data=output.select('feature','RANK')
train_data, test_data=final_data.randomSplit([0.9,0.1])

In [55]:
train_data.describe().show()

+-------+------------------+
|summary|              RANK|
+-------+------------------+
|  count|               683|
|   mean| 383.6281112737921|
| stddev|220.36375399922917|
|    min|                 1|
|    max|               759|
+-------+------------------+



In [56]:
test_data.describe().show()

+-------+------------------+
|summary|              RANK|
+-------+------------------+
|  count|                76|
|   mean|347.39473684210526|
| stddev|207.50030226145813|
|    min|                19|
|    max|               753|
+-------+------------------+



In [57]:
# Using Linear Regression from Spark ML Library
from pyspark.ml.regression import LinearRegression

ship_lr=LinearRegression(featuresCol='feature', labelCol='RANK')
trainded_ship_model=ship_lr.fit(train_data)
ship_result=trainded_ship_model.evaluate(train_data)

# Getting accuracy score for trained model
print('Score :',ship_result.r2) 

Score : 0.8844123851990178


In [58]:
unlabeled_data=test_data.select('feature')
unlabeled_data.head(10)

[Row(feature=DenseVector([135.0, 818.0, 953.0, 1.0])),
 Row(feature=DenseVector([140.0, 851.0, 991.0, 0.0])),
 Row(feature=DenseVector([143.0, 638.0, 781.0, 1.0])),
 Row(feature=DenseVector([143.0, 806.0, 949.0, 1.0])),
 Row(feature=DenseVector([143.0, 850.0, 993.0, 0.0])),
 Row(feature=DenseVector([143.0, 868.0, 1011.0, 0.0])),
 Row(feature=DenseVector([145.0, 793.0, 938.0, 1.0])),
 Row(feature=DenseVector([146.0, 844.0, 990.0, 1.0])),
 Row(feature=DenseVector([151.0, 792.0, 943.0, 1.0])),
 Row(feature=DenseVector([151.0, 815.0, 966.0, 2.0]))]

In [59]:
# Transforming the test data and pre
predictions=trainded_ship_model.transform(unlabeled_data)

In [60]:
predictions.show()

+--------------------+------------------+
|             feature|        prediction|
+--------------------+------------------+
|[135.0,818.0,953....| 472.1268552907768|
|[140.0,851.0,991....| 271.6768740369712|
|[143.0,638.0,781....|1137.4608072326082|
|[143.0,806.0,949....|488.55883722811905|
|[143.0,850.0,993....|264.32007622171477|
|[143.0,868.0,1011...|194.79486514980454|
|[145.0,793.0,938....| 531.2919499746267|
|[146.0,844.0,990....|330.56408209039137|
|[151.0,792.0,943....| 512.7158426178953|
|[151.0,815.0,966....| 478.1663179677471|
|[154.0,773.0,927....|  629.172500927396|
|[154.0,850.0,1004...| 223.1826079029761|
|[155.0,849.0,1004...|223.30534978203514|
|[157.0,843.0,1000...|239.00088044502172|
|[160.0,755.0,915....| 676.2590929163571|
|[160.0,779.0,939....| 529.2705664342966|
|[160.0,783.0,943....| 513.8205195294277|
|[160.0,788.0,948....|440.21971584549374|
|[160.0,819.0,979....|374.77009738560855|
|[160.0,861.0,1021...| 158.2563598316383|
+--------------------+------------