In [28]:
#Predicting UPSC rank using linear regression
#!pip install Bio
!pip install biopython

Collecting biopython
[?25l  Downloading https://files.pythonhosted.org/packages/36/3a/35d78250cb04ced183db3c01344737a7cf95a163f3267383a21e86784a3d/biopython-1.74-cp35-cp35m-manylinux1_x86_64.whl (2.2MB)
[K     |████████████████████████████████| 2.2MB 2.1MB/s eta 0:00:01
Installing collected packages: biopython
Successfully installed biopython-1.74


In [20]:
import pyspark
from pyspark.sql import SparkSession

spark=SparkSession.builder.appName("Predicting_upsc_rank").getOrCreate()

In [45]:
upsc_data=spark.read.csv("/home/moglix/Desktop/Amit/PGitHub/Python/UPSC_Result/CSM18_FQ_WEB_CELL_NEW.csv",inferSchema=True,header=True)
#upsc_data=upsc_data.drop(['ROLL_NO', 'NAME'])
columns_to_drop = ['ROLL_NO', 'NAME']
upsc_data = upsc_data.drop(*columns_to_drop)


In [46]:
#upsc_data=upsc_data.na.fill("UR", "CATEGORY")
colname=["CATEGORY"]
upsc_data=upsc_data.na.fill("UR",colname);
upsc_data.show()

+--------+--------+----------+-------+----+
|CATEGORY|PT_MARKS|MAIN_MARKS|F_TOTAL|RANK|
+--------+--------+----------+-------+----+
|      SC|     179|       942|   1121|   1|
|      UR|     198|       882|   1080|   2|
|      UR|     184|       893|   1077|   3|
|      UR|     184|       887|   1071|   4|
|      UR|     173|       895|   1068|   5|
|      UR|     184|       883|   1067|   6|
|      UR|     182|       885|   1067|   7|
|      UR|     195|       871|   1066|   8|
|      UR|     193|       871|   1064|   9|
|      UR|     187|       877|   1064|  10|
|      UR|     193|       870|   1063|  11|
|      UR|     171|       891|   1062|  12|
|      UR|     165|       897|   1062|  13|
|      UR|     182|       879|   1061|  14|
|      UR|     206|       854|   1060|  15|
|     OBC|     176|       884|   1060|  16|
|      UR|     193|       866|   1059|  17|
|      UR|     180|       879|   1059|  18|
|      UR|     171|       887|   1058|  19|
|      UR|     193|       864|  

In [47]:
from pyspark.ml.feature import StringIndexer
indexer=StringIndexer(inputCol='CATEGORY', outputCol='CATEGORY_cat')
indexed=indexer.fit(upsc_data).transform(upsc_data) 

In [48]:
indexed.show()

+--------+--------+----------+-------+----+------------+
|CATEGORY|PT_MARKS|MAIN_MARKS|F_TOTAL|RANK|CATEGORY_cat|
+--------+--------+----------+-------+----+------------+
|      SC|     179|       942|   1121|   1|         2.0|
|      UR|     198|       882|   1080|   2|         0.0|
|      UR|     184|       893|   1077|   3|         0.0|
|      UR|     184|       887|   1071|   4|         0.0|
|      UR|     173|       895|   1068|   5|         0.0|
|      UR|     184|       883|   1067|   6|         0.0|
|      UR|     182|       885|   1067|   7|         0.0|
|      UR|     195|       871|   1066|   8|         0.0|
|      UR|     193|       871|   1064|   9|         0.0|
|      UR|     187|       877|   1064|  10|         0.0|
|      UR|     193|       870|   1063|  11|         0.0|
|      UR|     171|       891|   1062|  12|         0.0|
|      UR|     165|       897|   1062|  13|         0.0|
|      UR|     182|       879|   1061|  14|         0.0|
|      UR|     206|       854| 

In [50]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

assembler=VectorAssembler(inputCols=['PT_MARKS', 
 'MAIN_MARKS', 
 'F_TOTAL', 
 'CATEGORY_cat'], outputCol='feature')
output=assembler.transform(indexed)
output.select('feature','RANK').show()

+--------------------+----+
|             feature|RANK|
+--------------------+----+
|[179.0,942.0,1121...|   1|
|[198.0,882.0,1080...|   2|
|[184.0,893.0,1077...|   3|
|[184.0,887.0,1071...|   4|
|[173.0,895.0,1068...|   5|
|[184.0,883.0,1067...|   6|
|[182.0,885.0,1067...|   7|
|[195.0,871.0,1066...|   8|
|[193.0,871.0,1064...|   9|
|[187.0,877.0,1064...|  10|
|[193.0,870.0,1063...|  11|
|[171.0,891.0,1062...|  12|
|[165.0,897.0,1062...|  13|
|[182.0,879.0,1061...|  14|
|[206.0,854.0,1060...|  15|
|[176.0,884.0,1060...|  16|
|[193.0,866.0,1059...|  17|
|[180.0,879.0,1059...|  18|
|[171.0,887.0,1058...|  19|
|[193.0,864.0,1057...|  20|
+--------------------+----+
only showing top 20 rows



In [51]:
final_data=output.select('feature','RANK')
train_data, test_data=final_data.randomSplit([0.7,0.3])

In [52]:
train_data.describe().show()

+-------+------------------+
|summary|              RANK|
+-------+------------------+
|  count|               525|
|   mean| 376.1485714285714|
| stddev|222.14116966121216|
|    min|                 1|
|    max|               759|
+-------+------------------+



In [53]:
test_data.describe().show()

+-------+------------------+
|summary|              RANK|
+-------+------------------+
|  count|               234|
|   mean|388.64102564102564|
| stddev|212.83020137820208|
|    min|                 2|
|    max|               754|
+-------+------------------+



In [54]:
from pyspark.ml.regression import LinearRegression

ship_lr=LinearRegression(featuresCol='feature', labelCol='RANK')
trainded_ship_model=ship_lr.fit(train_data)
ship_result=trainded_ship_model.evaluate(train_data)

print('Rsquared Error :',ship_result.r2) 

Rsquared Error : 0.8663209821645651


In [55]:
unlabeled_data=test_data.select('feature')
unlabeled_data.head(4)

[Row(feature=DenseVector([124.0, 833.0, 957.0, 2.0])),
 Row(feature=DenseVector([132.0, 823.0, 955.0, 1.0])),
 Row(feature=DenseVector([135.0, 645.0, 780.0, 0.0])),
 Row(feature=DenseVector([135.0, 792.0, 927.0, 3.0]))]

In [56]:
predictions=trainded_ship_model.transform(unlabeled_data)

In [57]:
predictions.show()

+--------------------+------------------+
|             feature|        prediction|
+--------------------+------------------+
|[124.0,833.0,957....| 513.5990173544446|
|[132.0,823.0,955....|465.61288784108274|
|[135.0,645.0,780....|1062.0614627817781|
|[135.0,792.0,927....| 679.1420145522939|
|[135.0,818.0,953....|472.81055832172024|
|[138.0,786.0,924....| 635.2913378641761|
|[138.0,794.0,932....| 660.2809945596659|
|[138.0,799.0,937....|  586.896626985254|
|[138.0,806.0,944....| 506.0669192756236|
|[138.0,849.0,987....| 291.2210891320551|
|[140.0,851.0,991....| 276.1652957585634|
|[143.0,777.0,920....| 649.7692353769785|
|[143.0,804.0,947....| 549.2571435515251|
|[143.0,811.0,954....| 523.1984530782597|
|[143.0,838.0,981....|422.68636125280636|
|[143.0,847.0,990....| 279.6402961715917|
|[143.0,866.0,1009...|208.90956488701386|
|[144.0,771.0,915....| 668.3000291634985|
|[146.0,676.0,822....| 904.8011978753871|
|[146.0,766.0,912....| 734.0739434996372|
+--------------------+------------