# Installing pyspark


In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!apt-get update
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
!update-alternatives --set java /usr/lib/jvm/java-8-openjdk-amd64/jre/bin/java
!java -version
!pip install pyspark

0% [Working]            Get:1 https://cloud.r-project.org/bin/linux/ubuntu focal-cran40/ InRelease [3,622 B]
0% [Connecting to archive.ubuntu.com] [Waiting for headers] [1 InRelease 3,622 0% [Connecting to archive.ubuntu.com] [Waiting for headers] [Connecting to ppa.                                                                               Get:2 http://security.ubuntu.com/ubuntu focal-security InRelease [114 kB]
Hit:3 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64  InRelease
Hit:4 http://ppa.launchpad.net/c2d4u.team/c2d4u4.0+/ubuntu focal InRelease
Hit:5 http://archive.ubuntu.com/ubuntu focal InRelease
Hit:6 http://ppa.launchpad.net/cran/libgit2/ubuntu focal InRelease
Get:7 http://archive.ubuntu.com/ubuntu focal-updates InRelease [114 kB]
Get:8 http://security.ubuntu.com/ubuntu focal-security/universe amd64 Packages [1,056 kB]
Hit:9 http://ppa.launchpad.net/deadsnakes/ppa/ubuntu focal InRelease
Get:10 http://ppa.launchpad.net/graphics-drivers/ppa/

#Importing required libs 

In [3]:
from pyspark.sql.types import StringType
from pyspark.ml.feature import VectorAssembler,StringIndexer,OneHotEncoder,PCA
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.classification import RandomForestClassifier, LogisticRegression
from pyspark.ml.tuning import CrossValidator,ParamGridBuilder
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
import pyspark.sql.functions as F

#Reading csv file 

In [4]:
spark = SparkSession.builder.getOrCreate()
data = spark.read.option("header","true").csv("/content/drive/MyDrive/IE525/project/experiment-NUMBA/TestData/3-players.csv")
data.show(10)


+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+---

# Transform to VectorAssembler as features
### Convert string to float
### k-fold cross validation

In [5]:

#Drop null values(row)
df = data.na.drop()

#shuffle data
df = df.select("*").orderBy(F.rand())

#convert to double
df = df.select(*(col(c).cast("float").alias(c) for c in df.columns))

# Split data into training and test sets
(trainingData, testData) = df.randomSplit([0.7, 0.3], seed=123)

# Prepare features and label columns for training data
assembler = VectorAssembler(inputCols=trainingData.columns[2:399], outputCol="features")
trainingData = assembler.transform(trainingData)

# Define the Random Forest Classifier
rf = RandomForestClassifier(labelCol="label", featuresCol="features")

# Define the parameter grid
paramGrid = ParamGridBuilder().addGrid(rf.numTrees, [20, 50]).build()

# Define the evaluation metric
evaluator = MulticlassClassificationEvaluator(labelCol="label", metricName="f1")

# Define the k-fold cross validator
cv = CrossValidator(estimator=rf, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds = 5)


In [6]:

# Fit the cross validator on the training data
cvModel = cv.fit(trainingData)

# Get the best model
bestModel = cvModel.bestModel

# Prepare features and label columns for test data
assembler = VectorAssembler(inputCols=testData.columns[2:399], outputCol="features")
testData = assembler.transform(testData)

# Make predictions on the test data
predictions = bestModel.transform(testData)

# Evaluate the predictions
auc = evaluator.evaluate(predictions)
print("AUC:", auc)

AUC: 0.9575375647576917


# Save model

In [7]:
bestModel.write().overwrite().save("/content/drive/MyDrive/IE525/project/experiment-NUMBA/TestData/players-trained")

# Load model and evaluation

In [8]:
from pyspark.ml.classification import RandomForestClassificationModel


# Load the saved model from Google Drive
loadedModel = RandomForestClassificationModel.load("/content/drive/MyDrive/IE525/project/experiment-NUMBA/TestData/players-trained")

# Prepare features and label columns for test data
assembler1 = VectorAssembler(inputCols = testData.columns[2:399])
testData = assembler1.transform(testData)

# Use the loaded model to make predictions on new data
predict = loadedModel.transform(testData)

eval = MulticlassClassificationEvaluator(metricName='f1')
result = eval.evaluate(predict)
print("F1 score :",result)

F1 score : 0.9575375647576917


# Other Evaluators

In [9]:

eva = MulticlassClassificationEvaluator(metricName='precisionByLabel')
result = eva.evaluate(predict)
print("precision By Label :",result)


precision By Label : 0.9666666666666667


In [10]:

eva = MulticlassClassificationEvaluator(metricName='f1')
result = eva.evaluate(predict)
print("F1 score :",result)

F1 score : 0.9575375647576917


In [11]:
eva = MulticlassClassificationEvaluator(metricName='fMeasureByLabel')
result = eva.evaluate(predictions)
print("precision By Label :",result)


precision By Label : 0.9354838709677419


In [12]:
eva = MulticlassClassificationEvaluator(metricName='recallByLabel')
result = eva.evaluate(predictions)
print("precision By Label :",result)


precision By Label : 0.90625


In [13]:
eva = MulticlassClassificationEvaluator(metricName='truePositiveRateByLabel')
result = eva.evaluate(predictions)
print("precision By Label :",result)

precision By Label : 0.90625
