In [0]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://downloads.apache.org/spark/spark-3.0.0-preview2/spark-3.0.0-preview2-bin-hadoop2.7.tgz
!tar -xvf spark-3.0.0-preview2-bin-hadoop2.7.tgz
!pip install -q findspark
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.0.0-preview2-bin-hadoop2.7"
import findspark
findspark.init()

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import when
from pyspark.ml.feature import VectorAssembler,StandardScaler
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator

spark = SparkSession.builder.getOrCreate()

df_train = spark.read.option("inferschema","true").csv("Planet_Training.csv", header=True)
df_train = df_train.select("Temperature", "Atmosphere Color", "Water", "Habitable")
df_train = df_train.na.drop()
df_train = df_train.withColumn("Atmosphere Color", when(df_train["Atmosphere Color"] == "Red", 0).when(df_train["Atmosphere Color"]=="Blue",1).otherwise(2))
df_train = df_train.withColumn("Water", when(df_train["Water"] == "Low", 0).when(df_train["Water"]=="Medium",1).otherwise(2))

col_train = df_train.columns
col_train.remove("Habitable")
df_train = VectorAssembler(inputCols = col_train, outputCol = "Features").transform(df_train)

scaler_train = StandardScaler(inputCol = "Features", outputCol="Scaled_Features")
df_train = scaler_train.fit(df_train).transform(df_train)

df_test = spark.read.option("inferSchema","true").csv("Planet_Testing.csv", header=True)
df_test = df_test.select("Temperature", "Atmosphere Color", "Water", "Habitable")
df_test = df_test.na.drop()
df_test = df_test.withColumn("Atmosphere Color", when(df_test["Atmosphere Color"] == "Red", 0).when(df_test["Atmosphere Color"]=="Blue",1).otherwise(2))
df_test = df_test.withColumn("Water", when(df_test["Water"] == "Low", 0).when(df_test["Water"]=="Medium",1).otherwise(2))

col_test = df_test.columns
col_test.remove("Habitable")
df_test = VectorAssembler(inputCols = col_test, outputCol = "Features").transform(df_test)

scaler_test = StandardScaler(inputCol = "Features", outputCol="Scaled_Features")
df_test = scaler_test.fit(df_test).transform(df_test)

model = LogisticRegression(featuresCol = "Scaled_Features", labelCol = "Habitable", maxIter = 10).fit(df_train)
prediction = model.transform(df_test)

evaluator = BinaryClassificationEvaluator(rawPredictionCol = "rawPrediction", labelCol = "Habitable")
print("Accuracy: {}%".format(evaluator.evaluate(prediction)*100))


Accuracy: 91.71043337232418%
