In [85]:
# ! pip install pyspark

In [86]:
# IMporting the libraries
import pyspark


# Use PCA

## Using pyspark to read the data and process it

In [2]:
# To work with spark we need to create a spark session
# Need to instal java
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('petfinder').getOrCreate()
# Read a dataset with spark
df_spark = spark.read.csv('./train_balanced_corr_pca.csv', header=True, inferSchema=True)
# Header = True, inferSchema = True, means that the first row is the header and the schema is inferred (if schema is not inferred, all columns will be read as string)
# Convert the column "AdoptionSpeed" to integer
df_spark = df_spark.withColumn("AdoptionSpeed", df_spark["AdoptionSpeed"].cast("integer"))

# Using PySpark MLlib to build the model

In [3]:
from pyspark.ml.feature import VectorAssembler

featureassemble = VectorAssembler(inputCols=['0','1'], outputCol='features')
output = featureassemble.transform(df_spark) # This will create a new column called 'features' which is a vector of the selected columns (Type, Age2, Breed1) by the VectorAssembler
output.show()

+-------------------+--------------------+-------------+--------------------+
|                  0|                   1|AdoptionSpeed|            features|
+-------------------+--------------------+-------------+--------------------+
| -37.16037461817183| -0.4456896700259529|            2|[-37.160374618171...|
| -4.160398248205791|-0.47001473885723066|            3|[-4.1603982482057...|
|-1.1604003963906866|-0.47222610875098475|            2|[-1.1604003963906...|
| -45.15867983673255|  0.4996916193946927|            1|[-45.158679836732...|
| -45.15867983673255|  0.4996916193946927|            2|[-45.158679836732...|
| -45.15867983673255|  0.4996916193946927|            0|[-45.158679836732...|
| -45.15947110531252| -0.4985787143538656|            1|[-45.159471105312...|
|-2.1588171431691077|  1.5250516820440492|            3|[-2.1588171431691...|
|-3.1596076956874497|  0.5275184715934086|            2|[-3.1596076956874...|
| -44.15957833716055|   0.557740526808022|            0|[-44.159

In [4]:
finalized_data = output.select('features', 'AdoptionSpeed') # Select the features and the target column
finalized_data.show()

+--------------------+-------------+
|            features|AdoptionSpeed|
+--------------------+-------------+
|[-37.160374618171...|            2|
|[-4.1603982482057...|            3|
|[-1.1604003963906...|            2|
|[-45.158679836732...|            1|
|[-45.158679836732...|            2|
|[-45.158679836732...|            0|
|[-45.159471105312...|            1|
|[-2.1588171431691...|            3|
|[-3.1596076956874...|            2|
|[-44.159578337160...|            0|
|[-45.159471105312...|            2|
|[-4.1596069796258...|            2|
|[-45.159471105312...|            2|
|[-4.1603982482057...|            2|
|[54.8412485571042...|            1|
|[-45.158679836732...|            4|
|[-45.158679836732...|            1|
|[-45.159471105312...|            2|
|[-4.1603982482057...|            1|
|[-4.1603982482057...|            1|
+--------------------+-------------+
only showing top 20 rows



### 1. Logistic Regression

In [5]:
from pyspark.ml.classification import LogisticRegression
# Split the data into training and validation data
train_data, validation_data = finalized_data.randomSplit([0.1, 0.9])
classifier = LogisticRegression
classifier = LogisticRegression(labelCol='AdoptionSpeed').fit(train_data) # Fit the model
results = classifier.evaluate(validation_data) # Evaluate the model on the validation data
results.predictions.show() # Show the predictions
results.predictions.select('AdoptionSpeed', 'prediction').show() # Show the target and the prediction
results.accuracy # Show the accuracy of the model


+--------------------+-------------+--------------------+--------------------+----------+
|            features|AdoptionSpeed|       rawPrediction|         probability|prediction|
+--------------------+-------------+--------------------+--------------------+----------+
|[-45.159577621098...|            4|[0.20595184275855...|[0.24352631590512...|       0.0|
|[-45.159471105312...|            0|[-0.2940410804854...|[0.14659358017979...|       4.0|
|[-45.159471105312...|            0|[-0.2940410804854...|[0.14659358017979...|       4.0|
|[-45.159471105312...|            0|[-0.2940410804854...|[0.14659358017979...|       4.0|
|[-45.159471105312...|            0|[-0.2940410804854...|[0.14659358017979...|       4.0|
|[-45.159471105312...|            0|[-0.2940410804854...|[0.14659358017979...|       4.0|
|[-45.159471105312...|            0|[-0.2940410804854...|[0.14659358017979...|       4.0|
|[-45.159471105312...|            0|[-0.2940410804854...|[0.14659358017979...|       4.0|
|[-45.1594

0.23835161183117315

### 2. Decision Tree

In [6]:
from pyspark.ml.classification import DecisionTreeClassifier
# Split the data into training and validation data
train_data, validation_data = finalized_data.randomSplit([0.1, 0.9])
classifier = DecisionTreeClassifier( labelCol='AdoptionSpeed', featuresCol='features')
classifier = classifier.fit(train_data) # Fit the model
results = classifier.transform(validation_data) # Evaluate the model on the validation data
results.show() # Show the predictions
results.select('AdoptionSpeed', 'prediction').show() # Show the target and the prediction
# results.accuracy # Show the accuracy of the model
results.accuracy = results.filter(results.AdoptionSpeed == results.prediction).count() / float(results.count())
results.accuracy

+--------------------+-------------+--------------------+--------------------+----------+
|            features|AdoptionSpeed|       rawPrediction|         probability|prediction|
+--------------------+-------------+--------------------+--------------------+----------+
|[-45.159577621098...|            4|[25.0,54.0,81.0,8...|[0.07062146892655...|       4.0|
|[-45.159471105312...|            0|[25.0,54.0,81.0,8...|[0.07062146892655...|       4.0|
|[-45.159471105312...|            0|[25.0,54.0,81.0,8...|[0.07062146892655...|       4.0|
|[-45.159471105312...|            0|[25.0,54.0,81.0,8...|[0.07062146892655...|       4.0|
|[-45.159471105312...|            0|[25.0,54.0,81.0,8...|[0.07062146892655...|       4.0|
|[-45.159471105312...|            0|[25.0,54.0,81.0,8...|[0.07062146892655...|       4.0|
|[-45.159471105312...|            0|[25.0,54.0,81.0,8...|[0.07062146892655...|       4.0|
|[-45.159471105312...|            0|[25.0,54.0,81.0,8...|[0.07062146892655...|       4.0|
|[-45.1594

0.24862208645992429

### 3. Random Forest

In [7]:
from pyspark.ml.classification import RandomForestClassifier
# Split the data into training and validation data
train_data, validation_data = finalized_data.randomSplit([0.1, 0.9])
classifier = RandomForestClassifier( labelCol='AdoptionSpeed', featuresCol='features')
classifier = classifier.fit(train_data) # Fit the model
results = classifier.transform(validation_data) # Evaluate the model on the validation data
results.show() # Show the predictions
results.select('AdoptionSpeed', 'prediction').show() # Show the target and the prediction
# results.accuracy # Show the accuracy of the model
results.accuracy = results.filter(results.AdoptionSpeed == results.prediction).count() / float(results.count())
results.accuracy


+--------------------+-------------+--------------------+--------------------+----------+
|            features|AdoptionSpeed|       rawPrediction|         probability|prediction|
+--------------------+-------------+--------------------+--------------------+----------+
|[-45.159577621098...|            4|[1.41873724001744...|[0.07093686200087...|       4.0|
|[-45.159471105312...|            0|[1.28397977367512...|[0.06419898868375...|       4.0|
|[-45.159471105312...|            0|[1.28397977367512...|[0.06419898868375...|       4.0|
|[-45.159471105312...|            0|[1.28397977367512...|[0.06419898868375...|       4.0|
|[-45.159471105312...|            0|[1.28397977367512...|[0.06419898868375...|       4.0|
|[-45.159471105312...|            0|[1.28397977367512...|[0.06419898868375...|       4.0|
|[-45.159471105312...|            0|[1.28397977367512...|[0.06419898868375...|       4.0|
|[-45.159471105312...|            0|[1.28397977367512...|[0.06419898868375...|       4.0|
|[-45.1594

0.24893814706663128