In [None]:
# ! pip install pyspark

In [None]:
# IMporting the libraries
import pyspark


# Use 3 features:  'Type', 'Age', 'Breed1'

## Using pyspark to read the data and process it

In [26]:

# To work with spark we need to create a spark session
# Need to instal java
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('petfinder').getOrCreate()

# Importing the dataset and split it into training and test data
df_spark, df_spark_test = spark.read.csv('./train.csv', header=True, inferSchema=True).randomSplit([0.7, 0.3])

######### For the training data #########
# Convert the column "AdoptionSpeed" to integer
df_spark = df_spark.withColumn("AdoptionSpeed", df_spark["AdoptionSpeed"].cast("integer"))
# Select the columns that we need ['Type', 'Age', 'Breed1']
df_spark = df_spark.select(['Type', 'Age', 'Breed1', 'AdoptionSpeed'])

######### For the test data #########
# Convert the column "AdoptionSpeed" to integer
df_spark_test = df_spark_test.withColumn("AdoptionSpeed", df_spark_test["AdoptionSpeed"].cast("integer"))
# Select the columns that we need ['Type', 'Age', 'Breed1']
df_spark_test = df_spark_test.select(['Type', 'Age', 'Breed1', 'AdoptionSpeed'])



## Print size of the data
print("Size of the training data: ", df_spark.count())
print("Size of the test data: ", df_spark_test.count())

Size of the training data:  10509
Size of the test data:  4484


## Data cleaning

In [27]:

########### For the train dataset

## Drop rows with missing values
# df_spark.na.drop(how='all', thresh=10).show() 
    ### how='any' means drop rows with any missing value, how='all' means drop rows whose all values are missing
    ### thresh=10 means drop rows whose number of missing values is greater than 10
    ### subset=['Age'] means drop rows whose 'Age' value is missing
df_spark = df_spark.na.drop(how= 'any' , subset=['AdoptionSpeed'])
## Fill missing values with mean
from pyspark.sql.functions import mean
mean_val = df_spark.select(mean(df_spark['Age'])).collect()
mean_age = mean_val[0][0]
df_spark.na.fill(mean_age, subset=['Age']).show()

########### For the test dataset

df_spark_test = df_spark_test.na.drop(how= 'any' , subset=['AdoptionSpeed'])
## Fill missing values with mean
from pyspark.sql.functions import mean
mean_val = df_spark_test.select(mean(df_spark_test['Age'])).collect()
mean_age = mean_val[0][0]
df_spark_test.na.fill(mean_age, subset=['Age']).show()




+----+---+------+-------------+
|Type|Age|Breed1|AdoptionSpeed|
+----+---+------+-------------+
|   1|  0|   307|            1|
|   1|  0|   307|            4|
|   1|  0|   307|            2|
|   1|  0|   307|            2|
|   1|  0|   307|            4|
|   1|  0|   307|            3|
|   1|  0|   307|            4|
|   1|  1|   128|            3|
|   1|  1|   189|            1|
|   1|  1|   189|            0|
|   1|  1|   307|            2|
|   1|  1|   307|            2|
|   1|  1|   307|            4|
|   1|  1|   307|            2|
|   1|  1|   307|            3|
|   1|  1|   307|            3|
|   1|  1|   307|            1|
|   1|  1|   307|            4|
|   1|  1|   307|            3|
|   1|  1|   307|            3|
+----+---+------+-------------+
only showing top 20 rows

+----+---+------+-------------+
|Type|Age|Breed1|AdoptionSpeed|
+----+---+------+-------------+
|   1|  0|   307|            1|
|   1|  0|   307|            0|
|   1|  1|   141|            4|
|   1|  1|   1

# Using PySpark MLlib to build the model

In [28]:
# First, collect the features in a single column

from pyspark.ml.feature import VectorAssembler

#### For the train dataset
featureassemble = VectorAssembler(inputCols=['Breed1','Age','Type'], outputCol='features')
output = featureassemble.transform(df_spark) # This will create a new column called 'features' which is a vector of the selected columns (Type, Age2, Breed1) by the VectorAssembler
output.show()

#### For the test dataset
testfeatureassemble = VectorAssembler(inputCols=['Breed1','Age','Type'], outputCol='features')
testoutput = testfeatureassemble.transform(df_spark_test) # This will create a new column called 'features' which is a vector of the selected columns (Type, Age2, Breed1) by the VectorAssembler
testoutput.show()


+----+---+------+-------------+---------------+
|Type|Age|Breed1|AdoptionSpeed|       features|
+----+---+------+-------------+---------------+
|   1|  0|   307|            1|[307.0,0.0,1.0]|
|   1|  0|   307|            4|[307.0,0.0,1.0]|
|   1|  0|   307|            2|[307.0,0.0,1.0]|
|   1|  0|   307|            2|[307.0,0.0,1.0]|
|   1|  0|   307|            4|[307.0,0.0,1.0]|
|   1|  0|   307|            3|[307.0,0.0,1.0]|
|   1|  0|   307|            4|[307.0,0.0,1.0]|
|   1|  1|   128|            3|[128.0,1.0,1.0]|
|   1|  1|   189|            1|[189.0,1.0,1.0]|
|   1|  1|   189|            0|[189.0,1.0,1.0]|
|   1|  1|   307|            2|[307.0,1.0,1.0]|
|   1|  1|   307|            2|[307.0,1.0,1.0]|
|   1|  1|   307|            4|[307.0,1.0,1.0]|
|   1|  1|   307|            2|[307.0,1.0,1.0]|
|   1|  1|   307|            3|[307.0,1.0,1.0]|
|   1|  1|   307|            3|[307.0,1.0,1.0]|
|   1|  1|   307|            1|[307.0,1.0,1.0]|
|   1|  1|   307|            4|[307.0,1.

In [29]:
# Select the features and the target column

#### For the train dataset
finalized_data = output.select('features', 'AdoptionSpeed') # Select the features and the target column
finalized_data.show()

#### For the test dataset
testfinalized_data = testoutput.select('features', 'AdoptionSpeed') # Select the features and the target column
testfinalized_data.show()


+---------------+-------------+
|       features|AdoptionSpeed|
+---------------+-------------+
|[307.0,0.0,1.0]|            1|
|[307.0,0.0,1.0]|            4|
|[307.0,0.0,1.0]|            2|
|[307.0,0.0,1.0]|            2|
|[307.0,0.0,1.0]|            4|
|[307.0,0.0,1.0]|            3|
|[307.0,0.0,1.0]|            4|
|[128.0,1.0,1.0]|            3|
|[189.0,1.0,1.0]|            1|
|[189.0,1.0,1.0]|            0|
|[307.0,1.0,1.0]|            2|
|[307.0,1.0,1.0]|            2|
|[307.0,1.0,1.0]|            4|
|[307.0,1.0,1.0]|            2|
|[307.0,1.0,1.0]|            3|
|[307.0,1.0,1.0]|            3|
|[307.0,1.0,1.0]|            1|
|[307.0,1.0,1.0]|            4|
|[307.0,1.0,1.0]|            3|
|[307.0,1.0,1.0]|            3|
+---------------+-------------+
only showing top 20 rows

+---------------+-------------+
|       features|AdoptionSpeed|
+---------------+-------------+
|[307.0,0.0,1.0]|            1|
|[307.0,0.0,1.0]|            0|
|[141.0,1.0,1.0]|            4|
|[173.0,1.0,1.

### 1. Logistic Regression

In [30]:
from pyspark.ml.classification import LogisticRegression
# Split the data into training and validation data
train_data = finalized_data
classifier = LogisticRegression
classifier = LogisticRegression(labelCol='AdoptionSpeed').fit(train_data) # Fit the model


test_data = testfinalized_data
results = classifier.evaluate(test_data) # Evaluate the model on the validation data
results.predictions.show() # Show the predictions
results.predictions.select('AdoptionSpeed', 'prediction').show() # Show the target and the prediction

# Want to show the f1 score and confusion matrix
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(labelCol="AdoptionSpeed", predictionCol="prediction", metricName="f1")
f1_score = evaluator.evaluate(results.predictions)
print("F1 score: %.3f" % f1_score)

# Confusion matrix
results.predictions.crosstab('AdoptionSpeed', 'prediction').show()



+---------------+-------------+--------------------+--------------------+----------+
|       features|AdoptionSpeed|       rawPrediction|         probability|prediction|
+---------------+-------------+--------------------+--------------------+----------+
|[307.0,0.0,1.0]|            1|[3.85593289186591...|[0.01547177856361...|       4.0|
|[307.0,0.0,1.0]|            0|[3.85593289186591...|[0.01547177856361...|       4.0|
|[141.0,1.0,1.0]|            4|[4.30374095429116...|[0.03338936633175...|       1.0|
|[173.0,1.0,1.0]|            0|[4.21990095186108...|[0.02930485903556...|       2.0|
|[307.0,1.0,1.0]|            2|[3.86882094168511...|[0.01542103828638...|       4.0|
|[307.0,1.0,1.0]|            4|[3.86882094168511...|[0.01542103828638...|       4.0|
|[307.0,1.0,1.0]|            1|[3.86882094168511...|[0.01542103828638...|       4.0|
|[307.0,1.0,1.0]|            4|[3.86882094168511...|[0.01542103828638...|       4.0|
|[307.0,1.0,1.0]|            1|[3.86882094168511...|[0.0154210382

### 2. Decision Tree

In [31]:
from pyspark.ml.classification import DecisionTreeClassifier
# Split the data into training and validation data
train_data = finalized_data
classifier = DecisionTreeClassifier( labelCol='AdoptionSpeed', featuresCol='features')
classifier = classifier.fit(train_data) # Fit the model


test_data = testfinalized_data
results = classifier.transform(test_data) # Evaluate the model on the validation data
results.show() # Show the predictions
results.select('AdoptionSpeed', 'prediction').show() # Show the target and the prediction

# Want to show the f1 score and confusion matrix
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(labelCol="AdoptionSpeed", predictionCol="prediction", metricName="f1")
f1_score = evaluator.evaluate(results)
print("F1 score: %.3f" % f1_score)

# Confusion matrix
results.crosstab('AdoptionSpeed', 'prediction').show()



+---------------+-------------+--------------------+--------------------+----------+
|       features|AdoptionSpeed|       rawPrediction|         probability|prediction|
+---------------+-------------+--------------------+--------------------+----------+
|[307.0,0.0,1.0]|            1|[12.0,177.0,281.0...|[0.01459854014598...|       2.0|
|[307.0,0.0,1.0]|            0|[12.0,177.0,281.0...|[0.01459854014598...|       2.0|
|[141.0,1.0,1.0]|            4|[11.0,64.0,66.0,7...|[0.04641350210970...|       3.0|
|[173.0,1.0,1.0]|            0|[11.0,64.0,66.0,7...|[0.04641350210970...|       3.0|
|[307.0,1.0,1.0]|            2|[12.0,177.0,281.0...|[0.01459854014598...|       2.0|
|[307.0,1.0,1.0]|            4|[12.0,177.0,281.0...|[0.01459854014598...|       2.0|
|[307.0,1.0,1.0]|            1|[12.0,177.0,281.0...|[0.01459854014598...|       2.0|
|[307.0,1.0,1.0]|            4|[12.0,177.0,281.0...|[0.01459854014598...|       2.0|
|[307.0,1.0,1.0]|            1|[12.0,177.0,281.0...|[0.0145985401

### 3. Random Forest

In [32]:
from pyspark.ml.classification import RandomForestClassifier

train_data = finalized_data
classifier = RandomForestClassifier(labelCol='AdoptionSpeed', featuresCol='features')
classifier = classifier.fit(train_data) # Fit the model


test_data = testfinalized_data
results = classifier.transform(test_data) # Evaluate the model on the validation data
results.show() # Show the predictions
results.select('AdoptionSpeed', 'prediction').show() # Show the target and the prediction

# Want to show the f1 score and confusion matrix
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(labelCol="AdoptionSpeed", predictionCol="prediction", metricName="f1")
f1_score = evaluator.evaluate(results)
print("F1 score: %.3f" % f1_score)

# Confusion matrix
results.crosstab('AdoptionSpeed', 'prediction').show()



+---------------+-------------+--------------------+--------------------+----------+
|       features|AdoptionSpeed|       rawPrediction|         probability|prediction|
+---------------+-------------+--------------------+--------------------+----------+
|[307.0,0.0,1.0]|            1|[0.32194941714301...|[0.01609747085715...|       2.0|
|[307.0,0.0,1.0]|            0|[0.32194941714301...|[0.01609747085715...|       2.0|
|[141.0,1.0,1.0]|            4|[0.76637273249964...|[0.03831863662498...|       1.0|
|[173.0,1.0,1.0]|            0|[0.76637273249964...|[0.03831863662498...|       1.0|
|[307.0,1.0,1.0]|            2|[0.34223752902365...|[0.01711187645118...|       2.0|
|[307.0,1.0,1.0]|            4|[0.34223752902365...|[0.01711187645118...|       2.0|
|[307.0,1.0,1.0]|            1|[0.34223752902365...|[0.01711187645118...|       2.0|
|[307.0,1.0,1.0]|            4|[0.34223752902365...|[0.01711187645118...|       2.0|
|[307.0,1.0,1.0]|            1|[0.34223752902365...|[0.0171118764

### 4. Naive Bayes

In [33]:
from pyspark.ml.classification import NaiveBayes

train_data = finalized_data
classifier = NaiveBayes(labelCol='AdoptionSpeed', featuresCol='features')
classifier = classifier.fit(train_data) # Fit the model


test_data = testfinalized_data
results = classifier.transform(test_data) # Evaluate the model on the validation data
results.show() # Show the predictions
results.select('AdoptionSpeed', 'prediction').show() # Show the target and the prediction

# Want to show the f1 score and confusion matrix
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(labelCol="AdoptionSpeed", predictionCol="prediction", metricName="f1")
f1_score = evaluator.evaluate(results)
print("F1 score: %.3f" % f1_score)

# Confusion matrix
results.crosstab('AdoptionSpeed', 'prediction').show()


+---------------+-------------+--------------------+--------------------+----------+
|       features|AdoptionSpeed|       rawPrediction|         probability|prediction|
+---------------+-------------+--------------------+--------------------+----------+
|[307.0,0.0,1.0]|            1|[-22.619651984454...|[0.00524808483129...|       2.0|
|[307.0,0.0,1.0]|            0|[-22.619651984454...|[0.00524808483129...|       2.0|
|[141.0,1.0,1.0]|            4|[-18.365367911113...|[0.01977398639212...|       2.0|
|[173.0,1.0,1.0]|            0|[-19.814951597034...|[0.01649168010014...|       2.0|
|[307.0,1.0,1.0]|            2|[-25.885083281829...|[0.00663321920203...|       2.0|
|[307.0,1.0,1.0]|            4|[-25.885083281829...|[0.00663321920203...|       2.0|
|[307.0,1.0,1.0]|            1|[-25.885083281829...|[0.00663321920203...|       2.0|
|[307.0,1.0,1.0]|            4|[-25.885083281829...|[0.00663321920203...|       2.0|
|[307.0,1.0,1.0]|            1|[-25.885083281829...|[0.0066332192