In [6]:
# ! pip install pyspark

In [7]:
# IMporting the libraries
import pyspark


# Use 3 features:  'Type', 'Age', 'Breed1'

## Using pyspark to read the data and process it

In [8]:

# To work with spark we need to create a spark session
# Need to instal java
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local=[*]").appName('petfinder').getOrCreate()

# Importing the dataset and split it into training and test data
df_spark, df_spark_test = spark.read.csv('./train.csv', header=True, inferSchema=True).randomSplit([0.7, 0.3])

######### For the training data #########
# Convert the column "AdoptionSpeed" to integer
df_spark = df_spark.withColumn("AdoptionSpeed", df_spark["AdoptionSpeed"].cast("integer"))
# Select the columns that we need ['Type', 'Age', 'Breed1']
df_spark = df_spark.select(['Type', 'Age', 'Breed1', 'AdoptionSpeed'])

######### For the test data #########
# Convert the column "AdoptionSpeed" to integer
df_spark_test = df_spark_test.withColumn("AdoptionSpeed", df_spark_test["AdoptionSpeed"].cast("integer"))
# Select the columns that we need ['Type', 'Age', 'Breed1']
df_spark_test = df_spark_test.select(['Type', 'Age', 'Breed1', 'AdoptionSpeed'])



## Print size of the data
print("Size of the training data: ", df_spark.count())
print("Size of the test data: ", df_spark_test.count())

Size of the training data:  10540
Size of the test data:  4453


## Data cleaning

In [9]:

########### For the train dataset

## Drop rows with missing values
# df_spark.na.drop(how='all', thresh=10).show() 
    ### how='any' means drop rows with any missing value, how='all' means drop rows whose all values are missing
    ### thresh=10 means drop rows whose number of missing values is greater than 10
    ### subset=['Age'] means drop rows whose 'Age' value is missing
df_spark = df_spark.na.drop(how= 'any' , subset=['AdoptionSpeed'])
## Fill missing values with mean
from pyspark.sql.functions import mean
mean_val = df_spark.select(mean(df_spark['Age'])).collect()
mean_age = mean_val[0][0]
df_spark.na.fill(mean_age, subset=['Age']).show()

########### For the test dataset

df_spark_test = df_spark_test.na.drop(how= 'any' , subset=['AdoptionSpeed'])
## Fill missing values with mean
from pyspark.sql.functions import mean
mean_val = df_spark_test.select(mean(df_spark_test['Age'])).collect()
mean_age = mean_val[0][0]
df_spark_test.na.fill(mean_age, subset=['Age']).show()




+----+---+------+-------------+
|Type|Age|Breed1|AdoptionSpeed|
+----+---+------+-------------+
|   1|  0|   307|            1|
|   1|  0|   307|            4|
|   1|  0|   307|            2|
|   1|  0|   307|            4|
|   1|  0|   307|            1|
|   1|  0|   307|            0|
|   1|  0|   307|            3|
|   1|  0|   307|            4|
|   1|  1|   141|            4|
|   1|  1|   173|            0|
|   1|  1|   189|            1|
|   1|  1|   189|            0|
|   1|  1|   307|            2|
|   1|  1|   307|            4|
|   1|  1|   307|            3|
|   1|  1|   307|            1|
|   1|  1|   307|            4|
|   1|  1|   307|            4|
|   1|  1|   307|            3|
|   1|  1|   307|            3|
+----+---+------+-------------+
only showing top 20 rows

+----+---+------+-------------+
|Type|Age|Breed1|AdoptionSpeed|
+----+---+------+-------------+
|   1|  0|   307|            2|
|   1|  1|   128|            3|
|   1|  1|   307|            2|
|   1|  1|   3

# Using PySpark MLlib to build the model

In [10]:
# First, collect the features in a single column

from pyspark.ml.feature import VectorAssembler

#### For the train dataset
featureassemble = VectorAssembler(inputCols=['Breed1','Age','Type'], outputCol='features')
output = featureassemble.transform(df_spark) # This will create a new column called 'features' which is a vector of the selected columns (Type, Age2, Breed1) by the VectorAssembler
output.show()

#### For the test dataset
testfeatureassemble = VectorAssembler(inputCols=['Breed1','Age','Type'], outputCol='features')
testoutput = testfeatureassemble.transform(df_spark_test) # This will create a new column called 'features' which is a vector of the selected columns (Type, Age2, Breed1) by the VectorAssembler
testoutput.show()


+----+---+------+-------------+---------------+
|Type|Age|Breed1|AdoptionSpeed|       features|
+----+---+------+-------------+---------------+
|   1|  0|   307|            1|[307.0,0.0,1.0]|
|   1|  0|   307|            4|[307.0,0.0,1.0]|
|   1|  0|   307|            2|[307.0,0.0,1.0]|
|   1|  0|   307|            4|[307.0,0.0,1.0]|
|   1|  0|   307|            1|[307.0,0.0,1.0]|
|   1|  0|   307|            0|[307.0,0.0,1.0]|
|   1|  0|   307|            3|[307.0,0.0,1.0]|
|   1|  0|   307|            4|[307.0,0.0,1.0]|
|   1|  1|   141|            4|[141.0,1.0,1.0]|
|   1|  1|   173|            0|[173.0,1.0,1.0]|
|   1|  1|   189|            1|[189.0,1.0,1.0]|
|   1|  1|   189|            0|[189.0,1.0,1.0]|
|   1|  1|   307|            2|[307.0,1.0,1.0]|
|   1|  1|   307|            4|[307.0,1.0,1.0]|
|   1|  1|   307|            3|[307.0,1.0,1.0]|
|   1|  1|   307|            1|[307.0,1.0,1.0]|
|   1|  1|   307|            4|[307.0,1.0,1.0]|
|   1|  1|   307|            4|[307.0,1.

In [11]:
# Select the features and the target column

#### For the train dataset
finalized_data = output.select('features', 'AdoptionSpeed') # Select the features and the target column
finalized_data.show()

#### For the test dataset
testfinalized_data = testoutput.select('features', 'AdoptionSpeed') # Select the features and the target column
testfinalized_data.show()


+---------------+-------------+
|       features|AdoptionSpeed|
+---------------+-------------+
|[307.0,0.0,1.0]|            1|
|[307.0,0.0,1.0]|            4|
|[307.0,0.0,1.0]|            2|
|[307.0,0.0,1.0]|            4|
|[307.0,0.0,1.0]|            1|
|[307.0,0.0,1.0]|            0|
|[307.0,0.0,1.0]|            3|
|[307.0,0.0,1.0]|            4|
|[141.0,1.0,1.0]|            4|
|[173.0,1.0,1.0]|            0|
|[189.0,1.0,1.0]|            1|
|[189.0,1.0,1.0]|            0|
|[307.0,1.0,1.0]|            2|
|[307.0,1.0,1.0]|            4|
|[307.0,1.0,1.0]|            3|
|[307.0,1.0,1.0]|            1|
|[307.0,1.0,1.0]|            4|
|[307.0,1.0,1.0]|            4|
|[307.0,1.0,1.0]|            3|
|[307.0,1.0,1.0]|            3|
+---------------+-------------+
only showing top 20 rows

+---------------+-------------+
|       features|AdoptionSpeed|
+---------------+-------------+
|[307.0,0.0,1.0]|            2|
|[128.0,1.0,1.0]|            3|
|[307.0,1.0,1.0]|            2|
|[307.0,1.0,1.

### 1. Logistic Regression

In [12]:
from pyspark.ml.classification import LogisticRegression
# Split the data into training and validation data
train_data = finalized_data
classifier = LogisticRegression
classifier = LogisticRegression(labelCol='AdoptionSpeed').fit(train_data) # Fit the model


test_data = testfinalized_data
results = classifier.evaluate(test_data) # Evaluate the model on the validation data
results.predictions.show() # Show the predictions
results.predictions.select('AdoptionSpeed', 'prediction').show() # Show the target and the prediction

# Want to show the f1 score and confusion matrix
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(labelCol="AdoptionSpeed", predictionCol="prediction", metricName="f1")
f1_score = evaluator.evaluate(results.predictions)
print("F1 score: %.3f" % f1_score)

# Confusion matrix
results.predictions.crosstab('AdoptionSpeed', 'prediction').show()



+---------------+-------------+--------------------+--------------------+----------+
|       features|AdoptionSpeed|       rawPrediction|         probability|prediction|
+---------------+-------------+--------------------+--------------------+----------+
|[307.0,0.0,1.0]|            2|[3.16169972978996...|[0.01709492317757...|       2.0|
|[128.0,1.0,1.0]|            3|[4.01073097781256...|[0.03253082360487...|       2.0|
|[307.0,1.0,1.0]|            2|[3.18020601443288...|[0.01707240028563...|       4.0|
|[307.0,1.0,1.0]|            2|[3.18020601443288...|[0.01707240028563...|       4.0|
|[307.0,1.0,1.0]|            4|[3.18020601443288...|[0.01707240028563...|       4.0|
|[307.0,1.0,1.0]|            2|[3.18020601443288...|[0.01707240028563...|       4.0|
|[307.0,1.0,1.0]|            3|[3.18020601443288...|[0.01707240028563...|       4.0|
|[307.0,1.0,1.0]|            3|[3.18020601443288...|[0.01707240028563...|       4.0|
|[307.0,1.0,1.0]|            3|[3.18020601443288...|[0.0170724002

### 2. Decision Tree

In [13]:
from pyspark.ml.classification import DecisionTreeClassifier
# Split the data into training and validation data
train_data = finalized_data
classifier = DecisionTreeClassifier( labelCol='AdoptionSpeed', featuresCol='features')
classifier = classifier.fit(train_data) # Fit the model


test_data = testfinalized_data
results = classifier.transform(test_data) # Evaluate the model on the validation data
results.show() # Show the predictions
results.select('AdoptionSpeed', 'prediction').show() # Show the target and the prediction

# Want to show the f1 score and confusion matrix
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(labelCol="AdoptionSpeed", predictionCol="prediction", metricName="f1")
f1_score = evaluator.evaluate(results)
print("F1 score: %.3f" % f1_score)

# Confusion matrix
results.crosstab('AdoptionSpeed', 'prediction').show()



+---------------+-------------+--------------------+--------------------+----------+
|       features|AdoptionSpeed|       rawPrediction|         probability|prediction|
+---------------+-------------+--------------------+--------------------+----------+
|[307.0,0.0,1.0]|            2|[41.0,427.0,693.0...|[0.01959847036328...|       2.0|
|[128.0,1.0,1.0]|            3|[2.0,30.0,26.0,19...|[0.02352941176470...|       1.0|
|[307.0,1.0,1.0]|            2|[41.0,427.0,693.0...|[0.01959847036328...|       2.0|
|[307.0,1.0,1.0]|            2|[41.0,427.0,693.0...|[0.01959847036328...|       2.0|
|[307.0,1.0,1.0]|            4|[41.0,427.0,693.0...|[0.01959847036328...|       2.0|
|[307.0,1.0,1.0]|            2|[41.0,427.0,693.0...|[0.01959847036328...|       2.0|
|[307.0,1.0,1.0]|            3|[41.0,427.0,693.0...|[0.01959847036328...|       2.0|
|[307.0,1.0,1.0]|            3|[41.0,427.0,693.0...|[0.01959847036328...|       2.0|
|[307.0,1.0,1.0]|            3|[41.0,427.0,693.0...|[0.0195984703

### 3. Random Forest

In [14]:
from pyspark.ml.classification import RandomForestClassifier

train_data = finalized_data
classifier = RandomForestClassifier(labelCol='AdoptionSpeed', featuresCol='features')
classifier = classifier.fit(train_data) # Fit the model


test_data = testfinalized_data
results = classifier.transform(test_data) # Evaluate the model on the validation data
results.show() # Show the predictions
results.select('AdoptionSpeed', 'prediction').show() # Show the target and the prediction

# Want to show the f1 score and confusion matrix
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(labelCol="AdoptionSpeed", predictionCol="prediction", metricName="f1")
f1_score = evaluator.evaluate(results)
print("F1 score: %.3f" % f1_score)

# Confusion matrix
results.crosstab('AdoptionSpeed', 'prediction').show()



+---------------+-------------+--------------------+--------------------+----------+
|       features|AdoptionSpeed|       rawPrediction|         probability|prediction|
+---------------+-------------+--------------------+--------------------+----------+
|[307.0,0.0,1.0]|            2|[0.43416348295369...|[0.02170817414768...|       2.0|
|[128.0,1.0,1.0]|            3|[0.72443176663505...|[0.03622158833175...|       2.0|
|[307.0,1.0,1.0]|            2|[0.35548404163702...|[0.01777420208185...|       2.0|
|[307.0,1.0,1.0]|            2|[0.35548404163702...|[0.01777420208185...|       2.0|
|[307.0,1.0,1.0]|            4|[0.35548404163702...|[0.01777420208185...|       2.0|
|[307.0,1.0,1.0]|            2|[0.35548404163702...|[0.01777420208185...|       2.0|
|[307.0,1.0,1.0]|            3|[0.35548404163702...|[0.01777420208185...|       2.0|
|[307.0,1.0,1.0]|            3|[0.35548404163702...|[0.01777420208185...|       2.0|
|[307.0,1.0,1.0]|            3|[0.35548404163702...|[0.0177742020

### 4. Naive Bayes

In [15]:
from pyspark.ml.classification import NaiveBayes

train_data = finalized_data
classifier = NaiveBayes(labelCol='AdoptionSpeed', featuresCol='features')
classifier = classifier.fit(train_data) # Fit the model


test_data = testfinalized_data
results = classifier.transform(test_data) # Evaluate the model on the validation data
results.show() # Show the predictions
results.select('AdoptionSpeed', 'prediction').show() # Show the target and the prediction

# Want to show the f1 score and confusion matrix
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(labelCol="AdoptionSpeed", predictionCol="prediction", metricName="f1")
f1_score = evaluator.evaluate(results)
print("F1 score: %.3f" % f1_score)

# Confusion matrix
results.crosstab('AdoptionSpeed', 'prediction').show()


+---------------+-------------+--------------------+--------------------+----------+
|       features|AdoptionSpeed|       rawPrediction|         probability|prediction|
+---------------+-------------+--------------------+--------------------+----------+
|[307.0,0.0,1.0]|            2|[-22.437679169332...|[0.00282065235772...|      13.0|
|[128.0,1.0,1.0]|            3|[-17.697206668521...|[0.02210479811797...|       2.0|
|[307.0,1.0,1.0]|            2|[-25.713613854880...|[0.00581037364618...|       2.0|
|[307.0,1.0,1.0]|            2|[-25.713613854880...|[0.00581037364618...|       2.0|
|[307.0,1.0,1.0]|            4|[-25.713613854880...|[0.00581037364618...|       2.0|
|[307.0,1.0,1.0]|            2|[-25.713613854880...|[0.00581037364618...|       2.0|
|[307.0,1.0,1.0]|            3|[-25.713613854880...|[0.00581037364618...|       2.0|
|[307.0,1.0,1.0]|            3|[-25.713613854880...|[0.00581037364618...|       2.0|
|[307.0,1.0,1.0]|            3|[-25.713613854880...|[0.0058103736