In [85]:
# ! pip install pyspark

In [86]:
# IMporting the libraries
import pyspark


# Use 3 features:  'Type', 'Age', 'Breed1'

## Using pyspark to read the data and process it

In [87]:
# To work with spark we need to create a spark session
# Need to instal java
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('petfinder').getOrCreate()
# Read a dataset with spark
df_spark = spark.read.csv('./train.csv', header=True, inferSchema=True)
# Header = True, inferSchema = True, means that the first row is the header and the schema is inferred (if schema is not inferred, all columns will be read as string)
# Convert the column "AdoptionSpeed" to integer
df_spark = df_spark.withColumn("AdoptionSpeed", df_spark["AdoptionSpeed"].cast("integer"))

## Data cleaning

In [88]:
## Drop rows with missing values
# df_spark.na.drop(how='all', thresh=10).show() 
    ### how='any' means drop rows with any missing value, how='all' means drop rows whose all values are missing
    ### thresh=10 means drop rows whose number of missing values is greater than 10
    ### subset=['Age'] means drop rows whose 'Age' value is missing
df_spark = df_spark.na.drop(how= 'any' , subset=['AdoptionSpeed'])
## Fill missing values with mean
from pyspark.sql.functions import mean
mean_val = df_spark.select(mean(df_spark['Age'])).collect()
mean_age = mean_val[0][0]
df_spark.na.fill(mean_age, subset=['Age']).show()


+----+--------------------+---+------+------+------+------+------+------+------------+---------+----------+--------+----------+------+--------+---+-----+--------------------+--------+--------------------+---------+--------+-------------+
|Type|                Name|Age|Breed1|Breed2|Gender|Color1|Color2|Color3|MaturitySize|FurLength|Vaccinated|Dewormed|Sterilized|Health|Quantity|Fee|State|           RescuerID|VideoAmt|         Description|    PetID|PhotoAmt|AdoptionSpeed|
+----+--------------------+---+------+------+------+------+------+------+------------+---------+----------+--------+----------+------+--------+---+-----+--------------------+--------+--------------------+---------+--------+-------------+
|   2|              Nibble|  3|   299|     0|     1|     1|     7|     0|           1|        1|         2|       2|         2|     1|       1|100|41326|8480853f516546f6c...|       0|Nibble is a 3+ mo...|86e1089a3|     1.0|            2|
|   2|         No Name Yet|  1|   265|     0|   

# Using PySpark MLlib to build the model

In [89]:
from pyspark.ml.feature import VectorAssembler

featureassemble = VectorAssembler(inputCols=['Breed1','Age','Type'], outputCol='features')
output = featureassemble.transform(df_spark) # This will create a new column called 'features' which is a vector of the selected columns (Type, Age2, Breed1) by the VectorAssembler
output.show()

+----+--------------------+---+------+------+------+------+------+------+------------+---------+----------+--------+----------+------+--------+---+-----+--------------------+--------+--------------------+---------+--------+-------------+----------------+
|Type|                Name|Age|Breed1|Breed2|Gender|Color1|Color2|Color3|MaturitySize|FurLength|Vaccinated|Dewormed|Sterilized|Health|Quantity|Fee|State|           RescuerID|VideoAmt|         Description|    PetID|PhotoAmt|AdoptionSpeed|        features|
+----+--------------------+---+------+------+------+------+------+------+------------+---------+----------+--------+----------+------+--------+---+-----+--------------------+--------+--------------------+---------+--------+-------------+----------------+
|   2|              Nibble|  3|   299|     0|     1|     1|     7|     0|           1|        1|         2|       2|         2|     1|       1|100|41326|8480853f516546f6c...|       0|Nibble is a 3+ mo...|86e1089a3|     1.0|            

In [90]:
finalized_data = output.select('features', 'AdoptionSpeed') # Select the features and the target column
finalized_data.show()

+----------------+-------------+
|        features|AdoptionSpeed|
+----------------+-------------+
| [299.0,3.0,2.0]|            2|
| [265.0,1.0,2.0]|            0|
| [307.0,1.0,1.0]|            3|
| [307.0,4.0,1.0]|            2|
| [307.0,1.0,1.0]|            2|
| [266.0,3.0,2.0]|            2|
|[264.0,12.0,2.0]|            1|
| [307.0,0.0,1.0]|            3|
| [265.0,2.0,2.0]|            1|
|[265.0,12.0,2.0]|            4|
| [307.0,2.0,1.0]|            1|
| [264.0,3.0,2.0]|            1|
| [307.0,2.0,1.0]|            2|
| [265.0,2.0,2.0]|            1|
| [307.0,3.0,1.0]|            2|
|[218.0,78.0,1.0]|            4|
| [266.0,6.0,2.0]|            3|
| [307.0,8.0,1.0]|            4|
| [307.0,2.0,1.0]|            2|
| [266.0,1.0,2.0]|            4|
+----------------+-------------+
only showing top 20 rows



### 1. Logistic Regression

In [91]:
from pyspark.ml.classification import LogisticRegression
# Split the data into training and validation data
train_data, validation_data = finalized_data.randomSplit([0.1, 0.9])
classifier = LogisticRegression
classifier = LogisticRegression(labelCol='AdoptionSpeed').fit(train_data) # Fit the model
results = classifier.evaluate(validation_data) # Evaluate the model on the validation data
results.predictions.show() # Show the predictions
results.predictions.select('AdoptionSpeed', 'prediction').show() # Show the target and the prediction
results.accuracy # Show the accuracy of the model


+---------------+-------------+--------------------+--------------------+----------+
|       features|AdoptionSpeed|       rawPrediction|         probability|prediction|
+---------------+-------------+--------------------+--------------------+----------+
|  [0.0,2.0,1.0]|            3|[4.50654175975770...|[0.04978538642926...|       1.0|
|  [0.0,2.0,1.0]|            3|[4.50654175975770...|[0.04978538642926...|       1.0|
|  [0.0,3.0,2.0]|            2|[6.63322771602572...|[0.08340635650503...|       1.0|
|  [0.0,4.0,1.0]|            4|[4.56675914155921...|[0.05038444511846...|       1.0|
| [0.0,72.0,1.0]|            4|[6.61415012281041...|[0.06729387890644...|       1.0|
|  [1.0,5.0,1.0]|            4|[4.59372545418808...|[0.05059119305873...|       1.0|
|  [5.0,0.0,1.0]|            1|[4.43061248659681...|[0.04874942113830...|       1.0|
| [5.0,24.0,1.0]|            4|[5.15322106821489...|[0.05569110923999...|       1.0|
| [7.0,14.0,1.0]|            4|[4.84584940266360...|[0.0526346183

0.3114915536701765

### 2. Decision Tree

In [92]:
from pyspark.ml.classification import DecisionTreeClassifier
# Split the data into training and validation data
train_data, validation_data = finalized_data.randomSplit([0.1, 0.9])
classifier = DecisionTreeClassifier( labelCol='AdoptionSpeed', featuresCol='features')
classifier = classifier.fit(train_data) # Fit the model
results = classifier.transform(validation_data) # Evaluate the model on the validation data
results.show() # Show the predictions
results.select('AdoptionSpeed', 'prediction').show() # Show the target and the prediction
# results.accuracy # Show the accuracy of the model
results.accuracy = results.filter(results.AdoptionSpeed == results.prediction).count() / float(results.count())
results.accuracy

+---------------+-------------+--------------------+--------------------+----------+
|       features|AdoptionSpeed|       rawPrediction|         probability|prediction|
+---------------+-------------+--------------------+--------------------+----------+
|  [0.0,2.0,1.0]|            3|[13.0,77.0,113.0,...|[0.03746397694524...|       2.0|
|  [0.0,2.0,1.0]|            3|[13.0,77.0,113.0,...|[0.03746397694524...|       2.0|
|  [0.0,3.0,2.0]|            2|[13.0,77.0,113.0,...|[0.03746397694524...|       2.0|
|  [0.0,4.0,1.0]|            4|[0.0,3.0,3.0,8.0,...|[0.0,0.1764705882...|       3.0|
| [0.0,72.0,1.0]|            4|[8.0,39.0,50.0,29...|[0.04878048780487...|       2.0|
|  [1.0,5.0,1.0]|            4|[0.0,3.0,3.0,8.0,...|[0.0,0.1764705882...|       3.0|
|  [5.0,0.0,1.0]|            1|[1.0,1.0,0.0,0.0,...|[0.5,0.5,0.0,0.0,...|       0.0|
| [5.0,24.0,1.0]|            4|[8.0,39.0,50.0,29...|[0.04878048780487...|       2.0|
| [7.0,14.0,1.0]|            4|[8.0,39.0,50.0,29...|[0.0487804878

0.3457478890229192

### 3. Random Forest

In [94]:
from pyspark.ml.classification import RandomForestClassifier
# Split the data into training and validation data
train_data, validation_data = finalized_data.randomSplit([0.1, 0.9])
classifier = RandomForestClassifier( labelCol='AdoptionSpeed', featuresCol='features')
classifier = classifier.fit(train_data) # Fit the model
results = classifier.transform(validation_data) # Evaluate the model on the validation data
results.show() # Show the predictions
results.select('AdoptionSpeed', 'prediction').show() # Show the target and the prediction
# results.accuracy # Show the accuracy of the model
results.accuracy = results.filter(results.AdoptionSpeed == results.prediction).count() / float(results.count())
results.accuracy


+---------------+-------------+--------------------+--------------------+----------+
|       features|AdoptionSpeed|       rawPrediction|         probability|prediction|
+---------------+-------------+--------------------+--------------------+----------+
|  [0.0,2.0,1.0]|            3|[0.57093352581894...|[0.02854667629094...|       3.0|
|  [0.0,3.0,2.0]|            2|[3.02215758377512...|[0.15110787918875...|       4.0|
|  [0.0,4.0,1.0]|            4|[1.20749549953330...|[0.06037477497666...|       3.0|
| [0.0,72.0,1.0]|            4|[0.91732063664329...|[0.04586603183216...|       4.0|
|  [5.0,0.0,1.0]|            1|[0.62544942282832...|[0.03127247114141...|       3.0|
| [7.0,14.0,1.0]|            4|[1.34838616332124...|[0.06741930816606...|       2.0|
|[10.0,60.0,1.0]|            3|[0.91732063664329...|[0.04586603183216...|       4.0|
|[10.0,84.0,1.0]|            3|[0.91732063664329...|[0.04586603183216...|       4.0|
| [11.0,1.0,1.0]|            4|[0.30044942282832...|[0.0150224711

0.34957818620066283

### 4. Naive Bayes

In [96]:
from pyspark.ml.classification import NaiveBayes
# Split the data into training and validation data
train_data, validation_data = finalized_data.randomSplit([0.1, 0.9])
classifier = NaiveBayes( labelCol='AdoptionSpeed', featuresCol='features')
classifier = classifier.fit(train_data) # Fit the model
results = classifier.transform(validation_data) # Evaluate the model on the validation data
results.show() # Show the predictions
results.select('AdoptionSpeed', 'prediction').show() # Show the target and the prediction
# results.accuracy # Show the accuracy of the model
results.accuracy = results.filter(results.AdoptionSpeed == results.prediction).count() / float(results.count())
results.accuracy

+---------------+-------------+--------------------+--------------------+----------+
|       features|AdoptionSpeed|       rawPrediction|         probability|prediction|
+---------------+-------------+--------------------+--------------------+----------+
|  [0.0,2.0,1.0]|            3|[-14.872410311282...|[0.04281955876827...|       4.0|
|  [0.0,2.0,1.0]|            3|[-14.872410311282...|[0.04281955876827...|       4.0|
|  [0.0,3.0,2.0]|            2|[-23.187144547845...|[0.05007950646776...|       4.0|
|  [0.0,4.0,1.0]|            4|[-21.278048371913...|[0.04236192109208...|       4.0|
| [0.0,72.0,1.0]|            4|[-239.06974243336...|[6.22502274660779...|       4.0|
|  [1.0,5.0,1.0]|            4|[-24.528663649522...|[0.04115440869555...|       4.0|
|  [5.0,0.0,1.0]|            1|[-8.7057534871198...|[0.03938349811024...|       2.0|
| [7.0,14.0,1.0]|            4|[-53.640812406122...|[0.02204981536654...|       4.0|
|[10.0,60.0,1.0]|            3|[-201.11387654251...|[2.2594422687

0.2663183401484174