In [25]:
# ! pip install pyspark

In [26]:
# IMporting the libraries
import pyspark


# Use PCA

## Using pyspark to read the data and process it

In [1]:
# To work with spark we need to create a spark session
# Need to instal java
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").appName('petfinder').getOrCreate()

########### For the train dataset
# Read a dataset with spark
df_spark = spark.read.csv('./train_balanced_corr_pca.csv', header=True, inferSchema=True)
# Header = True, inferSchema = True, means that the first row is the header and the schema is inferred (if schema is not inferred, all columns will be read as string)
# Convert the column "AdoptionSpeed" to integer
df_spark = df_spark.withColumn("AdoptionSpeed", df_spark["AdoptionSpeed"].cast("integer"))


############ For the test dataset
# Read a dataset with spark
df_spark_test = spark.read.csv('./test_split_corr_pca.csv', header=True, inferSchema=True)
# Header = True, inferSchema = True, means that the first row is the header and the schema is inferred (if schema is not inferred, all columns will be read as string)
# Convert the column "AdoptionSpeed" to integer
df_spark_test = df_spark_test.withColumn("AdoptionSpeed", df_spark_test["AdoptionSpeed"].cast("integer"))


## Print size of the data
print("Size of the training data: ", df_spark.count())
print("Size of the test data: ", df_spark_test.count())

23/05/06 11:16:50 WARN Utils: Your hostname, ubuntu resolves to a loopback address: 127.0.1.1; using 192.168.17.130 instead (on interface ens33)
23/05/06 11:16:50 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/05/06 11:16:51 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
23/05/06 11:16:52 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


Size of the training data:  16870
Size of the test data:  2999


## Data cleaning

In [2]:

########### For the train dataset

## Drop rows with missing values
df_spark = df_spark.na.drop(how= 'any' , subset=['AdoptionSpeed'])
df_spark.show()
########### For the test dataset
df_spark_test = df_spark_test.na.drop(how= 'any' , subset=['AdoptionSpeed'])
df_spark_test.show()

+-------------------+-------------------+-------------+
|               PC 1|               PC 2|AdoptionSpeed|
+-------------------+-------------------+-------------+
| -45.79889769398241|-3.7878222930805943|            1|
|-42.343623530864804| 30.035857366316367|            4|
|  81.53834585803112| -16.79632262357971|            1|
|-1.4767092940335453| 16.810059608539753|            4|
| -45.79889769398213|-3.7878222930861187|            1|
|-2.2682800879238054|  18.90131673263072|            4|
|  180.4965526851584|  7.270981787952771|            1|
| -43.56313205902385| 18.098088074762675|            3|
|-43.868009191063614|   15.1136457518742|            4|
|-43.055003505624235| 23.072158612910123|            3|
| -45.69727198330221| -2.793008185456628|            2|
| 211.91402333302426|  88.49790192569439|            2|
|  116.9883125474134| -4.334679801795222|            2|
|  9.161457941677828|  12.70769928849949|            3|
|  59.12824242464578|  19.66970866545448|       

# Using PySpark MLlib to build the model

In [3]:
# First, collect the features in a single column

from pyspark.ml.feature import VectorAssembler

#### For the train dataset
featureassemble = VectorAssembler(inputCols=['PC 1','PC 2'], outputCol='features')
output = featureassemble.transform(df_spark) # This will create a new column called 'features' which is a vector of the selected columns (Type, Age2, Breed1) by the VectorAssembler
output.show()

#### For the test dataset
testfeatureassemble = VectorAssembler(inputCols=['PC 1','PC 2'], outputCol='features')
testoutput = testfeatureassemble.transform(df_spark_test) # This will create a new column called 'features' which is a vector of the selected columns (Type, Age2, Breed1) by the VectorAssembler
testoutput.show()


+-------------------+-------------------+-------------+--------------------+
|               PC 1|               PC 2|AdoptionSpeed|            features|
+-------------------+-------------------+-------------+--------------------+
| -45.79889769398241|-3.7878222930805943|            1|[-45.798897693982...|
|-42.343623530864804| 30.035857366316367|            4|[-42.343623530864...|
|  81.53834585803112| -16.79632262357971|            1|[81.5383458580311...|
|-1.4767092940335453| 16.810059608539753|            4|[-1.4767092940335...|
| -45.79889769398213|-3.7878222930861187|            1|[-45.798897693982...|
|-2.2682800879238054|  18.90131673263072|            4|[-2.2682800879238...|
|  180.4965526851584|  7.270981787952771|            1|[180.496552685158...|
| -43.56313205902385| 18.098088074762675|            3|[-43.563132059023...|
|-43.868009191063614|   15.1136457518742|            4|[-43.868009191063...|
|-43.055003505624235| 23.072158612910123|            3|[-43.055003505624...|

In [4]:
# Select the features and the target column

#### For the train dataset
finalized_data = output.select('features', 'AdoptionSpeed') # Select the features and the target column
finalized_data.show()

#### For the test dataset
testfinalized_data = testoutput.select('features', 'AdoptionSpeed') # Select the features and the target column
testfinalized_data.show()


+--------------------+-------------+
|            features|AdoptionSpeed|
+--------------------+-------------+
|[-45.798897693982...|            1|
|[-42.343623530864...|            4|
|[81.5383458580311...|            1|
|[-1.4767092940335...|            4|
|[-45.798897693982...|            1|
|[-2.2682800879238...|            4|
|[180.496552685158...|            1|
|[-43.563132059023...|            3|
|[-43.868009191063...|            4|
|[-43.055003505624...|            3|
|[-45.697271983302...|            2|
|[211.914023333024...|            2|
|[116.988312547413...|            2|
|[9.16145794167782...|            3|
|[59.1282424246457...|            2|
|[-45.798897693982...|            3|
|[-3.9959171694824...|            3|
|[-45.697271983302...|            2|
|[-17.249369244257...|            4|
|[-3.9157263503516...|            2|
+--------------------+-------------+
only showing top 20 rows

+--------------------+-------------+
|            features|AdoptionSpeed|
+-----------

### 1. Logistic Regression

In [5]:
from pyspark.ml.classification import LogisticRegression
# Split the data into training and validation data
train_data = finalized_data
classifier = LogisticRegression
classifier = LogisticRegression(labelCol='AdoptionSpeed').fit(train_data) # Fit the model


test_data = testfinalized_data
results = classifier.evaluate(test_data) # Evaluate the model on the validation data
results.predictions.show() # Show the predictions
results.predictions.select('AdoptionSpeed', 'prediction').show() # Show the target and the prediction

# Want to show the f1 score and confusion matrix
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(labelCol="AdoptionSpeed", predictionCol="prediction", metricName="f1")
f1_score = evaluator.evaluate(results.predictions)
print("F1 score: %.3f" % f1_score)

# Confusion matrix
results.predictions.crosstab('AdoptionSpeed', 'prediction').show()



23/05/06 11:17:06 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS


+--------------------+-------------+--------------------+--------------------+----------+
|            features|AdoptionSpeed|       rawPrediction|         probability|prediction|
+--------------------+-------------+--------------------+--------------------+----------+
|[-0.4646831918769...|            4|[0.01081458698630...|[0.20194167201004...|       1.0|
|[0.18781876972525...|            4|[-0.0338958779557...|[0.19234253063304...|       4.0|
|[-0.9753292896775...|            2|[-0.0049339692493...|[0.19899426650005...|       3.0|
|[74.3435237231082...|            1|[0.14056653631653...|[0.22533761111126...|       4.0|
|[-1.8476903342297...|            4|[0.01678746228048...|[0.20278560206824...|       1.0|
|[-41.781674240331...|            2|[-0.1558683876163...|[0.16933260130768...|       4.0|
|[47.1869516866008...|            4|[0.16209503325697...|[0.23299498891322...|       0.0|
|[-18.722712558763...|            3|[-0.1038366691737...|[0.17780388978986...|       4.0|
|[1.350966

### 2. Decision Tree

In [6]:
from pyspark.ml.classification import DecisionTreeClassifier
# Split the data into training and validation data
train_data = finalized_data
classifier = DecisionTreeClassifier( labelCol='AdoptionSpeed', featuresCol='features')
classifier = classifier.fit(train_data) # Fit the model


test_data = testfinalized_data
results = classifier.transform(test_data) # Evaluate the model on the validation data
results.show() # Show the predictions
results.select('AdoptionSpeed', 'prediction').show() # Show the target and the prediction

# Want to show the f1 score and confusion matrix
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(labelCol="AdoptionSpeed", predictionCol="prediction", metricName="f1")
f1_score = evaluator.evaluate(results)
print("F1 score: %.3f" % f1_score)

# Confusion matrix
results.crosstab('AdoptionSpeed', 'prediction').show()



+--------------------+-------------+--------------------+--------------------+----------+
|            features|AdoptionSpeed|       rawPrediction|         probability|prediction|
+--------------------+-------------+--------------------+--------------------+----------+
|[-0.4646831918769...|            4|[118.0,52.0,47.0,...|[0.37942122186495...|       0.0|
|[0.18781876972525...|            4|[285.0,225.0,235....|[0.18566775244299...|       4.0|
|[-0.9753292896775...|            2|[285.0,225.0,235....|[0.18566775244299...|       4.0|
|[74.3435237231082...|            1|[315.0,177.0,136....|[0.35512965050732...|       0.0|
|[-1.8476903342297...|            4|[806.0,738.0,535....|[0.28380281690140...|       0.0|
|[-41.781674240331...|            2|[36.0,2.0,9.0,8.0...|[0.52173913043478...|       0.0|
|[47.1869516866008...|            4|[285.0,225.0,235....|[0.18566775244299...|       4.0|
|[-18.722712558763...|            3|[285.0,225.0,235....|[0.18566775244299...|       4.0|
|[1.350966

### 3. Random Forest

In [7]:
from pyspark.ml.classification import RandomForestClassifier

train_data = finalized_data
classifier = RandomForestClassifier(labelCol='AdoptionSpeed', featuresCol='features')
classifier = classifier.fit(train_data) # Fit the model


test_data = testfinalized_data
results = classifier.transform(test_data) # Evaluate the model on the validation data
results.show() # Show the predictions
results.select('AdoptionSpeed', 'prediction').show() # Show the target and the prediction

# Want to show the f1 score and confusion matrix
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(labelCol="AdoptionSpeed", predictionCol="prediction", metricName="f1")
f1_score = evaluator.evaluate(results)
print("F1 score: %.3f" % f1_score)

# Confusion matrix
results.crosstab('AdoptionSpeed', 'prediction').show()



+--------------------+-------------+--------------------+--------------------+----------+
|            features|AdoptionSpeed|       rawPrediction|         probability|prediction|
+--------------------+-------------+--------------------+--------------------+----------+
|[-0.4646831918769...|            4|[4.47205142084630...|[0.22360257104231...|       4.0|
|[0.18781876972525...|            4|[3.83380533287242...|[0.19169026664362...|       4.0|
|[-0.9753292896775...|            2|[3.83380533287242...|[0.19169026664362...|       4.0|
|[74.3435237231082...|            1|[7.28481869753369...|[0.36424093487668...|       0.0|
|[-1.8476903342297...|            4|[5.85447670750173...|[0.29272383537508...|       0.0|
|[-41.781674240331...|            2|[4.82163629683130...|[0.24108181484156...|       4.0|
|[47.1869516866008...|            4|[7.28481869753369...|[0.36424093487668...|       0.0|
|[-18.722712558763...|            3|[4.09998258913485...|[0.20499912945674...|       4.0|
|[1.350966