In [1]:
! pip install pyspark
! pip install numpy
! pip install pandas



In [2]:
# IMporting the libraries
import pyspark


# Use 3 features:  'Type', 'Age', 'Breed1'

## Using pyspark to read the data and process it

In [3]:
# To work with spark we need to create a spark session
# Need to instal java
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").appName('petfinder').getOrCreate()

########### For the train dataset
# Read a dataset with spark
df_spark = spark.read.csv('./train_balanced_corr.csv', header=True, inferSchema=True)
# Header = True, inferSchema = True, means that the first row is the header and the schema is inferred (if schema is not inferred, all columns will be read as string)
# Convert the column "AdoptionSpeed" to integer
df_spark = df_spark.withColumn("AdoptionSpeed", df_spark["AdoptionSpeed"].cast("integer"))


############ For the test dataset
# Read a dataset with spark
df_spark_test = spark.read.csv('./test_split_corr.csv', header=True, inferSchema=True)
# Header = True, inferSchema = True, means that the first row is the header and the schema is inferred (if schema is not inferred, all columns will be read as string)
# Convert the column "AdoptionSpeed" to integer
df_spark_test = df_spark_test.withColumn("AdoptionSpeed", df_spark_test["AdoptionSpeed"].cast("integer"))


## Print size of the data
print("Size of the training data: ", df_spark.count())
print("Size of the test data: ", df_spark_test.count())

23/05/06 14:15:57 WARN Utils: Your hostname, ubuntu resolves to a loopback address: 127.0.1.1; using 192.168.17.130 instead (on interface ens33)
23/05/06 14:15:57 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/05/06 14:15:58 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


Size of the training data:  16870
Size of the test data:  2999


## Data cleaning

In [4]:

########### For the train dataset

## Drop rows with missing values
# df_spark.na.drop(how='all', thresh=10).show() 
    ### how='any' means drop rows with any missing value, how='all' means drop rows whose all values are missing
    ### thresh=10 means drop rows whose number of missing values is greater than 10
    ### subset=['Age'] means drop rows whose 'Age' value is missing
df_spark = df_spark.na.drop(how= 'any' , subset=['AdoptionSpeed'])
## Fill missing values with mean
from pyspark.sql.functions import mean
mean_val = df_spark.select(mean(df_spark['Age'])).collect()
mean_age = mean_val[0][0]
df_spark.na.fill(mean_age, subset=['Age']).show()

########### For the test dataset

df_spark_test = df_spark_test.na.drop(how= 'any' , subset=['AdoptionSpeed'])
## Fill missing values with mean
from pyspark.sql.functions import mean
mean_val = df_spark_test.select(mean(df_spark_test['Age'])).collect()
mean_age = mean_val[0][0]
df_spark_test.na.fill(mean_age, subset=['Age']).show()




+------+---+----+-------------+
|Breed1|Age|Type|AdoptionSpeed|
+------+---+----+-------------+
|   307|  2|   1|            1|
|   307| 36|   1|            4|
|   179|  2|   1|            1|
|   265| 27|   2|            4|
|   307|  2|   1|            1|
|   266| 29|   2|            4|
|    83| 36|   1|            1|
|   307| 24|   1|            3|
|   307| 21|   1|            4|
|   307| 29|   1|            3|
|   307|  3|   1|            2|
|    60|120|   1|            2|
|   145| 18|   1|            2|
|   254| 24|   2|            3|
|   205| 36|   1|            2|
|   307|  2|   1|            3|
|   266| 12|   2|            3|
|   307|  3|   1|            2|
|   283| 48|   2|            4|
|   265|  3|   2|            2|
+------+---+----+-------------+
only showing top 20 rows

+------+---+----+-------------+
|Breed1|Age|Type|AdoptionSpeed|
+------+---+----+-------------+
|   265|  7|   2|            4|
|   266| 24|   2|            4|
|   266| 12|   2|            2|
|   195| 60|  

# Using PySpark MLlib to build the model

In [5]:
# First, collect the features in a single column

from pyspark.ml.feature import VectorAssembler

#### For the train dataset
featureassemble = VectorAssembler(inputCols=['Breed1','Age','Type'], outputCol='features')
output = featureassemble.transform(df_spark) # This will create a new column called 'features' which is a vector of the selected columns (Type, Age2, Breed1) by the VectorAssembler
output.show()

#### For the test dataset
testfeatureassemble = VectorAssembler(inputCols=['Breed1','Age','Type'], outputCol='features')
testoutput = testfeatureassemble.transform(df_spark_test) # This will create a new column called 'features' which is a vector of the selected columns (Type, Age2, Breed1) by the VectorAssembler
testoutput.show()


+------+---+----+-------------+----------------+
|Breed1|Age|Type|AdoptionSpeed|        features|
+------+---+----+-------------+----------------+
|   307|  2|   1|            1| [307.0,2.0,1.0]|
|   307| 36|   1|            4|[307.0,36.0,1.0]|
|   179|  2|   1|            1| [179.0,2.0,1.0]|
|   265| 27|   2|            4|[265.0,27.0,2.0]|
|   307|  2|   1|            1| [307.0,2.0,1.0]|
|   266| 29|   2|            4|[266.0,29.0,2.0]|
|    83| 36|   1|            1| [83.0,36.0,1.0]|
|   307| 24|   1|            3|[307.0,24.0,1.0]|
|   307| 21|   1|            4|[307.0,21.0,1.0]|
|   307| 29|   1|            3|[307.0,29.0,1.0]|
|   307|  3|   1|            2| [307.0,3.0,1.0]|
|    60|120|   1|            2|[60.0,120.0,1.0]|
|   145| 18|   1|            2|[145.0,18.0,1.0]|
|   254| 24|   2|            3|[254.0,24.0,2.0]|
|   205| 36|   1|            2|[205.0,36.0,1.0]|
|   307|  2|   1|            3| [307.0,2.0,1.0]|
|   266| 12|   2|            3|[266.0,12.0,2.0]|
|   307|  3|   1|   

In [6]:
# Select the features and the target column

#### For the train dataset
finalized_data = output.select('features', 'AdoptionSpeed') # Select the features and the target column
finalized_data.show()

#### For the test dataset
testfinalized_data = testoutput.select('features', 'AdoptionSpeed') # Select the features and the target column
testfinalized_data.show()


+----------------+-------------+
|        features|AdoptionSpeed|
+----------------+-------------+
| [307.0,2.0,1.0]|            1|
|[307.0,36.0,1.0]|            4|
| [179.0,2.0,1.0]|            1|
|[265.0,27.0,2.0]|            4|
| [307.0,2.0,1.0]|            1|
|[266.0,29.0,2.0]|            4|
| [83.0,36.0,1.0]|            1|
|[307.0,24.0,1.0]|            3|
|[307.0,21.0,1.0]|            4|
|[307.0,29.0,1.0]|            3|
| [307.0,3.0,1.0]|            2|
|[60.0,120.0,1.0]|            2|
|[145.0,18.0,1.0]|            2|
|[254.0,24.0,2.0]|            3|
|[205.0,36.0,1.0]|            2|
| [307.0,2.0,1.0]|            3|
|[266.0,12.0,2.0]|            3|
| [307.0,3.0,1.0]|            2|
|[283.0,48.0,2.0]|            4|
| [265.0,3.0,2.0]|            2|
+----------------+-------------+
only showing top 20 rows

+----------------+-------------+
|        features|AdoptionSpeed|
+----------------+-------------+
| [265.0,7.0,2.0]|            4|
|[266.0,24.0,2.0]|            4|
|[266.0,12.0,2.0]

### 1. Logistic Regression

In [7]:
from pyspark.ml.classification import LogisticRegression
# Split the data into training and validation data
train_data = finalized_data
classifier = LogisticRegression
classifier = LogisticRegression(labelCol='AdoptionSpeed').fit(train_data) # Fit the model


test_data = testfinalized_data
results = classifier.evaluate(test_data) # Evaluate the model on the validation data
results.predictions.show() # Show the predictions
results.predictions.select('AdoptionSpeed', 'prediction').show() # Show the target and the prediction

# Want to show the f1 score and confusion matrix
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(labelCol="AdoptionSpeed", predictionCol="prediction", metricName="f1")
f1_score = evaluator.evaluate(results.predictions)
print("F1 score: %.3f" % f1_score)

# Confusion matrix
results.predictions.crosstab('AdoptionSpeed', 'prediction').show()



23/05/06 14:16:10 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS


+----------------+-------------+--------------------+--------------------+----------+
|        features|AdoptionSpeed|       rawPrediction|         probability|prediction|
+----------------+-------------+--------------------+--------------------+----------+
| [265.0,7.0,2.0]|            4|[0.21852222769437...|[0.24594501729175...|       0.0|
|[266.0,24.0,2.0]|            4|[0.21091546173332...|[0.24452052550620...|       0.0|
|[266.0,12.0,2.0]|            2|[0.21328101316233...|[0.24531022067589...|       0.0|
|[195.0,60.0,1.0]|            1|[0.02040640887447...|[0.19880089651180...|       4.0|
| [266.0,3.0,2.0]|            4|[0.21505517673408...|[0.24453461523033...|       0.0|
|[307.0,12.0,1.0]|            2|[-0.4467550129775...|[0.12267623478828...|       4.0|
|[218.0,16.0,1.0]|            4|[-0.0687979689280...|[0.18565850870323...|       3.0|
|[285.0,24.0,2.0]|            3|[0.13005966777088...|[0.22530280619896...|       4.0|
|[266.0,36.0,2.0]|            4|[0.20854991030431...|[

### 2. Decision Tree

In [8]:
from pyspark.ml.classification import DecisionTreeClassifier
# Split the data into training and validation data
train_data = finalized_data
classifier = DecisionTreeClassifier( labelCol='AdoptionSpeed', featuresCol='features')
classifier = classifier.fit(train_data) # Fit the model


test_data = testfinalized_data
results = classifier.transform(test_data) # Evaluate the model on the validation data
results.show() # Show the predictions
results.select('AdoptionSpeed', 'prediction').show() # Show the target and the prediction

# Want to show the f1 score and confusion matrix
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(labelCol="AdoptionSpeed", predictionCol="prediction", metricName="f1")
f1_score = evaluator.evaluate(results)
print("F1 score: %.3f" % f1_score)

# Confusion matrix
results.crosstab('AdoptionSpeed', 'prediction').show()



+----------------+-------------+--------------------+--------------------+----------+
|        features|AdoptionSpeed|       rawPrediction|         probability|prediction|
+----------------+-------------+--------------------+--------------------+----------+
| [265.0,7.0,2.0]|            4|[388.0,258.0,317....|[0.19705434230573...|       4.0|
|[266.0,24.0,2.0]|            4|[388.0,258.0,317....|[0.19705434230573...|       4.0|
|[266.0,12.0,2.0]|            2|[388.0,258.0,317....|[0.19705434230573...|       4.0|
|[195.0,60.0,1.0]|            1|[380.0,195.0,147....|[0.39957939011566...|       0.0|
| [266.0,3.0,2.0]|            4|[475.0,657.0,581....|[0.19467213114754...|       1.0|
|[307.0,12.0,1.0]|            2|[163.0,156.0,291....|[0.08101391650099...|       4.0|
|[218.0,16.0,1.0]|            4|[388.0,258.0,317....|[0.19705434230573...|       4.0|
|[285.0,24.0,2.0]|            3|[214.0,128.0,98.0...|[0.31845238095238...|       0.0|
|[266.0,36.0,2.0]|            4|[388.0,258.0,317....|[

### 3. Random Forest

In [9]:
from pyspark.ml.classification import RandomForestClassifier

train_data = finalized_data
classifier = RandomForestClassifier(labelCol='AdoptionSpeed', featuresCol='features')
classifier = classifier.fit(train_data) # Fit the model


test_data = testfinalized_data
results = classifier.transform(test_data) # Evaluate the model on the validation data
results.show() # Show the predictions
results.select('AdoptionSpeed', 'prediction').show() # Show the target and the prediction

# Want to show the f1 score and confusion matrix
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(labelCol="AdoptionSpeed", predictionCol="prediction", metricName="f1")
f1_score = evaluator.evaluate(results)
print("F1 score: %.3f" % f1_score)

# Confusion matrix
results.crosstab('AdoptionSpeed', 'prediction').show()



+----------------+-------------+--------------------+--------------------+----------+
|        features|AdoptionSpeed|       rawPrediction|         probability|prediction|
+----------------+-------------+--------------------+--------------------+----------+
| [265.0,7.0,2.0]|            4|[4.37728456927346...|[0.21886422846367...|       4.0|
|[266.0,24.0,2.0]|            4|[3.88723613109653...|[0.19436180655482...|       4.0|
|[266.0,12.0,2.0]|            2|[3.88723613109653...|[0.19436180655482...|       4.0|
|[195.0,60.0,1.0]|            1|[7.11707880764521...|[0.35585394038226...|       0.0|
| [266.0,3.0,2.0]|            4|[3.93815317797074...|[0.19690765889853...|       1.0|
|[307.0,12.0,1.0]|            2|[1.74903516102445...|[0.08745175805122...|       4.0|
|[218.0,16.0,1.0]|            4|[5.0201601847985,...|[0.25100800923992...|       0.0|
|[285.0,24.0,2.0]|            3|[5.83607010678058...|[0.29180350533902...|       0.0|
|[266.0,36.0,2.0]|            4|[3.58845452181547...|[

### 4. Naive Bayes

In [10]:
from pyspark.ml.classification import NaiveBayes

train_data = finalized_data
classifier = NaiveBayes(labelCol='AdoptionSpeed', featuresCol='features')
classifier = classifier.fit(train_data) # Fit the model


test_data = testfinalized_data
results = classifier.transform(test_data) # Evaluate the model on the validation data
results.show() # Show the predictions
results.select('AdoptionSpeed', 'prediction').show() # Show the target and the prediction

# Want to show the f1 score and confusion matrix
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(labelCol="AdoptionSpeed", predictionCol="prediction", metricName="f1")
f1_score = evaluator.evaluate(results)
print("F1 score: %.3f" % f1_score)

# Confusion matrix
results.crosstab('AdoptionSpeed', 'prediction').show()


+----------------+-------------+--------------------+--------------------+----------+
|        features|AdoptionSpeed|       rawPrediction|         probability|prediction|
+----------------+-------------+--------------------+--------------------+----------+
| [265.0,7.0,2.0]|            4|[-46.974540850384...|[0.14402667045828...|       2.0|
|[266.0,24.0,2.0]|            4|[-101.24812790298...|[0.17873348172516...|       4.0|
|[266.0,12.0,2.0]|            2|[-62.971521419971...|[0.25457037136800...|       0.0|
|[195.0,60.0,1.0]|            1|[-207.53567000969...|[0.00110149909586...|       4.0|
| [266.0,3.0,2.0]|            4|[-34.264066557712...|[0.06978615073014...|       2.0|
|[307.0,12.0,1.0]|            2|[-59.849431997550...|[0.19599104897496...|       3.0|
|[218.0,16.0,1.0]|            4|[-68.301187210301...|[0.24240190122342...|       4.0|
|[285.0,24.0,2.0]|            3|[-102.16762406796...|[0.19066671534541...|       4.0|
|[266.0,36.0,2.0]|            4|[-139.52473438599...|[

## Using Map Reduce to build Naive Bayes model

In [11]:
# To work with spark we need to create a spark session
# Need to instal java
from pyspark.sql import SparkSession
rdd_spark_session = SparkSession.builder.master("local[*]").config("spark.driver.memory", "15g").appName('petfinderMapReduce').getOrCreate()

# Read the data
df_spark = rdd_spark_session.read.csv('./train_split.csv', header=True, inferSchema=True)

# Cast the column "AdoptionSpeed" to integer
df_spark = df_spark.withColumn("AdoptionSpeed", df_spark["AdoptionSpeed"].cast("integer"))

df_spark = df_spark.na.drop(how= 'any' , subset=['AdoptionSpeed'])

# Drope any row with AdoptionSpeed not in [0,1,2,3,4]
df_spark = df_spark.filter(df_spark.AdoptionSpeed.isin([0,1,2,3,4])) 

# Select the features and the target column
df_spark = df_spark.select('Breed1', 'Age', 'Type', 'AdoptionSpeed')

# Convert the data to RDD
rdd_spark = df_spark.rdd


23/05/06 14:16:19 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [12]:

# Count the number of occurence of each class and each value of the features
breed_map = rdd_spark.map(lambda x: ((x[0]), 1))
age_map = rdd_spark.map(lambda x: ((x[1]), 1))
type_map = rdd_spark.map(lambda x: ((x[2]), 1))
adoption_speed_map = rdd_spark.map(lambda x: (x[3], 1))

# Reduce the data to count the number of each class
breed_reduce = breed_map.reduceByKey(lambda x, y: x + y)
age_reduce = age_map.reduceByKey(lambda x, y: x + y)
type_reduce = type_map.reduceByKey(lambda x, y: x + y)
adoption_speed_reduce = adoption_speed_map.reduceByKey(lambda x, y: x + y)


# Now we want to count the propability of each class for each value of the features
# Map the data to count the occurence of each class with the different values of the features
breed_speed_map = rdd_spark.map(lambda x: ((x[0], x[3]), 1))
age_speed_map = rdd_spark.map(lambda x: ((x[1], x[3]), 1))
type_speed_map = rdd_spark.map(lambda x: ((x[2], x[3]), 1))

# Reduce the data to count the number of each class
breed_speed_reduce = breed_speed_map.reduceByKey(lambda x, y: x + y)
age_speed_reduce = age_speed_map.reduceByKey(lambda x, y: x + y)
type_speed_reduce = type_speed_map.reduceByKey(lambda x, y: x + y)


In [13]:

# Print the results

print("Breed")
print(breed_reduce.collect())

print("Age")
print(age_reduce.collect())

print("Type")
print(type_reduce.collect())

print("Adoption Speed")
print(adoption_speed_reduce.collect())

print("Breed and Adoption Speed")
print(breed_speed_reduce.collect())

print("Age and Adoption Speed")
print(age_speed_reduce.collect())

print("Type and Adoption Speed")
print(type_speed_reduce.collect())



Breed


                                                                                

[(307, 4737), (179, 131), (265, 1038), (266, 2888), (83, 9), (60, 28), (145, 1), (254, 87), (205, 163), (283, 60), (299, 265), (264, 235), (285, 171), (20, 71), (109, 128), (300, 17), (213, 76), (305, 6), (141, 163), (103, 78), (292, 215), (218, 122), (182, 17), (75, 20), (178, 13), (10, 2), (207, 19), (243, 83), (304, 6), (195, 54), (69, 31), (289, 23), (189, 70), (190, 5), (76, 33), (173, 16), (276, 28), (148, 2), (128, 49), (303, 34), (26, 19), (269, 2), (206, 22), (242, 13), (251, 24), (119, 33), (252, 18), (306, 42), (39, 20), (78, 49), (185, 3), (247, 63), (125, 1), (147, 14), (117, 9), (155, 3), (50, 10), (167, 2), (244, 2), (72, 11), (152, 51), (286, 1), (274, 4), (97, 3), (288, 12), (202, 6), (246, 2), (268, 6), (169, 15), (249, 15), (49, 14), (284, 4), (100, 4), (253, 5), (21, 1), (293, 3), (295, 12), (257, 1), (212, 1), (294, 4), (281, 2), (199, 1), (19, 6), (297, 4), (102, 6), (250, 13), (301, 4), (241, 23), (98, 2), (7, 1), (277, 4), (282, 12), (280, 5), (129, 4), (17, 4),

In [14]:
# Need to calculate the propability of each class for each value of the features

## For each feature we have two RDDs : one with the count of each class for each value of the feature and one with the total number of occurence of each value of the feature
## To get the propability we need to divide the count of each class by the total number of occurence of each value of the feature
## The schema of the RDDs is ((feature, class), count)

# Breed
prop_breed_speed_reduce = breed_speed_reduce.map(lambda x: (x[0][0], (x[0][1], x[1]))) #Convert the schema ((breed, class), count) -> (breed, (class, count))
prop_breed_speed_reduce = prop_breed_speed_reduce.join(breed_reduce) # Join the RDDs to get the total number of occurence of each breed (breed, ((class, count), total)), that's why we needed the above step
prop_breed_speed_reduce = prop_breed_speed_reduce.map(lambda x: (x[0], (x[1][0][0], x[1][0][1]), x[1][1])) # Convert the schema (breed, ((class, count), total)) -> (breed, (class, count), total)
prop_breed_speed_reduce = prop_breed_speed_reduce.map(lambda x: (x[0], (x[1][0], x[1][1]/x[2])))
prop_breed_speed_reduce = prop_breed_speed_reduce.groupByKey().mapValues(list)

print("Breed and Adoption Speed")
print(prop_breed_speed_reduce.collect())

# Age
prop_age_speed_reduce = age_speed_reduce.map(lambda x: (x[0][0], (x[0][1], x[1]))) #Convert the schema ((age, class), count) -> (age, (class, count))
prop_age_speed_reduce = prop_age_speed_reduce.join(age_reduce) # Join the RDDs to get the total number of occurence of each age (age, ((class, count), total)), that's why we needed the above step
prop_age_speed_reduce = prop_age_speed_reduce.map(lambda x: (x[0], (x[1][0][0], x[1][0][1]), x[1][1])) # Convert the schema (age, ((class, count), total)) -> (age, (class, count), total)
prop_age_speed_reduce = prop_age_speed_reduce.map(lambda x: (x[0], (x[1][0], x[1][1]/x[2])))
prop_age_speed_reduce = prop_age_speed_reduce.groupByKey().mapValues(list)

print("Age and Adoption Speed")
print(prop_age_speed_reduce.collect())

# Type
prop_type_speed_reduce = type_speed_reduce.map(lambda x: (x[0][0], (x[0][1], x[1]))) #Convert the schema ((type, class), count) -> (type, (class, count))
prop_type_speed_reduce = prop_type_speed_reduce.join(type_reduce) # Join the RDDs to get the total number of occurence of each type (type, ((class, count), total)), that's why we needed the above step
prop_type_speed_reduce = prop_type_speed_reduce.map(lambda x: (x[0], (x[1][0][0], x[1][0][1]), x[1][1])) # Convert the schema (type, ((class, count), total)) -> (type, (class, count), total)
prop_type_speed_reduce = prop_type_speed_reduce.map(lambda x: (x[0], (x[1][0], x[1][1]/x[2])))
prop_type_speed_reduce = prop_type_speed_reduce.groupByKey().mapValues(list)

print("Type and Adoption Speed")
print(prop_type_speed_reduce.collect())


# Number of records
N = rdd_spark.count()
# Adoption Speed
prop_adoption_speed_reduce = adoption_speed_reduce.map(lambda x: (x[0], x[1]/N))

print("Adoption Speed")
print(prop_adoption_speed_reduce.collect())



Breed and Adoption Speed
[(266, [(4, 0.2738919667590028), (3, 0.20948753462603878), (1, 0.21364265927977838), (2, 0.2787396121883656), (0, 0.024238227146814405)]), (60, [(2, 0.35714285714285715), (3, 0.32142857142857145), (1, 0.17857142857142858), (4, 0.14285714285714285)]), (254, [(3, 0.19540229885057472), (4, 0.367816091954023), (2, 0.1724137931034483), (1, 0.22988505747126436), (0, 0.034482758620689655)]), (264, [(4, 0.17872340425531916), (2, 0.25957446808510637), (1, 0.3191489361702128), (0, 0.06382978723404255), (3, 0.17872340425531916)]), (20, [(1, 0.28169014084507044), (2, 0.15492957746478872), (3, 0.28169014084507044), (4, 0.19718309859154928), (0, 0.08450704225352113)]), (300, [(4, 0.35294117647058826), (1, 0.23529411764705882), (0, 0.058823529411764705), (2, 0.29411764705882354), (3, 0.058823529411764705)]), (292, [(3, 0.14883720930232558), (4, 0.19069767441860466), (2, 0.2651162790697674), (1, 0.31627906976744186), (0, 0.07906976744186046)]), (218, [(3, 0.3360655737704918), 

In [15]:
# We want to convert the RDDs to dictionaries to be able to use them in the prediction function

prop_breed_speed_reduce_dict = prop_breed_speed_reduce.collectAsMap()
prop_age_speed_reduce_dict = prop_age_speed_reduce.collectAsMap()
prop_type_speed_reduce_dict = prop_type_speed_reduce.collectAsMap()
prop_adoption_speed_reduce_dict = prop_adoption_speed_reduce.collectAsMap()

# Need to sort the values of the dictionaries by the class to be able to use them in the prediction function

for key in prop_breed_speed_reduce_dict:
    prop_breed_speed_reduce_dict[key].sort(key=lambda x: x[0])
    
for key in prop_age_speed_reduce_dict:
    prop_age_speed_reduce_dict[key].sort(key=lambda x: x[0])
    
for key in prop_type_speed_reduce_dict:
    prop_type_speed_reduce_dict[key].sort(key=lambda x: x[0])
    
prop_adoption_speed_reduce_dict = sorted(prop_adoption_speed_reduce_dict.items(), key=lambda x: x[0])



# Print the dictionaries
print("Breed and Adoption Speed")
print(prop_breed_speed_reduce_dict)

print("Age and Adoption Speed")
print(prop_age_speed_reduce_dict)

print("Type and Adoption Speed")
print(prop_type_speed_reduce_dict)

print("Adoption Speed")
print(prop_adoption_speed_reduce_dict)




Breed and Adoption Speed
{266: [(0, 0.024238227146814405), (1, 0.21364265927977838), (2, 0.2787396121883656), (3, 0.20948753462603878), (4, 0.2738919667590028)], 60: [(1, 0.17857142857142858), (2, 0.35714285714285715), (3, 0.32142857142857145), (4, 0.14285714285714285)], 254: [(0, 0.034482758620689655), (1, 0.22988505747126436), (2, 0.1724137931034483), (3, 0.19540229885057472), (4, 0.367816091954023)], 264: [(0, 0.06382978723404255), (1, 0.3191489361702128), (2, 0.25957446808510637), (3, 0.17872340425531916), (4, 0.17872340425531916)], 20: [(0, 0.08450704225352113), (1, 0.28169014084507044), (2, 0.15492957746478872), (3, 0.28169014084507044), (4, 0.19718309859154928)], 300: [(0, 0.058823529411764705), (1, 0.23529411764705882), (2, 0.29411764705882354), (3, 0.058823529411764705), (4, 0.35294117647058826)], 292: [(0, 0.07906976744186046), (1, 0.31627906976744186), (2, 0.2651162790697674), (3, 0.14883720930232558), (4, 0.19069767441860466)], 218: [(0, 0.00819672131147541), (1, 0.18032786

In [16]:
# Function to predict the adoption speed of a pet given its features
def predict(breed, age, type_):
    # Check if the features are in the dictionaries
    if breed not in prop_breed_speed_reduce_dict or age not in prop_age_speed_reduce_dict or type_ not in prop_type_speed_reduce_dict:
        return -1
    # Breed
    breed_speed = prop_breed_speed_reduce_dict[breed] # This will return a list of tuples (class, probability)
    breed_speed = [x[1] for x in breed_speed] # We only want the probabilities (the class is the index of the list)
    # Age
    age_speed = prop_age_speed_reduce_dict[age]
    age_speed = [x[1] for x in age_speed]
    
    # Type
    type_speed = prop_type_speed_reduce_dict[type_]
    type_speed = [x[1] for x in type_speed]
    
    # We compute the product of the probabilities of each class given the features
    prob = [a*b*c for a,b,c in zip(breed_speed, age_speed, type_speed)]
    
    # We compute the argmax of the probabilities
    prediction = prob.index(max(prob))
    
    return prediction


In [17]:
## Now we can predict the adoption speed of a pet given its features
import pandas as pd
test_data = pd.read_csv("./test_split.csv")



# For each pet in the test set we predict the adoption speed
breeds = test_data["Breed1"].tolist()
ages = test_data["Age"].tolist()
types = test_data["Type"].tolist()
correct_adoption_speed = test_data["AdoptionSpeed"].tolist()

predictions = []

for i in range(len(breeds)):
    breed = breeds[i]
    age = ages[i]
    type_ = types[i]
    prediction = predict(breed, age, type_)
    predictions.append(prediction)

# We compute the accuracy of our predictions
correct = 0
for i in range(len(predictions)):
    if predictions[i] == correct_adoption_speed[i]:
        correct += 1
        
accuracy = correct/len(predictions)
print("Accuracy: ", accuracy)

print("PredictionsSize: ", len(predictions))
print("CorrectAdoptionSpeedSize: ", len(correct_adoption_speed))

Accuracy:  0.3451150383461154
PredictionsSize:  2999
CorrectAdoptionSpeedSize:  2999
