In [1]:
# ! pip install pyspark

In [2]:
# IMporting the libraries
import pyspark


# Using pyspark to read the data and process it

In [3]:
# To work with spark we need to create a spark session
# Need to instal java
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('petfinder').getOrCreate()
# Read a dataset with spark
df_spark = spark.read.csv('train.csv', header=True, inferSchema=True)
# Header = True, inferSchema = True, means that the first row is the header and the schema is inferred (if schema is not inferred, all columns will be read as string)
# Convert the column "AdoptionSpeed" to integer
df_spark = df_spark.withColumn("AdoptionSpeed", df_spark["AdoptionSpeed"].cast("integer"))

## Pyspark dataframes analysis

In [4]:
# Some basic operations on the pyspark dataframe
## Show the whole dataframe (only the first 20 rows if the dataframe is too big)
df_spark.show()
## Print the schema of the dataframe
df_spark.printSchema()
## Print the first 5 rows of the dataframe
df_spark.head(5)
## Describe the dataframe by showing some statistics (count, mean, standard deviation, min, max)
df_spark.describe().show()
## Select/Index a column of the dataframe
df_spark.select('Age', 'Breed1', 'Type').show() # Do not use df_spark['Age'] to select a column
## Add a new column
df_spark = df_spark.withColumn('Age2', df_spark['Age'] + 1)
df_spark.show()
## Drop a column
df_spark = df_spark.drop('Age2')
df_spark.show()
## Rename a column
df_spark = df_spark.withColumnRenamed('Age', 'Age2')
df_spark.show()
## Drop rows with missing values
# df_spark.na.drop(how='all', thresh=10).show() 
    ### how='any' means drop rows with any missing value, how='all' means drop rows whose all values are missing
    ### thresh=10 means drop rows whose number of missing values is greater than 10
    ### subset=['Age'] means drop rows whose 'Age' value is missing
df_spark = df_spark.na.drop(how= 'any' , subset=['AdoptionSpeed'])
## Fill missing values
df_spark.na.fill('Missing', subset='Name').show()
    ### Fill all missing values with 'Missing'
    ### Subset=['Name'] means fill missing values in the 'Name' column only
## Fill missing values with mean
from pyspark.sql.functions import mean
mean_val = df_spark.select(mean(df_spark['Age2'])).collect()
mean_age = mean_val[0][0]
df_spark.na.fill(mean_age, subset=['Age2']).show()


# Filter operations
## Filter rows by a condition
df_spark.filter(df_spark['Age2'] > 10).show()
## Filter rows by a condition and not it
df_spark.filter(~(df_spark['Age2'] > 10)).show()
## Filter rows by a condition and select some columns
df_spark.filter(df_spark['Age2'] > 10).select('Name', 'Age2').show()
## Filter rows by multiple conditions
df_spark.filter((df_spark['Age2'] > 10) & (df_spark['Type'] == 1)).show()
## Filter rows by a string condition
df_spark.filter(df_spark['Name'].contains('a')).show()
## Filter rows by a list of values
df_spark.filter(df_spark['Age2'].isin([1, 2, 3])).show()


# Groupby operations and aggregate functions
## Groupby a column and count the number of rows in each group
df_spark.groupBy('Type').count().show()
## Groupby a column and compute the mean of another column in each group
df_spark.groupBy('Type').mean('Age2').show()
## Groupby a column and compute the mean, standard deviation, min, max of another column in each group
df_spark.groupBy('Type').agg({'Age2': 'mean'}).show()
df_spark.groupBy('Type').agg({'Age2': 'mean', 'Breed1': 'max'}).show()
## Groupby a column and compute the mean, standard deviation, min, max of another column in each group and rename the new columns
df_spark.groupBy('Type').agg({'Age2': 'mean', 'Breed1': 'max'}).withColumnRenamed('avg(Age2)', 'Average Age').withColumnRenamed('max(Breed1)', 'Max Breed').show()


+----+--------------------+---+------+------+------+------+------+------+------------+---------+----------+--------+----------+------+--------+---+-----+--------------------+--------+--------------------+---------+--------+-------------+
|Type|                Name|Age|Breed1|Breed2|Gender|Color1|Color2|Color3|MaturitySize|FurLength|Vaccinated|Dewormed|Sterilized|Health|Quantity|Fee|State|           RescuerID|VideoAmt|         Description|    PetID|PhotoAmt|AdoptionSpeed|
+----+--------------------+---+------+------+------+------+------+------+------------+---------+----------+--------+----------+------+--------+---+-----+--------------------+--------+--------------------+---------+--------+-------------+
|   2|              Nibble|  3|   299|     0|     1|     1|     7|     0|           1|        1|         2|       2|         2|     1|       1|100|41326|8480853f516546f6c...|       0|Nibble is a 3+ mo...|86e1089a3|     1.0|            2|
|   2|         No Name Yet|  1|   265|     0|   

# Using Pyspark to build Naive Bayes model in mapreduce way

In [6]:
# ## Convert the data into RDD
# https://pythonexamples.org/pyspark-word-count-example/

# Using PySpark MLlib to build the model

In [7]:
from pyspark.ml.feature import VectorAssembler

featureassemble = VectorAssembler(inputCols=['Type', 'Age2', 'Breed1','FurLength'], outputCol='features')
output = featureassemble.transform(df_spark) # This will create a new column called 'features' which is a vector of the selected columns (Type, Age2, Breed1) by the VectorAssembler
output.show()

+----+--------------------+----+------+------+------+------+------+------+------------+---------+----------+--------+----------+------+--------+---+-----+--------------------+--------+--------------------+---------+--------+-------------+--------------------+
|Type|                Name|Age2|Breed1|Breed2|Gender|Color1|Color2|Color3|MaturitySize|FurLength|Vaccinated|Dewormed|Sterilized|Health|Quantity|Fee|State|           RescuerID|VideoAmt|         Description|    PetID|PhotoAmt|AdoptionSpeed|            features|
+----+--------------------+----+------+------+------+------+------+------+------------+---------+----------+--------+----------+------+--------+---+-----+--------------------+--------+--------------------+---------+--------+-------------+--------------------+
|   2|              Nibble|   3|   299|     0|     1|     1|     7|     0|           1|        1|         2|       2|         2|     1|       1|100|41326|8480853f516546f6c...|       0|Nibble is a 3+ mo...|86e1089a3|     

In [8]:
finalized_data = output.select('features', 'AdoptionSpeed') # Select the features and the target column
finalized_data.show()

+--------------------+-------------+
|            features|AdoptionSpeed|
+--------------------+-------------+
| [2.0,3.0,299.0,1.0]|            2|
| [2.0,1.0,265.0,2.0]|            0|
| [1.0,1.0,307.0,2.0]|            3|
| [1.0,4.0,307.0,1.0]|            2|
| [1.0,1.0,307.0,1.0]|            2|
| [2.0,3.0,266.0,1.0]|            2|
|[2.0,12.0,264.0,3.0]|            1|
| [1.0,0.0,307.0,1.0]|            3|
| [2.0,2.0,265.0,2.0]|            1|
|[2.0,12.0,265.0,2.0]|            4|
| [1.0,2.0,307.0,1.0]|            1|
| [2.0,3.0,264.0,3.0]|            1|
| [1.0,2.0,307.0,3.0]|            2|
| [2.0,2.0,265.0,2.0]|            1|
| [1.0,3.0,307.0,2.0]|            2|
|[1.0,78.0,218.0,2.0]|            4|
| [2.0,6.0,266.0,1.0]|            3|
| [1.0,8.0,307.0,1.0]|            4|
| [1.0,2.0,307.0,1.0]|            2|
| [2.0,1.0,266.0,1.0]|            4|
+--------------------+-------------+
only showing top 20 rows



In [9]:
from pyspark.ml.classification import LogisticRegression
# Split the data into training and validation data
train_data, validation_data = finalized_data.randomSplit([0.1, 0.9])
classifier = LogisticRegression
classifier = LogisticRegression(labelCol='AdoptionSpeed').fit(train_data) # Fit the model
results = classifier.evaluate(validation_data) # Evaluate the model on the validation data
results.predictions.show() # Show the predictions
results.predictions.select('AdoptionSpeed', 'prediction').show() # Show the target and the prediction
results.accuracy # Show the accuracy of the model


+-------------------+-------------+--------------------+--------------------+----------+
|           features|AdoptionSpeed|       rawPrediction|         probability|prediction|
+-------------------+-------------+--------------------+--------------------+----------+
| [1.0,0.0,50.0,1.0]|            3|[3.86264987336587...|[0.05048278696184...|       1.0|
| [1.0,0.0,50.0,1.0]|            3|[3.86264987336587...|[0.05048278696184...|       1.0|
|[1.0,0.0,103.0,3.0]|            1|[7.46739465208599...|[0.11691573203068...|       1.0|
|[1.0,0.0,128.0,1.0]|            1|[3.59684386917038...|[0.03278307284926...|       1.0|
|[1.0,0.0,179.0,2.0]|            1|[5.31572591105878...|[0.04286103375557...|       2.0|
|[1.0,0.0,218.0,2.0]|            3|[5.18282290896103...|[0.03302135616552...|       2.0|
|[1.0,0.0,218.0,2.0]|            3|[5.18282290896103...|[0.03302135616552...|       2.0|
|[1.0,0.0,218.0,3.0]|            1|[7.07550118436186...|[0.05804111976074...|       2.0|
|[1.0,0.0,307.0,1.0]|

0.3202251407129456

In [10]:
results.predictions.show()

+-------------------+-------------+--------------------+--------------------+----------+
|           features|AdoptionSpeed|       rawPrediction|         probability|prediction|
+-------------------+-------------+--------------------+--------------------+----------+
| [1.0,0.0,50.0,1.0]|            3|[3.86264987336587...|[0.05048278696184...|       1.0|
| [1.0,0.0,50.0,1.0]|            3|[3.86264987336587...|[0.05048278696184...|       1.0|
|[1.0,0.0,103.0,3.0]|            1|[7.46739465208599...|[0.11691573203068...|       1.0|
|[1.0,0.0,128.0,1.0]|            1|[3.59684386917038...|[0.03278307284926...|       1.0|
|[1.0,0.0,179.0,2.0]|            1|[5.31572591105878...|[0.04286103375557...|       2.0|
|[1.0,0.0,218.0,2.0]|            3|[5.18282290896103...|[0.03302135616552...|       2.0|
|[1.0,0.0,218.0,2.0]|            3|[5.18282290896103...|[0.03302135616552...|       2.0|
|[1.0,0.0,218.0,3.0]|            1|[7.07550118436186...|[0.05804111976074...|       2.0|
|[1.0,0.0,307.0,1.0]|