In [5]:
# ! pip install pyspark
# ! pip install spark

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyspark
  Downloading pyspark-3.4.0.tar.gz (310.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m310.8/310.8 MB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.4.0-py2.py3-none-any.whl size=311317145 sha256=815f7e4767efbe1cec425b38ace45ccd0b19043c92e722379441f4ec119b2836
  Stored in directory: /root/.cache/pip/wheels/7b/1b/4b/3363a1d04368e7ff0d408e57ff57966fcdf00583774e761327
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.4.0


### Frequent Items and Association Rules with normal python

In [18]:
import pandas as pd
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules

# Load the PetFinder dataset into a Pandas DataFrame
df = pd.read_csv("train.csv")

# Define the columns for analysis
columns = ["Type", "Age", "Breed1", "Gender", "Color1", "Color2", "MaturitySize", "FurLength", "Vaccinated", "Dewormed", "Sterilized", "Health", "Quantity", "Fee", "State"]

# Select the columns for analysis
data = df[columns]

# Convert the data to a list of lists for Apriori input
transactions = data.values.tolist()

# Apply the Apriori algorithm to get the frequent itemsets
te = TransactionEncoder()
te_ary = te.fit(transactions).transform(transactions)
df = pd.DataFrame(te_ary, columns=te.columns_)
freqItemsets = apriori(df, min_support=0.2, use_colnames=True)

# Apply the association rules algorithm to get the association rules
rules = association_rules(freqItemsets, metric="confidence", min_threshold=0.5)

# save the results to a csv file
freqItemsets.to_csv("freqItems.csv")
rules.to_csv("rules.csv")

### Frequent Items, AssociationRules with Spark

#### Map the values from numeric to the meaningful non-numeric values

In [68]:
# map values in the df to it's actual non-numeric value 
def map_values(df):
    # create new df
    new_df = pd.DataFrame(columns=df.columns, index=df.index)
    # create map dict
    map_dict = {
        "Type": {
            1: "Dog",
            2: "Cat"
        },
        "Color1": {
            1: "Black",
            2: "Brown",
            3: "Golden",
            4: "Yellow",
            5: "Cream",
            6: "Gray",
            7: "White"
        },
        "MaturitySize": {
            1: "Small MaturitySize",
            2: "Medium MaturitySize",
            3: "Large MaturitySize",
            4: "Extra Large MaturitySize",
            0: "Not Specified MaturitySize"
        },
        "FurLength": {
            1: "Short Fur",
            2: "Medium Fur",
            3: "Long Fur",
            0: "Not Specified Fur"
        },
        "Vaccinated": {
            1: "Yes Vaccinated",
            2: "No Vaccinated",
            3: "Not Sure Vaccinated"
        },
        "Dewormed": {
            1: "Yes Dewormed",
            2: "No Dewormed",
            3: "Not Sure Dewormed"
        },
        "Sterilized": {
            1: "Yes Sterilized",
            2: "No Sterilized",
            3: "Not Sure Sterilized"
        },
        "Health": {
            1: "Healthy",
            2: "Minor Injury",
            3: "Serious Injury",
            0: "Not Specified Health"
        },
        "AdoptionSpeed": {
            "0": "Adoption on the same day",
            "1": "Adoption in the first week",
            "2": "Adoption in the first month",
            "3": "Adoption within the first 3 months",
            "4": "No adoption after 100 days"
        }
    }
    for i in range (0, len(df.columns)):
        new_df.iloc[:,i] = df.iloc[:,i].map(map_dict[df.columns[i]])    
    return new_df


#### Prepare the dataset

In [69]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("Insights").getOrCreate()

df = spark.read.csv("train.csv", header=True, inferSchema=True)

# Define the columns for analysis, all columns except the non-numerical columns
columns = ["Type", "Color1", "MaturitySize", "FurLength", "Vaccinated", "Dewormed", "Sterilized", "Health","AdoptionSpeed"]

# Convert the df to be a list of lists
df = df.select(columns).toPandas()

df = map_values(df)

df = df.values.tolist()

# add id to the df 
for i in range(len(df)):
    df[i] = [i , df[i]]

# save the df to a file
# with open("df.txt", "w") as f:
#     for row in df:
#         f.write(str(row) + "\n")

# convert df to spark dataframe with id and items columns 
df = spark.createDataFrame(df, ["id", "items"])

# save the df to a file
# with open("df_spark.txt", "w") as f:
#     for row in df:
#         f.write(str(row) + "\n")


#### Extract frequect itemsets and association rules

In [73]:
from pyspark.ml.fpm import FPGrowth

fpGrowth = FPGrowth(itemsCol="items", minSupport=0.5, minConfidence=0.6)
model = fpGrowth.fit(df)

# Display frequent itemsets.
model.freqItemsets.show()

# save the frequent itemsets to a file
# with open("freqItemsetsFinal.txt", "w") as f:
#     for row in model.freqItemsets.collect():
#         f.write(str(row) + "\n")

# Display generated association rules.
model.associationRules.show()

# save the association rules to a file
# with open("associationRulesFinal.txt", "w") as f:
#     for row in model.associationRules.collect():
#         f.write(str(row) + "\n")

# transform examines the input items against all the association rules
# then summarize the consequents as prediction
model.transform(df).show()

# save the predictions to a file
# with open("predictionsFinal.txt", "w") as f:
#     for row in model.transform(df).collect():
#         f.write(str(row) + "\n")

+--------------------+-----+
|               items| freq|
+--------------------+-----+
|     [No Sterilized]|10077|
|[No Sterilized, H...| 9782|
|      [Yes Dewormed]| 8397|
|[Yes Dewormed, He...| 8161|
|           [Healthy]|14478|
|               [Dog]| 8132|
|      [Dog, Healthy]| 7845|
|[Medium MaturityS...|10305|
|[Medium MaturityS...|10030|
|         [Short Fur]| 8808|
|[Short Fur, Healthy]| 8536|
+--------------------+-----+

+--------------------+--------------------+------------------+------------------+------------------+
|          antecedent|          consequent|        confidence|              lift|           support|
+--------------------+--------------------+------------------+------------------+------------------+
|     [No Sterilized]|           [Healthy]|0.9707254143098144|1.0052552933241503|0.6524378043086774|
|         [Short Fur]|           [Healthy]|0.9691189827429609|1.0035917190402828|0.5693323550990462|
|      [Yes Dewormed]|           [Healthy]|0.97189472430629