In [None]:
# ! pip install pyspark
# ! pip install spark

Frequent Items and Association Rules with normal python

In [None]:
import pandas as pd
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules

# Load the PetFinder dataset into a Pandas DataFrame
df = pd.read_csv("train.csv")

# Define the columns for analysis
columns = ["Type", "Age", "Breed1", "Gender", "Color1", "Color2", "MaturitySize", "FurLength", "Vaccinated", "Dewormed", "Sterilized", "Health", "Quantity", "Fee", "State"]

# Select the columns for analysis
data = df[columns]

# Convert the data to a list of lists for Apriori input
transactions = data.values.tolist()

# Apply the Apriori algorithm to get the frequent itemsets
te = TransactionEncoder()
te_ary = te.fit(transactions).transform(transactions)
df = pd.DataFrame(te_ary, columns=te.columns_)
freqItemsets = apriori(df, min_support=0.01, use_colnames=True)

# Apply the association rules algorithm to get the association rules
rules = association_rules(freqItemsets, metric="confidence", min_threshold=0.5)

# save the results to a csv file
freqItemsets.to_csv("freqItemsets.csv")
rules.to_csv("rules.csv")

Frequent Items with Spark

In [None]:
from pyspark.mllib.fpm import FPGrowth
# from pyspark.mllib.fpm import AssociationRules
from pyspark import SparkConf
from pyspark.context import SparkContext

# Load the PetFinder dataset into a Spark DataFrame
df = spark.read.csv("train.csv", header=True, inferSchema=True)

# Define the columns for analysis, all columns except the non-numerical columns
columns = ["Age", "Breed1", "Color1", "Color2", "MaturitySize", "FurLength", "Vaccinated", "Dewormed", "Sterilized", "Health", "Quantity", "Fee", "State"]

# Select the columns for analysis
data = df.select(columns)

# Convert the data to a list of lists for FPGrowth input
transactions = data.rdd.map(lambda x: x).collect()

sc = SparkContext.getOrCreate(SparkConf())

# Convert the transactions to an RDD of itemsets
rdd = sc.parallelize(transactions).map(lambda x: set(x))

# Train the FPGrowth model to get the frequent itemsets
model = FPGrowth.train(rdd, minSupport=0.2, numPartitions=10)

# Get the frequent itemsets and save them to a file
freqItems = model.freqItemsets().collect()
with open("freqItemsets.txt", "a") as f:
    for fi in freqItems:
        f.write(str(fi) + "\n")


Simple example

In [None]:
from pyspark.ml.fpm import FPGrowth

df = spark.createDataFrame([
    (0, [1, 2, 5]),
    (1, [1, 2, 3, 5]),
    (2, [1, 2])
], ["id", "items"])

# save the df in txt file  
with open("df.txt", "w") as f:
    for row in df:
        f.write(str(row) + "\n")


fpGrowth = FPGrowth(itemsCol="items", minSupport=0.5, minConfidence=0.6)
model = fpGrowth.fit(df)

# Display frequent itemsets.
model.freqItemsets.show()

# Display generated association rules.
model.associationRules.show()

# transform examines the input items against all the association rules and summarize the
# consequents as prediction
model.transform(df).show()

+---------+----+
|    items|freq|
+---------+----+
|      [5]|   2|
|   [5, 1]|   2|
|[5, 1, 2]|   2|
|   [5, 2]|   2|
|      [2]|   3|
|      [1]|   3|
|   [1, 2]|   3|
+---------+----+

+----------+----------+------------------+----+------------------+
|antecedent|consequent|        confidence|lift|           support|
+----------+----------+------------------+----+------------------+
|       [5]|       [1]|               1.0| 1.0|0.6666666666666666|
|       [5]|       [2]|               1.0| 1.0|0.6666666666666666|
|    [1, 2]|       [5]|0.6666666666666666| 1.0|0.6666666666666666|
|    [5, 2]|       [1]|               1.0| 1.0|0.6666666666666666|
|    [5, 1]|       [2]|               1.0| 1.0|0.6666666666666666|
|       [2]|       [5]|0.6666666666666666| 1.0|0.6666666666666666|
|       [2]|       [1]|               1.0| 1.0|               1.0|
|       [1]|       [5]|0.6666666666666666| 1.0|0.6666666666666666|
|       [1]|       [2]|               1.0| 1.0|               1.0|
+-------

Extract useful insights from Accosiation Rules