# Frequent pattern for data mining

In [28]:
from utilities.std_imports import *
from pyspark import SparkContext, SparkConf
from pyspark.sql import SQLContext
from pyspark.ml import fpm
from pyspark.ml.fpm import PrefixSpan
from pyspark.sql import Row
from pyspark.sql.functions import split

## FP Growth algorithm

In [9]:
spark = SparkContext("local", "sqlContext")
sql = SQLContext(spark)

In [21]:
df = sql.createDataFrame([(0, [1, 2, 5]),(1, [1, 2, 3, 5]),(2, [1, 2])], ["id", "items"])
df.toPandas()

Unnamed: 0,id,items
0,0,"[1, 2, 5]"
1,1,"[1, 2, 3, 5]"
2,2,"[1, 2]"


In [22]:
fpg = fpm.FPGrowth(itemsCol="items", minSupport=0.5, minConfidence=0.6)
fpg_fit = fpg.fit(df)

In [24]:
print('1. Frequent itemsets')
fpg_fit.freqItemsets.show()

print('2. Association rules')
fpg_fit.associationRules.show()

print('3. Exam the input items against all the association rules and summarize the consequents as prediction')
fpg_fit.transform(df).show()

1. Frequent itemsets
+---------+----+
|    items|freq|
+---------+----+
|      [5]|   2|
|   [5, 2]|   2|
|[5, 2, 1]|   2|
|   [5, 1]|   2|
|      [2]|   3|
|   [2, 1]|   3|
|      [1]|   3|
+---------+----+

2. Association rules
+----------+----------+------------------+----+
|antecedent|consequent|        confidence|lift|
+----------+----------+------------------+----+
|    [5, 2]|       [1]|               1.0| 1.0|
|    [5, 1]|       [2]|               1.0| 1.0|
|       [5]|       [2]|               1.0| 1.0|
|       [5]|       [1]|               1.0| 1.0|
|       [2]|       [5]|0.6666666666666666| 1.0|
|       [2]|       [1]|               1.0| 1.0|
|       [1]|       [5]|0.6666666666666666| 1.0|
|       [1]|       [2]|               1.0| 1.0|
|    [2, 1]|       [5]|0.6666666666666666| 1.0|
+----------+----------+------------------+----+

3. Exam the input items against all the association rules and summarize the consequents as prediction
+---+------------+----------+
| id|       i

#### Another example

In [30]:
path = 'D:/ProgramFiles/Spark/spark-3.0.0-bin-hadoop2.7/data/mllib/'
rdd = (sql.read.text(path + "sample_fpgrowth.txt").select(split("value", "\s+").alias("items")))
rdd.show(truncate=False)

+------------------------+
|items                   |
+------------------------+
|[r, z, h, k, p]         |
|[z, y, x, w, v, u, t, s]|
|[s, x, o, n, r]         |
|[x, z, y, m, t, s, q, e]|
|[z]                     |
|[x, z, y, r, q, t, p]   |
+------------------------+



In [32]:
fp = fpm.FPGrowth(minSupport=0.2, minConfidence=0.7)
fp_fit = fp.fit(rdd)

In [33]:
fp_fit.setPredictionCol("newPrediction")
fp_fit.freqItemsets.show(5)

+---------+----+
|    items|freq|
+---------+----+
|      [s]|   3|
|   [s, x]|   3|
|[s, x, z]|   2|
|   [s, z]|   2|
|      [r]|   3|
+---------+----+
only showing top 5 rows



## Prefix span algorithm

In [19]:
df = spark.parallelize([Row(sequence=[[1, 2], [3]]), Row(sequence=[[1], [3, 2], [1, 2]]), Row(sequence=[[1, 2], [5]]), Row(sequence=[[6]])]).toDF()
df.toPandas()

Unnamed: 0,sequence
0,"[[1, 2], [3]]"
1,"[[1], [3, 2], [1, 2]]"
2,"[[1, 2], [5]]"
3,[[6]]


In [18]:
prefixSpan = PrefixSpan(minSupport=0.5, maxPatternLength=5, maxLocalProjDBSize=32000000)

# Find frequent sequential patterns.
prefixSpan.findFrequentSequentialPatterns(df).show()

+----------+----+
|  sequence|freq|
+----------+----+
|     [[1]]|   3|
|     [[3]]|   2|
|     [[2]]|   3|
|  [[1, 2]]|   3|
|[[1], [3]]|   2|
+----------+----+



In [7]:
spark.stop()

## Credits & Links

https://spark.apache.org/docs/latest/ml-frequent-pattern-mining.html