In [None]:
!pip install pyspark
!pip install -U -q PyDrive
!apt install openjdk-8-jdk-headless -qq
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
from pyspark import SparkContext, SparkConf 
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType,StructField, StringType, IntegerType,TimestampType
from pyspark.sql.functions import col,current_timestamp,to_date,hour,dayofweek\


from itertools import combinations

In [5]:
def create_new_spark_context(appName):
    return SparkSession.builder.appName(appName)\
    .master("local[*]").getOrCreate()

spark_session = create_new_spark_context("Q4")
sc = spark_session.sparkContext

In [6]:
schema = StructType([
        StructField("DEVICE_CODE", IntegerType(), True), 
        StructField("SYSTEM_ID",IntegerType(),True),
        StructField("ORIGINE_CAR_KEY",StringType(),True),
        StructField("FINAL_CAR_KEY",StringType(),True),
        StructField("CHECK_STATUS_KEY", IntegerType(), True),
        StructField("COMPANY_ID", StringType(), True),
        StructField("PASS_DAY_TIME", TimestampType(), True)
    ])

In [7]:
df = spark_session.read.csv('/content/drive/MyDrive/MDA/EX2/Sample_Traffic.csv', header=True, schema=schema)
df.show(1)

+-----------+---------+---------------+-------------+----------------+----------+-------------------+
|DEVICE_CODE|SYSTEM_ID|ORIGINE_CAR_KEY|FINAL_CAR_KEY|CHECK_STATUS_KEY|COMPANY_ID|      PASS_DAY_TIME|
+-----------+---------+---------------+-------------+----------------+----------+-------------------+
|     200501|       81|       10477885|     10477885|               5|       161|2021-06-01 03:54:39|
+-----------+---------+---------------+-------------+----------------+----------+-------------------+
only showing top 1 row



In [8]:
traffic_rdd = df.rdd.map(lambda x: ((x["FINAL_CAR_KEY"] , x["PASS_DAY_TIME"].date()) , x["DEVICE_CODE"]))\
                    .groupByKey()\
                    .map(lambda x: (x[0], sorted(list(set(list(x[1]))))))\
                    .filter(lambda x: len(x[1]) < 20)\
                    .map(lambda x: x[1])

In [9]:
frequent_paths = {}

In [10]:
def prune_candidates(x, Ck, n):
    combs = list(combinations(x, n))
    return all(i in Ck for i in combs)


def apriori(traffic_rdd, support_threshold, n):    
    if n == 1:
        f_1 = traffic_rdd.flatMap(lambda x: x)\
                .map(lambda x: (x, 1))\
                .reduceByKey(lambda x, y: x + y)\
                .filter(lambda x: x[1] >= support_threshold)\
                .map(lambda x : ([x[0]], x[1]))\
                .collect()
        f_1 = {tuple(x[0]): x[1] for x in f_1}
        frequent_paths[1] = f_1
        return f_1
    
    Cn_1 = apriori(traffic_rdd, support_threshold, n - 1)
    f_n = traffic_rdd.flatMap(lambda x: combinations(x, n))\
                .filter(lambda x: prune_candidates(x, Cn_1, n - 1))\
                .map(lambda x: (x, 1))\
                .reduceByKey(lambda x, y: x + y)\
                .filter(lambda x: x[1] >= support_threshold)\
                .collect()
    f_n = {tuple(x[0]): x[1] for x in f_n}
    frequent_paths[n] = f_n
    return f_n

In [11]:
support_threshold = 1000

In [12]:
f_5 = apriori(traffic_rdd, support_threshold, 5)

In [17]:
frequent_paths[1]

{(128,): 6572,
 (900240,): 12644,
 (100700880,): 2790,
 (100701100,): 12384,
 (232,): 7527,
 (230204,): 12492,
 (900244,): 59902,
 (900164,): 36751,
 (900256,): 21541,
 (22010072,): 3548,
 (100700820,): 23405,
 (100700824,): 33632,
 (22010048,): 18303,
 (22010052,): 3824,
 (900108,): 21308,
 (900212,): 78004,
 (100700804,): 32797,
 (900124,): 18032,
 (900216,): 21523,
 (900276,): 17825,
 (22010112,): 3348,
 (100701156,): 2874,
 (100700864,): 24313,
 (100,): 7823,
 (168,): 2331,
 (22009972,): 11291,
 (22009912,): 1394,
 (22010040,): 19585,
 (631832,): 10942,
 (900236,): 40000,
 (631356,): 6342,
 (900224,): 12832,
 (900268,): 41045,
 (144,): 4854,
 (900176,): 3885,
 (900160,): 5758,
 (22010088,): 14257,
 (631352,): 3316,
 (100700812,): 8518,
 (100701132,): 4612,
 (631368,): 12201,
 (631640,): 11404,
 (100700836,): 6419,
 (900228,): 9068,
 (22010060,): 23752,
 (100701096,): 7511,
 (156,): 2833,
 (900184,): 3671,
 (900272,): 10742,
 (631776,): 3589,
 (900152,): 18128,
 (100700868,): 39866,

In [18]:
frequent_paths[2]

{(631367, 631763): 1701,
 (900234, 900240): 4606,
 (900240, 900246): 1421,
 (900240, 900266): 1740,
 (119, 209103): 1237,
 (119, 900235): 3039,
 (209103, 900235): 1399,
 (900182, 100701100): 3385,
 (900233, 900265): 1038,
 (114, 232): 3303,
 (212501, 100700841): 1504,
 (631833, 900269): 1359,
 (900244, 900258): 1112,
 (900244, 100700866): 2968,
 (900269, 100700841): 2923,
 (900269, 100700853): 2132,
 (100700841, 100700853): 2960,
 (900107, 900247): 1302,
 (900102, 900212): 7227,
 (900135, 900235): 1049,
 (900155, 900227): 1680,
 (900212, 900226): 1188,
 (900212, 900246): 2325,
 (900212, 100701130): 2197,
 (900226, 100700804): 1015,
 (900235, 100700871): 4841,
 (900246, 100700804): 2003,
 (900246, 100700820): 1612,
 (900266, 100700804): 3167,
 (100700804, 100700834): 10536,
 (205202, 900124): 1041,
 (631829, 900225): 1721,
 (631829, 900269): 2855,
 (900225, 900269): 8335,
 (900185, 900265): 1243,
 (900241, 22010117): 2547,
 (900234, 900276): 2269,
 (209103, 900167): 1273,
 (900222, 1007

In [19]:
frequent_paths[3]

{(900259, 900269, 100700841): 1361,
 (900235, 100700804, 100700834): 1216,
 (900207, 900225, 900269): 1181,
 (230204, 900107, 900276): 2535,
 (205802, 900215, 900234): 1823,
 (205802, 900234, 900265): 1229,
 (212802, 900215, 900234): 1341,
 (900139, 900268, 100700826): 2691,
 (631765, 900164, 900276): 1159,
 (631765, 900164, 100700820): 2598,
 (631765, 900276, 100700820): 1719,
 (206602, 900234, 100700845): 1859,
 (900101, 22010119, 100700841): 1229,
 (900102, 900142, 100700853): 1293,
 (900142, 900212, 900259): 1143,
 (631357, 900102, 100701130): 1682,
 (631357, 900212, 900244): 1381,
 (631633, 900212, 900244): 3383,
 (631829, 900226, 900246): 1037,
 (205802, 212802, 900233): 1795,
 (900142, 900212, 22010119): 1198,
 (900215, 900234, 900256): 2223,
 (900101, 900212, 900244): 5775,
 (900142, 900212, 100700839): 2682,
 (900142, 900244, 100700839): 2494,
 (900155, 900222, 100700868): 4457,
 (900236, 900276, 100700841): 1009,
 (142, 900215, 900234): 1575,
 (142, 900215, 22010118): 1084,
 

In [20]:
frequent_paths[4]

{(900102, 900142, 900212, 900244): 1799,
 (900101, 900212, 900244, 100700839): 1105,
 (22010087, 22010088, 22010094, 22010095): 2083,
 (900142, 900202, 900212, 900244): 1017,
 (900142, 900212, 900249, 100700853): 1295,
 (900193, 900212, 900244, 100700839): 1463,
 (900142, 900212, 900273, 100700853): 1044,
 (900212, 900244, 100700839, 100700853): 1299,
 (142, 900215, 900234, 900256): 1055,
 (900142, 900152, 900212, 100700853): 1299,
 (900142, 900212, 900244, 100700853): 4784,
 (900102, 900212, 900244, 100700853): 1134,
 (900139, 900212, 900244, 100700826): 1315,
 (900142, 900193, 900212, 900244): 1126,
 (900142, 900212, 900244, 900249): 1309,
 (900142, 900152, 900212, 900244): 1549,
 (900193, 900212, 900244, 100700853): 1071,
 (209103, 900265, 100700804, 100700834): 1276,
 (900139, 900212, 900244, 100700839): 1093,
 (900101, 900212, 900244, 100700841): 1467,
 (900102, 900142, 900212, 100700853): 1088,
 (631633, 900142, 900212, 900244): 1281,
 (900142, 900212, 900244, 100700839): 2251,
 

In [21]:
f_5

{}

In [33]:
all_frequent_paths = [item for sublist in frequent_paths.values() for item in sublist]
all_frequent_paths

[(128,),
 (900240,),
 (100700880,),
 (100701100,),
 (232,),
 (230204,),
 (900244,),
 (900164,),
 (900256,),
 (22010072,),
 (100700820,),
 (100700824,),
 (22010048,),
 (22010052,),
 (900108,),
 (900212,),
 (100700804,),
 (900124,),
 (900216,),
 (900276,),
 (22010112,),
 (100701156,),
 (100700864,),
 (100,),
 (168,),
 (22009972,),
 (22009912,),
 (22010040,),
 (631832,),
 (900236,),
 (631356,),
 (900224,),
 (900268,),
 (144,),
 (900176,),
 (900160,),
 (22010088,),
 (631352,),
 (100700812,),
 (100701132,),
 (631368,),
 (631640,),
 (100700836,),
 (900228,),
 (22010060,),
 (100701096,),
 (156,),
 (900184,),
 (900272,),
 (631776,),
 (900152,),
 (100700868,),
 (900156,),
 (100701252,),
 (900208,),
 (900264,),
 (100700832,),
 (900104,),
 (900144,),
 (900232,),
 (631748,),
 (900120,),
 (100700932,),
 (112,),
 (100700816,),
 (22010120,),
 (22010100,),
 (900180,),
 (100701264,),
 (100701068,),
 (100701144,),
 (900132,),
 (631360,),
 (100701092,),
 (22010084,),
 (104,),
 (22010116,),
 (22010080,),


In [34]:
traffic_rdd = traffic_rdd.zipWithIndex()

In [35]:
traffic_rdd_1 = traffic_rdd.filter(lambda x : x[1]%3==0).map(lambda x : x[0])
traffic_rdd_2 = traffic_rdd.filter(lambda x : x[1]%3==1).map(lambda x : x[0])
traffic_rdd_3 = traffic_rdd.filter(lambda x : x[1]%3==2).map(lambda x : x[0])
    
support_threshold = support_threshold / 3

f_n_1 = apriori(traffic_rdd_1, support_threshold, 3)
f_n_2 = apriori(traffic_rdd_2, support_threshold, 3)
f_n_3 = apriori(traffic_rdd_3, support_threshold, 3)

In [38]:
f_n = {}
for key in f_n_1:
    f_n[key] = f_n_1[key]
for key in f_n_2:
    if key in f_n:
        f_n[key] += f_n_2[key]
    else:
        f_n[key] = f_n_2[key]
for key in f_n_3:
    if key in f_n:
        f_n[key] += f_n_3[key]
    else:
        f_n[key] = f_n_3[key]

f_n

{(205802, 900215, 900234): 1823,
 (205802, 900234, 900265): 1229,
 (212802, 900215, 900234): 1341,
 (206602, 900234, 100700845): 1859,
 (631765, 900164, 900276): 1159,
 (631765, 900164, 100700820): 2598,
 (631765, 900276, 100700820): 1719,
 (205802, 212802, 900233): 1795,
 (900215, 900234, 900256): 2223,
 (631829, 900226, 900246): 717,
 (900171, 900233, 100700845): 1604,
 (230204, 900107, 900276): 2535,
 (230204, 900107, 100700824): 2004,
 (631357, 900212, 900244): 1381,
 (900101, 900212, 900244): 5775,
 (900142, 900212, 22010119): 1198,
 (900207, 900225, 900269): 1181,
 (900235, 100700804, 100700834): 1216,
 (900212, 900236, 100700841): 1216,
 (22010087, 22010088, 22010094): 2394,
 (22010088, 22010094, 22010095): 2777,
 (900108, 900259, 900268): 1140,
 (900225, 900227, 900269): 1845,
 (209103, 100700804, 100700834): 2201,
 (209103, 900217, 900265): 683,
 (900101, 900139, 100700841): 1672,
 (900212, 900244, 900245): 1119,
 (900212, 900244, 900249): 4907,
 (900212, 900244, 22009977): 21

In [47]:
def son_filter(x, f_n):
    def check_key(key):
        return all(k in x[0] for k in key)
    return [(key, 1) for key in f_n if check_key(key)]


In [40]:
support_threshold = 1000

In [48]:
f_3_son = traffic_rdd.map(lambda x: son_filter(x, f_n))\
        .filter(lambda x: len(x) > 0)\
        .flatMap(lambda x: x).\
        reduceByKey(lambda x, y: x + y)\
        .filter(lambda x: x[-1] > support_threshold)\
        .collect()

In [49]:
f_3_son

[((900259, 900269, 100700841), 1361),
 ((900235, 100700804, 100700834), 1216),
 ((900207, 900225, 900269), 1181),
 ((230204, 900107, 900276), 2535),
 ((205802, 900215, 900234), 1823),
 ((205802, 900234, 900265), 1229),
 ((212802, 900215, 900234), 1341),
 ((900139, 900268, 100700826), 2691),
 ((631765, 900164, 900276), 1159),
 ((631765, 900164, 100700820), 2598),
 ((631765, 900276, 100700820), 1719),
 ((206602, 900234, 100700845), 1859),
 ((900101, 22010119, 100700841), 1229),
 ((900102, 900142, 100700853), 1293),
 ((900142, 900212, 900259), 1143),
 ((631357, 900212, 900244), 1381),
 ((631357, 900102, 100701130), 1682),
 ((631633, 900212, 900244), 3383),
 ((631829, 900226, 900246), 1037),
 ((205802, 212802, 900233), 1795),
 ((900142, 900212, 22010119), 1198),
 ((900215, 900234, 900256), 2223),
 ((900101, 900212, 900244), 5775),
 ((900142, 900212, 100700839), 2682),
 ((900142, 900244, 100700839), 2494),
 ((900155, 900222, 100700868), 4457),
 ((900236, 900276, 100700841), 1009),
 ((205802