# Data Cleaning

In [None]:
from pyspark.sql import SparkSession
import json
import pandas as pd

review_path = "yelp_dataset/review.json"
business_path = 'yelp_dataset/business.json'

ss = SparkSession \
    .builder \
    .appName('Son') \
    .master('local[*]') \
    .getOrCreate()

sc = ss.sparkContext

reviewRDD = sc.textFile(review_path)

review = reviewRDD.map(lambda line: json.loads(line)) \
    .map(lambda line: (line['user_id'], line['business_id'])) \
    .collect()

review_list = list(zip(*review))

businessRDD = sc.textFile(business_path)

business = businessRDD.map(lambda line: json.loads(line)) \
    .map(lambda line: (line['business_id'], line['state'])) \
    .collect()

business_list = list(zip(*business))

review_dict = {"user_id": review_list[0], "business_id": review_list[1]}
business_dict = {"business_id": business_list[0], "state": business_list[1]}

review_df = pd.DataFrame.from_dict(review_dict)
business_df = pd.DataFrame.from_dict(business_dict)

print(review_df.head())
print(business_df.head())

all_df = pd.merge(left=review_df, right=business_df, how='left', left_on=["business_id"], right_on=['business_id'])

print(all_df.head())

all_df_NV = all_df[all_df['state'] == 'NV']

print(all_df_NV.head())

del all_df_NV['state']

all_df_NV.to_csv('task2_data.csv', index=False)


# Helper Functions

In [None]:
def create_candidates(candidates_list, length):
    '''
    :param candidates_list: list[set]
    :return: list[set]
    '''
    res = []
    for item_1 in candidates_list:
        for item_2 in candidates_list:
            temp = frozenset(item_1 | item_2)
            if temp not in res and len(temp) == length:
                res.append(temp)
    print(res)
    return res


def frequent_items(partition, candidates, support):
    '''
    :param partition: iterator
    :param candidates: list[set]
    :param support: int
    :return: list[set]
    '''
    res = []
    count = {}
    for line in partition:
        for candidate in candidates:
            if candidate.issubset(line[1]):
                if tuple(candidate) not in count:
                    count[tuple(candidate)] = 1
                else:
                    count[tuple(candidate)] += 1
    for key, value in count.items():
        if value >= support:
            if set(key) not in res:
                res.append(frozenset(key))
    return res


def Apriori(partition, support, threshold=0):
    '''
    :param partition: iterator
    :param support: int
    :return: tuple(tuple(int, tuple))
    '''
    frequent = {}
    cand_1 = {}
    temp_partition = list(partition)
    for line in temp_partition:
        for item in line[1]:
            if item not in cand_1:
                cand_1[(item)] = 1
            else:
                cand_1[(item)] += 1
    cand_1 = {key: value for key, value in cand_1.items() if value >= threshold}
    frequent[1] = [frozenset([key]) for key, value in cand_1.items() if value >= support]
    k = 2
    while 1:
        temp_candidate = create_candidates(frequent[k - 1], k)
        temp_frequent = frequent_items(temp_partition, temp_candidate, support)
        if len(temp_frequent) == 0:
            break
        else:
            frequent[k] = temp_frequent
            k += 1
    res = [(key, value) for key, value in frequent.items()]
    return res


def global_frequent(line, candidates):
    '''
    :param line: list
    :param candidates: list[set]
    :return: list[tuple]
    '''
    res = []
    for candidate in candidates:
        length_set = candidate[0]
        items = list(candidate[1])
        counter = {}
        for item in items:
            if item.issubset(line):
                if tuple(item) not in counter:
                    counter[tuple(item)] = 1
                else:
                    counter[tuple(item)] += 1
        res.append((length_set, counter))
    return res


In [None]:
from pyspark.sql import SparkSession
import Apriori as A

ss = SparkSession \
    .builder \
    .master("local[*]") \
    .appName('Try') \
    .getOrCreate()

sc = ss.sparkContext

path = "tesk_data.csv"

smallRDD = sc.textFile(path)
header = smallRDD.first()

createCombiner = (lambda line: [line])
mergeValue = (lambda exist, new: exist + [new])
mergeCombiner = (lambda exist1, exist2: exist1 + exist2)

smallRDD = smallRDD.filter(lambda line: line != header) \
    .map(lambda line: (line.split(',')[0], line.split(',')[1])) \
    .combineByKey(createCombiner, mergeValue, mergeCombiner)

# smallRDD.foreach(print)

support = 7
threshold = 10
numOfPar = smallRDD.getNumPartitions()

candidates = smallRDD.mapPartitions(lambda partition: A.Apriori(partition, support / numOfPar, threshold / numOfPar)) \
    .reduceByKey(lambda a, b: a + b) \
    .map(lambda line: (line[0], {item for item in line[1]})) \
    .collect()

sc.broadcast(candidates)

frequent = smallRDD.flatMap(lambda line: A.global_frequent(line[1], candidates, support)) \
    .reduceByKey(
    lambda exist1, exist2: {key: exist1.get(key, 0) + exist2.get(key, 0) for key in set(exist1) | set(exist2)}) \
    .collect()

A = {item[0]: [key for key, value in item[1].items() if value >= support] for item in frequent}
print(A)