# Data Cleaning

In [None]:
from pyspark.sql import SparkSession
import json
import pandas as pd

review_path = "yelp_dataset/review.json"
business_path = 'yelp_dataset/business.json'

ss = SparkSession \
    .builder \
    .appName('Son') \
    .master('local[*]') \
    .getOrCreate()

sc = ss.sparkContext

reviewRDD = sc.textFile(review_path)

review = reviewRDD.map(lambda line: json.loads(line)) \
    .map(lambda line: (line['user_id'], line['business_id'])) \
    .collect()

review_list = list(zip(*review))

businessRDD = sc.textFile(business_path)

business = businessRDD.map(lambda line: json.loads(line)) \
    .map(lambda line: (line['business_id'], line['state'])) \
    .collect()

business_list = list(zip(*business))

review_dict = {"user_id": review_list[0], "business_id": review_list[1]}
business_dict = {"business_id": business_list[0], "state": business_list[1]}

review_df = pd.DataFrame.from_dict(review_dict)
business_df = pd.DataFrame.from_dict(business_dict)

print(review_df.head())
print(business_df.head())

all_df = pd.merge(left=review_df, right=business_df, how='left', left_on=["business_id"], right_on=['business_id'])

print(all_df.head())

all_df_NV = all_df[all_df['state'] == 'NV']

print(all_df_NV.head())

del all_df_NV['state']

all_df_NV.to_csv('task2_data.csv', index=False)


# Helper Functions

In [None]:
def create_candidates(item_set, length):
    '''
    :param item_set: list[set()]
    :param length: int
    :return: list[set()]
    '''
    return_list = []
    for item_1 in item_set:
        for item_2 in item_set:
            temp = item_1.union(item_2)
            if temp not in return_list and len(temp) == length:
                return_list.append(temp)
    return return_list

def frequent_items(items, data, support):
    '''
    :param items: list[set()]
    :param data: list[list]
    :param support: int
    :return: list[set()]
    '''
    return_ = []
    count = {}
    for line in data:
        for item in items:
            if item.issubset(line[1]):
                if tuple(item) not in count:
                    count[tuple(item)] = 1
                else:
                    count[tuple(item)] += 1
    for key, value in count.items():
        if value >= support:
            if set(key) not in return_:
                return_.append(set(key))
    return return_

def makedic(data):
    '''
    :param data: iterator
    :return: list[tuple]
    '''
    return_key = []
    return_value = []
    for line in data:
        for item in line[1]:
            if item not in return_key:
                return_key.append(item)
                return_value.append(1)
            else:
                index = return_key.index(item)
                return_value[index] += 1
    return_list = list(zip(return_key, return_value))
    return return_list



In [None]:
from pyspark.sql import SparkSession
import Apriori as A
import json
import time

sample_path = "task2_data.csv"

ss = SparkSession \
    .builder \
    .appName('Son') \
    .master('local[*]') \
    .getOrCreate()

sc = ss.sparkContext

start1 = time.time()
smallRDD = sc.textFile(sample_path)
header = smallRDD.first()

small1RDD = smallRDD.filter(lambda row: row != header) \
    .map(lambda line: (line.split(',')[0], line.split(',')[1])) \
    .combineByKey(lambda line: [line],
                  lambda exit, new: exit + [new],
                  lambda exit1, exit2: exit1 + exit2)

candidates = {}
frequent = {}

num_partitions = small1RDD.getNumPartitions()
print(num_partitions)
support = 4

temp = small1RDD.mapPartitions(lambda data: A.makedic(data)).reduceByKey(lambda a, b: a + b).collect()
temp_1 = {tup[0]: tup[1] for tup in temp}
candidates[1] = list(temp_1.values())
candidates[1] = [{item} for item in candidates[1]]
frequent[1] = [{key} for key, value in temp_1.items() if value >= support]
freq = sc.parallelize(frequent[1]).persist()

k = 2

while 1:
    print(k)
    candidate_temp = A.create_candidates(freq.collect(), k)
    freq.unpersist()
    freq = small1RDD.mapPartitions(lambda data: A.frequent_items(candidate_temp, data, support / num_partitions)) \
        .map(lambda x: (tuple(x), 1)) \
        .reduceByKey(lambda a, b: a + b) \
        .filter(lambda a: a[1] >= num_partitions) \
        .map(lambda a: set(a[0])) \
        .cache()
    fr = freq.collect()
    if len(fr) == 0:
        break
    else:
        candidates[k] = candidate_temp
        frequent[k] = fr
        k += 1

for key, value in candidates.items():
    candidates[key] = sorted([tuple(item) for item in value])
# print(candidates)
with open("candidates.json", "w") as file:
    json.dump(candidates, file, indent=1)
for key, value in frequent.items():
    frequent[key] = sorted([tuple(item) for item in value])
# print(frequent)
with open('frequent.json', 'w') as file:
    json.dump(frequent, file, indent=1)

end1 = time.time()
print(end1 - start1)
