In [None]:
#import findspark
#findspark.init()
import os
import sys
import pyspark as ps
import warnings
from pyspark.sql import SQLContext

# Set up Python environment for PySpark
os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

# Initialize SparkContext, handling if it already exists
try:
    sc = ps.SparkContext('local[*]')  # Create SparkContext on all available CPUs
    # sqlContext = SQLContext(sc)
    print("Just created a SparkContext")
except ValueError:
    warnings.warn("SparkContext already exists in this scope")

import unittest
import sys

# Define a unit test for the RDD
class TestRdd(unittest.TestCase):
    def test_take(self):
        input = sc.parallelize([1, 2, 3, 4])
        self.assertEqual([1, 2, 3, 4], input.take(4))

# Function to run the unit tests
def run_tests():
    suite = unittest.TestLoader().loadTestsFromTestCase(TestRdd)
    unittest.TextTestRunner(verbosity=1, stream=sys.stderr).run(suite)

run_tests()
help(sc)

import json

# Define field sets for various uses
fields = ['product_id', 'user_id', 'score', 'time']
fields2 = ['product_id', 'user_id', 'review', 'profile_name', 'helpfulness', 'score', 'time']
fields3 = ['product_id', 'user_id', 'time']
fields4 = ['user_id', 'score', 'time']

# Validate the JSON line to ensure it has necessary fields
def validate(line):
    for field in fields2:
        if field not in line:
            return False
    return True

# Load and process reviews data
reviews_raw = sc.textFile('data/movies.json')
reviews = reviews_raw.map(lambda line: json.loads(line)).filter(validate)
reviews.cache()

# Display sample data
reviews.take(1)

# Calculate statistics for movies, users, and entries
num_movies = reviews.groupBy(lambda entry: entry['product_id']).count()
num_users = reviews.groupBy(lambda entry: entry['user_id']).count()
num_entries = reviews.count()
print(f"{num_entries} reviews of {num_movies} movies by {num_users} different people.")

# Calculate most-watched movies
r1 = reviews.map(lambda r: ((r['product_id'],), 1))
avg3 = r1.mapValues(lambda x: (x, 1)) \
          .reduceByKey(lambda x, y: (x[0] + y[0], x[1] + y[1])) \
          .filter(lambda x: x[1][1] > 20) \
          .map(lambda x: ((x[1][0] + x[1][1],), x[0])) \
          .sortByKey(ascending=False)

# Display top 10 most-watched movies
for movie in avg3.take(10):
    print(f"http://www.amazon.com/dp/{movie[1][0]} WATCHED BY: {movie[0][0]} PEOPLE")

# Calculate users with most reviews
r2 = reviews.map(lambda ru: ((ru['user_id'],), 1))
avg2 = r2.mapValues(lambda x: (x, 1)) \
          .reduceByKey(lambda x, y: (x[0] + y[0], x[1] + y[1])) \
          .filter(lambda x: x[1][1] > 20) \
          .map(lambda x: ((x[1][0] + x[1][1],), x[0])) \
          .sortByKey(ascending=False)

# Display top 10 users by review count
for movie in avg2.take(10):
    print(f"http://www.amazon.com/dp/{movie[1][0]} WATCHED: {movie[0][0]} MOVIES")

# Find reviews by specific profile name
filtered = reviews.filter(lambda entry: "George" in entry['profile_name'])
print(f"Found {filtered.count()} entries.\n")
for review in filtered.collect():
    print(f"Rating: {review['score']} and helpfulness: {review['helpfulness']}")
    print(f"http://www.amazon.com/dp/{review['product_id']}")
    print(review['summary'])
    print(review['review'])
    print("\n")

# Calculate best and worst rated movies
reviews_by_movie = reviews.map(lambda r: ((r['product_id'],), r['score']))
avg = reviews_by_movie.mapValues(lambda x: (x, 1)) \
                      .reduceByKey(lambda x, y: (x[0] + y[0], x[1] + y[1])) \
                      .filter(lambda x: x[1][1] > 20) \
                      .map(lambda x: ((x[1][0] / x[1][1],), x[0])) \
                      .sortByKey(ascending=True)

# Display top 10 best and worst rated movies
for movie in avg.take(10):
    print(f"http://www.amazon.com/dp/{movie[1][0]} Rating: {movie[0][0]}")

from datetime import datetime
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Process review time series data
timeseries_rdd = reviews.map(lambda entry: {'score': entry['score'], 'time': datetime.fromtimestamp(entry['time'])})
sample = timeseries_rdd.sample(withReplacement=False, fraction=20000.0 / num_entries, seed=1134)
timeseries = pd.DataFrame(sample.collect(), columns=['score', 'time'])

# Resample and plot time series data
timeseries.set_index('time', inplace=True)
Rsample = timeseries.score.resample('Y').count()
Rsample.plot()
Rsample2 = timeseries.score.resample('M').count()
Rsample2.plot()
Rsample3 = timeseries.score.resample('Q').count()
Rsample3.plot()

# Plot average rating of movies
for movie in avg.take(4):
    plt.bar(movie[1][0], movie[0][0])
    plt.title('Histogram of "AVERAGE RATING OF MOVIE"')
    plt.xlabel('MOVIE')
    plt.ylabel('AVGRATING')

# Plot number of movies reviewed by users
for movie in avg2.take(3):
    plt.bar(movie[1][0], movie[0][0])
    plt.title('Histogram of "NUMBER OF MOVIES REVIEWED BY USER"')
    plt.xlabel('USER')
    plt.ylabel('MOVIE COUNT')

# Plot movies reviewed by number of users
for movie in avg3.take(4):
    plt.bar(movie[1][0], movie[0][0])
    plt.title('Histogram of "MOVIES REVIEWED BY NUMBER OF USERS"')
    plt.xlabel('MOVIE')
    plt.ylabel('USER COUNT')

from pyspark.mllib.recommendation import ALS
from numpy import array
import hashlib

# Helper function to hash user and product IDs
def get_hash(s):
    return int(hashlib.sha1(s).hexdigest(), 16) % (10 ** 8)

# Prepare ratings data for recommendation model
ratings = reviews.map(lambda entry: tuple([get_hash(entry['user_id'].encode('utf-8')),
                                           get_hash(entry['product_id'].encode('utf-8')),
                                           int(entry['score'])]))

# Split data into train and test sets
train_data = ratings.filter(lambda entry: ((entry[0] + entry[1]) % 10) >= 2)
test_data = ratings.filter(lambda entry: ((entry[0] + entry[1]) % 10) < 2)
train_data.cache()
print(f"Number of train samples: {train_data.count()}")
print(f"Number of test samples: {test_data.count()}")

# Train recommendation model using ALS
rank = 20
numIterations = 20
model = ALS.train(train_data, rank, numIterations)

# Evaluate model on test data
unknown = test_data.map(lambda entry: (int(entry[0]), int(entry[1])))
predictions = model.predictAll(unknown).map(lambda r: ((int(r[0]), int(r[1])), r[2]))
true_and_predictions = test_data.map(lambda r: ((int(r[0]), int(r[1])), r[2])).join(predictions)
MSE = true_and_predictions.map(lambda r: (int(r[1][0]) - int(r[1][1]) ** 2)).reduce(lambda x, y: x + y) / true_and_predictions.count()
true_and_predictions.take(10)

# Sentiment analysis on reviews
min_occurrences = 10
good_reviews = reviews.filter(lambda line: line['score'] == 5.0)
bad_reviews = reviews.filter(lambda line: line['score'] == 1.0)

# Process and count word frequencies in good and bad reviews
good_words = good_reviews.flatMap(lambda line: line['review'].split(' ')).map(lambda word: (word.strip(), 1)).reduceByKey(lambda a, b: a + b).filter(lambda word_count: word_count[1] > min_occurrences)
bad_words = bad_reviews.flatMap(lambda line: line['review'].split(' ')).map(lambda word: (word.strip(), 1)).reduceByKey(lambda a, b: a + b).filter(lambda word_count: word_count[1] > min_occurrences)

# Calculate word frequencies
num_good_words = good_words.count()
num_bad_words = bad_words.count()
frequency_good = good_words.map(lambda word: ((word[0],), float(word[1]) / num_good_words))
frequency_bad = bad_words.map(lambda word: ((word[0],), float(word[1]) / num_bad_words))

# Join frequencies and calculate relative differences
joined_frequencies = frequency_good.join(frequency_bad)
result = joined_frequencies.map(lambda f: ((abs(f[1][0] - f[1][1]) / f[1][0],), f[0][0])).sortByKey(ascending=False)
result.take(50)

# Plot sentiment analysis histogram
for movie in result.take(7):
    plt.bar(movie[1], movie[0][0])
    plt.title('Histogram of "SENTIMENT ANALYSIS"')
    plt.xlabel('WORD')
    plt.ylabel('NUMBER OF OCCURRENCES')
