# Lab 6 - Amazon Product Co-Review Analysis

This notebook analyzes Amazon food reviews to find products that are frequently reviewed together by the same users.

## Objectives:

1. **Task 1**: 
   - Transform dataset to get (user_id, list of products) pairs
   - Count frequency of product pairs reviewed together
   - Save pairs that appear more than once, sorted by frequency

2. **Task 2** (Bonus): 
   - Display top 10 most frequent product pairs

## Input Format:
CSV file with schema: `Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text`

Key columns:
- Column 1 (index 1): ProductId  
- Column 2 (index 2): UserId

## Import libraries and configuration

In [None]:
from typing import List, Tuple
from pyspark import SparkConf, SparkContext
from itertools import combinations

## Parameters configuration

In [None]:
# Configuration of paths and parameters
inputPath = "ReviewsSample.csv"  # For local testing
# inputPath = "/data/students/bigdata-01QYD/Lab4/Reviews.csv"  # For HDFS environment
outputPath = "product_pairs_output/"

## Reading and preprocessing input data

In [None]:
# Read input file
reviewsRDD = sc.textFile(inputPath)

# Remove header line (starts with "Id,")
header = reviewsRDD.first()

# Filter out header
dataRDD = reviewsRDD.filter(lambda line: line != header)

# Cache for performance
dataRDD.cache()

## Step 1: Extract (UserId, ProductId) pairs

In [None]:
def parse_csv_line(line: str) -> Tuple[str, str]:
    """
    Parse CSV line to extract ProductId (column 1) and UserId (column 2)
    Returns (UserId, ProductId)
    """
    try:
        # Split by comma (simple CSV parsing)
        fields = line.split(',')
        
        if len(fields) >= 3:
            product_id = fields[1].strip()  # ProductId (column 1)
            user_id = fields[2].strip()     # UserId (column 2)
            return (user_id, product_id)
        else:
            return None
    except:
        return None

# Extract (UserId, ProductId) pairs
userProductPairsRDD = dataRDD.map(parse_csv_line).filter(lambda x: x is not None)


## Step 2: Group products by user (create user-product lists)

In [None]:
# Group by UserId and collect distinct ProductIds for each user
userProductListsRDD = userProductPairsRDD.groupByKey().mapValues(lambda products: list(set(products)))

# Cache for multiple usage
userProductListsRDD.cache()

## Step 3: Generate product pairs for each user

In [None]:
def generate_product_pairs(user_products: Tuple[str, List[str]]) -> List[Tuple[str, int]]:
    """
    Generate all product pairs for a user
    Returns list of ((product1, product2), 1) where product1 < product2 lexicographically
    """
    user_id, products = user_products
    
    # Only generate pairs if user reviewed 2 or more products
    if len(products) < 2:
        return []
    
    pairs = []
    # Generate all combinations of 2 products
    for product1, product2 in combinations(products, 2):
        # Order products lexicographically to avoid duplicates (A,B) and (B,A)
        if product1 < product2:
            pairs.append(((product1, product2), 1))
        else:
            pairs.append(((product2, product1), 1))
    
    return pairs

# Generate all product pairs from all users
productPairsRDD = userProductListsRDD.flatMap(generate_product_pairs)

## Step 4: Count frequency of each product pair

In [None]:
# Count frequency of each product pair
productPairFreqRDD = productPairsRDD.reduceByKey(lambda a, b: a + b)

# Cache for multiple operations
productPairFreqRDD.cache()

## Step 5: Filter pairs that appear more than once and sort by frequency

In [None]:
# Filter pairs that appear more than once (frequency > 1)
frequentPairsRDD = productPairFreqRDD.filter(lambda x: x[1] > 1)

# Count filtered pairs
numFrequentPairs = frequentPairsRDD.count()

if numFrequentPairs > 0:
    # Sort by frequency in descending order
    # Convert to (frequency, (product1, product2)) for sorting, then swap back
    sortedPairsRDD = frequentPairsRDD.map(lambda x: (x[1], x[0])) \
                                     .sortByKey(ascending=False) \
                                     .map(lambda x: (x[1], x[0]))
    
else:
    print("No product pairs appear more than once")
    sortedPairsRDD = sc.emptyRDD()

## Step 6: Save results to output folder

In [None]:
if numFrequentPairs > 0:
    # Prepare output format: "(product1,product2)\tfrequency"
    outputRDD = sortedPairsRDD.map(lambda x: f"({x[0][0]},{x[0][1]})\t{x[1]}")
    
    # Save to output folder
    try:
        outputRDD.saveAsTextFile(outputPath)
        print(f"\nResults saved successfully to: {outputPath}")
    except Exception as e:
        print(f"\nError saving to {outputPath}: {e}")
else:
    print("\nNo data to save (no frequent pairs found)")

## Task 2 (Bonus): Top 10 Most Frequent Product Pairs

In [None]:
if numFrequentPairs > 0:
    # Get top 10 most frequent pairs
    top10Pairs = sortedPairsRDD.take(10)
    
    for i, ((product1, product2), frequency) in enumerate(top10Pairs, 1):
        print(f"{i:2d}.\t({product1}, {product2})\t\t{frequency}")
else:
    print("No frequent product pairs found to display top 10")

## Data Analysis and Insights

In [None]:
print("\n=== DATA ANALYSIS AND INSIGHTS ===")

# Analyze user behavior
usersWithMultipleProducts = userProductListsRDD.filter(lambda x: len(x[1]) > 1).count()
percentageMultipleProducts = (usersWithMultipleProducts / totalUsers) * 100 if totalUsers > 0 else 0

print(f"\nUser Behavior Analysis:")
print(f"  Total users: {totalUsers}")
print(f"  Users who reviewed multiple products: {usersWithMultipleProducts}")
print(f"  Percentage of users with multiple products: {percentageMultipleProducts:.1f}%")

# Product analysis
if numFrequentPairs > 0:
    # Extract all unique products from frequent pairs
    productsInPairs = sortedPairsRDD.flatMap(lambda x: [x[0][0], x[0][1]]).distinct()
    numProductsInPairs = productsInPairs.count()
    
    print(f"\nProduct Pair Analysis:")
    print(f"  Total unique product pairs with frequency > 1: {numFrequentPairs}")
    print(f"  Unique products involved in frequent pairs: {numProductsInPairs}")
    
    # Most popular products (appear in most pairs)
    productPopularity = sortedPairsRDD.flatMap(lambda x: [(x[0][0], 1), (x[0][1], 1)]) \
                                      .reduceByKey(lambda a, b: a + b) \
                                      .sortBy(lambda x: x[1], ascending=False)
    
    print(f"\nMost popular products (appear in most frequent pairs):")
    topProducts = productPopularity.take(5)
    for product, pairCount in topProducts:
        print(f"  {product}: appears in {pairCount} frequent pairs")

print(f"\nOverall Statistics:")
print(f"  Total reviews processed: {totalLines}")
print(f"  Total unique users: {totalUsers}")
print(f"  Total product pair instances: {productPairsRDD.count()}")
print(f"  Unique product pairs: {totalUniquePairs}")
print(f"  Frequent pairs (freq > 1): {numFrequentPairs}")