In [1]:
#set up PySpark on your machine or cluster. Ensure you have Spark installed and configured properly.
from pyspark import SparkContext, SparkConf

# Create a SparkContext (make sure to configure properly for your cluster)
conf = SparkConf().setAppName("MyMapReduceJob")
sc = SparkContext(conf=conf)


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
# Load the data into an RDD
lines = sc.textFile("allBirthData.csv")

# Split each line into fields and filter out header if necessary
header = lines.first()
data = lines.filter(lambda line: line != header).map(lambda line: line.split(","))

# Convert data to (index, countyBiths) tuples
transactions = data.map(lambda fields: (int(fields[0]), float(fields[1])))

In [5]:
#perform the MapReduce operations to compute average transaction countyBiths per index.
# Map step: Transform (index, countyBiths) to (index, (countyBiths, 1))
mapped = transactions.map(lambda x: (x[0], (x[1], 1)))

# Reduce step: Aggregate by index and calculate sum and count
reduced = mapped.reduceByKey(lambda x, y: (x[0] + y[0], x[1] + y[1]))

# Calculate average per index
average_per_birth = reduced.mapValues(lambda x: x[0] / x[1])

# Collect the results
results = average_per_birth.collect()

# Print results
for index, average_birth in results:
    print(f"Customer ID: {index}, Average Amount: {average_birth}")


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Customer ID: 311475, Average Amount: 124.0
Customer ID: 311477, Average Amount: 173.0
Customer ID: 311479, Average Amount: 987.0
Customer ID: 311481, Average Amount: 160.0
Customer ID: 311483, Average Amount: 271.0
Customer ID: 311485, Average Amount: 119.0
Customer ID: 311487, Average Amount: 119.0
Customer ID: 311489, Average Amount: 286.0
Customer ID: 311491, Average Amount: 1717.0
Customer ID: 311493, Average Amount: 1090.0
Customer ID: 311495, Average Amount: 1392.0
Customer ID: 311497, Average Amount: 347.0
Customer ID: 311499, Average Amount: 227.0
Customer ID: 311501, Average Amount: 206.0
Customer ID: 311503, Average Amount: 398.0
Customer ID: 311505, Average Amount: 121.0
Customer ID: 311507, Average Amount: 328.0
Customer ID: 311509, Average Amount: 1745.0
Customer ID: 311511, Average Amount: 3974.0
Customer ID: 311513, Average Amount: 919.0
Customer ID: 311515, Average Amount: 815.0
Customer ID: 311517, Averag

In [None]:
#wrap up by stopping the Spark context and handling any necessary cleanup.
# Stop the SparkContext
sc.stop()
