In [38]:
!apt-get update
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://archive.apache.org/dist/spark/spark-3.0.1/spark-3.0.1-bin-hadoop2.7.tgz
!tar xf spark-3.0.1-bin-hadoop2.7.tgz
!pip install -q findspark

import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.0.1-bin-hadoop2.7"

import findspark
findspark.init()

from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").getOrCreate()


0% [Working]            Hit:1 http://security.ubuntu.com/ubuntu jammy-security InRelease
0% [Connecting to archive.ubuntu.com] [Connected to cloud.r-project.org (3.171.85.66)] [Connecting t                                                                                                    Hit:2 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease
0% [Connecting to archive.ubuntu.com] [Connected to r2u.stat.illinois.edu (192.17.190.167)] [Connect                                                                                                    Get:3 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease [1,581 B]
Hit:4 https://r2u.stat.illinois.edu/ubuntu jammy InRelease
Hit:5 http://archive.ubuntu.com/ubuntu jammy InRelease
Get:6 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]
Get:7 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  Packages [1,108 kB]
Hit:8 https://ppa.launchpadcontent.ne

In [39]:
income_rdd = spark.sparkContext.textFile("/content/trial_incomes.csv", 32)


In [40]:
reduced_rdd = (
    income_rdd
    .filter(lambda x: len(x.split(',')) >= 2)
    .map(lambda x: (x.split(',')[0], float(x.split(',')[1])))
    .reduceByKey(lambda x, y: x + y)
)
reduced_rdd.collect()

income_rdd.take(5)


['2', '10', '34', '1', '4']

Tasks:
With respect to the training dataset

1.count of distinct incomes – The number of distinct incomes in the dataset

In [41]:
distinct_incomes_rdd = (
    income_rdd
    .map(lambda x: (x, 1))
    .reduceByKey(lambda x, y: x)
    .map(lambda x: x[0])
)

distinct_income_count_test = distinct_incomes_rdd.count()
distinct_income_count_test

79

2.median – The median of all incomes in the dataset: the income at which
there is an equal number of values greater than the income as there are
values less than the income.

In [54]:
income_val_rdd= income_rdd.map(lambda x: float(x.split(',')[0]))
sorted_incomes = income_val_rdd.sortBy(lambda x: x).collect()

count = len(sorted_incomes)
if count % 2 == 1:
    median_income = sorted_incomes[count // 2]
else:
    median_income = (sorted_incomes[count // 2 - 1] + sorted_incomes[count // 2]) / 2

median_income

4.0

3.mode – The mode of all incomes in the dataset: the most frequently seen
income.

In [43]:
income_counts_rdd = (
    income_rdd
    .filter(lambda x: len(x.split(',')) >= 1)
    .map(lambda x: (float(x.split(',')[0]), 1))
    .reduceByKey(lambda x, y: x + y)
)



In [36]:
mode_of_income = income_counts_rdd.max(key=lambda x: x[1])
mode_of_income


(4.0, 257)

4.count per 10power – counting the incomes by powers of 10. That is, for
each integer round it down to its nearest power of 10 (for example 3 map to
1 = 100; 30 would map to 10 = 101. 87 would map to 10 = 101; 870 would
map to 100 = 102, 100 would map to 100 = 102 etc….). Your goal is to
count the number of integers between each power of 10

In [44]:
import math

def nearest_power_of_10(n):
    return 10 ** int(math.log10(n)) if n > 0 else 1

power_of_10_counts_rdd = (
    income_rdd
    .filter(lambda x: len(x.split(',')) >= 1)
    .map(lambda x: nearest_power_of_10(float(x.split(',')[0])))
    .map(lambda x: (x, 1))
    .reduceByKey(lambda x, y: x + y)
)

power_of_10_incomecounts = power_of_10_counts_rdd.collect()
power_of_10_incomecounts


[(1, 853), (100, 19), (10000, 2), (10, 118), (1000, 8)]

Tasks:
With respect to test dataset

In [45]:
income_rdd_test = spark.sparkContext.textFile("/content/test_incomes.csv", 32)


In [47]:
reduced_rdd_test = (
    income_rdd_test
    .filter(lambda x: len(x.split(',')) >= 2)
    .map(lambda x: (x.split(',')[0], float(x.split(',')[1])))
    .reduceByKey(lambda x, y: x + y)
)
reduced_rdd_test.collect()
income_rdd_test.take(5)


['3', '2', '4', '24', '1133']

1.count of distinct incomes – The number of distinct incomes in the dataset

In [48]:
distinct_incomes_rdd_test = (
    income_rdd_test
    .map(lambda x: (x, 1))
    .reduceByKey(lambda x, y: x)
    .map(lambda x: x[0])
)

distinct_income_count_test = distinct_incomes_rdd_test.count()
distinct_income_count_test


5572

2.median – The median of all incomes in the dataset: the income at which there is an equal number of values greater than the income as there are values less than the income.

In [49]:
income_values_rdd_test= income_rdd_test.map(lambda x: float(x.split(',')[0]))
sorted_incomes = income_values_rdd_test.sortBy(lambda x: x).collect()

count = len(sorted_incomes)
if count % 2 == 1:
    median_income = sorted_incomes[count // 2]
else:
    median_income = (sorted_incomes[count // 2 - 1] + sorted_incomes[count // 2]) / 2

median_income


3458.0

3.mode – The mode of all incomes in the dataset: the most frequently seen
income.

In [50]:
income_counts_rdd_test = (
    income_rdd_test
    .filter(lambda x: len(x.split(',')) >= 1)
    .map(lambda x: (float(x.split(',')[0]), 1))
    .reduceByKey(lambda x, y: x + y)
)

mode_income_test = income_counts_rdd_test.max(key=lambda x: x[1])
mode_income_test


(32.0, 1)

4.count per 10power – counting the incomes by powers of 10. That is, for
each integer round it down to its nearest power of 10 (for example 3 map to
1 = 100; 30 would map to 10 = 101. 87 would map to 10 = 101; 870 would
map to 100 = 102, 100 would map to 100 = 102 etc….). Your goal is to
count the number of integers between each power of 10.

In [52]:
import math

def nearest_power_of_10(n):
    return 10 ** int(math.log10(n)) if n > 0 else 1

power_of_10_counts_test = (
    income_rdd_test
    .filter(lambda x: len(x.split(',')) >= 1)
    .map(lambda x: nearest_power_of_10(float(x.split(',')[0])))
    .map(lambda x: (x, 1))
    .reduceByKey(lambda x, y: x + y)
)

power_of_10_incomecounts_test = power_of_10_counts_test.collect()
power_of_10_incomecounts_test


[(100000, 218),
 (1000000, 51),
 (10000000, 3),
 (1, 9),
 (100, 898),
 (1000, 3175),
 (10, 90),
 (10000, 1128)]