# introduction


In this Jupyter Notebook you will find materials about 
1. <span style="color:red">RDD</span>   
- Reading a text file (from local or HDFS)
- map() and flatMap
- reduceByKey(), groupByKey(), sortByKey(), keys(), and values()
- join(), rightOuterJoin(), leftOuterJoin(), cogroup(), subtractByKey()
- with key/value data, use mapValues() and flatMapValues() of your transformation doesn't affect the keys. It is more efficient because it allows spark to maintain the same partitioning as original RDD instead of shuffling data.
- filter()
* Question: am I modifying the keys: yes then use map and flatMap, no then use mapValues and flatMapValues.



In [2]:
import os
# os.environ["HADOOP_HOME"] = "C:\\hadoop"

from pyspark.sql import SparkSession
from pyspark import SparkFiles
# from nltk.corpus import stopwords

# Some hints

max for each item: reduceByKey(lambda x,y : max(x,y))

# RDD

In [4]:
# Where driver is in spark-master container. check the README.me file
spark = (
    SparkSession.builder.appName("spppppppp")
    .master("spark://spark-master:7077") # Spark stand alone
    .getOrCreate()
)

# if driver is running in local.
# spark = (
#     SparkSession.builder.appName("spppppppp")
#     .master("spark://localhost:7077") # Spark stand alone
#     # .master("local[*]") # if runnung local
#     # .master("yarn")
#     # .master("mesos://<mesos-master-url>")
#     # .config("spark.jars", "c:/java/postgresql-42.7.5.jar")    
#     # .config("spark.driver.extraClassPath", "c:/java/postgresql-42.7.5.jar")
#     # .config("spark.executor.extraClassPath", "c:/java/postgresql-42.7.5.jar")
#     # .config("spark.driver.host", "10.0.0.177") 
#     # .config("spark.executor.memory", "2g")
#     # .config("spark.driver.memory", "2g")
#     .getOrCreate()
# )
sc =spark.sparkContext


In [5]:
# Get executor memory status
executor_status = sc._jsc.sc().getExecutorMemoryStatus()

# Convert Java Map to a Python dictionary
executor_status_dict = sc._gateway.jvm.scala.collection.JavaConversions.mapAsJavaMap(executor_status)

# Get the keys (nodes)
nodes = list(executor_status_dict.keys())
nodes

['172.19.0.5:36089', '172.19.0.4:33291', 'b797753f41f5:46101']

In [3]:
status_tracker = sc.statusTracker()


In [12]:
from pyspark.sql.functions import col

# Read data from the share docker volume when driver is in spark-master

In [None]:
sales = spark.read.option("header", "true").csv("/data/practice/sales.csv")
sales.show(2)

+--------+----------+-------+--------+-----+----------+
|order_id|product_id|user_id|quantity|price| timestamp|
+--------+----------+-------+--------+-----+----------+
|       0|       553|   4397|       8|490.6|2023-08-18|
|       1|       441|   6066|       2|23.87|2023-10-09|
+--------+----------+-------+--------+-----+----------+
only showing top 2 rows



In [15]:
sales.withColumn("a0", col("price")*2).show(2)

+--------+----------+-------+--------+-----+----------+-----+
|order_id|product_id|user_id|quantity|price| timestamp|   a0|
+--------+----------+-------+--------+-----+----------+-----+
|       0|       553|   4397|       8|490.6|2023-08-18|981.2|
|       1|       441|   6066|       2|23.87|2023-10-09|47.74|
+--------+----------+-------+--------+-----+----------+-----+
only showing top 2 rows



In [11]:
sales.select("user_id").distinct().count(), sales.count()

(10000, 100000)

In [16]:
import requests

# Replace with your Spark driver's REST API URL
spark_driver_url = "http://localhost:4040/api/v1/applications"

# Get the list of applications
response = requests.get(spark_driver_url)
applications = response.json()

# Get the application ID of the first application
applications[0]["id"]

'app-20250317050259-0002'

In [19]:
executors_url = f"http://localhost:4040/api/v1/applications/{applications[0]['id']}/executors"
executors_response = requests.get(executors_url)
executors_response.json()

[{'id': 'driver',
  'hostPort': 'b797753f41f5:46101',
  'isActive': True,
  'rddBlocks': 0,
  'memoryUsed': 857851,
  'diskUsed': 0,
  'totalCores': 0,
  'maxTasks': 0,
  'activeTasks': 0,
  'failedTasks': 0,
  'completedTasks': 0,
  'totalTasks': 0,
  'totalDuration': 548629,
  'totalGCTime': 112,
  'totalInputBytes': 0,
  'totalShuffleRead': 0,
  'totalShuffleWrite': 0,
  'isBlacklisted': False,
  'maxMemory': 455501414,
  'addTime': '2025-03-17T05:02:59.014GMT',
  'executorLogs': {},
  'memoryMetrics': {'usedOnHeapStorageMemory': 857851,
   'usedOffHeapStorageMemory': 0,
   'totalOnHeapStorageMemory': 455501414,
   'totalOffHeapStorageMemory': 0},
  'blacklistedInStages': [],
  'peakMemoryMetrics': {'JVMHeapMemory': 204195840,
   'JVMOffHeapMemory': 174712608,
   'OnHeapExecutionMemory': 0,
   'OffHeapExecutionMemory': 0,
   'OnHeapStorageMemory': 1135791,
   'OffHeapStorageMemory': 0,
   'OnHeapUnifiedMemory': 1135791,
   'OffHeapUnifiedMemory': 0,
   'DirectPoolMemory': 33711040,


In [7]:
import pandas as pd 
import numpy as np

In [20]:
# df = spark.read.text("hdfs://hadoop-namenode:9000/data/test/Common_Sense.txt")
common_sense = sc.textFile("/data/common_sense.txt")

In [21]:
common_sense.map(lambda x:x[:2]).countByValue()

                                                                                

defaultdict(int,
            {'Th': 54,
             '  ': 64,
             'mo': 15,
             'wh': 52,
             'of': 58,
             'at': 10,
             'yo': 10,
             'be': 43,
             '': 315,
             'Ti': 2,
             'Au': 1,
             'Re': 3,
             'La': 1,
             'Cr': 2,
             '**': 2,
             'CO': 1,
             'ad': 7,
             'IN': 3,
             'AM': 1,
             'On': 3,
             'SU': 1,
             'A ': 3,
             'ca': 18,
             'Ma': 4,
             'Or': 1,
             'PH': 1,
             'Pr': 8,
             'MD': 1,
             'Co': 10,
             'By': 2,
             'Pe': 5,
             'su': 29,
             'cu': 2,
             're': 49,
             'As': 11,
             'ne': 22,
             'in': 58,
             'ow': 4,
             'th': 159,
             'co': 72,
             'pr': 37,
             'In': 22,
             'ce': 3,
             'wo'

In [None]:
ratings = d["ratings"].map(lambda x:x[2]).countByValue()
ratings

## Get list of stopwords to be removed from data

In [4]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\mamma\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## Movie Rating

In [7]:
# addFiles just works with absolute paths
import os
# "file:///" + os.path.abspath("data/Common_Sense.txt").replace("\\",'/')
sc.addFile('file:///c:/Users/mamma/my_git/data-science-old/apache_spark/data/Common_Sense.txt')
sc.addFile('file:///c:/Users/mamma/my_git/data-science-old/apache_spark/data/OntheOriginofSpecies.txt')
sc.addFile("file:///c:/Users/mamma/my_git/data-science-old/apache_spark/data/IMDBDataset.csv")
sc.listFiles

['spark://10.0.0.177:51545/files/Common_Sense.txt',
 'spark://10.0.0.177:51545/files/IMDBDataset.csv',
 'spark://10.0.0.177:51545/files/OntheOriginofSpecies.txt']

In [15]:
# didn't work when driver in local
file_path = SparkFiles.get("Common_Sense.txt")
sc.textFile(file_path).take(2)
file_pathdf = spark.read.text("hdfs:///opt/bitnami/spark/data/Common_Sense.txt")

file_path

'C:\\Users\\mamma\\AppData\\Local\\Temp\\spark-b75a711a-e447-4739-b810-826a312fb267\\userFiles-077d407d-2529-4444-ac37-88a547fe5a0e\\Common_Sense.txt'

In [23]:
d={}

for k in ["movies","ratings","users"]:
    d[k] = sc.textFile(f"/data/ml-1m/{k}.dat").map(lambda li:li.split("::"))
d["ratings"].take(2)


[['1', '1193', '5', '978300760'], ['1', '661', '3', '978302109']]

In [28]:
d["ratings"].take(2)

[['1', '1193', '5', '978300760'], ['1', '661', '3', '978302109']]

In [29]:
ratings = d["ratings"].map(lambda x:x[2]).countByValue()
ratings

                                                                                

defaultdict(int,
            {'5': 226310, '3': 261197, '4': 348971, '2': 107557, '1': 56174})

In [32]:
rating2.take(2)

[('5', 1), ('3', 1)]

In [30]:
rating2 = d["ratings"].map(lambda x: (x[2], 1))
rating2syn = rating2.reduceByKey(lambda x, y: x+y)
rating2syn.collect()

                                                                                

[('5', 226310), ('4', 348971), ('3', 261197), ('2', 107557), ('1', 56174)]

In [36]:
ratingKV = d["ratings"].map(lambda x:(x[2],x[1]))
print(ratingKV.take(2))
print( ratingKV.mapValues(lambda x: (x,1)).take(2))

[('5', '1193'), ('3', '661')]
[('5', ('1193', 1)), ('3', ('661', 1))]


In [39]:
rating3 = ratingKV.mapValues(lambda x: (x,1)).reduceByKey(lambda x,y:(int(x[0])+int(y[0]),int(x[1])+int(y[1])))
rating3.take(5)

                                                                                

[('5', (391123353, 226310)),
 ('3', (501107422, 261197)),
 ('4', (654499954, 348971)),
 ('2', (208381312, 107557)),
 ('1', (110817755, 56174))]

In [40]:

averagePerRating = rating3.mapValues(lambda x:x[0]/x[1])
print(averagePerRating.collect())
rating3.collect()

[('5', 1728.2636781406036), ('3', 1918.5037423860151), ('4', 1875.5138793767962), ('2', 1937.4035348698828), ('1', 1972.758838608609)]


[('5', (391123353, 226310)),
 ('3', (501107422, 261197)),
 ('4', (654499954, 348971)),
 ('2', (208381312, 107557)),
 ('1', (110817755, 56174))]

## Get data from HDFS

 On the Origin of Species, by Charles Darwin

In [76]:
import re
rdd1 = sc.textFile(
    "hdfs:///user/hadoop/OntheOriginofSpecies.txt").flatMap(lambda text: re.compile(r'\W',re.UNICODE).split(text.lower()))
rdd1 = rdd1.filter(lambda x: x not in stopwords.words("english"))


In [77]:
rdd2 = rdd1.groupBy(lambda x:x[:4])
for k , v in rdd2.take(2):
    print(k,list(v)[:3])

[Stage 65:>                                                         (0 + 2) / 2]

proj ['project', 'project', 'project']
gute ['gutenberg', 'gutenberg', 'gutenberg']


                                                                                

In [83]:
def swapTuple(t):
    return (t[1],t[0])
numOccurance = rdd1.map(lambda x:(x,1)).reduceByKey(lambda x,y:x+y).map(swapTuple).sortByKey()
numOccurance.take(2)

                                                                                

[(1, 'title'), (1, '1st')]

In [78]:
rdd3 = rdd1.distinct()
rdd4 = rdd3.groupBy(lambda x:x[:4])
for k , v in rdd4.take(2):
    print(k,list(v)[:3])

[Stage 67:>                                                         (0 + 2) / 2]

proj ['project', 'projecting']
gute ['gutenberg']


                                                                                

# File Handling

In [None]:
import os
import subprocess

from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("regression").getOrCreate()
sc = spark.sparkContext

server_file = "hdfs:///192.168.0.233:9000"
file_uri = "hdfs:///user/hadoop/OntheOriginofSpecies.txt"
# text = sc.textFile("hdfs:///testdata/stockdata2.csv")

URI           = sc._gateway.jvm.java.net.URI
Path          = sc._gateway.jvm.org.apache.hadoop.fs.Path
FileSystem    = sc._gateway.jvm.org.apache.hadoop.fs.FileSystem
Configuration = sc._gateway.jvm.org.apache.hadoop.conf.Configuration

log4jLogger = sc._jvm.org.apache.log4j
LOGGER = log4jLogger.LogManager.getLogger(__name__)
LOGGER.info("pyspark script logger initialized")

fs = FileSystem.get(URI(server_file), Configuration())
status = fs.listStatus(Path('movies/'))

df = spark.read.csv("hdfs:///user/hadoop/OntheOriginofSpecies.txt")
print('\033[92m')
print("test is done ***************************************")
for fileStatus in status:
    print(fileStatus.getPath())
# print(text.take(2))

print('\033[0m')



cmd = 'hdfs dfs -ls movies/'
files = subprocess.check_output(cmd, shell=True).strip().split('\n')
for pat in files:
  print (pat)
