In [2]:
import requests
import os.path # https://www.freecodecamp.org/news/how-to-check-if-a-file-exists-in-python/
import pyarrow as pa
import pyarrow.fs
import io

In [3]:
#q1: how many live DataNodes are in the cluster?
!hdfs dfsadmin -fs hdfs://boss:9000 -report

Configured Capacity: 51642105856 (48.10 GB)
Present Capacity: 29986103296 (27.93 GB)
DFS Remaining: 29986054144 (27.93 GB)
DFS Used: 49152 (48 KB)
DFS Used%: 0.00%
Replicated Blocks:
	Under replicated blocks: 0
	Blocks with corrupt replicas: 0
	Missing blocks: 0
	Missing blocks (with replication factor 1): 0
	Low redundancy blocks with highest priority to recover: 0
	Pending deletion blocks: 0
Erasure Coded Block Groups: 
	Low redundancy block groups: 0
	Block groups with corrupt internal blocks: 0
	Missing block groups: 0
	Low redundancy blocks with highest priority to recover: 0
	Pending deletion blocks: 0

-------------------------------------------------
Live datanodes (2):

Name: 172.19.0.3:9866 (project-4-pandas4-dn-2.project-4-pandas4_default)
Hostname: 8d0f909ee157
Decommission Status : Normal
Configured Capacity: 25821052928 (24.05 GB)
DFS Used: 24576 (24 KB)
Non DFS Used: 10811224064 (10.07 GB)
DFS Remaining: 14993027072 (13.96 GB)
DFS Used%: 0.00%
DFS Remaining%: 58.07%
Conf

In [4]:
#download data
# !curl -i https://pages.cs.wisc.edu/~harter/cs544/data/hdma-wi-2021.csv
path = "./hdma-wi-2021.csv"
if not os.path.isfile(path): # the file doesn't exist in this directory, download it
    url = "https://pages.cs.wisc.edu/~harter/cs544/data/hdma-wi-2021.csv"
    r = requests.get(url, allow_redirects = True)
    open("hdma-wi-2021.csv", "wb").write(r.content)


In [5]:
#run before cp commands
!hdfs dfs -rm -r hdfs://boss:9000/single.csv
!hdfs dfs -rm -r hdfs://boss:9000/double.csv

rm: `hdfs://boss:9000/single.csv': No such file or directory
rm: `hdfs://boss:9000/double.csv': No such file or directory


In [6]:
#copy data to directories
!hdfs dfs -D dfs.block.size=1048576 -D dfs.replication=1 -cp hdma-wi-2021.csv hdfs://boss:9000/single.csv
!hdfs dfs -D dfs.block.size=1048576 -D dfs.replication=2 -cp hdma-wi-2021.csv hdfs://boss:9000/double.csv

In [9]:
#Q2: what are the logical and physical sizes of the CSV files?
!hdfs dfs -du -h hdfs://boss:9000/double.csv
!hdfs dfs -du -h hdfs://boss:9000/single.csv

166.8 M  333.7 M  hdfs://boss:9000/double.csv
166.8 M  166.8 M  hdfs://boss:9000/single.csv


In [7]:
#Q3: what is the file status for single.csv?
url = "http://boss:9870/webhdfs/v1/single.csv?op=GETFILESTATUS"
r = requests.get(url)
r.raise_for_status()
file_status_single = r.json()
file_status_single

{'FileStatus': {'accessTime': 1698614894966,
  'blockSize': 1048576,
  'childrenNum': 0,
  'fileId': 16386,
  'group': 'supergroup',
  'length': 174944099,
  'modificationTime': 1698614902446,
  'owner': 'root',
  'pathSuffix': '',
  'permission': '644',
  'replication': 1,
  'storagePolicy': 0,
  'type': 'FILE'}}

In [8]:
#Q4: what is the location for the first block of single.csv
url = "http://boss:9870/webhdfs/v1/single.csv?op=OPEN&offset=0&noredirect=true"
r = requests.get(url)
r.raise_for_status
r.json()["Location"]

'http://0e3fe0795835:9864/webhdfs/v1/single.csv?op=OPEN&namenoderpcaddress=boss:9000&offset=0'

In [9]:
#Q5: how are the blocks of single.csv distributed across the two DataNode containers?
blockSize = file_status_single["FileStatus"]["blockSize"]
length = file_status_single["FileStatus"]["length"]
location_occurrences = {} 
# offest is starting byte position (offset = 0 is 0th byte of file)
# length in the context of GET request is number of bytes to be processed (undefined means whole file)
# num_block = length//blockSize + 1  # number of blocks
idx = 0 # starting block, inc by "blockSize" until idx > length
while idx < length:
    # each idx is  the starting point of a new block
    url = f"http://boss:9870/webhdfs/v1/single.csv?op=OPEN&offset={idx}&noredirect=true&length=100"
    r = requests.get(url)
    r.raise_for_status
    location = r.json()["Location"][7:19]
    if not location in location_occurrences:
        location_occurrences[location] = 0
    location_occurrences[location] += 1
    idx += blockSize
location_occurrences

{'0e3fe0795835': 90, '7148d45c935e': 77}

In [10]:
#Q6: what are the first 10 bytes of single.csv?
hdfs = pa.fs.HadoopFileSystem("boss", 9000)
first_10_bytes = None
with hdfs.open_input_file("/single.csv") as f:
    first_10_bytes = f.read_at(10,0)
first_10_bytes

2023-10-29 21:28:47,596 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


b'activity_y'

In [11]:
#Q7: how many lines of single.csv contain the string "Single Family"?
count = 0

with hdfs.open_input_file("/single.csv") as f:
    reader = io.TextIOWrapper(io.BufferedReader(f))
    for line in reader:
        if "Single Family" in line: count += 1
count            

444874