In [1]:
import os
import requests
import io
import time

files = ['166.8 M  333.7 M  hdfs://main:9000/double.csv',
 '166.8 M  166.8 M  hdfs://main:9000/single.csv']

In [2]:
#Part 1: HDFS Deployment and Data Upload

In [3]:
if not os.path.exists("hdma-wi-2021.csv"):
    !wget https://pages.cs.wisc.edu/~harter/cs639/data/hdma-wi-2021.csv
else:
    print("csv already downloaded")

csv already downloaded


In [4]:
result = !hdfs dfs -du -h hdfs://main:9000/

if result != files:
    !hdfs dfs -D dfs.block.size=1048576 -D dfs.replication=1 -cp hdma-wi-2021.csv hdfs://main:9000/single.csv
    !hdfs dfs -D dfs.block.size=1048576 -D dfs.replication=2 -cp hdma-wi-2021.csv hdfs://main:9000/double.csv
else:
    print("files already copied")

In [5]:
!hdfs dfs -du -h hdfs://main:9000/
# expected output
# 166.8 M  333.7 M  hdfs://main:9000/double.csv
# 166.8 M  166.8 M  hdfs://main:9000/single.csv

166.8 M  333.7 M  hdfs://main:9000/double.csv
166.8 M  166.8 M  hdfs://main:9000/single.csv


In [6]:
#Part 2: Block Locations

In [7]:
blocks = {}
blockSize = 1048576
blockNumber = 0
for i in range(167):
    url = f"http://main:9870/webhdfs/v1/single.csv/?op=OPEN&length=1000&offset={blockNumber}"
    resp = requests.get(url, allow_redirects=False)
    resp.raise_for_status
    location = resp.headers["Location"][:46]
    if location in blocks:
        blocks[location] +=1
    else:
        blocks[location] = 1
    blockNumber+=blockSize
        
blocks

{'http://e26cd7a84cc9:9864/webhdfs/v1/single.csv': 86,
 'http://9bf8706a3306:9864/webhdfs/v1/single.csv': 81}

In [8]:
#Part 3: Reading the Data

In [9]:
class hdfsFile(io.RawIOBase):
    def __init__(self, path):
        url = f"http://main:9870/webhdfs/v1/{path}/?op=LISTSTATUS"
        resp = requests.get(url, allow_redirects=False)
        json = resp.json()
        self.path = path
        self.offset = 0
        self.length = json['FileStatuses']['FileStatus'][0]['length']

    def readable(self):
        return True

    def readinto(self, b):
        if self.offset < self.length:
            url = f"http://main:9870/webhdfs/v1/{self.path}/?op=OPEN&length={len(b)}&offset={self.offset}"
            resp = requests.get(url, allow_redirects=True)
            if(resp.status_code == 403):
                b[0:1] = bytes("\n", "utf-8")
                self.offset += len(b)
                return 1
            else:
                b[0:len(resp.content)] = bytes(resp.content)
                self.offset += len(b)
                return len(resp.content)
        else:
            return 0 # TODO

In [10]:
Single_Family = 0 
Multi_Family  = 0
t0 = time.time()
for line in io.BufferedReader(hdfsFile("single.csv"), buffer_size=1048576):
    line = str(line, "utf-8")
    if "Single Family" in line:
        Single_Family+=1
    if "Multifamily" in line:
        Multi_Family+=1
t1 = time.time()

print("Counts from single.csv with 1MB buffer size")
print("Single Family: " + str(Single_Family))
print("Multi Family: " + str(Multi_Family))
print("Seconds: " + str(t1-t0))
print()

Single_Family2 = 0 
Multi_Family2  = 0
t2 = time.time()
for line in io.BufferedReader(hdfsFile("single.csv"), buffer_size=65536):
    line = str(line, "utf-8")
    if "Single Family" in line:
        Single_Family2+=1
    if "Multifamily" in line:
        Multi_Family2+=1
t3 = time.time()


print("Counts from single.csv with 65.536KB buffer size")
print("Single Family: " + str(Single_Family2))
print("Multi Family: " + str(Multi_Family2))
print("Seconds: " + str(t3-t2))

# Counts from single.csv
# Single Family: 444874
# Multi Family: 2493
# Seconds: 24.33926248550415

Counts from single.csv with 1MB buffer size
Single Family: 444874
Multi Family: 2493
Seconds: 13.823347568511963

Counts from single.csv with 65.536KB buffer size
Single Family: 444874
Multi Family: 2493
Seconds: 49.031872272491455


In [11]:
#Part 4: Disaster Strikes

In [12]:
#wait for worker datanode to die
report = !hdfs dfsadmin -fs hdfs://main:9000/ -report
while("Dead datanodes" not in str(report)):
      time.sleep(20)
      report = !hdfs dfsadmin -fs hdfs://main:9000/ -report
report

['Configured Capacity: 51642105856 (48.10 GB)',
 'Present Capacity: 23979203708 (22.33 GB)',
 'DFS Remaining: 23450218496 (21.84 GB)',
 'DFS Used: 528985212 (504.48 MB)',
 'DFS Used%: 2.21%',
 'Replicated Blocks:',
 '\tUnder replicated blocks: 0',
 '\tBlocks with corrupt replicas: 0',
 '\tMissing blocks: 0',
 '\tMissing blocks (with replication factor 1): 0',
 '\tLow redundancy blocks with highest priority to recover: 0',
 '\tPending deletion blocks: 0',
 'Erasure Coded Block Groups: ',
 '\tLow redundancy block groups: 0',
 '\tBlock groups with corrupt internal blocks: 0',
 '\tMissing block groups: 0',
 '\tLow redundancy blocks with highest priority to recover: 0',
 '\tPending deletion blocks: 0',
 '',
 '-------------------------------------------------',
 'Live datanodes (1):',
 '',
 'Name: 172.18.0.4:9866 (project-3-nateproject3-worker-2.cs544net)',
 'Hostname: 9bf8706a3306',
 'Decommission Status : Normal',
 'Configured Capacity: 25821052928 (24.05 GB)',
 'DFS Used: 261765966 (249.6

In [13]:
#if ConnectionError occurs rerun this cell

Single_Family = 0 
Multi_Family  = 0

for line in io.BufferedReader(hdfsFile("double.csv"), buffer_size=1048576):
    line = str(line, "utf-8")
    if "Single Family" in line:
        Single_Family+=1
    if "Multifamily" in line:
        Multi_Family+=1


print("Counts from double.csv with 1MB buffer size")
print("Single Family: " + str(Single_Family))
print("Multi Family: " + str(Multi_Family))
print()

Single_Family = 0 
Multi_Family  = 0

for line in io.BufferedReader(hdfsFile("single.csv"), buffer_size=1048576):
    line = str(line, "utf-8")
    if "Single Family" in line:
        Single_Family+=1
    if "Multifamily" in line:
        Multi_Family+=1


print("Counts from single.csv with 1MB buffer size")
print("Single Family: " + str(Single_Family))
print("Multi Family: " + str(Multi_Family))




Counts from double.csv with 1MB buffer size
Single Family: 444874
Multi Family: 2493

Counts from single.csv with 1MB buffer size
Single Family: 215999
Multi Family: 1170
