In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


### Install Hadoop

In [3]:
# Installing openssh-server
!apt-get install openssh-server -qq > /dev/null

# Starting our server
!service ssh start

 * Starting OpenBSD Secure Shell server sshd
   ...done.


In [4]:
# Creating a new rsa key pair with empty password
!ssh-keygen -t rsa -P "" -f ~/.ssh/id_rsa

Generating public/private rsa key pair.
Created directory '/root/.ssh'.
Your identification has been saved in /root/.ssh/id_rsa.
Your public key has been saved in /root/.ssh/id_rsa.pub.
The key fingerprint is:
SHA256:VQFKd8+XPtruK434I1ESBWeAdgtyI/SDtODOLwducu0 root@6155e2551bb9
The key's randomart image is:
+---[RSA 2048]----+
|     ..o. ++B=   |
|    . o++O =oo  .|
|     . o*o= o o..|
|    o    ..o ... |
|     +  S   o  o |
|    . +    .  o .|
|   . = +    o.o. |
|    + +    o +.. |
|       E    o.++.|
+----[SHA256]-----+


In [5]:
# Copying the public key we just generated to autorized keys
!cat $HOME/.ssh/id_rsa.pub>>$HOME/.ssh/authorized_keys

# Changing the permissions on the key
!chmod 0600 ~/.ssh/authorized_keys

In [6]:
# Conneting with our local machine
!ssh -o StrictHostKeyChecking=no localhost uptime

 20:42:41 up 2 min,  0 users,  load average: 1.03, 0.49, 0.19


In [7]:
import os

In [8]:
# Installing Hadoop and configuring JAVA_HOME: 
!if [ ! -d /usr/local/hadoop-3.3.3/ ]; then \
wget https://dlcdn.apache.org/hadoop/common/hadoop-3.3.3/hadoop-3.3.3.tar.gz; \
tar -xzf hadoop-3.3.3.tar.gz; \
cp -r hadoop-3.3.3/ /usr/local/; \
rm -rf hadoop-3.3.3/; \
rm hadoop-3.3.3.tar.gz; \
echo "export JAVA_HOME=$(dirname $(dirname $(realpath $(which java))))" >> /usr/local/hadoop-3.3.3/etc/hadoop/hadoop-env.sh; \
fi

--2022-12-03 20:42:41--  https://dlcdn.apache.org/hadoop/common/hadoop-3.3.3/hadoop-3.3.3.tar.gz
Resolving dlcdn.apache.org (dlcdn.apache.org)... 151.101.2.132, 2a04:4e42::644
Connecting to dlcdn.apache.org (dlcdn.apache.org)|151.101.2.132|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 645040598 (615M) [application/x-gzip]
Saving to: ‘hadoop-3.3.3.tar.gz’


2022-12-03 20:42:44 (226 MB/s) - ‘hadoop-3.3.3.tar.gz’ saved [645040598/645040598]



In [9]:
# Setting up some of our environmental variables: 
os.environ['PATH'] = "/usr/local/hadoop-3.3.3/bin/:" + os.environ['PATH']
os.environ["HADOOP_HOME"] = "/usr/local/hadoop-3.3.3"

In [10]:
# Running our config script
# Note: remember to check you have the correct filepath
!bash lab4_config.sh

In [11]:
!$HADOOP_HOME/bin/hdfs namenode -format

2022-12-03 20:43:02,893 INFO namenode.NameNode: STARTUP_MSG: 
/************************************************************
STARTUP_MSG: Starting NameNode
STARTUP_MSG:   host = 6155e2551bb9/172.28.0.12
STARTUP_MSG:   args = [-format]
STARTUP_MSG:   version = 3.3.3
STARTUP_MSG:   classpath = /usr/local/hadoop-3.3.3/etc/hadoop:/usr/local/hadoop-3.3.3/share/hadoop/common/lib/avro-1.7.7.jar:/usr/local/hadoop-3.3.3/share/hadoop/common/lib/commons-io-2.8.0.jar:/usr/local/hadoop-3.3.3/share/hadoop/common/lib/slf4j-api-1.7.36.jar:/usr/local/hadoop-3.3.3/share/hadoop/common/lib/jsr305-3.0.2.jar:/usr/local/hadoop-3.3.3/share/hadoop/common/lib/jakarta.activation-api-1.2.1.jar:/usr/local/hadoop-3.3.3/share/hadoop/common/lib/checker-qual-2.5.2.jar:/usr/local/hadoop-3.3.3/share/hadoop/common/lib/jetty-security-9.4.43.v20210629.jar:/usr/local/hadoop-3.3.3/share/hadoop/common/lib/metrics-core-3.2.4.jar:/usr/local/hadoop-3.3.3/share/hadoop/common/lib/kerby-pkix-1.0.1.jar:/usr/local/hadoop-3.3.3/share/h

In [12]:
# Creating our HDFS environmental variables: 
os.environ["HDFS_NAMENODE_USER"] = "root"
os.environ["HDFS_DATANODE_USER"] = "root"
os.environ["HDFS_SECONDARYNAMENODE_USER"] = "root"
os.environ["YARN_RESOURCEMANAGER_USER"] = "root"
os.environ["YARN_NODEMANAGER_USER"] = "root"

In [13]:
# Launching hdfs daemons
!$HADOOP_HOME/sbin/start-dfs.sh

Starting namenodes on [localhost]
Starting datanodes
Starting secondary namenodes [6155e2551bb9]


In [14]:
# Listing the running daemons
!jps

1504 SecondaryNameNode
1682 Jps
1286 DataNode
1150 NameNode


In [15]:
# Launching our yarn daemons
# nohup causes a process to ignore a "hang-up" signal
!nohup $HADOOP_HOME/sbin/start-yarn.sh

nohup: ignoring input and appending output to 'nohup.out'


In [16]:
#Listing the running daemons
!jps

1504 SecondaryNameNode
1953 NodeManager
1826 ResourceManager
1286 DataNode
2092 Jps
1150 NameNode


In [17]:
# Report the basic file system information and statistics to make sure everything is set up as it should be: 
!$HADOOP_HOME/bin/hdfs dfsadmin -report

Configured Capacity: 115658190848 (107.72 GB)
Present Capacity: 89913307136 (83.74 GB)
DFS Remaining: 89913282560 (83.74 GB)
DFS Used: 24576 (24 KB)
DFS Used%: 0.00%
Replicated Blocks:
	Under replicated blocks: 0
	Blocks with corrupt replicas: 0
	Missing blocks: 0
	Missing blocks (with replication factor 1): 0
	Low redundancy blocks with highest priority to recover: 0
	Pending deletion blocks: 0
Erasure Coded Block Groups: 
	Low redundancy block groups: 0
	Block groups with corrupt internal blocks: 0
	Missing block groups: 0
	Low redundancy blocks with highest priority to recover: 0
	Pending deletion blocks: 0

-------------------------------------------------
Live datanodes (1):

Name: 127.0.0.1:9866 (localhost)
Hostname: 6155e2551bb9
Decommission Status : Normal
Configured Capacity: 115658190848 (107.72 GB)
DFS Used: 24576 (24 KB)
Non DFS Used: 25728106496 (23.96 GB)
DFS Remaining: 89913282560 (83.74 GB)
DFS Used%: 0.00%
DFS Remaining%: 77.74%
Configured Cache Capacity: 0 (0 B)
Cache

### You may complete the following tasks using Java or Python scripts. The output for each task in this section should be just one line. Upload the following three books to your HDFS:

In [18]:
# TODO:

# Solution:
!wget http://www.gutenberg.org/cache/epub/1524/pg1524.txt http://www.gutenberg.org/cache/epub/1112/pg1112.txt http://www.gutenberg.org/cache/epub/2267/pg2267.txt

--2022-12-03 20:43:39--  http://www.gutenberg.org/cache/epub/1524/pg1524.txt
Resolving www.gutenberg.org (www.gutenberg.org)... 152.19.134.47, 2610:28:3090:3000:0:bad:cafe:47
Connecting to www.gutenberg.org (www.gutenberg.org)|152.19.134.47|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://www.gutenberg.org/cache/epub/1524/pg1524.txt [following]
--2022-12-03 20:43:39--  https://www.gutenberg.org/cache/epub/1524/pg1524.txt
Connecting to www.gutenberg.org (www.gutenberg.org)|152.19.134.47|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 204364 (200K) [text/plain]
Saving to: ‘pg1524.txt’


2022-12-03 20:43:39 (3.26 MB/s) - ‘pg1524.txt’ saved [204364/204364]

--2022-12-03 20:43:39--  http://www.gutenberg.org/cache/epub/1112/pg1112.txt
Connecting to www.gutenberg.org (www.gutenberg.org)|152.19.134.47|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://www.gutenberg.org/cache/epub/1112/pg1112.txt [

In [19]:
# TODO:

# Solution: 
!$HADOOP_HOME/bin/hdfs dfs -mkdir /word_count
!$HADOOP_HOME/bin/hdfs dfs -put *.txt /word_count

In [20]:
# We can see our files now on the HDFS:
!$HADOOP_HOME/bin/hdfs dfs -ls /word_count

Found 3 items
-rw-r--r--   1 root supergroup     179382 2022-12-03 20:43 /word_count/pg1112.txt
-rw-r--r--   1 root supergroup     204364 2022-12-03 20:43 /word_count/pg1524.txt
-rw-r--r--   1 root supergroup     172298 2022-12-03 20:43 /word_count/pg2267.txt


### 1. How many words in the corpus begin with the letter Y/y?

In [22]:
# Writing our mapper.py script:
# The executable will read our input and transform them into key-value 
# pairs which are then passed to our reducer script. 
# Reference: https://github.com/LMAPcoder/Hadoop-on-Colab/blob/main/Hadoop_on_Colab.ipynb

%%writefile mapper1.py

#!/usr/bin/env python

import sys
import re
  
for line in sys.stdin:
  line = line.strip()  # removes whitespace either side of our line
  words = line.split()  # splitting our line into a list of words
    
  for word in words:
    big_y = re.findall("^Y", word)
    small_y=re.findall("^y", word)
    if len(big_y)==1:
      print('%s\t%s' % (word, 1))  # writing our results to STDOUT (this is the input for reducer.py)
    elif len(small_y)==1:
      print('%s\t%s' % (word, 1))  # writing our results to STDOUT (this is the input for reducer.py)

Writing mapper1.py


In [23]:
# Writing our reducer.py script: 
# This executable reads all intermediate key-value pairs generated by our mapper.py
# And aggregates these into our final output result
# Reference: https://github.com/LMAPcoder/Hadoop-on-Colab/blob/main/Hadoop_on_Colab.ipynb

%%writefile reducer1.py

#!/usr/bin/env python
  
import sys
from operator import itemgetter
  
current_word = None
current_count = 0
word = None

y_Y_count=0  
for line in sys.stdin:
  line = line.strip()
  word, count = line.split('\t', 1)  # splitting the data on the basis of tab (see mapper.py)
  
  try:
    count = int(count)  # convert count (currently a string) to int
  except ValueError:
    continue  # silently ignore line if count is not a number
  
  # this IF-switch only works because Hadoop sorts map output
  # by key (here: word) before it is passed to the reducer
  if current_word == word:
    current_count += count
  else:
    current_count = count
    current_word = word
  y_Y_count+=current_count


print(f"{y_Y_count} words in the corpus begin with the letter Y/y")

Writing reducer1.py


In [24]:
# Giving these new files permissions:
!chmod u+x /content/mapper1.py /content/reducer1.py

In [25]:
#Testing our MapReduce job locally (Hadoop does not participate here)
!cat pg2267.txt | python mapper1.py | sort -k1,1 | python reducer1.py | head -50

105521 words in the corpus begin with the letter Y/y


In [26]:
#Running MapReduce programs
!$HADOOP_HOME/bin/hadoop jar $HADOOP_HOME/share/hadoop/tools/lib/hadoop-streaming-3.3.3.jar \
  -input /word_count/*.txt \
  -output /word_count/python_output1 \
  -mapper "python /content/mapper1.py" \
  -reducer "python /content/reducer1.py"

packageJobJar: [/tmp/hadoop-unjar9086846026193386767/] [] /tmp/streamjob8219554400497305437.jar tmpDir=null
2022-12-03 20:17:30,426 INFO client.DefaultNoHARMFailoverProxyProvider: Connecting to ResourceManager at localhost/127.0.0.1:8032
2022-12-03 20:17:30,727 INFO client.DefaultNoHARMFailoverProxyProvider: Connecting to ResourceManager at localhost/127.0.0.1:8032
2022-12-03 20:17:31,156 INFO mapreduce.JobResourceUploader: Disabling Erasure Coding for path: /tmp/hadoop-yarn/staging/root/.staging/job_1670098633286_0001
2022-12-03 20:17:31,541 INFO mapred.FileInputFormat: Total input files to process : 3
2022-12-03 20:17:31,683 INFO mapreduce.JobSubmitter: number of splits:3
2022-12-03 20:17:32,162 INFO mapreduce.JobSubmitter: Submitting tokens for job: job_1670098633286_0001
2022-12-03 20:17:32,163 INFO mapreduce.JobSubmitter: Executing with tokens: []
2022-12-03 20:17:32,469 INFO conf.Configuration: resource-types.xml not found
2022-12-03 20:17:32,470 INFO resource.ResourceUtils: Unab

In [27]:
# Checking out our new python_output directory:
!$HADOOP_HOME/bin/hdfs dfs -ls /word_count/python_output1

Found 2 items
-rw-r--r--   1 root supergroup          0 2022-12-03 20:18 /word_count/python_output1/_SUCCESS
-rw-r--r--   1 root supergroup         54 2022-12-03 20:18 /word_count/python_output1/part-00000


In [28]:
# part-00000 contains the ouput this time:
!$HADOOP_HOME/bin/hdfs dfs -cat /word_count/python_output1/part-00000

757759 words in the corpus begin with the letter Y/y	


### 2. What is the total number of unique words in the corpus?

In [29]:
# Writing our mapper.py script:
# The executable will read our input and transform them into key-value 
# pairs which are then passed to our reducer script. 
# Reference: https://github.com/LMAPcoder/Hadoop-on-Colab/blob/main/Hadoop_on_Colab.ipynb

%%writefile mapper2.py

#!/usr/bin/env python

import sys
  
for line in sys.stdin:
  line = line.strip()  # removes whitespace either side of our line
  words = line.split()  # splitting our line into a list of words
    
  for word in words:
    print('%s\t%s' % (word, 1))  # writing our results to STDOUT (this is the input for reducer.py)

Writing mapper2.py


In [30]:
# Writing our reducer.py script: 
# This executable reads all intermediate key-value pairs generated by our mapper.py
# And aggregates these into our final output result
# Reference: https://github.com/LMAPcoder/Hadoop-on-Colab/blob/main/Hadoop_on_Colab.ipynb

%%writefile reducer2.py

#!/usr/bin/env python
  
import sys
from operator import itemgetter
  
current_word = None
current_count = 0
word = None
unique_counter=0  
for line in sys.stdin:
  line = line.strip()
  word, count = line.split('\t', 1)  # splitting the data on the basis of tab (see mapper.py)
  
  try:
    count = int(count)  # convert count (currently a string) to int
  except ValueError:
    continue  # silently ignore line if count is not a number
  
  # this IF-switch only works because Hadoop sorts map output
  # by key (here: word) before it is passed to the reducer
  if current_word == word:
    current_count += count
  else:
    if current_word: # to avoid None values
      unique_counter+=1
    current_count = count
    current_word = word
  
print(f"total number of unique words in the corpus is {unique_counter}")

Writing reducer2.py


In [31]:
# Giving these new files permissions:
!chmod u+x /content/mapper2.py /content/reducer2.py

In [32]:
#Running MapReduce programs
!$HADOOP_HOME/bin/hadoop jar $HADOOP_HOME/share/hadoop/tools/lib/hadoop-streaming-3.3.3.jar \
  -input /word_count/*.txt \
  -output /word_count/python_output2 \
  -mapper "python /content/mapper2.py" \
  -reducer "python /content/reducer2.py"

packageJobJar: [/tmp/hadoop-unjar16131529443164896695/] [] /tmp/streamjob14515511339128030120.jar tmpDir=null
2022-12-03 20:18:17,975 INFO client.DefaultNoHARMFailoverProxyProvider: Connecting to ResourceManager at localhost/127.0.0.1:8032
2022-12-03 20:18:18,283 INFO client.DefaultNoHARMFailoverProxyProvider: Connecting to ResourceManager at localhost/127.0.0.1:8032
2022-12-03 20:18:18,656 INFO mapreduce.JobResourceUploader: Disabling Erasure Coding for path: /tmp/hadoop-yarn/staging/root/.staging/job_1670098633286_0002
2022-12-03 20:18:19,066 INFO mapred.FileInputFormat: Total input files to process : 3
2022-12-03 20:18:19,588 INFO mapreduce.JobSubmitter: number of splits:3
2022-12-03 20:18:20,379 INFO mapreduce.JobSubmitter: Submitting tokens for job: job_1670098633286_0002
2022-12-03 20:18:20,380 INFO mapreduce.JobSubmitter: Executing with tokens: []
2022-12-03 20:18:20,596 INFO conf.Configuration: resource-types.xml not found
2022-12-03 20:18:20,596 INFO resource.ResourceUtils: Un

In [33]:
# Checking out our new python_output directory:
!$HADOOP_HOME/bin/hdfs dfs -ls /word_count/python_output2

Found 2 items
-rw-r--r--   1 root supergroup          0 2022-12-03 20:18 /word_count/python_output2/_SUCCESS
-rw-r--r--   1 root supergroup         53 2022-12-03 20:18 /word_count/python_output2/part-00000


In [34]:
# part-00000 contains the ouput this time:
!$HADOOP_HOME/bin/hdfs dfs -cat /word_count/python_output2/part-00000

total number of unique words in the corpus is 17744	


### 3. Which word occurs most frequently in the corpus? How many times does it occur?

In [35]:
# Writing our mapper.py script:
# The executable will read our input and transform them into key-value 
# pairs which are then passed to our reducer script. 
# Reference: https://github.com/LMAPcoder/Hadoop-on-Colab/blob/main/Hadoop_on_Colab.ipynb

%%writefile mapper3.py

#!/usr/bin/env python

import sys
  
for line in sys.stdin:
  line = line.strip()  # removes whitespace either side of our line
  words = line.split()  # splitting our line into a list of words
    
  for word in words:
    print('%s\t%s' % (word, 1))  # writing our results to STDOUT (this is the input for reducer.py)

Writing mapper3.py


In [39]:
# Writing our reducer.py script: 
# This executable reads all intermediate key-value pairs generated by our mapper.py
# And aggregates these into our final output result
# Reference: https://github.com/LMAPcoder/Hadoop-on-Colab/blob/main/Hadoop_on_Colab.ipynb

%%writefile reducer3.py

#!/usr/bin/env python
  
import sys
from operator import itemgetter
  
current_word = None
current_count = 0
word = None
words_counts={}  
for line in sys.stdin:
  line = line.strip()
  word, count = line.split('\t', 1)  # splitting the data on the basis of tab (see mapper.py)
  
  try:
    count = int(count)  # convert count (currently a string) to int
  except ValueError:
    continue  # silently ignore line if count is not a number
  
  # this IF-switch only works because Hadoop sorts map output
  # by key (here: word) before it is passed to the reducer
  if current_word == word:
    current_count += count
  else:
    if current_word: # to avoid None values
      words_counts[current_word]= current_count
    current_count = count
    current_word = word
  
m_freq_item=max(words_counts.items(), key=itemgetter(1))
m_freq_word=m_freq_item[0]
m_freq_word_occur=m_freq_item[1]
print(f"most frequently in the corpus is {m_freq_word}, it occured {m_freq_word_occur} times.")

Overwriting reducer3.py


In [40]:
# Giving these new files permissions:
!chmod u+x /content/mapper3.py /content/reducer3.py

In [41]:
#Testing our MapReduce job locally (Hadoop does not participate here)
!cat pg2267.txt | python mapper3.py | sort -k1,1 | python reducer3.py | head -50

most frequently in the corpus is I, it occured 830 times.


In [42]:
#Running MapReduce programs
!$HADOOP_HOME/bin/hadoop jar $HADOOP_HOME/share/hadoop/tools/lib/hadoop-streaming-3.3.3.jar \
  -input /word_count/*.txt \
  -output /word_count/python_output3 \
  -mapper "python /content/mapper3.py" \
  -reducer "python /content/reducer3.py"

packageJobJar: [/tmp/hadoop-unjar14679185848601467052/] [] /tmp/streamjob8689544642227814113.jar tmpDir=null
2022-12-03 20:20:32,069 INFO client.DefaultNoHARMFailoverProxyProvider: Connecting to ResourceManager at localhost/127.0.0.1:8032
2022-12-03 20:20:32,321 INFO client.DefaultNoHARMFailoverProxyProvider: Connecting to ResourceManager at localhost/127.0.0.1:8032
2022-12-03 20:20:32,749 INFO mapreduce.JobResourceUploader: Disabling Erasure Coding for path: /tmp/hadoop-yarn/staging/root/.staging/job_1670098633286_0003
2022-12-03 20:20:33,160 INFO mapred.FileInputFormat: Total input files to process : 3
2022-12-03 20:20:33,287 INFO mapreduce.JobSubmitter: number of splits:3
2022-12-03 20:20:33,638 INFO mapreduce.JobSubmitter: Submitting tokens for job: job_1670098633286_0003
2022-12-03 20:20:33,638 INFO mapreduce.JobSubmitter: Executing with tokens: []
2022-12-03 20:20:33,864 INFO conf.Configuration: resource-types.xml not found
2022-12-03 20:20:33,865 INFO resource.ResourceUtils: Una

In [43]:
# Checking out our new python_output directory:
!$HADOOP_HOME/bin/hdfs dfs -ls /word_count/python_output3

Found 2 items
-rw-r--r--   1 root supergroup          0 2022-12-03 20:21 /word_count/python_output3/_SUCCESS
-rw-r--r--   1 root supergroup         62 2022-12-03 20:21 /word_count/python_output3/part-00000


In [44]:
# part-00000 contains the ouput this time:
!$HADOOP_HOME/bin/hdfs dfs -cat /word_count/python_output3/part-00000

most frequently in the corpus is the, it occured 2576 times.	


### 4. For that most commonly occurring word, what word most frequently follows it in a line?

In [37]:
# Writing our mapper.py script:
# The executable will read our input and transform them into key-value 
# pairs which are then passed to our reducer script. 
# Reference: https://github.com/LMAPcoder/Hadoop-on-Colab/blob/main/Hadoop_on_Colab.ipynb

%%writefile mapper4.py

#!/usr/bin/env python

import sys
  
for line in sys.stdin:
  line = line.strip()  # removes whitespace either side of our line
  words = line.split()  # splitting our line into a list of words
  p_word=None    
  for word in words:
    if p_word=="the":
      print('%s\t%s' % (word, 1))  # writing our results to STDOUT (this is the input for reducer.py)
    p_word=word

Overwriting mapper4.py


In [22]:
# Writing our reducer.py script: 
# This executable reads all intermediate key-value pairs generated by our mapper.py
# And aggregates these into our final output result
# Reference: https://github.com/LMAPcoder/Hadoop-on-Colab/blob/main/Hadoop_on_Colab.ipynb

%%writefile reducer4.py

#!/usr/bin/env python

import sys
from operator import itemgetter
  
current_word = None
current_count = 0
word = None
words_counts={}  
for line in sys.stdin:
  line = line.strip()
  word, count = line.split('\t', 1)  # splitting the data on the basis of tab (see mapper.py)
  
  try:
    count = int(count)  # convert count (currently a string) to int
  except ValueError:
    continue  # silently ignore line if count is not a number
  
  # this IF-switch only works because Hadoop sorts map output
  # by key (here: word) before it is passed to the reducer
  if current_word == word:
    current_count += count
  else:
    if current_word: # to avoid None values
      words_counts[current_word]= current_count
    current_count = count
    current_word = word
  
m_freq_item=max(words_counts.items(), key=itemgetter(1))
m_freq_word=m_freq_item[0]
m_freq_word_occur=m_freq_item[1]
print(f"most commonly occurring word the followed most frequently by {m_freq_word}, {m_freq_word} followed the {m_freq_word_occur} times in a line.")

Writing reducer4.py


In [38]:
# Giving these new files permissions:
!chmod u+x /content/mapper4.py /content/reducer4.py

In [39]:
#Testing our MapReduce job locally (Hadoop does not participate here)
!cat pg2267.txt | python mapper4.py | sort -k1,1 | python reducer4.py | head -50

most commonly occurring word the followed most frequently by Moore, Moore followed the 21 times in a line.


In [40]:
#Running MapReduce programs
!$HADOOP_HOME/bin/hadoop jar $HADOOP_HOME/share/hadoop/tools/lib/hadoop-streaming-3.3.3.jar \
  -input /word_count/*.txt \
  -output /word_count/python_output4 \
  -mapper "python /content/mapper4.py" \
  -reducer "python /content/reducer4.py"

packageJobJar: [/tmp/hadoop-unjar18214177651360519311/] [] /tmp/streamjob5948849691694841749.jar tmpDir=null
2022-12-03 20:50:11,875 INFO client.DefaultNoHARMFailoverProxyProvider: Connecting to ResourceManager at localhost/127.0.0.1:8032
2022-12-03 20:50:12,109 INFO client.DefaultNoHARMFailoverProxyProvider: Connecting to ResourceManager at localhost/127.0.0.1:8032
2022-12-03 20:50:12,452 INFO mapreduce.JobResourceUploader: Disabling Erasure Coding for path: /tmp/hadoop-yarn/staging/root/.staging/job_1670100213453_0001
2022-12-03 20:50:13,207 INFO mapred.FileInputFormat: Total input files to process : 3
2022-12-03 20:50:13,340 INFO mapreduce.JobSubmitter: number of splits:3
2022-12-03 20:50:13,696 INFO mapreduce.JobSubmitter: Submitting tokens for job: job_1670100213453_0001
2022-12-03 20:50:13,697 INFO mapreduce.JobSubmitter: Executing with tokens: []
2022-12-03 20:50:13,909 INFO conf.Configuration: resource-types.xml not found
2022-12-03 20:50:13,909 INFO resource.ResourceUtils: Una

In [41]:
# Checking out our new python_output directory:
!$HADOOP_HOME/bin/hdfs dfs -ls /word_count/python_output4

Found 2 items
-rw-r--r--   1 root supergroup          0 2022-12-03 20:50 /word_count/python_output4/_SUCCESS
-rw-r--r--   1 root supergroup        112 2022-12-03 20:50 /word_count/python_output4/part-00000


In [42]:
# part-00000 contains the ouput this time:
!$HADOOP_HOME/bin/hdfs dfs -cat /word_count/python_output4/part-00000

most commonly occurring word the followed most frequently by Project, Project followed the 58 times in a line.	
