# **Prepare - Data collection.**

---



In [1]:
# Create a folder only when it does not exist.
%mkdir -p wikipedia-ml-raw

In [2]:
# Create a script
%%file main.sh
#!/bin/bash

# Give the path of the file as first argument to the script:
input="$1" 

while IFS= read -r var 
do
  # Get each id
  id=`echo $var` 
  link="https://en.wikipedia.org/w/index.php?title=Machine_learning&oldid=$id"
  downloadname="index.php?title=Machine_learning&oldid=$id.html"
  filename="wikipedia-ml-raw/machine-learning-$id.html"
  echo $filename
  # Download the file from wikipedia
  wget -E $link
  # Move the downloaded file to the specific directory and rename it
  mv $downloadname $filename
done <"$input"

Writing main.sh


In [3]:
# Run the script
!bash main.sh article-ids.txt

wikipedia-ml-raw/machine-learning-530966344.html
--2023-04-30 09:57:25--  https://en.wikipedia.org/w/index.php?title=Machine_learning&oldid=530966344
Resolving en.wikipedia.org (en.wikipedia.org)... 208.80.153.224, 2620:0:860:ed1a::1
Connecting to en.wikipedia.org (en.wikipedia.org)|208.80.153.224|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: unspecified [text/html]
Saving to: ‘index.php?title=Machine_learning&oldid=530966344.html’

index.php?title=Mac     [ <=>                ] 127.78K  --.-KB/s    in 0.1s    

2023-04-30 09:57:26 (1.19 MB/s) - ‘index.php?title=Machine_learning&oldid=530966344.html’ saved [130843]

wikipedia-ml-raw/machine-learning-561799120.html
--2023-04-30 09:57:26--  https://en.wikipedia.org/w/index.php?title=Machine_learning&oldid=561799120
Resolving en.wikipedia.org (en.wikipedia.org)... 208.80.153.224, 2620:0:860:ed1a::1
Connecting to en.wikipedia.org (en.wikipedia.org)|208.80.153.224|:443... connected.
HTTP request sent, awaiting re

In [4]:
# Run the parse_article script
!bash parse_article.sh

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


# **Installing and Configuring Hadoop**

In [5]:
import os
# Installing Hadoop and configuring JAVA_HOME: 
# Downloading Hadoop
# Upzipping 
# Copying hadoop into our /usr/local folder
# Removing the unused original copy
# Remove the compressed (zip) file, we're not using it anymore. 
# Adding a variable called "JAVA_HOME" to hadoop's environment script which tells it where Java is on our system. 

!if [ ! -d /usr/local/hadoop-3.3.3/ ]; then \
wget https://dlcdn.apache.org/hadoop/common/hadoop-3.3.3/hadoop-3.3.3.tar.gz; \
tar -xzf hadoop-3.3.3.tar.gz; \
cp -r hadoop-3.3.3/ /usr/local/; \
rm -rf hadoop-3.3.3/; \
rm hadoop-3.3.3.tar.gz; \
echo "export JAVA_HOME=$(dirname $(dirname $(realpath $(which java))))" >> /usr/local/hadoop-3.3.3/etc/hadoop/hadoop-env.sh; \
fi

--2023-04-30 09:58:40--  https://dlcdn.apache.org/hadoop/common/hadoop-3.3.3/hadoop-3.3.3.tar.gz
Resolving dlcdn.apache.org (dlcdn.apache.org)... 151.101.2.132, 2a04:4e42::644
Connecting to dlcdn.apache.org (dlcdn.apache.org)|151.101.2.132|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 645040598 (615M) [application/x-gzip]
Saving to: ‘hadoop-3.3.3.tar.gz’


2023-04-30 09:58:44 (201 MB/s) - ‘hadoop-3.3.3.tar.gz’ saved [645040598/645040598]



In [6]:
# Setting up some of our environmental variables: 
# Here we add Hadoop's location to our path (in Python) and tell our system where hadoop is located. 
os.environ['PATH'] = "/usr/local/hadoop-3.3.3/bin/:" + os.environ['PATH']
os.environ["HADOOP_HOME"] = "/usr/local/hadoop-3.3.3"

In [7]:
# from google.colab import drive
# drive.mount('/content/drive')
# !cp /content/drive/MyDrive/COMP47470_lab4/lab4_config.sh /content/
# !cat lab4_config.sh
# # Running our config script
# # Note: remember to check you have the correct filepath
# !bash lab4_config.sh

In [8]:
!$HADOOP_HOME/bin/hdfs namenode -format
# Creating our HDFS environment variables: 
os.environ["HDFS_NAMENODE_USER"] = "root"
os.environ["HDFS_DATANODE_USER"] = "root"
os.environ["HDFS_SECONDARYNAMENODE_USER"] = "root"
os.environ["YARN_RESOURCEMANAGER_USER"] = "root"
os.environ["YARN_NODEMANAGER_USER"] = "root"

2023-04-30 09:59:23,183 INFO namenode.NameNode: STARTUP_MSG: 
/************************************************************
STARTUP_MSG: Starting NameNode
STARTUP_MSG:   host = 6d8454f771e3/172.28.0.12
STARTUP_MSG:   args = [-format]
STARTUP_MSG:   version = 3.3.3
STARTUP_MSG:   classpath = /usr/local/hadoop-3.3.3/etc/hadoop:/usr/local/hadoop-3.3.3/share/hadoop/common/lib/jettison-1.1.jar:/usr/local/hadoop-3.3.3/share/hadoop/common/lib/hadoop-auth-3.3.3.jar:/usr/local/hadoop-3.3.3/share/hadoop/common/lib/jsch-0.1.55.jar:/usr/local/hadoop-3.3.3/share/hadoop/common/lib/kerb-simplekdc-1.0.1.jar:/usr/local/hadoop-3.3.3/share/hadoop/common/lib/jakarta.activation-api-1.2.1.jar:/usr/local/hadoop-3.3.3/share/hadoop/common/lib/commons-beanutils-1.9.4.jar:/usr/local/hadoop-3.3.3/share/hadoop/common/lib/curator-framework-4.2.0.jar:/usr/local/hadoop-3.3.3/share/hadoop/common/lib/jackson-core-asl-1.9.13.jar:/usr/local/hadoop-3.3.3/share/hadoop/common/lib/jsr305-3.0.2.jar:/usr/local/hadoop-3.3.3/sha

In [9]:
# Launching hdfs daemons
!$HADOOP_HOME/sbin/start-dfs.sh
# Launching our yarn daemons
# nohup causes a process to ignore a "hang-up" signal
!nohup $HADOOP_HOME/sbin/start-yarn.sh

Starting namenodes on [6d8454f771e3]
6d8454f771e3: ssh: connect to host 6d8454f771e3 port 22: Connection refused
Starting datanodes
localhost: ssh: connect to host localhost port 22: Cannot assign requested address
Starting secondary namenodes [6d8454f771e3]
6d8454f771e3: ssh: connect to host 6d8454f771e3 port 22: Connection refused
nohup: ignoring input and appending output to 'nohup.out'


# **Data Processing with Hadoop**

**Move all article files (from all years) from wikipedia-ml into the HDFS.**

In [10]:
# Make a new directory in HDFS
!$HADOOP_HOME/bin/hdfs dfs -mkdir /wikipedia-ml

In [11]:
# Create a script to upload the articles onto Hadoop
%%file upload.sh
#!/bin/bash
input="/content/wikipedia-ml"
for file in $input/*; do
  hdfs dfs -put "$file" /wikipedia-ml
done

Writing upload.sh


In [12]:
!bash upload.sh

In [13]:
!$HADOOP_HOME/bin/hdfs dfs -ls /wikipedia-ml

Found 44 items
-rw-r--r--   1 root root      12305 2023-04-30 10:00 /wikipedia-ml/article_Machine learning - Wikipedia_2013_1_2.txt
-rw-r--r--   1 root root      12048 2023-04-30 10:00 /wikipedia-ml/article_Machine learning - Wikipedia_2013_6_27.txt
-rw-r--r--   1 root root      12391 2023-04-30 10:00 /wikipedia-ml/article_Machine learning - Wikipedia_2014_1_4.txt
-rw-r--r--   1 root root      12396 2023-04-30 10:00 /wikipedia-ml/article_Machine learning - Wikipedia_2014_6_4.txt
-rw-r--r--   1 root root      16236 2023-04-30 10:00 /wikipedia-ml/article_Machine learning - Wikipedia_2015_1_4.txt
-rw-r--r--   1 root root      17678 2023-04-30 10:00 /wikipedia-ml/article_Machine learning - Wikipedia_2015_6_30.txt
-rw-r--r--   1 root root      17526 2023-04-30 10:00 /wikipedia-ml/article_Machine learning - Wikipedia_2016_1_1.txt
-rw-r--r--   1 root root      19147 2023-04-30 10:00 /wikipedia-ml/article_Machine learning - Wikipedia_2016_6_30.txt
-rw-r--r--   1 root root      22051 2023-04-30

**Co-occurance MapReduce**

In [14]:
# Create the mapper script
%%writefile mapper_co_occur.py

#!/usr/bin/env python
import sys,string

# Read the stopwords and store them in a list
stopwords = open("stopwords.txt", "r")
stopwords_list = []
for stopword in stopwords:
  stopwords_list.append(stopword.strip())

# Read articles
for line in sys.stdin:
  line = line.strip()  # removes whitespace either side of each line
  words = line.split()  # splitting each line into a list of words
  n = len(words)
  # Create co-occurance
  for i in range(n):
    for j in range(i+1, n):
      word_i = words[i].strip(string.punctuation.replace("\'","")) # removes punctuations
      word_j = words[j].strip(string.punctuation.replace("\'","")) # removes punctuations
      if not(word_i.lower() in stopwords_list) and not(word_j.lower() in stopwords_list):
        print('%s,%s\t%s' % (word_i,word_j, 1))  # writing our results to STDOUT (this is the input for reducer.py)
    

Writing mapper_co_occur.py


In [15]:
# Writing reducer script: 
# Reference: lab04_solution

%%writefile reducer_co_occur.py

#!/usr/bin/env python
  
import sys
from operator import itemgetter
  
current_word = None
current_count = 0
word = None
  
for line in sys.stdin:
  line = line.strip()
  word, count = line.split('\t', 1)  # splitting the data on the basis of tab (see mapper.py)
  
  try:
    count = int(count)  # convert count (currently a string) to int
  except ValueError:
    continue  # silently ignore line if count is not a number
  
  # this IF-switch only works because Hadoop sorts map output
  # by key (here: word) before it is passed to the reducer
  if current_word == word:
    current_count += count
  else:
    if current_word: # to avoid None values
      print('%s\t%s' % (current_word, current_count))
    current_count = count
    current_word = word
  
# do not forget to output the last word if needed!
if current_word == word:
  print('%s\t%s' % (current_word, current_count))

Writing reducer_co_occur.py


In [16]:
# Giving these mapper&reducer permissions:
!chmod u+x /content/mapper_co_occur.py /content/reducer_co_occur.py

In [17]:
#Running MapReduce programs
!$HADOOP_HOME/bin/hadoop jar $HADOOP_HOME/share/hadoop/tools/lib/hadoop-streaming-3.3.3.jar \
  -input /wikipedia-ml/article*.txt \
  -output /cooccur-outputs \
  -mapper "python /content/mapper_co_occur.py" \
  -reducer "python /content/reducer_co_occur.py"

2023-04-30 10:03:06,783 INFO impl.MetricsConfig: Loaded properties from hadoop-metrics2.properties
2023-04-30 10:03:07,203 INFO impl.MetricsSystemImpl: Scheduled Metric snapshot period at 10 second(s).
2023-04-30 10:03:07,203 INFO impl.MetricsSystemImpl: JobTracker metrics system started
2023-04-30 10:03:07,252 WARN impl.MetricsSystemImpl: JobTracker metrics system already initialized!
2023-04-30 10:03:07,677 INFO mapred.FileInputFormat: Total input files to process : 22
2023-04-30 10:03:07,765 INFO mapreduce.JobSubmitter: number of splits:22
2023-04-30 10:03:08,175 INFO mapreduce.JobSubmitter: Submitting tokens for job: job_local1265541845_0001
2023-04-30 10:03:08,175 INFO mapreduce.JobSubmitter: Executing with tokens: []
2023-04-30 10:03:08,400 INFO mapreduce.Job: The url to track the job: http://localhost:8080/
2023-04-30 10:03:08,403 INFO mapreduce.Job: Running job: job_local1265541845_0001
2023-04-30 10:03:08,423 INFO mapred.LocalJobRunner: OutputCommitter set in config null
2023-

In [18]:
# Checking output:
!$HADOOP_HOME/bin/hdfs dfs -ls /cooccur-outputs
!$HADOOP_HOME/bin/hdfs dfs -cat /cooccur-outputs/part-00000

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
system,10	1
system,2009	1
system,2015	10
system,2016	10
system,2018	10
system,3	8
system,AI...It's	2
system,AI...Its	7
system,ATT	1
system,BellKor	1
system,CAA	16
system,Chaos	1
system,Concern	9
system,Fei-Fei	9
system,Google	20
system,Inductive	19
system,Language	10
system,Li	9
system,MNIST	7
system,Machine	10
system,Microsoft	10
system,Pragmatic	1
system,Research	1
system,Responsible	12
system,Rule-based	14
system,Similar	10
system,Team	1
system,Theres	7
system,Twitter	10
system,W	8
system,able	20
system,action	8
system,adopted	10
system,advice	8
system,algorithm	8
system,algorithmic	12
system,algorithms	10
system,already	10
system,andmost	9
system,applicants	24
system,applied	14
system,approaches	14
system,artificial	32
system,assessment	10
system,association	14
system,backpropagated	8
system,beat	1
system,beginning	9
system,behaves	8
system,behavior	16
system,behavioral	16
system,bias	21
system,biased	10
system,biases

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



**n-gram MapReduce**

In [19]:
# Create the mapper script
%%writefile mapper_bigram.py

#!/usr/bin/env python
import sys,string

# Read articles
for line in sys.stdin:
  line = line.strip()  # removes whitespace either side of each line
  words = line.split()  # splitting each line into a list of words
  n = len(words)
  # Create bigram
  for i in range(n-1):
    word_i = words[i].strip(string.punctuation.replace("\'","")) # removes punctuations
    word_j = words[i+1].strip(string.punctuation.replace("\'","")) # removes punctuations
    print('%s,%s\t%s' % (word_i,word_j, 1))  # writing our results to STDOUT (this is the input for reducer.py)
    

Writing mapper_bigram.py


In [20]:
# Writing reducer script: 
# Reference: lab04_solution
%%writefile combiner_bigram.py

#!/usr/bin/env python
  
import sys
from operator import itemgetter
  
current_word = None
current_count = 0
word = None
  
for line in sys.stdin:
  line = line.strip()
  word, count = line.split('\t', 1)  # splitting the data on the basis of tab (see mapper.py)
  
  try:
    count = int(count)  # convert count (currently a string) to int
  except ValueError:
    continue  # silently ignore line if count is not a number
  
  # this IF-switch only works because Hadoop sorts map output
  # by key (here: word) before it is passed to the reducer
  if current_word == word:
    current_count += count
  else:
    if current_word: # to avoid None values
      print('%s\t%s' % (current_word, current_count))
    current_count = count
    current_word = word
  
# do not forget to output the last word if needed!
if current_word == word:
  print('%s\t%s' % (current_word, current_count))

Writing combiner_bigram.py


In [21]:
# Writing reducer script: 
# Reference: lab04_solution
%%writefile reducer_bigram.py

#!/usr/bin/env python
  
import sys
from operator import itemgetter
  
current_word = None
current_count = 0
word = None
  
for line in sys.stdin:
  line = line.strip()
  word, count = line.split('\t', 1)  # splitting the data on the basis of tab (see mapper.py)
  
  try:
    count = int(count)  # convert count (currently a string) to int
  except ValueError:
    continue  # silently ignore line if count is not a number
  
  # this IF-switch only works because Hadoop sorts map output
  # by key (here: word) before it is passed to the reducer
  if current_word == word:
    current_count += count
  else:
    if current_word: # to avoid None values
      print('%s\t%s' % (current_word, current_count))
    current_count = count
    current_word = word
  
# do not forget to output the last word if needed!
if current_word == word:
  print('%s\t%s' % (current_word, current_count))

Writing reducer_bigram.py


In [22]:
# Giving these mapper&reducer permissions:
!chmod u+x /content/mapper_bigram.py /content/combiner_bigram.py /content/reducer_bigram.py

In [23]:
#Running MapReduce programs
!$HADOOP_HOME/bin/hadoop jar $HADOOP_HOME/share/hadoop/tools/lib/hadoop-streaming-3.3.3.jar \
  -input /wikipedia-ml/article*.txt \
  -output /bigram-outputs \
  -mapper "python /content/mapper_bigram.py" \
  -combiner "python /content/combiner_bigram.py" \
  -reducer "python /content/reducer_bigram.py"

2023-04-30 10:05:41,901 INFO impl.MetricsConfig: Loaded properties from hadoop-metrics2.properties
2023-04-30 10:05:42,223 INFO impl.MetricsSystemImpl: Scheduled Metric snapshot period at 10 second(s).
2023-04-30 10:05:42,223 INFO impl.MetricsSystemImpl: JobTracker metrics system started
2023-04-30 10:05:42,283 WARN impl.MetricsSystemImpl: JobTracker metrics system already initialized!
2023-04-30 10:05:42,824 INFO mapred.FileInputFormat: Total input files to process : 22
2023-04-30 10:05:42,919 INFO mapreduce.JobSubmitter: number of splits:22
2023-04-30 10:05:43,507 INFO mapreduce.JobSubmitter: Submitting tokens for job: job_local1540228441_0001
2023-04-30 10:05:43,507 INFO mapreduce.JobSubmitter: Executing with tokens: []
2023-04-30 10:05:44,006 INFO mapreduce.Job: The url to track the job: http://localhost:8080/
2023-04-30 10:05:44,008 INFO mapreduce.Job: Running job: job_local1540228441_0001
2023-04-30 10:05:44,033 INFO mapred.LocalJobRunner: OutputCommitter set in config null
2023-

In [24]:
# Checking output:
!$HADOOP_HOME/bin/hdfs dfs -ls /bigram-outputs
!$HADOOP_HOME/bin/hdfs dfs -cat /bigram-outputs/part-00000

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
electrocardiograms,and	3
eliminate,the	4
eliminated,since	4
elimination,or	5
email,For	3
email,and	10
email,as	1
email,filtering	13
email,message	1
email,messages	8
emails,the	13
embedded,devices	3
embedded,systems	3
emotion,The	8
emotion,toward	8
emotions,about	8
emotions,feelings	8
emphasis,on	18
emphasize,the	4
employ,the	22
employed,especially	18
employed,in	10
employed,today	10
employee,and	3
employs,data	22
employs,various	7
emulate,a	4
emulate,neurons	4
emulate,the	4
enables,it	20
encoding,of	22
encompasses,a	8
encompasses,other	10
encountered,in	8
endeavor,machine	8
endeavour,machine	10
engine,accordingly	17
engineering,and	10
engineering,example	1
engineering,teams	3
engineers,and	5
engineers,need	3
engineers,that	9
engines,and	5
enhance,their	1
enhancements,had	7
enough,is	1
enough,training	4
ensemble,model	21
entails,all	22
entire,set	6
entirely,eliminated	4
entities,and	4
entities,can	22
environment,After	8
en