# Initial definitions

In [4]:
%env HADOOP_VERSION     2.10.1
%env HADOOP_PATH hadoop-2.10.1

env: HADOOP_VERSION=2.10.1
env: HADOOP_PATH=hadoop-2.10.1


# Preparing the environment

## Downloading Hadoop

In [5]:
!wget http://ftp.unicamp.br/pub/apache/hadoop/common/hadoop-${HADOOP_VERSION}/hadoop-${HADOOP_VERSION}.tar.gz -q --show-progress



## Extracting compressed files and removing .tar

In [6]:
# !rm ${HADOOP_PATH} -r
!tar -xvf hadoop-${HADOOP_VERSION}.tar.gz >/dev/null 
!rm       hadoop-${HADOOP_VERSION}.tar.gz

## Discovering the Java path

In [7]:
!dirname $(dirname $(readlink -f $(which javac)))

/usr/lib/jvm/java-8-openjdk-amd64


## Setting the Java path envvar

We also added it to user's .bashrc so it will be loaded as the nodes perform ssh connections.

In [8]:
%env JAVA_HOME /usr/lib/jvm/java-8-openjdk-amd64

env: JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64


In [9]:
!echo "export JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64 " >> ~/.bashrc
!echo "export JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64 " >> ~/.profile
!echo "export JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64 " >> ${HADOOP_PATH}/etc/hadoop/hadoop-env.sh

# Hadoop in Standalone Mode (local)

## MapReduce in the local filesystem - word count example

In [10]:
!${HADOOP_PATH}/bin/hadoop jar ${HADOOP_PATH}/share/hadoop/mapreduce/hadoop-mapreduce-examples-${HADOOP_VERSION}.jar wordcount \
                               ./resources/examples/newyorknewyork.txt ./output

21/11/10 15:15:27 INFO Configuration.deprecation: session.id is deprecated. Instead, use dfs.metrics.session-id
21/11/10 15:15:27 INFO jvm.JvmMetrics: Initializing JVM Metrics with processName=JobTracker, sessionId=
21/11/10 15:15:27 INFO input.FileInputFormat: Total input files to process : 1
21/11/10 15:15:27 INFO mapreduce.JobSubmitter: number of splits:1
21/11/10 15:15:27 INFO mapreduce.JobSubmitter: Submitting tokens for job: job_local170601614_0001
21/11/10 15:15:28 INFO mapreduce.Job: The url to track the job: http://localhost:8080/
21/11/10 15:15:28 INFO mapreduce.Job: Running job: job_local170601614_0001
21/11/10 15:15:28 INFO mapred.LocalJobRunner: OutputCommitter set in config null
21/11/10 15:15:28 INFO output.FileOutputCommitter: File Output Committer Algorithm version is 1
21/11/10 15:15:28 INFO output.FileOutputCommitter: FileOutputCommitter skip cleanup _temporary folders under output directory:false, ignore cleanup failures: false
21/11/10 15:15:28 INFO mapred.LocalJob

### Listing files in the output folder

In [11]:
!ls ./output/

part-r-00000  _SUCCESS


### Reading output file

In [12]:
! cat ./output/part-r-00000

And	3
Come	1
Head	1
I	8
I'll	1
I'm	3
If	2
In	1
It's	1
King	1
My	1
New	13
Right	2
Start	1
That	2
These	2
They	3
Top	2
York	13
You	1
a	3
about	2
all	1
am	2
anywhere	2
are	3
away	2
baby	1
be	1
bet	1
blues	2
brand	2
can	2
city	2
come	1
doesn't	1
find	2
gonna	2
have	1
heap	2
heart	1
hill	3
in	3
it	8
just	1
king	2
know	1
leaving	1
list	1
little	2
longing	1
make	6
melted	1
melting	1
never	1
new	2
news	1
of	10
old	2
on	1
part	1
shoes	1
sleep	1
sleeps	1
spreading	1
start	2
stray	1
that	2
the	8
there	3
through	2
to	6
today	1
town	2
up	3
vagabond	1
very	1
wake	2
want	3
you	2


# Hadoop in Pseudo-Distributed Mode

## Preparing the environment

### Starting sshd server

Check `/binder/postBuild` and `/resources/configs/ssh/sshd_config` files for more details

In [None]:
!/usr/sbin/sshd -f resources/configs/ssh/sshd_config 

### Adding names to know hosts 

Commands below stablish ssh connections to used host names/ips. This step avoids yes/no host confirmation.

In [None]:
!ssh -o "StrictHostKeyChecking no" $USER@localhost -p 8822 -C "exit" 
!ssh -o "StrictHostKeyChecking no" $USER@0.0.0.0   -p 8822 -C "exit"

### Adding ssh options to Hadoop via envvar

* connecting in a diferent port (`-p 8822`)
* avoiding host key checking (`-o StrictHostKeyChecking=no`)

In [None]:
%env HADOOP_SSH_OPTS= -o StrictHostKeyChecking=no -p 8822

In [None]:
%env PDSH_RCMD_TYPE ssh

### Copying configurations files to Hadoop folder

Check the configuration files accordingly to the Hadoop version. 
Refer to the `/resources/configs/hadoop/<version>`.

In [None]:
!cp resources/configs/hadoop/${HADOOP_VERSION}/core-site.xml   ${HADOOP_PATH}/etc/hadoop/
!cp resources/configs/hadoop/${HADOOP_VERSION}/hdfs-site.xml   ${HADOOP_PATH}/etc/hadoop/

## Formatting the filesystem

In [None]:
!${HADOOP_PATH}/bin/hdfs namenode -format -force -nonInteractive

## Starting DFS (NameNode, SecondaryNameNode, and DataNode daemons)

In [None]:
!${HADOOP_PATH}/sbin/start-dfs.sh
!jps

## MapReduce - Word count example 

### Creating folders in the distributed file system

In [None]:
!${HADOOP_PATH}/bin/hdfs dfs -mkdir /user/
!${HADOOP_PATH}/bin/hdfs dfs -mkdir /user/matheus/
!${HADOOP_PATH}/bin/hdfs dfs -mkdir /user/matheus/input/

### Copying a file to a folder in the distributed file system

In [None]:
!${HADOOP_PATH}/bin/hdfs dfs -put ./resources/examples/newyorknewyork.txt /user/matheus/input/

### Listing files in a folder of the distributed file system

In [None]:
!${HADOOP_PATH}/bin/hdfs dfs -ls /user/matheus/input/

### Retrieving the contents of a file in the distributed file system

In [None]:
!${HADOOP_PATH}/bin/hdfs dfs -cat /user/matheus/input/newyorknewyork.txt

### Running MapReduce job in Pseudo-Distributed Mode

In [None]:
!./${HADOOP_PATH}/bin/hadoop jar  ./${HADOOP_PATH}/share/hadoop/mapreduce/hadoop-mapreduce-examples-${HADOOP_VERSION}.jar wordcount \
                                /user/matheus/input /user/matheus/output

### Listing files in the output folder

In [None]:
!./${HADOOP_PATH}/bin/hdfs dfs -ls /user/matheus/output/

### Reading output file

In [None]:
!./${HADOOP_PATH}/bin/hdfs dfs -cat /user/matheus/output/part-r-00000

# Starting YARN in Pseudo-Distributed Mode

## Preparing the environment

### Copying configurations files to Hadoop folder

In [None]:
!cp resources/configs/hadoop/${HADOOP_VERSION}/mapred-site.xml ${HADOOP_PATH}/etc/hadoop/
!cp resources/configs/hadoop/${HADOOP_VERSION}/yarn-site.xml   ${HADOOP_PATH}/etc/hadoop/

## Starting YARN

In [None]:
!${HADOOP_PATH}/sbin/start-yarn.sh
!jps

## MapReduce via YARN - Word count example 

In [None]:
!./${HADOOP_PATH}/bin/yarn jar  ./${HADOOP_PATH}/share/hadoop/mapreduce/hadoop-mapreduce-examples-${HADOOP_VERSION}.jar wordcount \
                                /user/matheus/input /user/matheus/output2

### Listing files in the output folder

In [None]:
!./${HADOOP_PATH}/bin/hdfs dfs -ls /user/matheus/output2/

### Reading output file

In [None]:
!./${HADOOP_PATH}/bin/hdfs dfs -cat /user/matheus/output2/part-r-00000