# How to run commands on hadoop using jupyter

## Import utility functions
run_in_master runs provided command on namenode. Use it to start map-reduce, pig or spark application. 

run_in_hive runs provided command on hive-server. Use it to start hive application

In [1]:
from utils import run_in_master, run_in_hive, print_hdfs_output, hdfs_upload

In [2]:
print(run_in_master("echo 'test'"))
print(run_in_hive("echo 'test'"))

(['test\n'], [])
(['test\n'], [])


# Map reduce

In [3]:
run_in_master("yarn jar /opt/hadoop-3.3.1/share/hadoop/mapreduce/hadoop-mapreduce-examples-3.3.1.jar pi 2 5")

(['Number of Maps  = 2\n',
  'Samples per Map = 5\n',
  'Wrote input for Map #0\n',
  'Wrote input for Map #1\n',
  'Starting Job\n',
  'Job Finished in 13.097 seconds\n',
  'Estimated value of Pi is 3.60000000000000000000\n'],
 ['2023-06-19 22:27:47,595 INFO client.DefaultNoHARMFailoverProxyProvider: Connecting to ResourceManager at resourcemanager/172.19.0.2:8032\n',
  '2023-06-19 22:27:47,662 INFO client.AHSProxy: Connecting to Application History server at historyserver/172.19.0.9:10200\n',
  '2023-06-19 22:27:47,747 INFO mapreduce.JobResourceUploader: Disabling Erasure Coding for path: /tmp/hadoop-yarn/staging/root/.staging/job_1687213661089_0001\n',
  '2023-06-19 22:27:47,820 INFO input.FileInputFormat: Total input files to process : 2\n',
  '2023-06-19 22:27:47,865 INFO mapreduce.JobSubmitter: number of splits:2\n',
  '2023-06-19 22:27:47,938 INFO mapreduce.JobSubmitter: Submitting tokens for job: job_1687213661089_0001\n',
  '2023-06-19 22:27:47,938 INFO mapreduce.JobSubmitter:

# Pig

In [4]:
hdfs_upload("examples/pig.pig")
hdfs_upload("examples/data.jsonl")

hdfs dfs -put /data/master_volume/examples/pig.pig /examples
exit code []
[]
hdfs dfs -put /data/master_volume/examples/data.jsonl /examples
exit code []
[]


In [5]:
run_in_master("pig -x mapreduce /data/master_volume/examples/pig.pig")

(['(Michał,24)\n'],
 ['2023-06-19 22:28:06,352 INFO pig.ExecTypeProvider: Trying ExecType : LOCAL\n',
  '2023-06-19 22:28:06,352 INFO pig.ExecTypeProvider: Trying ExecType : MAPREDUCE\n',
  '2023-06-19 22:28:06,352 INFO pig.ExecTypeProvider: Picked MAPREDUCE as the ExecType\n',
  '2023-06-19 22:28:06,376 [main] INFO  org.apache.pig.Main - Apache Pig version 0.17.0 (r1797386) compiled Jun 02 2017, 15:41:58\n',
  '2023-06-19 22:28:06,376 [main] INFO  org.apache.pig.Main - Logging error messages to: /app/pig_1687213686372.log\n',
  '2023-06-19 22:28:06,512 [main] INFO  org.apache.pig.impl.util.Utils - Default bootup file /root/.pigbootup not found\n',
  '2023-06-19 22:28:06,532 [main] INFO  org.apache.hadoop.conf.Configuration.deprecation - mapred.job.tracker is deprecated. Instead, use mapreduce.jobtracker.address\n',
  '2023-06-19 22:28:06,532 [main] INFO  org.apache.pig.backend.hadoop.executionengine.HExecutionEngine - Connecting to hadoop file system at: hdfs://namenode:9000\n',
  '20

# Hive

In [6]:
hdfs_upload("examples/employee.csv")

hdfs dfs -put /data/master_volume/examples/employee.csv /examples
exit code []
[]


In [7]:
run_in_hive("hive -f /data/master_volume/examples/employee_table.hql")

([],
 ['SLF4J: Class path contains multiple SLF4J bindings.\n',
  'SLF4J: Found binding in [jar:file:/opt/hive/lib/log4j-slf4j-impl-2.10.0.jar!/org/slf4j/impl/StaticLoggerBinder.class]\n',
  'SLF4J: Found binding in [jar:file:/opt/hadoop-3.3.1/share/hadoop/common/lib/slf4j-log4j12-1.7.30.jar!/org/slf4j/impl/StaticLoggerBinder.class]\n',
  'SLF4J: See http://www.slf4j.org/codes.html#multiple_bindings for an explanation.\n',
  'SLF4J: Actual binding is of type [org.apache.logging.slf4j.Log4jLoggerFactory]\n',
  'Hive Session ID = a2330ac2-ec8a-4772-83a9-db4085f23bf0\n',
  '\n',
  'Logging initialized using configuration in file:/opt/hive/conf/hive-log4j2.properties Async: true\n',
  'Hive Session ID = d339b115-9a29-483e-bf81-98eaa4b5cfa4\n',
  'OK\n',
  'Time taken: 0.483 seconds\n',
  'OK\n',
  'Time taken: 0.027 seconds\n',
  'OK\n',
  'Time taken: 0.291 seconds\n'])

In [8]:
run_in_hive("hive -f /data/master_volume/examples/employee_load.hql")

([],
 ['SLF4J: Class path contains multiple SLF4J bindings.\n',
  'SLF4J: Found binding in [jar:file:/opt/hive/lib/log4j-slf4j-impl-2.10.0.jar!/org/slf4j/impl/StaticLoggerBinder.class]\n',
  'SLF4J: Found binding in [jar:file:/opt/hadoop-3.3.1/share/hadoop/common/lib/slf4j-log4j12-1.7.30.jar!/org/slf4j/impl/StaticLoggerBinder.class]\n',
  'SLF4J: See http://www.slf4j.org/codes.html#multiple_bindings for an explanation.\n',
  'SLF4J: Actual binding is of type [org.apache.logging.slf4j.Log4jLoggerFactory]\n',
  'Hive Session ID = c71a58b7-1c74-4d88-834e-fb06fa189138\n',
  '\n',
  'Logging initialized using configuration in file:/opt/hive/conf/hive-log4j2.properties Async: true\n',
  'Hive Session ID = 2bf7c717-364c-42c3-bbb8-50d9428c7b98\n',
  'OK\n',
  'Time taken: 0.398 seconds\n',
  'Loading data to table testdb.employee\n',
  'OK\n',
  'Time taken: 0.48 seconds\n'])

In [9]:
run_in_hive("hive -f /data/master_volume/examples/test_group.hql")

([],
 ['SLF4J: Class path contains multiple SLF4J bindings.\n',
  'SLF4J: Found binding in [jar:file:/opt/hive/lib/log4j-slf4j-impl-2.10.0.jar!/org/slf4j/impl/StaticLoggerBinder.class]\n',
  'SLF4J: Found binding in [jar:file:/opt/hadoop-3.3.1/share/hadoop/common/lib/slf4j-log4j12-1.7.30.jar!/org/slf4j/impl/StaticLoggerBinder.class]\n',
  'SLF4J: See http://www.slf4j.org/codes.html#multiple_bindings for an explanation.\n',
  'SLF4J: Actual binding is of type [org.apache.logging.slf4j.Log4jLoggerFactory]\n',
  'Hive Session ID = 45219d87-b12f-40ae-8678-73b9848d64b3\n',
  '\n',
  'Logging initialized using configuration in file:/opt/hive/conf/hive-log4j2.properties Async: true\n',
  'Hive Session ID = bf781ccc-0084-4a2d-a6a3-e1ffeeae9093\n',
  'OK\n',
  'Time taken: 0.395 seconds\n',
  'Query ID = root_20230619222833_d2f7a4ae-4e7b-4c0b-ae8b-186d28c2daf8\n',
  'Total jobs = 1\n',
  'Launching Job 1 out of 1\n',
  'Number of reduce tasks not specified. Estimated from input data size: 1\n',

In [10]:
print_hdfs_output("/user/hive/warehouse/results/000000_0")

8824933

14555152

13918264

22224443

83793498

77255506

92432520

23576545

37365571

26259585



# Spark

In [11]:
run_in_master("spark-submit --master yarn --deploy-mode cluster /data/master_volume/examples/spark.py")

([],
 ['23/06/19 22:28:51 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable\n',
  '23/06/19 22:28:51 INFO DefaultNoHARMFailoverProxyProvider: Connecting to ResourceManager at resourcemanager/172.19.0.2:8032\n',
  '23/06/19 22:28:51 INFO AHSProxy: Connecting to Application History server at historyserver/172.19.0.9:10200\n',
  '23/06/19 22:28:51 INFO Configuration: resource-types.xml not found\n',
  "23/06/19 22:28:51 INFO ResourceUtils: Unable to find 'resource-types.xml'.\n",
  '23/06/19 22:28:51 INFO Client: Verifying our application has not requested more than the maximum memory capability of the cluster (4096 MB per container)\n',
  '23/06/19 22:28:51 INFO Client: Will allocate AM container, with 1408 MB memory including 384 MB overhead\n',
  '23/06/19 22:28:51 INFO Client: Setting up container launch context for our AM\n',
  '23/06/19 22:28:51 INFO Client: Setting up the launch environment for our AM contai

In [13]:
print_hdfs_output("/spark-result/dataframe-select/*")

name

Michał



In [14]:
print_hdfs_output("/spark-result/sql-select/*")

name

Michał

