-
Notifications
You must be signed in to change notification settings - Fork 0
/
hive.hql
59 lines (37 loc) · 1.61 KB
/
hive.hql
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
-- This is a Hive program. Hive is an SQL-like language that compiles
-- into Hadoop Map/Reduce jobs. It's very popular among data analysts
-- because it allows them to query enormous Hadoop data stores using
-- a language much like SQL.
-- Logs are stored on the Hadoop Distributed File System, in the
-- directory /root/logs/. They're ordinary Apache logs in *.gz format.
--
-- We want to pretend that these gzipped log files are a database table,
-- and use a regular expression to split lines into database columns.
Options: CLI, JDBC; ODBC;
Data: Tables / Partitions / Buckets
hive
hive -f script.q
hive -e 'SELECT * FROM dummy'
hive -e "CREATE TABLE dummy (value STRING); LOAD DATA LOCAL INPATH '/tmp/dummy.txt' OVERWRITE INTO TABLE dummy"
SHOW TABLES;
DROP TABLE <table>;
CREATE TABLE logs(
host STRING,
identity STRING,
user STRING,
time STRING,
request STRING,
status STRING,
size STRING,
referer STRING,
agent STRING)
ROW FORMAT SERDE 'org.apache.hadoop.hive.contrib.serde2.RegexSerDe'
WITH SERDEPROPERTIES (
'input.regex'='^(\\S+) \\S+ \\S+ \\[([^\\[]+)\\] "(\\w+) (\\S+) (\\S+)" (\\d+) (\\d+) "([^"]+)" "([^"]+)".*'
);
LOAD DATA LOCAL INPATH '/root/data/hive.log' OVERWRITE INTO TABLE logs;
SELECT * from logs;
IO Formats:
"input.regex" = "([^ ]*) ([^ ]*) ([^ ]*) (-|\\[[^\\]]*\\]) ([^ \"]*|\"[^\"]*\") (-|[0-9]*) (-|[0-9]*)(?: ([^ \"]*|\"[^\"]*\") ([^ \"]*|\"[^\"]*\"))?"
"input.regex" = "([^ ]*) ([^ ]*) ([^ ]*) (-|\\[[^\\]]*\\]) ([^ \"]*|\"[^\"]*\") (-|[0-9]*) (-|[0-9]*)(?: ([^ \"]*|\"[^\"]*\") ([^ \"]*|\"[^\"]*\"))?"
"output.format.string" = "%1$s %2$s %3$s %4$s %5$s %6$s %7$s %8$s %9$s"