# Setup

In [9]:
#!docker-compose up

In [1]:
!pip install boto3 pandas sqlalchemy sqlalchemy-access pyhive thrift requests sasl thrift_sasl



# Example query

1. Uploading the sample data on s3
2. Creating a table and update metadata with MSCK REPAIR
3. Retrieving results and save them in a dataframe.

In [2]:
import sqlalchemy_access as sa_a
from sqlalchemy import *
from sqlalchemy.engine import create_engine
import boto3
import botocore
import pandas as pd

# 0. Global set up
mock_s3_client = boto3.client("s3", region_name="eu-west-1", endpoint_url="http://localhost:5000")
mock_hive_engine = create_engine('hive://localhost:10000')

# 1. Create mothena bucket 
try:
    mock_s3_client.create_bucket(
        Bucket="mothena", CreateBucketConfiguration={"LocationConstraint": "eu-west-1"}
    )
except botocore.exceptions.ClientError:
    pass

# 2. Upload file
file = open(
        "sample_data/ny_data.parquet",
        "rb",
    )
mock_s3_client.put_object(
        Bucket="mothena",
        Key="ny/downloaded_at=2020-10-15/properties.parquet",
        Body=file.read(),
    )

# 3. Create table
mock_hive_engine.execute("DROP TABLE IF EXISTS ny_data")
mock_hive_engine.execute(
    """
    CREATE EXTERNAL TABLE ny_data(
        address string, 
        residential_units int, 
        sale_price int, 
        sale_date string
    )
    PARTITIONED BY ( 
        downloaded_at string
    )
    STORED AS PARQUET LOCATION 's3a://mothena/ny'
    """
)

# 4. Update metadata
mock_hive_engine.execute("MSCK REPAIR TABLE ny_data")

# 5. Query results
df = pd.read_sql_query("SELECT * FROM ny_data", mock_hive_engine)
df.head()

Unnamed: 0,address,residential_units,sale_price,sale_date,downloaded_at
0,231 EAST 7TH,2,0,02/10/2020,2020-10-15
1,243 EAST 7TH STREET,3,4350000,16/07/2020,2020-10-15
2,262 EAST 7TH STREET,4,600000,12/12/2019,2020-10-15
3,272 EAST 7TH STREET,24,1000000,26/06/2020,2020-10-15


# 1. Quering apache logs

Athena monitoruje wybrany folder s3 z logami serwera Apache. Pliki logów są rozproszone (wiele plików). Athene saknuje je w poszukiwaniu zdefiniowanych wzorów i pozwala na relacyjne wyszukiwania.

1. Tworzymy definicje schematu
2. Uzupełniamy logi
3. Tworzymy zapytanie

In [3]:
!head sample_data/apache_logs

83.149.9.216 - - [17/May/2015:10:05:03 +0000] "GET /presentations/logstash-monitorama-2013/images/kibana-search.png HTTP/1.1" 200 203023 "http://semicomplete.com/presentations/logstash-monitorama-2013/" "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1700.77 Safari/537.36"
83.149.9.216 - - [17/May/2015:10:05:43 +0000] "GET /presentations/logstash-monitorama-2013/images/kibana-dashboard3.png HTTP/1.1" 200 171717 "http://semicomplete.com/presentations/logstash-monitorama-2013/" "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1700.77 Safari/537.36"
83.149.9.216 - - [17/May/2015:10:05:47 +0000] "GET /presentations/logstash-monitorama-2013/plugin/highlight/highlight.js HTTP/1.1" 200 26185 "http://semicomplete.com/presentations/logstash-monitorama-2013/" "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1700.77 Safari/537.36"
83.149.9.216 -

In [16]:
first_file = open(
        "sample_data/apache_logs",
        "rb",
)
mock_s3_client.put_object(
        Bucket="mothena",
        Key="apache-log-folder/first_logs",
        Body=first_file.read(),
)

second_file = open(
        "sample_data/single_log",
        "rb",
)
mock_s3_client.put_object(
        Bucket="mothena",
        Key="apache-log-folder/second_logs",
        Body=second_file.read(),
)

{'ResponseMetadata': {'RequestId': 'EPDT99OLSN6F1J0W2VQ8TJS7D58SVQF59D8VSV0L94EF6CUOLYMZ',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'etag': '"52d2d7a35c6d2b1310511e2c0393ed73"',
   'last-modified': 'Sat, 08 Jan 2022 11:15:22 GMT',
   'content-length': '0',
   'x-amzn-requestid': 'EPDT99OLSN6F1J0W2VQ8TJS7D58SVQF59D8VSV0L94EF6CUOLYMZ',
   'content-type': 'text/html; charset=utf-8',
   'access-control-allow-origin': '*',
   'server': 'Werkzeug/2.0.2 Python/3.7.12',
   'date': 'Sat, 08 Jan 2022 11:15:22 GMT'},
  'RetryAttempts': 0},
 'ETag': '"52d2d7a35c6d2b1310511e2c0393ed73"'}

In [17]:

mock_hive_engine.execute("DROP TABLE IF EXISTS apache_logs")
mock_hive_engine.execute(
    """
    CREATE EXTERNAL TABLE apache_logs(
        client_ip string,
        request_received_time string,
        client_request string,
        server_status string,
        returned_obj_size string,
        metadata string
    )
    ROW FORMAT SERDE 
        'org.apache.hadoop.hive.serde2.RegexSerDe'
    WITH SERDEPROPERTIES (
       'input.regex'='(.*) - - \\\\[(.*)\\\\] "(.*)" (\\\\d*) (\\\\d*) (.*)'
    )
    STORED AS INPUTFORMAT
       'org.apache.hadoop.mapred.TextInputFormat'
    OUTPUTFORMAT
       'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
    LOCATION
        's3a://mothena/apache-log-folder'
    """
)
mock_hive_engine.execute("MSCK REPAIR TABLE apache_logs")

<sqlalchemy.engine.cursor.LegacyCursorResult at 0x109bf2520>

In [18]:
df = pd.read_sql_query("SELECT * FROM apache_logs", mock_hive_engine)
df.head(10)

Unnamed: 0,client_ip,request_received_time,client_request,server_status,returned_obj_size,metadata
0,83.149.9.216,17/May/2015:10:05:03 +0000,GET /presentations/logstash-monitorama-2013/im...,200,203023,"""http://semicomplete.com/presentations/logstas..."
1,83.149.9.216,17/May/2015:10:05:43 +0000,GET /presentations/logstash-monitorama-2013/im...,200,171717,"""http://semicomplete.com/presentations/logstas..."
2,83.149.9.216,17/May/2015:10:05:47 +0000,GET /presentations/logstash-monitorama-2013/pl...,200,26185,"""http://semicomplete.com/presentations/logstas..."
3,83.149.9.216,17/May/2015:10:05:12 +0000,GET /presentations/logstash-monitorama-2013/pl...,200,7697,"""http://semicomplete.com/presentations/logstas..."
4,83.149.9.216,17/May/2015:10:05:07 +0000,GET /presentations/logstash-monitorama-2013/pl...,200,2892,"""http://semicomplete.com/presentations/logstas..."
5,83.149.9.216,17/May/2015:10:05:34 +0000,GET /presentations/logstash-monitorama-2013/im...,200,430406,"""http://semicomplete.com/presentations/logstas..."
6,83.149.9.216,17/May/2015:10:05:57 +0000,GET /presentations/logstash-monitorama-2013/cs...,200,38720,"""http://semicomplete.com/presentations/logstas..."
7,83.149.9.216,17/May/2015:10:05:50 +0000,GET /presentations/logstash-monitorama-2013/cs...,200,41820,"""http://semicomplete.com/presentations/logstas..."
8,83.149.9.216,17/May/2015:10:05:24 +0000,GET /presentations/logstash-monitorama-2013/im...,200,52878,"""http://semicomplete.com/presentations/logstas..."
9,83.149.9.216,17/May/2015:10:05:50 +0000,GET /presentations/logstash-monitorama-2013/im...,200,321631,"""http://semicomplete.com/presentations/logstas..."


## 404 requests

In [19]:
df = pd.read_sql_query(
    """
    SELECT request_received_time, client_ip, server_status
    FROM apache_logs
    WHERE server_status = '404'
    """, mock_hive_engine)
df.head(10)

Unnamed: 0,request_received_time,client_ip,server_status
0,17/May/2015:10:05:22 +0000,66.249.73.185,404
1,17/May/2015:11:05:05 +0000,208.91.156.11,404
2,17/May/2015:13:05:25 +0000,111.199.235.239,404
3,17/May/2015:13:05:32 +0000,111.199.235.239,404
4,17/May/2015:13:05:04 +0000,208.91.156.11,404
5,17/May/2015:13:05:28 +0000,144.76.194.187,404
6,17/May/2015:13:05:37 +0000,144.76.194.187,404
7,17/May/2015:15:05:00 +0000,208.91.156.11,404
8,17/May/2015:16:05:23 +0000,208.91.156.11,404
9,17/May/2015:16:05:50 +0000,74.208.180.23,404


## distributed files search

In [20]:
!head sample_data/single_log

0.0.0.0 - - [17/May/2015:10:05:03 +0000] "GET /test/request" 200 203023 "http://semicomplete.com/presentations/logstash-monitorama-2013/" "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1700.77 Safari/537.36"

In [21]:
df = pd.read_sql_query("SELECT * FROM apache_logs WHERE client_ip = '0.0.0.0'", mock_hive_engine)
df.head()

Unnamed: 0,client_ip,request_received_time,client_request,server_status,returned_obj_size,metadata
0,0.0.0.0,17/May/2015:10:05:03 +0000,GET /test/request,200,203023,"""http://semicomplete.com/presentations/logstas..."
1,0.0.0.0,17/May/2015:10:05:03 +0000,GET /test/request,200,203023,"""http://semicomplete.com/presentations/logstas..."


# 2. Analyzing Wikipedia Clickstreams

In [11]:
!wget -O 5036380.tsv.gz https://figshare.com/ndownloader/files/5036380

--2022-01-08 12:12:14--  https://figshare.com/ndownloader/files/5036380
Resolving figshare.com (figshare.com)... 54.76.172.109, 52.210.36.187
Connecting to figshare.com (figshare.com)|54.76.172.109|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://s3-eu-west-1.amazonaws.com/pfigshare-u-files/5036380/2015_01_en_clickstream.tsv.gz?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIAIYCQYOYV5JSSROOA/20220108/eu-west-1/s3/aws4_request&X-Amz-Date=20220108T111214Z&X-Amz-Expires=10&X-Amz-SignedHeaders=host&X-Amz-Signature=ca315e6e1d574d58f41690bd459032e5b8d243d147408e47c9aeb159379be74b [following]
--2022-01-08 12:12:14--  https://s3-eu-west-1.amazonaws.com/pfigshare-u-files/5036380/2015_01_en_clickstream.tsv.gz?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIAIYCQYOYV5JSSROOA/20220108/eu-west-1/s3/aws4_request&X-Amz-Date=20220108T111214Z&X-Amz-Expires=10&X-Amz-SignedHeaders=host&X-Amz-Signature=ca315e6e1d574d58f41690bd459032e5b8d243d147408e47c9aeb15

PING wp.pl (212.77.98.9): 56 data bytes
64 bytes from 212.77.98.9: icmp_seq=0 ttl=59 time=21.884 ms
64 bytes from 212.77.98.9: icmp_seq=1 ttl=59 time=20.465 ms
64 bytes from 212.77.98.9: icmp_seq=2 ttl=59 time=17.819 ms
64 bytes from 212.77.98.9: icmp_seq=3 ttl=59 time=17.756 ms
64 bytes from 212.77.98.9: icmp_seq=4 ttl=59 time=18.613 ms
64 bytes from 212.77.98.9: icmp_seq=5 ttl=59 time=29.478 ms
64 bytes from 212.77.98.9: icmp_seq=6 ttl=59 time=20.018 ms
64 bytes from 212.77.98.9: icmp_seq=7 ttl=59 time=15.413 ms
64 bytes from 212.77.98.9: icmp_seq=8 ttl=59 time=16.894 ms
^C

--- wp.pl ping statistics ---
9 packets transmitted, 9 packets received, 0.0% packet loss
round-trip min/avg/max/stddev = 15.413/19.816/29.478/3.883 ms
