# Introduction to Big Data Modern Technologies course

## FINAL PROJECT: lab work
### Part 1. Object storage and database pipeline (Serverless)

### 1. Libraries and credentials

In [14]:
!pip install pandas
!pip install clickhouse-connect
!pip install names



In [15]:
import os
import sys
import json
import boto3
import names
import datetime
import clickhouse_connect
import pandas as pd

In [16]:
def access_data(file_path):
    with open(file_path) as file:
        access_data = json.load(file)
    return access_data

creds = access_data(file_path='Selezneva_access_bucket.json')
print(creds.keys())

dict_keys(['aws_access_key_id', 'aws_secret_access_key'])


### 2. Raw data

In [17]:
session = boto3.session.Session()
s3 = session.client(
    service_name='s3',
    aws_access_key_id=creds['aws_access_key_id'],
    aws_secret_access_key=creds['aws_secret_access_key'],
    endpoint_url='https://storage.yandexcloud.net'
)

In [20]:
DATA_BUCKET = 'shelkoviydivan'

In [21]:
s3.download_file(DATA_BUCKET, 'YandexInternalRootCA.crt', 'YandexInternalRootCA.crt')
s3.download_file(DATA_BUCKET, 'access_ch.json', 'access_ch.json')

access_ch = access_data('access_ch.json')
print(access_ch.keys())

dict_keys(['host', 'port', 'dbname', 'user', 'password', 'sslrootcert'])


In [22]:
get_object_response = s3.get_object(
    Bucket=DATA_BUCKET, 
    Key='jhub_logs_large.csv'
)

In [23]:
df = pd.read_csv(get_object_response['Body'], sep=';')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 699065 entries, 0 to 699064
Data columns (total 5 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   date        699065 non-null  object
 1   kubernetes  699065 non-null  object
 2   log         699065 non-null  object
 3   stream      699065 non-null  object
 4   time        699065 non-null  object
dtypes: object(5)
memory usage: 26.7+ MB


### 3. Preprocessing data

#### 3.1. Parce the data

In [24]:
def row_info(rin):
    """
    Extracts names of:
      - docker image
      - id of the Jupyter application
      - name of the host, where Jupyter runs
    
    """
    img = rin[rin.find('container_image='):].split('\'')[1]
    hub = rin[rin.find('pod_name='):].split('\'')[1]
    host = rin[rin.find('host='):].split('\'')[1]
    return img, hub, host

In [25]:
def sq_brackets(sin):
    """
    Split log string amd extracts:
      - timestamp of the event
      - name of application
      - type of logs
      - code of event
      - description
    
    """
    try:
        s = sin.split('[', 1)[1].split(']')[0]
        msg = sin[len(s) + 2 :].strip()
        s = s.split()
        head = s[0]
        ts = ' '.join(s[1:3])
        svc = s[3]
        typ = s[4].split(':')[0]
        code = s[4].split(':')[1]
    except:
        head, ts, svc, typ, code = '', '', '', '', ''
        msg = sin
    return head, ts, svc, typ, code, msg

In [26]:
df['img'], df['hub'], df['host'] = zip(*df['kubernetes'].map(row_info))
df.head()

Unnamed: 0,date,kubernetes,log,stream,time,img,hub,host
0,2022-12-09T04:50:48.335844Z,Row(annotations=Row(checksum/config-map='ce892...,[I 2022-12-09 04:50:48.335 JupyterHub log:181]...,stderr,2022-12-09T04:50:48.33584421Z,jupyterhub/k8s-hub:0.11.1,hub-5c66c6c96f-p5xcc,cl1flrrk4hvdbm084md4-ahoc
1,2022-12-09T04:50:48.359937Z,Row(annotations=Row(checksum/config-map='ce892...,[W 2022-12-09 04:50:48.359 JupyterHub log:181]...,stderr,2022-12-09T04:50:48.359937031Z,jupyterhub/k8s-hub:0.11.1,hub-5c66c6c96f-p5xcc,cl1flrrk4hvdbm084md4-ahoc
2,2022-12-09T04:50:55.940651Z,Row(annotations=Row(checksum/config-map='ce892...,[I 2022-12-09 04:50:55.940 JupyterHub log:181]...,stderr,2022-12-09T04:50:55.940651688Z,jupyterhub/k8s-hub:0.11.1,hub-5c66c6c96f-p5xcc,cl1flrrk4hvdbm084md4-ahoc
3,2022-12-09T04:50:55.968410Z,Row(annotations=Row(checksum/config-map='ce892...,[W 2022-12-09 04:50:55.968 JupyterHub log:181]...,stderr,2022-12-09T04:50:55.968410334Z,jupyterhub/k8s-hub:0.11.1,hub-5c66c6c96f-p5xcc,cl1flrrk4hvdbm084md4-ahoc
4,2022-12-09T04:50:51.758320Z,Row(annotations=Row(checksum/config-map='ce892...,[I 2022-12-09 04:50:51.758 JupyterHub log:181]...,stderr,2022-12-09T04:50:51.758320284Z,jupyterhub/k8s-hub:0.11.1,hub-5c66c6c96f-p5xcc,cl1flrrk4hvdbm084md4-ahoc


In [27]:
df['head'], df['timestamp'], df['service'], \
    df['event_type'], df['event_code'], df['message'] \
    = zip(*df['log'].map(sq_brackets))
df.head()

Unnamed: 0,date,kubernetes,log,stream,time,img,hub,host,head,timestamp,service,event_type,event_code,message
0,2022-12-09T04:50:48.335844Z,Row(annotations=Row(checksum/config-map='ce892...,[I 2022-12-09 04:50:48.335 JupyterHub log:181]...,stderr,2022-12-09T04:50:48.33584421Z,jupyterhub/k8s-hub:0.11.1,hub-5c66c6c96f-p5xcc,cl1flrrk4hvdbm084md4-ahoc,I,2022-12-09 04:50:48.335,JupyterHub,log,181,302 GET /utilities/login/index.php -> /hub/uti...
1,2022-12-09T04:50:48.359937Z,Row(annotations=Row(checksum/config-map='ce892...,[W 2022-12-09 04:50:48.359 JupyterHub log:181]...,stderr,2022-12-09T04:50:48.359937031Z,jupyterhub/k8s-hub:0.11.1,hub-5c66c6c96f-p5xcc,cl1flrrk4hvdbm084md4-ahoc,W,2022-12-09 04:50:48.359,JupyterHub,log,181,404 GET /hub/utilities/login/index.php (@10.11...
2,2022-12-09T04:50:55.940651Z,Row(annotations=Row(checksum/config-map='ce892...,[I 2022-12-09 04:50:55.940 JupyterHub log:181]...,stderr,2022-12-09T04:50:55.940651688Z,jupyterhub/k8s-hub:0.11.1,hub-5c66c6c96f-p5xcc,cl1flrrk4hvdbm084md4-ahoc,I,2022-12-09 04:50:55.940,JupyterHub,log,181,302 GET /test-output/ -> /hub/test-output/ (@1...
3,2022-12-09T04:50:55.968410Z,Row(annotations=Row(checksum/config-map='ce892...,[W 2022-12-09 04:50:55.968 JupyterHub log:181]...,stderr,2022-12-09T04:50:55.968410334Z,jupyterhub/k8s-hub:0.11.1,hub-5c66c6c96f-p5xcc,cl1flrrk4hvdbm084md4-ahoc,W,2022-12-09 04:50:55.968,JupyterHub,log,181,404 GET /hub/test-output/ (@10.112.128.1) 1.19ms
4,2022-12-09T04:50:51.758320Z,Row(annotations=Row(checksum/config-map='ce892...,[I 2022-12-09 04:50:51.758 JupyterHub log:181]...,stderr,2022-12-09T04:50:51.758320284Z,jupyterhub/k8s-hub:0.11.1,hub-5c66c6c96f-p5xcc,cl1flrrk4hvdbm084md4-ahoc,I,2022-12-09 04:50:51.758,JupyterHub,log,181,302 GET /admin.pl -> /hub/admin.pl (@10.112.12...


#### 3.2. Parce users' activities

In [28]:
def parce_users_activities(row):
    """
    Ugly function.
    
    You may use dictionary to make it
    more pythonic or something else.
    
    """
    code = row['event_code']
    msg = row['message']
    if code == '43':
        user = msg.split()[-1]
        log = 'logged out'
    elif code == '757':
        user = msg.split()[-1]
        log = 'logged in'
    elif code == '402':
        user = msg.split()[0]
        log = 'pending spawn'
    elif code == '1875':
        user = msg.split()[4].replace('claim-', '').replace(',', '')
        log = 'attempt to create pvc with timeout'
    elif code == '1887':
        user = msg.split()[1].replace('claim-', '')
        log = 'pvc already exists'
    elif code == '1840':
        user = msg.split()[4].replace('jupyter-', '').replace(',', '')
        log = 'attempting to create pod with timeout'
    elif code == '1344':
        user = msg.split('/')[3]
        log = 'failing suspected api request to not-running server'
    elif code == '380':
        user = msg.split()[3]
        log = 'previous spawn failed'
    elif code == '567':
        user = msg.split('/')[4]
        log = 'stream closed while handling '
    elif code == '681':
        user = msg.split()[0].replace('\'s', '')
        log = 'server failed to start'
    elif code == '1997':
        user = msg.split('-')[-1]
        log = 'deleting pod'
    elif code == '689':
        user = msg.split()[3].replace('\'s', '')
        log = 'unhandled error starting with timeout'
    elif code == '1961' or code == '2044':
        user = msg.split()[1].replace('jupyter-', '')
        log = 'restarting pod reflector'
    elif code == '257':
        user = msg.split()[2]
        log = 'adding user to proxy'
    elif code == '664':
        user = msg.split()[1]
        log = 'server is ready'
    elif code == '61' or code == '85':
        user = msg.split()[3]
        log = 'spawning sever with advanced configuration option'
    elif code == '1143':
        user = msg.split()[1].replace(':', '')
        log = 'server is slow to stop'
    elif code == '2077':
        user = msg.split()[0]
        log = 'still running'
    elif code == '167':
        user = msg.split()[1]
        log = 'server is already active'
    elif code == '1067' or code == '2022':
        user = msg.split()[1]
        log = 'user server stopped with exit code 1'
    elif code == '1857':
        user = msg.split()[3].replace('jupyter-', '').replace(',', '')
        log = 'found existing pod and attempting to kill'
    elif code == '1861':
        user = msg.split()[2].replace('jupyter-', '').replace(',', '')
        log = 'killed pod and will try starting singleuser pod again'
    elif code == '738':
        user = msg.split()[0].replace(',', '').replace('\'s', '')
        log = 'server never showed up and giving up'
    elif code == '2069':
        user = msg.split()[0].replace(',', '')
        log = 'user does not appear to be running and shutting it down'  
    elif code == '148':
        user = msg.split()[-1]
        log = 'user is running'
    elif code == '1415':
        user = msg.split()[-1]
        log = 'admin requesting spawn on behalf'
    elif code == '1437':
        user = msg.split()[5].replace(',', '')
        log = 'user requested server which user do not own'
    elif code == '626':
        user = msg.split()[1]
        log = 'server is already started'
    elif code == '2085':
        user = msg.split()[0]
        log = 'server appears to have stopped while the hub was down'
    else:
        user, log = '', ''
    return user, log

In [29]:
df['user'], df['log'] = zip(*df.apply(parce_users_activities, axis=1))
df.head()

Unnamed: 0,date,kubernetes,log,stream,time,img,hub,host,head,timestamp,service,event_type,event_code,message,user
0,2022-12-09T04:50:48.335844Z,Row(annotations=Row(checksum/config-map='ce892...,,stderr,2022-12-09T04:50:48.33584421Z,jupyterhub/k8s-hub:0.11.1,hub-5c66c6c96f-p5xcc,cl1flrrk4hvdbm084md4-ahoc,I,2022-12-09 04:50:48.335,JupyterHub,log,181,302 GET /utilities/login/index.php -> /hub/uti...,
1,2022-12-09T04:50:48.359937Z,Row(annotations=Row(checksum/config-map='ce892...,,stderr,2022-12-09T04:50:48.359937031Z,jupyterhub/k8s-hub:0.11.1,hub-5c66c6c96f-p5xcc,cl1flrrk4hvdbm084md4-ahoc,W,2022-12-09 04:50:48.359,JupyterHub,log,181,404 GET /hub/utilities/login/index.php (@10.11...,
2,2022-12-09T04:50:55.940651Z,Row(annotations=Row(checksum/config-map='ce892...,,stderr,2022-12-09T04:50:55.940651688Z,jupyterhub/k8s-hub:0.11.1,hub-5c66c6c96f-p5xcc,cl1flrrk4hvdbm084md4-ahoc,I,2022-12-09 04:50:55.940,JupyterHub,log,181,302 GET /test-output/ -> /hub/test-output/ (@1...,
3,2022-12-09T04:50:55.968410Z,Row(annotations=Row(checksum/config-map='ce892...,,stderr,2022-12-09T04:50:55.968410334Z,jupyterhub/k8s-hub:0.11.1,hub-5c66c6c96f-p5xcc,cl1flrrk4hvdbm084md4-ahoc,W,2022-12-09 04:50:55.968,JupyterHub,log,181,404 GET /hub/test-output/ (@10.112.128.1) 1.19ms,
4,2022-12-09T04:50:51.758320Z,Row(annotations=Row(checksum/config-map='ce892...,,stderr,2022-12-09T04:50:51.758320284Z,jupyterhub/k8s-hub:0.11.1,hub-5c66c6c96f-p5xcc,cl1flrrk4hvdbm084md4-ahoc,I,2022-12-09 04:50:51.758,JupyterHub,log,181,302 GET /admin.pl -> /hub/admin.pl (@10.112.12...,


In [30]:
df = df.loc[df.user != '', [
    'timestamp',
    'hub',
    'img',
    'host',
    'event_code',
    'event_type',
    'log',
    'user'
]].reset_index(drop=True)

df.head()

Unnamed: 0,timestamp,hub,img,host,event_code,event_type,log,user
0,2023-02-13 09:30:12.727,hub-57c88d997b-xh654,jupyterhub/k8s-hub:0.11.1,cl1flrrk4hvdbm084md4-elef,257,proxy,adding user to proxy,st107874
1,2023-02-13 09:30:12.729,hub-57c88d997b-xh654,jupyterhub/k8s-hub:0.11.1,cl1flrrk4hvdbm084md4-elef,664,users,server is ready,st107874
2,2023-02-13 09:30:12.730,hub-57c88d997b-xh654,jupyterhub/k8s-hub:0.11.1,cl1flrrk4hvdbm084md4-elef,664,users,server is ready,st107874
3,2023-02-13 09:30:13.534,hub-57c88d997b-xh654,jupyterhub/k8s-hub:0.11.1,cl1flrrk4hvdbm084md4-elef,257,proxy,adding user to proxy,st112224
4,2023-02-13 09:30:13.536,hub-57c88d997b-xh654,jupyterhub/k8s-hub:0.11.1,cl1flrrk4hvdbm084md4-elef,664,users,server is ready,st112224


### 4. Normalize data

#### 4.1. Users table

In [31]:
logins = df.user.unique()
print(len(logins))

165


In [32]:
users = []
for login in logins:
    user = {}
    user['login'] = login
    user['name'] = names.get_full_name()
    user['email'] = login + '@gsom.spbu.ru'
    users.append(user)

In [33]:
df_users = pd.DataFrame(users)
df_users.head()

Unnamed: 0,login,name,email
0,st107874,Brandon Kuhns,st107874@gsom.spbu.ru
1,st112224,Peter Thornhill,st112224@gsom.spbu.ru
2,st112364,Steve Kicker,st112364@gsom.spbu.ru
3,st107860,Alexandria Culhane,st107860@gsom.spbu.ru
4,st105940,Milton Lekan,st105940@gsom.spbu.ru


In [34]:
client = clickhouse_connect.get_client(
    host=access_ch['host'], 
    username=access_ch['user'], 
    password=access_ch['password'],
    port=access_ch['port'],
    verify=f'{access_ch["sslrootcert"]}',
    #verify=f'/home/jovyan/ibdt_course_miba_23/topics_labs/{access_ch["sslrootcert"]}'
)

In [35]:
result = client.query('DROP TABLE IF EXISTS db1.users')
query = '''
CREATE TABLE IF NOT EXISTS db1.users
(
    email String,
    login String,
    name String
) ENGINE = MergeTree
ORDER BY email;
'''
result = client.query(query)
result.result_rows

[['']]

In [36]:
client.insert_df(
    'db1.users',
    df_users
)

In [37]:
result = client.query(
    "SELECT * FROM db1.users LIMIT 5"
)
result.result_rows

[('aasoloviev@gsom.spbu.ru', 'aasoloviev', 'Lenny Artrip'),
 ('ab2216206@gsom.spbu.ru', 'ab2216206', 'Guy Shaw'),
 ('ab2219090@gsom.spbu.ru', 'ab2219090', 'Robert Gomez'),
 ('abulatov@gsom.spbu.ru', 'abulatov', 'Susan Richard'),
 ('albulatov@gsom.spbu.ru', 'albulatov', 'Laura Mickolick')]

#### 4.2. JupyterHub instances table

In [38]:
df_instances = df[[
    'hub',
    'img',
    'host'
]].reset_index(drop=True)

In [39]:
df_instances.drop_duplicates(inplace=True)
df_instances.reset_index(drop=True, inplace=True)
df_instances.head()

Unnamed: 0,hub,img,host
0,hub-57c88d997b-xh654,jupyterhub/k8s-hub:0.11.1,cl1flrrk4hvdbm084md4-elef
1,hub-5bb9b9c56c-s958d,jupyterhub/k8s-hub:0.11.1,cl1flrrk4hvdbm084md4-ahoc
2,hub-d9c54fff9-tzxn2,jupyterhub/k8s-hub:0.11.1,cl1flrrk4hvdbm084md4-eqix
3,hub-5bb9b9c56c-5k7nq,jupyterhub/k8s-hub:0.11.1,cl1flrrk4hvdbm084md4-akec
4,hub-864f997456-fxk67,jupyterhub/k8s-hub:0.11.1,cl1flrrk4hvdbm084md4-eqix


In [40]:
result = client.query('DROP TABLE IF EXISTS db1.instances')
query = '''
CREATE TABLE IF NOT EXISTS db1.instances
(
    hub String,
    img String,
    host String
) ENGINE = MergeTree
ORDER BY hub;
'''
result = client.query(query)
result.result_rows

[['']]

In [41]:
client.insert_df(
    'db1.instances',
    df_instances
)

In [42]:
result = client.query(
    "SELECT * FROM db1.instances LIMIT 5"
)
result.result_rows

[('hub-56bbc6d5f7-5wsph',
  'jupyterhub/k8s-hub:0.11.1',
  'cl1flrrk4hvdbm084md4-ahoc'),
 ('hub-56bbc6d5f7-rgsct',
  'jupyterhub/k8s-hub:0.11.1',
  'cl1flrrk4hvdbm084md4-ahoc'),
 ('hub-57c88d997b-xh654',
  'jupyterhub/k8s-hub:0.11.1',
  'cl1flrrk4hvdbm084md4-elef'),
 ('hub-58f6d59b46-jfwm9',
  'jupyterhub/k8s-hub:0.11.1',
  'cl1flrrk4hvdbm084md4-ahoc'),
 ('hub-59778cfbc5-kb9tb',
  'jupyterhub/k8s-hub:0.11.1',
  'cl1flrrk4hvdbm084md4-elef')]

#### 4.4. JupyterHub logs table

In [43]:
df_logs = df[[
    'timestamp',
    'hub',
    'event_code',
    'event_type',
    'log',
    'user'
]].reset_index(drop=True)

In [44]:
df_logs.rename({'user': 'login'}, axis='columns', inplace=True)

In [45]:
result = client.query('DROP TABLE IF EXISTS db1.logs')
query = '''
CREATE TABLE IF NOT EXISTS db1.logs
(
    timestamp String,
    hub String,
    event_code String,
    event_type String,
    log String,
    login String
) ENGINE = MergeTree
ORDER BY hub;
'''
result = client.query(query)
result.result_rows

[['']]

In [46]:
client.insert_df(
    'db1.logs',
    df_logs
)

In [47]:
result = client.query(
    "SELECT * FROM db1.logs LIMIT 5"
)
result.result_rows

[('2022-11-15 10:02:17.181',
  'hub-56bbc6d5f7-5wsph',
  '1344',
  'base',
  'failing suspected api request to not-running server',
  'st061467'),
 ('2022-11-15 10:02:17.183',
  'hub-56bbc6d5f7-5wsph',
  '1344',
  'base',
  'failing suspected api request to not-running server',
  'st061467'),
 ('2022-11-15 10:02:17.238',
  'hub-56bbc6d5f7-5wsph',
  '1344',
  'base',
  'failing suspected api request to not-running server',
  'st061467'),
 ('2022-11-15 10:02:17.309',
  'hub-56bbc6d5f7-5wsph',
  '1344',
  'base',
  'failing suspected api request to not-running server',
  'st061467'),
 ('2022-11-15 10:02:28.368',
  'hub-56bbc6d5f7-5wsph',
  '1344',
  'base',
  'failing suspected api request to not-running server',
  'st061467')]

### 5. Drop everything (if necessary)

In [48]:
client.query('DROP TABLE IF EXISTS db1.users')
client.query('DROP TABLE IF EXISTS db1.instances')
client.query('DROP TABLE IF EXISTS db1.logs')

<clickhouse_connect.driver.query.QueryResult at 0x7f9706ca20d0>