# Exercise 03. Aggregations

In [329]:
import pandas as pd
import sqlite3

## Create a connection to the database using the `sqlite3` library

In [330]:
con = sqlite3.connect("../data/checking-logs.sqlite")

## Get the schema of the `test` table

In [331]:
query = "PRAGMA table_info(test);"
df = pd.io.sql.read_sql(query, con)
df

Unnamed: 0,cid,name,type,notnull,dflt_value,pk
0,0,index,INTEGER,0,,0
1,1,uid,TEXT,0,,0
2,2,labname,TEXT,0,,0
3,3,first_commit_ts,TIMESTAMP,0,,0
4,4,first_view_ts,TIMESTAMP,0,,0


## Get only the first ten rows of the `test` table to see what it looks like

In [332]:
query = """
SELECT *
FROM test
LIMIT 10
"""
df = pd.io.sql.read_sql(query, con)
df

Unnamed: 0,index,uid,labname,first_commit_ts,first_view_ts
0,3,user_17,project1,2020-04-18 07:56:45.408648,2020-04-18 10:56:55.833899
1,4,user_30,laba04,2020-04-18 13:36:53.971502,2020-04-17 22:46:26.785035
2,7,user_30,laba04s,2020-04-18 14:51:37.498399,2020-04-17 22:46:26.785035
3,8,user_14,laba04,2020-04-18 15:14:00.312338,2020-04-18 10:53:52.623447
4,11,user_14,laba04s,2020-04-18 22:30:30.247628,2020-04-18 10:53:52.623447
5,18,user_19,laba04,2020-04-20 19:05:01.297780,2020-04-21 20:30:38.034966
6,19,user_25,laba04,2020-04-20 19:16:50.673054,2020-05-09 23:54:54.260791
7,20,user_21,laba04,2020-04-21 17:48:00.487806,2020-04-22 22:40:36.824081
8,21,user_30,project1,2020-04-22 12:36:24.053518,2020-04-17 22:46:26.785035
9,23,user_21,laba04s,2020-04-22 20:09:21.857747,2020-04-22 22:40:36.824081


## Find the minimum value of the delta between the first commit and the deadline of the corresponding lab for all users using only one query

In [333]:
query = """
SELECT uid, MIN(delta)
FROM
(
    SELECT uid,
        (CAST(strftime('%s', first_commit_ts) AS INT) - deadlines) / 3600
        AS delta
    FROM test t
    JOIN deadlines dl
    ON t.labname = dl.labs
    WHERE NOT t.labname = 'project1'
    ORDER BY delta
)
LIMIT 1
"""
df_min = pd.io.sql.read_sql(query, con)
df_min

Unnamed: 0,uid,MIN(delta)
0,user_30,-202


## Do the same thing for the maximum, but use only one query. The dataframe name is `df_max`

In [334]:
query = """
SELECT uid, MAX(delta)
FROM
(
    SELECT uid,
        (CAST(strftime('%s', first_commit_ts) AS INT) - deadlines) / 3600
        AS delta
    FROM test t
    JOIN deadlines dl
    ON t.labname = dl.labs
    WHERE NOT t.labname = 'project1'
    ORDER BY delta
)
LIMIT 1
"""
df_max = pd.io.sql.read_sql(query, con)
df_max

Unnamed: 0,uid,MAX(delta)
0,user_25,-2


## Do the same thing, but for the average. Use only one query. This time, your dataframe should not include the uid column. The dataframe name is `df_avg`

In [335]:
query = """
SELECT AVG(delta)
FROM
(
    SELECT
        (CAST(strftime('%s', first_commit_ts) AS INT) - deadlines) / 3600
        AS delta
    FROM test t
    JOIN deadlines dl
    ON t.labname = dl.labs
    WHERE NOT t.labname = 'project1'
    ORDER BY delta
)
LIMIT 1
"""
df_avg = pd.io.sql.read_sql(query, con)
df_avg

Unnamed: 0,AVG(delta)
0,-89.125


## We want to test the hypothesis that users who visited the newsfeed just a few times have a lower delta between the first commit and the deadline. To do this, calculate the correlation coefficient between the number of pageviews and the difference

In [336]:
query = """
SELECT uid,
    AVG((CAST(strftime('%s', first_commit_ts) AS INT) - deadlines) / 3600)
    AS avg_diff,
    (
        SELECT COUNT(*)
        FROM pageviews pv
        WHERE t.uid = pv.uid
    )
    AS pageviews
FROM test t
JOIN deadlines dl
ON t.labname = dl.labs
WHERE NOT t.labname = 'project1'
GROUP BY uid
"""
views_diff = pd.io.sql.read_sql(query, con)
views_diff.set_index('uid', inplace=True)
print(views_diff)
views_diff.corr()

           avg_diff  pageviews
uid                           
user_1   -64.400000         28
user_10  -74.800000         89
user_14 -159.000000        143
user_17  -61.600000         47
user_18   -5.666667          3
user_19  -98.750000         16
user_21  -95.500000         10
user_25  -92.600000        179
user_28  -86.400000        149
user_3  -105.400000        317
user_30 -145.250000          3


Unnamed: 0,avg_diff,pageviews
avg_diff,1.0,-0.279736
pageviews,-0.279736,1.0


## Close the connection

In [337]:
con.close()