# Exercise 03

In [96]:
import pandas as pd
import sqlite3

## 1. Create a connection to the database using the library sqlite3

In [97]:
connection = sqlite3.connect('../data/checking-logs.sqlite')
cursor = connection.cursor()

## 2. Get the schema of the table test

In [98]:
sql_query = 'PRAGMA table_info(test)'

df = pd.read_sql(sql_query, connection)
df

Unnamed: 0,cid,name,type,notnull,dflt_value,pk
0,0,uid,TEXT,0,,0
1,1,labname,TEXT,0,,0
2,2,first_commit_ts,TIMESTAMP,0,,0
3,3,first_view_ts,TIMESTAMP,0,,0


## 3. Get only the first 10 rows of the table test to check what the table looks like

In [99]:
sql_query = 'SELECT * FROM test LIMIT 10'

pd.read_sql(sql_query, connection)

Unnamed: 0,uid,labname,first_commit_ts,first_view_ts
0,user_17,project1,2020-04-18 07:56:45.408648,2020-04-18 10:56:55.833899
1,user_30,laba04,2020-04-18 13:36:53.971502,2020-04-17 22:46:26.785035
2,user_30,laba04s,2020-04-18 14:51:37.498399,2020-04-17 22:46:26.785035
3,user_14,laba04,2020-04-18 15:14:00.312338,2020-04-18 10:53:52.623447
4,user_14,laba04s,2020-04-18 22:30:30.247628,2020-04-18 10:53:52.623447
5,user_19,laba04,2020-04-20 19:05:01.297780,2020-04-21 20:30:38.034966
6,user_25,laba04,2020-04-20 19:16:50.673054,2020-05-09 23:54:54.260791
7,user_21,laba04,2020-04-21 17:48:00.487806,2020-04-22 22:40:36.824081
8,user_30,project1,2020-04-22 12:36:24.053518,2020-04-17 22:46:26.785035
9,user_21,laba04s,2020-04-22 20:09:21.857747,2020-04-22 22:40:36.824081


## 4. Find among all the users the minimum value of the delta between the first commit of the user and the deadline of the corresponding lab using only one query


- do this by joining the table with the table deadlines
- the difference should be displayed in hours
- do not take the lab ’project1’ into account, it has longer deadlines and will be an outlier
- the value should be stored in the dataframe df_min with the corresponding uid

In [100]:
sql_query = '''
SELECT uid, MIN(df_min) as df_min
FROM (SELECT uid, CAST((strftime('%s', datetime(dl.deadlines, 'unixepoch')) - strftime('%s', t.first_commit_ts)) / 3600 AS INTEGER) AS df_min
FROM test t
JOIN deadlines dl ON t.labname = dl.labs
WHERE NOT t.labname = 'project1')
'''

df_min = pd.read_sql(sql_query, connection)
df_min

Unnamed: 0,uid,df_min
0,user_25,2


## 5. Do the same thing, but for the maximum, using only one query, the dataframe name is df_max

In [101]:
sql_query = '''
SELECT uid, MAX(df_max) as df_max
FROM (SELECT uid, CAST((strftime('%s', datetime(dl.deadlines, 'unixepoch')) - strftime('%s', t.first_commit_ts)) / 3600 AS INTEGER) AS df_max
FROM test t
JOIN deadlines dl ON t.labname = dl.labs
WHERE NOT t.labname = 'project1')
'''

df_max = pd.read_sql(sql_query, connection)
df_max

Unnamed: 0,uid,df_max
0,user_30,202


## 6. Do the same thing but for the average, using only one query, this time your dataframe should not include the uid column, and the dataframe name is df_avg

In [102]:
sql_query = '''
SELECT AVG(df_avg) as df_avg
FROM (SELECT uid, CAST((strftime('%s', datetime(dl.deadlines, 'unixepoch')) - strftime('%s', t.first_commit_ts)) / 3600 AS INTEGER) AS df_avg
FROM test t
JOIN deadlines dl ON t.labname = dl.labs
WHERE NOT t.labname = 'project1')
'''

df_avg = pd.read_sql(sql_query, connection)
df_avg

Unnamed: 0,df_avg
0,89.125


## 7. We want to test the hypothesis that the users who visited the newsfeed just a few times have the lower delta between the first commit and the deadline. To do this, you need to calculate the correlation coefficient between the number of pageviews and the difference

- Using only one query, create a table with the columns: uid, avg_diff, pageviews

In [103]:
sql_query = '''
SELECT uid, AVG(diff) AS avg_diff, COUNT(uid) AS pageviews
FROM (
SELECT t.uid, CAST((strftime('%s', datetime(dl.deadlines, 'unixepoch')) - strftime('%s', t.first_commit_ts)) / 3600 AS INTEGER) AS diff
FROM test AS t
JOIN deadlines AS dl ON t.labname = dl.labs
WHERE NOT t.labname = 'project1'
)
GROUP BY uid
'''

views_diff = pd.read_sql(sql_query, connection)
views_diff

Unnamed: 0,uid,avg_diff,pageviews
0,user_1,64.4,5
1,user_10,74.8,5
2,user_14,159.0,3
3,user_17,61.6,5
4,user_18,5.666667,3
5,user_19,98.75,4
6,user_21,95.5,4
7,user_25,92.6,5
8,user_28,86.4,5
9,user_3,105.4,5


In [104]:
views_diff.corr(numeric_only=True)

Unnamed: 0,avg_diff,pageviews
avg_diff,1.0,-0.117685
pageviews,-0.117685,1.0


## 8. Close the connection

In [105]:
connection.close()