In [1]:
import pandas as pd
import sqlite3

## Create a connection to the database using the library `sqlite3`

In [2]:
con = sqlite3.connect('../data/checking-logs.sqlite')

## Get the schema of the table `test`

In [3]:
pd.io.sql.read_sql(
    """
    pragma table_info(test)
    """,
    con,
    index_col='cid',
)

Unnamed: 0_level_0,name,type,notnull,dflt_value,pk
cid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,index,INTEGER,0,,0
1,uid,TEXT,0,,0
2,labname,TEXT,0,,0
3,first_commit_ts,TIMESTAMP,0,,0
4,first_view_ts,TIMESTAMP,0,,0


In [4]:
pd.io.sql.read_sql(
    """
    pragma table_info(deadlines)
    """,
    con,
    index_col='cid',
)

Unnamed: 0_level_0,name,type,notnull,dflt_value,pk
cid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,index,INTEGER,0,,0
1,labs,TEXT,0,,0
2,deadlines,INTEGER,0,,0


## Get only the first 10 rows of the table `test` to check what the table looks like

In [5]:
pd.io.sql.read_sql(
    """
    select *
    from test
    limit 10
    """,
    con,
    index_col='index',
)

Unnamed: 0_level_0,uid,labname,first_commit_ts,first_view_ts
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,user_1,laba04,2020-04-26 17:06:18.462708,2020-04-26 21:53:59.624136
1,user_1,laba04s,2020-04-26 17:12:11.843671,2020-04-26 21:53:59.624136
2,user_1,laba05,2020-05-02 19:15:18.540185,2020-04-26 21:53:59.624136
3,user_1,laba06,2020-05-17 16:26:35.268534,2020-04-26 21:53:59.624136
4,user_1,laba06s,2020-05-20 12:23:37.289724,2020-04-26 21:53:59.624136
5,user_1,project1,2020-05-14 20:56:08.898880,2020-04-26 21:53:59.624136
6,user_10,laba04,2020-04-25 08:24:52.696624,2020-04-18 12:19:50.182714
7,user_10,laba04s,2020-04-25 08:37:54.604222,2020-04-18 12:19:50.182714
8,user_10,laba05,2020-05-01 19:27:26.063245,2020-04-18 12:19:50.182714
9,user_10,laba06,2020-05-19 11:39:28.885637,2020-04-18 12:19:50.182714


In [6]:
pd.io.sql.read_sql(
    """
    select *
    from deadlines
    limit 10
    """,
    con,
    index_col='index',
    parse_dates=['deadlines'],
)

Unnamed: 0_level_0,labs,deadlines
index,Unnamed: 1_level_1,Unnamed: 2_level_1
0,laba04,2020-04-26 23:59:59
1,laba04s,2020-04-26 23:59:59
2,laba05,2020-05-03 23:59:59
4,laba06,2020-05-24 23:59:59
5,laba06s,2020-05-24 23:59:59
3,project1,2020-05-16 23:59:59


## Find among all the users the minimum value of the delta between the first commit of the user and the deadline of the corresponding lab using only one query

- do this by joining the table with the table `deadlines`

- the difference should be displayed in hours

- do not take the lab `'project1'` into account, it has longer deadlines and will be an outlier

- the value should be stored in the dataframe `df_min` with the corresponding `uid`

In [7]:
df_min = pd.io.sql.read_sql(
    """
    select
        test.uid,
        min((deadlines.deadlines - strftime('%s', test.first_commit_ts)) / 3600) as diff
    from test
    left join deadlines
    on test.labname = deadlines.labs
    where test.labname <> 'project1'
    """,
    con,
)

In [8]:
df_min

Unnamed: 0,uid,diff
0,user_25,2


## Do the same thing, but for the maximum, using only one query, the dataframe name is `df_max`

In [9]:
df_max = pd.io.sql.read_sql(
    """
    select
        test.uid,
        max((deadlines.deadlines - strftime('%s', test.first_commit_ts)) / 3600) as diff
    from test
    left join deadlines
    on test.labname = deadlines.labs
    where test.labname <> 'project1'
    """,
    con,
)

In [10]:
df_max

Unnamed: 0,uid,diff
0,user_30,202


## Do the same thing but for the average, using only one query, this time your dataframe should not include the `uid` column, and the dataframe name is `df_avg`

In [11]:
df_avg = pd.io.sql.read_sql(
    """
    select
        avg((deadlines.deadlines - strftime('%s', test.first_commit_ts)) / 3600) as diff
    from test
    left join deadlines
    on test.labname = deadlines.labs
    where test.labname <> 'project1'
    """,
    con,
)

In [12]:
df_avg

Unnamed: 0,diff
0,89.125


## We want to test the hypothesis that the users who visited the newsfeed just a few times have the lower delta between the first commit and the deadline. To do this, you need to calculate the correlation coefficient between the number of pageviews and the difference

- using only one query, create a table with the columns: `uid`, `avg_diff`, `pageviews`

- `uid` is the uids that exist in the `test`

- `avg_diff` is the average delta between the first commit and the lab deadline per user

- `pageviews` is the number of Newsfeed visits per user

- do not take the lab `'project1'` into account

- store it to the dataframe `views_diff`

- use the `pandas` method `corr()` to calculate the correlation coefficient between the number of pageviews and the difference

In [13]:
views_diff = pd.io.sql.read_sql(
    """
    select
        diff.uid,
        avg_diff,
        pageviews
    from (
        select
            uid,
            avg((deadlines.deadlines - strftime('%s', test.first_commit_ts)) / 3600) as avg_diff
        from test
        left join deadlines
        on test.labname = deadlines.labs
        where test.labname <> 'project1'
        group by uid
    ) as diff
    left join (
        select
            uid,
            count(*) as pageviews
        from pageviews
        group by uid
    ) as views
    on diff.uid = views.uid
    """,
    con,
)

In [14]:
views_diff

Unnamed: 0,uid,avg_diff,pageviews
0,user_1,64.4,28
1,user_10,74.8,89
2,user_14,159.0,143
3,user_17,61.6,47
4,user_18,5.666667,3
5,user_19,98.75,16
6,user_21,95.5,10
7,user_25,92.6,179
8,user_28,86.4,149
9,user_3,105.4,317


In [15]:
views_diff.corr()

Unnamed: 0,avg_diff,pageviews
avg_diff,1.0,0.279736
pageviews,0.279736,1.0


## Close the connection

In [16]:
con.close()