In [1]:
import pandas as pd
import sqlite3

## Create a connection to the database using the library `sqlite3`

In [2]:
con = sqlite3.connect('../data/checking-logs.sqlite')

## Using only one query for each of the groups, create two dataframes: `test_results` and `control_results` with the columns `time` and `avg_diff` and only two rows

- `time` should have the values: `after` and `before`

- `avg_diff` contains the average delta among all the users for the time period before each of them made their first visit to the page and afterward

- only take into account the users that have observations before and after

- we still are not using the lab `'project1'`

In [3]:
test_results = pd.io.sql.read_sql(
    """
    select
        tmp.time,
        avg(diff) as avg_diff
        from (
            select
                test.uid,
                (deadlines.deadlines - strftime('%s', test.first_commit_ts)) / 3600 as diff,
                case
                    when test.first_commit_ts < test.first_view_ts then 'before'
                    else 'after'
                end as time
            from test
            left join deadlines
            on test.labname = deadlines.labs
            where test.labname <> 'project1'
        ) as tmp
        where (
            select count(*)
            from test
            where test.labname <> 'project1'
            and test.first_commit_ts < test.first_view_ts
            and test.uid = tmp.uid
        ) > 0
        and (
            select count(*)
            from test
            where test.labname <> 'project1'
            and test.first_commit_ts > test.first_view_ts
            and test.uid = tmp.uid
        ) > 0
        group by tmp.time
    """,
    con,
)

In [4]:
test_results

Unnamed: 0,time,avg_diff
0,after,104.6
1,before,60.5625


In [5]:
control_results = pd.io.sql.read_sql(
    """
    select
        tmp.time,
        avg(diff) as avg_diff
        from (
            select
                control.uid,
                (deadlines.deadlines - strftime('%s', control.first_commit_ts)) / 3600 as diff,
                case
                    when control.first_commit_ts < control.first_view_ts then 'before'
                    else 'after'
                end as time
            from control
            left join deadlines
            on control.labname = deadlines.labs
            where control.labname <> 'project1'
        ) as tmp
        where (
            select count(*)
            from control
            where control.labname <> 'project1'
            and control.first_commit_ts < control.first_view_ts
            and control.uid = tmp.uid
        ) > 0
        and (
            select count(*)
            from control
            where control.labname <> 'project1'
            and control.first_commit_ts > control.first_view_ts
            and control.uid = tmp.uid
        ) > 0
        group by tmp.time
    """,
    con,
)

In [6]:
control_results

Unnamed: 0,time,avg_diff
0,after,117.636364
1,before,99.464286


## Close the connection

In [7]:
con.close()

## Have the answer: did the hypothesis turn out to be true and the page does affect the students' behavior?

#### Difference in test group ~44 hours, difference in control group ~18 hours, so the hypothesis was true