In [1]:
import pandas as pd
import numpy as np

#### Подключение к бд и заливка данных

In [2]:
import sqlalchemy
import pyodbc
import warnings
warnings.filterwarnings('ignore')

In [3]:
conn = pyodbc.connect('DSN=TestDB;Trusted_Connection=yes;')

In [4]:
def select(sql):
  return pd.read_sql(sql,conn)

#### Нарастающий итог:

In [5]:
t = pd.DataFrame({'dt':pd.to_datetime(['2021-04-01','2021-04-02','2021-04-03'],format='%Y-%m-%d'),
                  'revenue':[1,2,3]})

In [6]:
cur = conn.cursor()
sql = '''
drop table if exists revenue;
CREATE TABLE revenue (
    dt        datetime,
    revenue   int
);
'''
cur.execute(sql)
conn.commit()
for index,row in t.iterrows():
    cur.execute('''INSERT INTO revenue(
                    [dt],[revenue]
                    ) 
                    values (?,?)
    ''', 
                    row['dt'], 
                    row['revenue']
               )
conn.commit()
cur.close()
sql = '''select * from revenue t'''
select(sql)

Unnamed: 0,dt,revenue
0,2021-04-01,1
1,2021-04-02,2
2,2021-04-03,3


In [7]:
sql = '''select t.dt,t.revenue, 
sum(r.revenue) as cumsum
from revenue t
join revenue r on r.dt <= t.dt 
group by t.dt, t.revenue
'''

In [8]:
select(sql)

Unnamed: 0,dt,revenue,cumsum
0,2021-04-01,1,1
1,2021-04-02,2,3
2,2021-04-03,3,6


# 7. Оконные функции

## 1. Что такое оконная функция

### Нарастающий итог:

In [9]:
sql = '''select t.*,
sum(t.revenue) over (order by t.dt) as cum_sum
from revenue t'''

In [10]:
select(sql)

Unnamed: 0,dt,revenue,cum_sum
0,2021-04-01,1,1
1,2021-04-02,2,3
2,2021-04-03,3,6


In [11]:
t = pd.DataFrame({'user_id':[1,1,1,2,2,2],'dt':pd.to_datetime(['2021-04-01','2021-04-02','2021-04-03',
                                                               '2021-04-01','2021-04-02','2021-04-03'],format='%Y-%m-%d'),
                  'revenue':[1,2,3,2,3,4]})

In [12]:
cur = conn.cursor()
sql = '''
drop table if exists revenue;
CREATE TABLE revenue (
    user_id   int,
    dt        datetime,
    revenue   int
);
'''
cur.execute(sql)
conn.commit()
for index,row in t.iterrows():
    cur.execute('''INSERT INTO revenue(
                    [user_id],[dt],[revenue]
                    ) 
                    values (?,?,?)
    ''', 
                    row['user_id'],
                    row['dt'], 
                    row['revenue']
               )
conn.commit()
cur.close()
sql = '''select * from revenue t'''
select(sql)

Unnamed: 0,user_id,dt,revenue
0,1,2021-04-01,1
1,1,2021-04-02,2
2,1,2021-04-03,3
3,2,2021-04-01,2
4,2,2021-04-02,3
5,2,2021-04-03,4


In [13]:
sql = '''select t.*,
sum(t.revenue) over (partition by t.user_id order by t.dt) as cum_sum
from revenue t'''

In [14]:
select(sql)

Unnamed: 0,user_id,dt,revenue,cum_sum
0,1,2021-04-01,1,1
1,1,2021-04-02,2,3
2,1,2021-04-03,3,6
3,2,2021-04-01,2,2
4,2,2021-04-02,3,5
5,2,2021-04-03,4,9


## 2. rank и row_number

In [15]:
t = pd.DataFrame({'user_id':[1,1,1,1,2,2,2],'dt':pd.to_datetime(['2021-04-01','2021-04-02','2021-04-03','2021-04-03',
                                                               '2021-04-03','2021-04-04','2021-04-05'],format='%Y-%m-%d'),
                  'revenue':[1,2,3,1,2,3,4]})

In [16]:
cur = conn.cursor()
sql = '''
drop table if exists revenue;
CREATE TABLE revenue (
    user_id   int,
    dt        datetime,
    revenue   int
);
'''
cur.execute(sql)
conn.commit()
for index,row in t.iterrows():
    cur.execute('''INSERT INTO revenue(
                    [user_id],[dt],[revenue]
                    ) 
                    values (?,?,?)
    ''', 
                    row['user_id'],
                    row['dt'], 
                    row['revenue']
               )
conn.commit()
cur.close()
sql = '''select * from revenue t'''
select(sql)

Unnamed: 0,user_id,dt,revenue
0,1,2021-04-01,1
1,1,2021-04-02,2
2,1,2021-04-03,3
3,1,2021-04-03,1
4,2,2021-04-03,2
5,2,2021-04-04,3
6,2,2021-04-05,4


### последняя дата активности каждого пользователя:

***rank():***

In [17]:
sql = '''select t.*,
rank() over (partition by t.user_id order by t.dt desc) as rnk
from revenue t
'''

In [18]:
select(sql)

Unnamed: 0,user_id,dt,revenue,rnk
0,1,2021-04-03,3,1
1,1,2021-04-03,1,1
2,1,2021-04-02,2,3
3,1,2021-04-01,1,4
4,2,2021-04-05,4,1
5,2,2021-04-04,3,2
6,2,2021-04-03,2,3


In [19]:
sql = '''with 
dt_rank as (
    select t.*,
    rank() over (partition by t.user_id order by t.dt desc) as rnk
    from revenue t
)
select * from dt_rank t
where t.rnk = 1
'''

In [20]:
select(sql)

Unnamed: 0,user_id,dt,revenue,rnk
0,1,2021-04-03,3,1
1,1,2021-04-03,1,1
2,2,2021-04-05,4,1


***row_number():***

In [21]:
sql = '''select t.*,
row_number() over (partition by t.user_id order by t.dt desc) as rnk
from revenue t
'''

In [22]:
select(sql)

Unnamed: 0,user_id,dt,revenue,rnk
0,1,2021-04-03,3,1
1,1,2021-04-03,1,2
2,1,2021-04-02,2,3
3,1,2021-04-01,1,4
4,2,2021-04-05,4,1
5,2,2021-04-04,3,2
6,2,2021-04-03,2,3


In [23]:
sql = '''with 
dt_rank as (
    select t.*,
    row_number() over (partition by t.user_id order by t.dt desc) as rnk
    from revenue t
)
select * from dt_rank t
where t.rnk = 1
'''

In [24]:
select(sql)

Unnamed: 0,user_id,dt,revenue,rnk
0,1,2021-04-03,3,1
1,2,2021-04-05,4,1


#### стандартным способом:

In [25]:
t = pd.DataFrame({'user_id':[1,1,1,2,2,2],'dt':pd.to_datetime(['2021-04-01','2021-04-02','2021-04-03',
                                                               '2021-04-03','2021-04-04','2021-04-05'],format='%Y-%m-%d'),
                  'revenue':[1,2,3,2,3,4]})

In [26]:
cur = conn.cursor()
sql = '''
drop table if exists revenue;
CREATE TABLE revenue (
    user_id   int,
    dt        datetime,
    revenue   int
);
'''
cur.execute(sql)
conn.commit()
for index,row in t.iterrows():
    cur.execute('''INSERT INTO revenue(
                    [user_id],[dt],[revenue]
                    ) 
                    values (?,?,?)
    ''', 
                    row['user_id'],
                    row['dt'], 
                    row['revenue']
               )
conn.commit()
cur.close()
sql = '''select * from revenue t'''
select(sql)

Unnamed: 0,user_id,dt,revenue
0,1,2021-04-01,1
1,1,2021-04-02,2
2,1,2021-04-03,3
3,2,2021-04-03,2
4,2,2021-04-04,3
5,2,2021-04-05,4


In [27]:
sql = '''select t.user_id, 
max(t.dt) as max_dt from revenue t
group by t.user_id'''

In [28]:
select(sql)

Unnamed: 0,user_id,max_dt
0,1,2021-04-03
1,2,2021-04-05


In [29]:
sql = '''with 
last_dt as (
    select t.user_id, max(t.dt) as max_dt from revenue t
    group by t.user_id
)
select t.* from revenue t
join last_dt ld on t.user_id = ld.user_id and t.dt = ld.max_dt
order by t.user_id
'''

In [30]:
select(sql)

Unnamed: 0,user_id,dt,revenue
0,1,2021-04-03,3
1,2,2021-04-05,4


## 3. Топ 3 зарплаты в отделе (задача на интервью)

In [31]:
t = pd.DataFrame({'dep':['a','a','a','a','a',
                         'b','b','b','b','b'],
                  'emp':['aa','bb','cc','dd','ee',
                         'aa','bb','cc','dd','ee'],
                  'sal':[5,5,3,2,1,
                         5,4,3,2,1]})

In [32]:
cur = conn.cursor()
sql = '''
drop table if exists salary;
CREATE TABLE salary (
    dep       varchar(max),
    emp       varchar(max),
    sal       int
);
'''
cur.execute(sql)
conn.commit()
for index,row in t.iterrows():
    cur.execute('''INSERT INTO salary(
                    [dep],[emp],[sal]
                    ) 
                    values (?,?,?)
    ''', 
                    row['dep'],
                    row['emp'], 
                    row['sal']
               )
conn.commit()
cur.close()
sql = '''select * from salary t'''
select(sql)

Unnamed: 0,dep,emp,sal
0,a,aa,5
1,a,bb,5
2,a,cc,3
3,a,dd,2
4,a,ee,1
5,b,aa,5
6,b,bb,4
7,b,cc,3
8,b,dd,2
9,b,ee,1


In [33]:
sql = '''select t.*,
rank() over (partition by t.dep order by t.sal desc) as rnk_rank,
dense_rank() over (partition by t.dep order by t.sal desc) as rnk
from salary t
'''

In [34]:
select(sql)

Unnamed: 0,dep,emp,sal,rnk_rank,rnk
0,a,aa,5,1,1
1,a,bb,5,1,1
2,a,cc,3,3,2
3,a,dd,2,4,3
4,a,ee,1,5,4
5,b,aa,5,1,1
6,b,bb,4,2,2
7,b,cc,3,3,3
8,b,dd,2,4,4
9,b,ee,1,5,5


In [35]:
sql = '''with 
salary_rnk as (
    select t.*,
    dense_rank() over (partition by t.dep order by t.sal desc) as rnk
    from salary t
)
select * from salary_rnk t
where t.rnk <= 3
'''

In [36]:
select(sql)

Unnamed: 0,dep,emp,sal,rnk
0,a,aa,5,1
1,a,bb,5,1
2,a,cc,3,2
3,a,dd,2,3
4,b,aa,5,1
5,b,bb,4,2
6,b,cc,3,3


## 4. Расчет сессий клиентов (задача из тестового)

действия клиентов по времени:

In [37]:
user1 = pd.DataFrame({'user_id':[1,1,1,1,1],
                  'dt':pd.to_datetime(['2021-04-01 07:31','2021-04-01 07:35',
                                       '2021-04-01 08:20','2021-04-01 12:31',
                                       '2021-04-03 07:31'],format='%Y-%m-%d %H:%M')})

In [38]:
user2 = pd.DataFrame({'user_id':[2,2,2,2],
                  'dt':pd.to_datetime(['2021-04-01 07:31','2021-04-01 07:35',
                                       '2021-04-01 08:20','2021-04-01 9:10',
                                       ],format='%Y-%m-%d %H:%M')})

In [39]:
user3 = pd.DataFrame({'user_id':[3,3,3],
                  'dt':pd.to_datetime(['2021-04-01 07:31','2021-04-02 07:35',
                                       '2021-04-03 08:20'
                                       ],format='%Y-%m-%d %H:%M')})

In [40]:
t = pd.concat([user1,user2,user3])
# t

In [41]:
cur = conn.cursor()
sql = '''
drop table if exists client_log;
CREATE TABLE client_log (
    user_id   int,
    dt        datetime
);
'''
cur.execute(sql)
conn.commit()
for index,row in t.iterrows():
    cur.execute('''INSERT INTO client_log(
                    [user_id],[dt]
                    ) 
                    values (?,?)
    ''', 
                    row['user_id'],
                    row['dt']
               )
conn.commit()
cur.close()
sql = '''select * from client_log t'''
select(sql)

Unnamed: 0,user_id,dt
0,1,2021-04-01 07:31:00
1,1,2021-04-01 07:35:00
2,1,2021-04-01 08:20:00
3,1,2021-04-01 12:31:00
4,1,2021-04-03 07:31:00
5,2,2021-04-01 07:31:00
6,2,2021-04-01 07:35:00
7,2,2021-04-01 08:20:00
8,2,2021-04-01 09:10:00
9,3,2021-04-01 07:31:00


### Надо посчитать количество сессий клиентов:

Одна сессия, если между действиями проходит меньше часа. Надо посчитать количество сессий клиетов.  
(для 1 клиента 2-я сессия начинается в 12:31... = 3 сессии  
2: 1 сессия, 3: 2 сессии)

На каждое действие показать предыдущее действие:

### ***lag():***

In [42]:
sql = '''select *,
lag(t.dt) over (partition by t.user_id order by t.dt) as prev_dt
from client_log t
'''

In [43]:
select(sql)

Unnamed: 0,user_id,dt,prev_dt
0,1,2021-04-01 07:31:00,NaT
1,1,2021-04-01 07:35:00,2021-04-01 07:31:00
2,1,2021-04-01 08:20:00,2021-04-01 07:35:00
3,1,2021-04-01 12:31:00,2021-04-01 08:20:00
4,1,2021-04-03 07:31:00,2021-04-01 12:31:00
5,2,2021-04-01 07:31:00,NaT
6,2,2021-04-01 07:35:00,2021-04-01 07:31:00
7,2,2021-04-01 08:20:00,2021-04-01 07:35:00
8,2,2021-04-01 09:10:00,2021-04-01 08:20:00
9,3,2021-04-01 07:31:00,NaT


#### Сколько времени прошло между текущей активностью и предыдущей:

<a href="https://learn.microsoft.com/ru-RU/sql/t-sql/functions/datediff-transact-sql?view=sql-server-ver15&viewFallbackFrom=sqlallproducts-allversions">
    DATEDIFF (Transact-SQL) - SQL Server | Microsoft Learn</a>

In [44]:
sql = '''SELECT 
DATEDIFF(second, '2021-04-01 07:31:00.0000000', '2021-04-01 07:35:00.0000000');
'''
select(sql)

Unnamed: 0,Unnamed: 1
0,240


In [45]:
sql = '''select *,
lag(t.dt) over (partition by t.user_id order by t.dt) as prev_dt,
DATEDIFF(second, lag(t.dt) over (partition by t.user_id order by t.dt), t.dt) as dt_diff
from client_log t
'''

In [46]:
select(sql)

Unnamed: 0,user_id,dt,prev_dt,dt_diff
0,1,2021-04-01 07:31:00,NaT,
1,1,2021-04-01 07:35:00,2021-04-01 07:31:00,240.0
2,1,2021-04-01 08:20:00,2021-04-01 07:35:00,2700.0
3,1,2021-04-01 12:31:00,2021-04-01 08:20:00,15060.0
4,1,2021-04-03 07:31:00,2021-04-01 12:31:00,154800.0
5,2,2021-04-01 07:31:00,NaT,
6,2,2021-04-01 07:35:00,2021-04-01 07:31:00,240.0
7,2,2021-04-01 08:20:00,2021-04-01 07:35:00,2700.0
8,2,2021-04-01 09:10:00,2021-04-01 08:20:00,3000.0
9,3,2021-04-01 07:31:00,NaT,


#### Работаем с сессиями (номер сессии, начиная с 0):

In [47]:
sql = '''with 
new_session as (
    select *,
    --lag(t.dt) over (partition by t.user_id order by t.dt) as prev_dt,
    --DATEDIFF(second, lag(t.dt) over (partition by t.user_id order by t.dt), t.dt) as dt_diff,
    --условия сессий:
    case when DATEDIFF(second, lag(t.dt) over (partition by t.user_id order by t.dt), t.dt) >= 3600
        then 1 else 0 end as new_session
    from client_log t
)
--select * from new_session t
--/*
select t.*,
--нарастающий итог (номер сессии, начиная с 0):
sum(t.new_session) over (partition by t.user_id order by t.dt) as session_id
from new_session t
--*/
'''

In [48]:
select(sql)

Unnamed: 0,user_id,dt,new_session,session_id
0,1,2021-04-01 07:31:00,0,0
1,1,2021-04-01 07:35:00,0,0
2,1,2021-04-01 08:20:00,0,0
3,1,2021-04-01 12:31:00,1,1
4,1,2021-04-03 07:31:00,1,2
5,2,2021-04-01 07:31:00,0,0
6,2,2021-04-01 07:35:00,0,0
7,2,2021-04-01 08:20:00,0,0
8,2,2021-04-01 09:10:00,0,0
9,3,2021-04-01 07:31:00,0,0


#### кол-во активностей в каждой сессии:

In [49]:
sql = '''with 
new_session as (
    select *,
    case when DATEDIFF(second, lag(t.dt) over (partition by t.user_id order by t.dt), t.dt) >= 3600
        then 1 else 0 end as new_session
    from client_log t
),
client_sessions as (
    select t.*,
    sum(t.new_session) over (partition by t.user_id order by t.dt) as session_id
    from new_session t 
) 
--select * from client_sessions t
--/*
select t.user_id, t.session_id, count(1) as action_cnt from client_sessions t
group by t.user_id, t.session_id
order by t.user_id, t.session_id
--*/
'''

In [50]:
select(sql)

Unnamed: 0,user_id,session_id,action_cnt
0,1,0,3
1,1,1,1
2,1,2,1
3,2,0,4
4,3,0,1
5,3,1,1
6,3,2,1


#### всего количество сессий:

In [51]:
sql = '''with 
new_session as (
    select *,
    case when DATEDIFF(second, lag(t.dt) over (partition by t.user_id order by t.dt), t.dt) >= 3600
        then 1 else 0 end as new_session
    from client_log t
),
client_sessions as (
    select t.*,
    sum(t.new_session) over (partition by t.user_id order by t.dt) as session_id
    from new_session t 
),
client_sessions_agg as (
    select t.user_id, t.session_id, 
    count(1) as action_cnt 
    from client_sessions t
    group by t.user_id, t.session_id
)  
--select * from client_sessions_agg t order by t.user_id, t.session_id
--/*
select count(*) from client_sessions_agg t
--*/
'''

In [52]:
select(sql)

Unnamed: 0,Unnamed: 1
0,7


## 6. Скользящее среднее

In [53]:
t = pd.DataFrame({'user_id':[1,1,1,1,1,1,
                             2,2,2,2,2],
                  'dt':[1,2,3,4,5,6,
                        1,2,3,4,5],
                  'revenue':[1.0,2,3,4,5,6,
                             3,4,5,6,7]})

In [54]:
cur = conn.cursor()
sql = '''
drop table if exists revenue;
CREATE TABLE revenue (
    user_id   int,
    dt        int,
    revenue   int
);
'''
cur.execute(sql)
conn.commit()
for index,row in t.iterrows():
    cur.execute('''INSERT INTO revenue(
                    [user_id],[dt],[revenue]
                    ) 
                    values (?,?,?)
    ''', 
                row['user_id'],
                row['dt'],
                row['revenue']                
               )
conn.commit()
cur.close()
sql = '''select * from revenue t'''
select(sql)

Unnamed: 0,user_id,dt,revenue
0,1,1,1
1,1,2,2
2,1,3,3
3,1,4,4
4,1,5,5
5,1,6,6
6,2,1,3
7,2,2,4
8,2,3,5
9,2,4,6


#### Среднее для каждой строчки, включая саму строчку и две предыдущие:

In [55]:
sql = '''select t.*,
avg(t.revenue * 1.0) over (
        partition by t.user_id order by t.dt rows between 2 preceding and current row
    ) as moving_avg
from revenue t'''

In [56]:
select(sql)

Unnamed: 0,user_id,dt,revenue,moving_avg
0,1,1,1,1.0
1,1,2,2,1.5
2,1,3,3,2.0
3,1,4,4,3.0
4,1,5,5,4.0
5,1,6,6,5.0
6,2,1,3,3.0
7,2,2,4,3.5
8,2,3,5,4.0
9,2,4,6,5.0


----------------

In [57]:
conn.close()