In [1]:
import pandas as pd
import numpy as np

#### Подключение к бд и заливка данных

In [2]:
import sqlalchemy
import pyodbc
import warnings
warnings.filterwarnings('ignore')

In [3]:
conn = pyodbc.connect('DSN=TestDB;Trusted_Connection=yes;')

In [4]:
def select(sql):
  return pd.read_sql(sql,conn)

In [5]:
sql = '''select * from german_credit t'''
select(sql)

Unnamed: 0,age,sex,job,housing,saving_accounts,checking_account,credit_amount,duration,purpose,default,contract_dt,client_id
0,33,male,2,own,,,3074,9,radio/TV,0,2008-06-29 18:52:00,210
1,43,male,1,own,little,little,1344,12,car,0,2007-05-20 18:30:19,929
2,52,male,2,own,quite rich,,936,9,education,0,2008-04-27 08:23:07,200
3,35,female,3,own,little,,1393,11,car,0,2007-05-06 10:58:22,45
4,28,male,2,own,little,,776,12,radio/TV,0,2007-07-21 13:22:14,358
...,...,...,...,...,...,...,...,...,...,...,...,...
995,65,male,2,free,little,little,2600,18,radio/TV,1,2007-12-16 20:17:19,624
996,30,male,3,own,little,moderate,4455,36,business,1,2007-07-12 14:08:58,181
997,33,male,2,own,little,moderate,6403,24,radio/TV,0,2008-04-08 03:24:26,730
998,29,female,2,own,,,5003,21,car,1,2007-11-29 15:51:45,557


# 4. Group By

## 1. Сводная таблица

В сводных таблицах всегда дожен быть *count*

In [6]:
sql = '''select 
t.sex,

count(*) as cnt,

-- поля FLOAT должны, поэтому и не точность
avg(t.credit_amount * 1.0) as credit_amount_avg

from german_credit t
group by t.sex
'''

In [7]:
select(sql)

Unnamed: 0,sex,cnt,credit_amount_avg
0,female,310,2877.774193
1,male,690,3448.040579


### Уникальные значения:

In [8]:
sql = '''select 
count(distinct t.housing), count(t.housing) 
from german_credit t
'''

In [9]:
select(sql)

Unnamed: 0,Unnamed: 1,Unnamed: 2
0,3,1000


In [10]:
sql = '''select 
t.housing,

count(*) as cnt,
avg(t.credit_amount * 1.0) as credit_amount_avg

from german_credit t
group by t.housing
'''

In [11]:
select(sql) 

Unnamed: 0,housing,cnt,credit_amount_avg
0,free,108,4906.212962
1,own,713,3060.939691
2,rent,179,3122.553072


## 2. Пропущенные значения (null)

In [12]:
sql = '''select 
count(t.checking_account), count(0) 
from german_credit t
'''

In [13]:
select(sql)

Unnamed: 0,Unnamed: 1,Unnamed: 2
0,606,1000


In [14]:
sql = '''select 
t.checking_account,

count(*) as cnt,
avg(t.credit_amount) as credit_amount_avg

from german_credit t
group by t.checking_account
'''

In [15]:
select(sql)

Unnamed: 0,checking_account,cnt,credit_amount_avg
0,,394,3133
1,little,274,3175
2,moderate,269,3827
3,rich,63,2177


In [16]:
sql = '''select 
sum(case when t.checking_account is null then 1 else 0 end) as is_null,
count(case when t.checking_account is null then 1 else null end) as is_null2
from german_credit t
'''

In [17]:
select(sql)

Unnamed: 0,is_null,is_null2
0,394,394


#### потренируемся:

In [18]:
t = pd.DataFrame({'col1':[1,np.nan,2]})
t = t.replace({np.nan:None})
# t

In [19]:
cur = conn.cursor()
sql = '''
drop table if exists null_test;
CREATE TABLE null_test (
    col1   money
);
'''
cur.execute(sql)
conn.commit()

for index,row in t.iterrows():
    cur.execute('''INSERT INTO null_test(
                    [col1]
                    ) 
                    values (?)
    ''', 
                
                row['col1']
               )
    
conn.commit()
cur.close()

sql = '''select * from null_test t'''
select(sql)

Unnamed: 0,col1
0,1.0
1,
2,2.0


In [20]:
(1 + 2) / 2

1.5

In [21]:
(1 + 0 + 2) / 3

1.0

In [22]:
sql = '''select avg(t.col1) from null_test t'''

In [23]:
select(sql)

Unnamed: 0,Unnamed: 1
0,1.5


### заменим пропуски:

In [24]:
sql = '''select 
t.checking_account,
coalesce(t.checking_account,'no_info')
from german_credit t
'''

In [25]:
select(sql)

Unnamed: 0,checking_account,Unnamed: 2
0,,no_info
1,little,little
2,,no_info
3,,no_info
4,,no_info
...,...,...
995,little,little
996,moderate,moderate
997,moderate,moderate
998,,no_info


### coalesce:

In [26]:
t = pd.DataFrame({'col1':[1,np.nan,2],
                  'col2':[np.nan,np.nan,1],
                  'col3':[1,2,3]})
t = t.replace({np.nan:None})
# t

In [27]:
cur = conn.cursor()
sql = '''
drop table if exists null_test;
CREATE TABLE null_test (
    col1        INTEGER,
    col2        INTEGER,
    col3        INTEGER
);
'''
cur.execute(sql)
conn.commit()

for index,row in t.iterrows():
    cur.execute('''INSERT INTO null_test(
                    [col1],[col2],[col3]
                    ) 
                    values (?,?,?)
    ''', 
                row['col1'], 
                row['col2'], 
                row['col3'],
               )
    
conn.commit()
cur.close()

sql = '''select * from null_test t'''
select(sql)

Unnamed: 0,col1,col2,col3
0,1.0,,1
1,,,2
2,2.0,1.0,3


In [28]:
sql = '''select t.*, 
coalesce(t.col1, t.col2, t.col3) as res
from null_test t
'''

In [29]:
select(sql)

Unnamed: 0,col1,col2,col3,res
0,1.0,,1,1
1,,,2,2
2,2.0,1.0,3,2


## 3. Дубликаты

In [30]:
t = pd.DataFrame({'id':[1,1,2],'name':['a','a','b']})
# t

In [31]:
cur = conn.cursor()
sql = '''
drop table if exists dupl_test;
CREATE TABLE dupl_test (
    id        INTEGER,
    name      VARCHAR(max)
);
'''
cur.execute(sql)
conn.commit()

for index,row in t.iterrows():
    cur.execute('''INSERT INTO dupl_test(
                    [id],[name]
                    ) 
                    values (?,?)
    ''', 
                row['id'], 
                row['name']
               )
    
conn.commit()
cur.close()

sql = '''select * from dupl_test t'''
select(sql)

Unnamed: 0,id,name
0,1,a
1,1,a
2,2,b


### группируем на все поля и посчитаем строки:

In [32]:
sql = '''select 
t.id, t.name, 
count(1) as cnt 
from dupl_test t
group by t.id, t.name
'''

In [33]:
select(sql)

Unnamed: 0,id,name,cnt
0,1,a,2
1,2,b,1


In [34]:
sql = '''select t.id, t.name, 
count(1) as cnt 
from dupl_test t
group by t.id, t.name
having count(1) > 1
'''

In [35]:
select(sql)

Unnamed: 0,id,name,cnt
0,1,a,2


### дубликат Id:

In [36]:
t = pd.DataFrame({'id':[1,1,2,2,3],
                  'name':['a','b','c','d','e']})
# t

In [37]:
cur = conn.cursor()
sql = '''
drop table if exists dupl_test;
CREATE TABLE dupl_test (
    id        INTEGER,
    name      VARCHAR(max)
);
'''
cur.execute(sql)
conn.commit()

for index,row in t.iterrows():
    cur.execute('''INSERT INTO dupl_test(
                    [id],[name]
                    ) 
                    values (?,?)
    ''', 
                row['id'], 
                row['name']
               )
    
conn.commit()
cur.close()

sql = '''select * from dupl_test t'''
select(sql)

Unnamed: 0,id,name
0,1,a
1,1,b
2,2,c
3,2,d
4,3,e


In [38]:
sql = '''select t.id, 
count(1) as cnt from dupl_test t
group by t.id
having count(1) > 1
'''

In [39]:
select(sql)

Unnamed: 0,id,cnt
0,1,2
1,2,2


In [None]:
sql = '''
select * from dupl_test t
where t.id in (1,2)
'''

In [None]:
select(sql)

#### Используя подзапросы:

In [40]:
sql = '''select 
t.id as cnt 
from dupl_test t
group by t.id
having count(1) > 1
'''

In [41]:
select(sql)

Unnamed: 0,cnt
0,1
1,2


In [42]:
sql = '''select * 
from dupl_test t
where t.id in (
    select t.id as cnt from dupl_test t
    group by t.id
    having count(1) > 1
)
'''

In [43]:
select(sql)

Unnamed: 0,id,name
0,1,a
1,1,b
2,2,c
3,2,d


## 4. Агрегация

In [44]:
sql = '''select 
year(t.contract_dt) as year,  month(t.contract_dt) as month,

count(1) as credit_cnt,
count(distinct t.client_id) as client_id_unique,
sum(t.credit_amount) as credit_amount_sum,
avg(t.credit_amount * 1.0) as credit_amount_avg

from german_credit t
group by year(t.contract_dt),  month(t.contract_dt)
order by year(t.contract_dt),  month(t.contract_dt)
'''

In [45]:
select(sql)

Unnamed: 0,year,month,credit_cnt,client_id_unique,credit_amount_sum,credit_amount_avg
0,2007,5,81,81,207663,2563.74074
1,2007,6,74,74,239594,3237.756756
2,2007,7,71,71,224333,3159.619718
3,2007,8,57,57,178569,3132.789473
4,2007,9,58,58,186909,3222.568965
5,2007,10,70,70,188534,2693.342857
6,2007,11,87,87,300504,3454.068965
7,2007,12,77,77,273973,3558.090909
8,2008,1,93,93,288080,3097.634408
9,2008,2,55,55,211128,3838.690909


## 5. Создание интервалов (или бинов или бакетов)

#### Уникальные значения:

In [46]:
sql = '''select 
count(distinct t.credit_amount) 
from german_credit t
'''

In [47]:
select(sql)

Unnamed: 0,Unnamed: 1
0,921


#### Введём диапозоны:

In [48]:
sql = '''select t.credit_amount,

case when t.credit_amount < 1000 then '1. <1000'
when t.credit_amount < 2000 then '2. 1000-2000' 
when t.credit_amount < 3000 then '3. 2000-3000'
when t.credit_amount >= 3000 then '4. >= 3000'
else 'other' end as credit_amount_bin

from german_credit t
'''

In [49]:
select(sql)

Unnamed: 0,credit_amount,credit_amount_bin
0,3074,4. >= 3000
1,1344,2. 1000-2000
2,936,1. <1000
3,1393,2. 1000-2000
4,776,1. <1000
...,...,...
995,2600,3. 2000-3000
996,4455,4. >= 3000
997,6403,4. >= 3000
998,5003,4. >= 3000


In [50]:
sql = '''select 

case 
when t.credit_amount < 1000 then '1. <1000'
when t.credit_amount < 2000 then '2. 1000-2000' 
when t.credit_amount < 3000 then '3. 2000-3000'
when t.credit_amount >= 3000 then '4. >= 3000'
else 'other' end as credit_amount_bin,

count(1) as credit_cnt

from german_credit t

group by case 
when t.credit_amount < 1000 then '1. <1000'
when t.credit_amount < 2000 then '2. 1000-2000' 
when t.credit_amount < 3000 then '3. 2000-3000'
when t.credit_amount >= 3000 then '4. >= 3000'
else 'other' end

order by case 
when t.credit_amount < 1000 then '1. <1000'
when t.credit_amount < 2000 then '2. 1000-2000' 
when t.credit_amount < 3000 then '3. 2000-3000'
when t.credit_amount >= 3000 then '4. >= 3000'
else 'other' end
'''

In [51]:
select(sql)

Unnamed: 0,credit_amount_bin,credit_cnt
0,1. <1000,116
1,2. 1000-2000,316
2,3. 2000-3000,188
3,4. >= 3000,380


## 6. Переменные в столбцах сводной таблицы

### Pivot таблицы:

In [52]:
sql = '''select 
t.housing, 

count(case when t.sex = 'female' then 1 else null end) as female,
count(case when t.sex = 'male' then 1 else null end) as male,

count(1) as cnt 

from german_credit t
group  by t.housing
'''

In [53]:
select(sql)

Unnamed: 0,housing,female,male,cnt
0,free,19,89,108
1,own,196,517,713
2,rent,95,84,179


#### автоматизируем в Python:

In [54]:
sql = '''select distinct 
t.purpose 
from german_credit t
'''

In [55]:
select(sql)

Unnamed: 0,purpose
0,business
1,car
2,domestic appliances
3,education
4,furniture/equipment
5,radio/TV
6,repairs
7,vacation/others


In [56]:
purpose = list(select(sql)['purpose'].values)
purpose

['business',
 'car',
 'domestic appliances',
 'education',
 'furniture/equipment',
 'radio/TV',
 'repairs',
 'vacation/others']

In [57]:
for p in purpose:
  print(f"count(case when t.purpose = '{p}' then 1 else null end) as {p.lower().replace(' ','').replace('/','')},")

count(case when t.purpose = 'business' then 1 else null end) as business,
count(case when t.purpose = 'car' then 1 else null end) as car,
count(case when t.purpose = 'domestic appliances' then 1 else null end) as domesticappliances,
count(case when t.purpose = 'education' then 1 else null end) as education,
count(case when t.purpose = 'furniture/equipment' then 1 else null end) as furnitureequipment,
count(case when t.purpose = 'radio/TV' then 1 else null end) as radiotv,
count(case when t.purpose = 'repairs' then 1 else null end) as repairs,
count(case when t.purpose = 'vacation/others' then 1 else null end) as vacationothers,


In [58]:
sql = '''select t.housing, 

count(case when t.purpose = 'radio/TV' then 1 else null end) as radiotv,
count(case when t.purpose = 'car' then 1 else null end) as car,
count(case when t.purpose = 'education' then 1 else null end) as education,
count(case when t.purpose = 'furniture/equipment' then 1 else null end) as furnitureequipment,
count(case when t.purpose = 'repairs' then 1 else null end) as repairs,
count(case when t.purpose = 'business' then 1 else null end) as business,
count(case when t.purpose = 'domestic appliances' then 1 else null end) as domesticappliances,
count(case when t.purpose = 'vacation/others' then 1 else null end) as vacationothers,
count(1) as cnt 

from german_credit t
group  by t.housing
'''

In [59]:
select(sql)

Unnamed: 0,housing,radiotv,car,education,furnitureequipment,repairs,business,domesticappliances,vacationothers,cnt
0,free,15,55,15,11,3,5,0,4,108
1,own,227,219,34,122,17,76,10,8,713
2,rent,38,63,10,48,2,16,2,0,179


## 7. Создание категорий из текстовых данных (like)

#### пример разрозненных данных:

In [60]:
t = pd.DataFrame({'purpose':['машина','машина','машина','на машину','на покупку машины',
                             'автомобиль','на возвращение 2007', 
                             'на свадьбу','свадьба','свадьба','свадьба','для свадьбы',
                             'недвижимость','на покупку недвижимости']})
# t

In [61]:
cur = conn.cursor()
sql = '''
drop table if exists purpose;
CREATE TABLE purpose (
    purpose      VARCHAR(max)
);
'''
cur.execute(sql)
conn.commit()

for index,row in t.iterrows():
    cur.execute('''INSERT INTO purpose(
                    [purpose]
                    ) 
                    values (?)
    ''', 
                    row['purpose']
               )
    
conn.commit()
cur.close()

sql = '''select * from purpose t'''
select(sql)

Unnamed: 0,purpose
0,машина
1,машина
2,машина
3,на машину
4,на покупку машины
5,автомобиль
6,на возвращение 2007
7,на свадьбу
8,свадьба
9,свадьба


#### проверим на уникальные значения:

In [62]:
sql = '''select 
t.purpose, count(1) from purpose t
group by t.purpose
order by count(1) desc
'''

In [63]:
select(sql)

Unnamed: 0,purpose,Unnamed: 2
0,машина,3
1,свадьба,3
2,автомобиль,1
3,для свадьбы,1
4,на возвращение 2007,1
5,на машину,1
6,на покупку машины,1
7,на покупку недвижимости,1
8,на свадьбу,1
9,недвижимость,1


#### выберем общее:

In [64]:
cat = '''select t.purpose,

case when t.purpose like '%свадьб%' then 'свадьба'
when t.purpose like '%машин%' or t.purpose like '%авто%' then 'машина'
when t.purpose like '%недвиж%' then 'недвижимость'

else 'другое' end as purpose_cat

from purpose t
'''

In [65]:
select(cat)

Unnamed: 0,purpose,purpose_cat
0,машина,машина
1,машина,машина
2,машина,машина
3,на машину,машина
4,на покупку машины,машина
5,автомобиль,машина
6,на возвращение 2007,другое
7,на свадьбу,свадьба
8,свадьба,свадьба
9,свадьба,свадьба


In [66]:
sql = f'''select 
t.purpose_cat,

count(1)

from (
    select t.purpose,

    case when t.purpose like '%свадьб%' then 'свадьба'
    when t.purpose like '%машин%' or t.purpose like '%авто%' then 'машина'
    when t.purpose like '%недвиж%' then 'недвижимость'

    else 'другое' end as purpose_cat

    from purpose t
) t
group by t.purpose_cat
'''

In [67]:
select(sql)

Unnamed: 0,purpose_cat,Unnamed: 2
0,другое,1
1,машина,6
2,недвижимость,2
3,свадьба,5


In [68]:
sql = f'''select 
t.purpose_cat,
count(1)

from ({cat}) t
group by t.purpose_cat
'''

In [69]:
select(sql)

Unnamed: 0,purpose_cat,Unnamed: 2
0,другое,1
1,машина,6
2,недвижимость,2
3,свадьба,5


In [70]:
sql = f'''select 
t.purpose, 
count(1) 

from ({cat}) t
where t.purpose_cat = 'другое'

group by t.purpose
order by count(1) desc'''

In [71]:
select(sql)

Unnamed: 0,purpose,Unnamed: 2
0,на возвращение 2007,1


----------------

In [72]:
conn.close()