In [1]:
import numpy as np
import pandas as pd

import sqlite3

**Database: Chinook**

In [3]:
# Kết nối đến CSDL và tạo cursor
conn = sqlite3.connect('database/Chinook_Sqlite.sqlite')
cursor = conn.cursor()

# 1. Gom nhóm

## 1.1. Mệnh đề `GROUP BY` cơ bản
(Lưu ý: `GROUP BY` cần kết hợp với `MAX`, `MIN`, `SUM`, `COUNT`, `AVG`)

**Thử gom nhóm bảng `track` theo `AlbumID`**

In [4]:
query = """
        SELECT *
        FROM track
        GROUP BY albumid
        --LIMIT (5)
"""
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[item[0] for item in cursor.description])

Unnamed: 0,TrackId,Name,AlbumId,MediaTypeId,GenreId,Composer,Milliseconds,Bytes,UnitPrice
0,1,For Those About To Rock (We Salute You),1,1,1,"Angus Young, Malcolm Young, Brian Johnson",343719,11170334,0.99
1,2,Balls to the Wall,2,2,1,,342562,5510424,0.99
2,3,Fast As a Shark,3,2,1,"F. Baltes, S. Kaufman, U. Dirkscneider & W. Ho...",230619,3990994,0.99
3,15,Go Down,4,1,1,AC/DC,331180,10847611,0.99
4,23,Walk On Water,5,1,1,"Steven Tyler, Joe Perry, Jack Blades, Tommy Shaw",295680,9719579,0.99
...,...,...,...,...,...,...,...,...,...
342,3499,Pini Di Roma (Pinien Von Rom) \ I Pini Della V...,343,2,24,,286741,4718950,0.99
343,3500,"String Quartet No. 12 in C Minor, D. 703 ""Quar...",344,2,24,Franz Schubert,139200,2283131,0.99
344,3501,"L'orfeo, Act 3, Sinfonia (Orchestra)",345,2,24,Claudio Monteverdi,66639,1189062,0.99
345,3502,"Quintet for Horn, Violin, 2 Violas, and Cello ...",346,2,24,Wolfgang Amadeus Mozart,221331,3665114,0.99


**Mỗi album có bao nhiêu track?**

In [5]:
query = """
        SELECT albumid, count(trackid) as count
        FROM track
        GROUP BY AlbumId        
        ORDER BY count DESC
        LIMIT (10)
"""
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[item[0] for item in cursor.description])

Unnamed: 0,AlbumId,count
0,141,57
1,23,34
2,73,30
3,229,26
4,230,25
5,251,25
6,83,24
7,231,24
8,253,24
9,24,23


**Tương tự, mỗi GenreID có bao nhiêu track**

In [6]:
query = """
        SELECT GenreID, COUNT(trackID) as Count_Track
        FROM track
        GROUP BY GenreID
        ORDER BY Count_Track DESC
        --LIMIT (5)
"""
cursor.execute(query)
fetch_data = cursor.fetchall()
pd.DataFrame(fetch_data, columns=[item[0] for item in cursor.description])

Unnamed: 0,GenreId,Count_Track
0,1,1297
1,7,579
2,3,374
3,4,332
4,2,130
5,19,93
6,6,81
7,24,74
8,21,64
9,14,61


**Trung bình thời lượng mỗi bài trong các album**

In [7]:
query = """
        SELECT AlbumID, AVG(Milliseconds) as Mean_Time
        FROM track
        -- WHERE GenreID = 1
        
        GROUP BY AlbumID
        --LIMIT (5)
"""
cursor.execute(query)
fetch_data = cursor.fetchall()
pd.DataFrame(fetch_data, columns=[item[0] for item in cursor.description])

Unnamed: 0,AlbumId,Mean_Time
0,1,240041.500000
1,2,342562.000000
2,3,286029.333333
3,4,306657.375000
4,5,294113.933333
...,...,...
342,343,286741.000000
343,344,139200.000000
344,345,66639.000000
345,346,221331.000000


**Tổng dung lượng của các track trong từng album**

In [8]:
query = """
        SELECT albumid, SUM(Bytes) as sum_bytes, MIN(Bytes), MAX(bytes)
        FROM track
        GROUP BY AlbumId        
        ORDER BY sum_bytes DESC
        LIMIT (10)
"""
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[item[0] for item in cursor.description])

Unnamed: 0,AlbumId,sum_bytes,MIN(Bytes),MAX(bytes)
0,229,13917603291,486675063,1059546140
1,253,12872621850,487899692,587051735
2,231,12344960921,457364940,574325829
3,228,11781321607,475996611,549353481
4,227,10059916535,462818231,1054423946
5,261,7708725642,20831818,526865050
6,251,7652731262,245378749,515301752
7,250,5711964665,244626927,327642458
8,230,5280909854,183867185,228896396
9,249,1610359572,257879716,290482361


**Group theo nhiều cột: mỗi loại media có bao nhiêu genre, trong đó có bao nhiêu track?**

In [10]:
query = """
        SELECT MediaTypeId, GenreId, count(trackid) as count
        FROM track
        -- WHERE MediaTypeId BETWEEN 3 and 5
        
        GROUP BY MediaTypeId, GenreId
        ORDER BY MediaTypeId ASC
        --LIMIT (10)
"""
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[item[0] for item in cursor.description])

Unnamed: 0,MediaTypeId,GenreId,count
0,1,1,1211
1,1,2,127
2,1,3,374
3,1,4,332
4,1,5,12
5,1,6,81
6,1,7,578
7,1,8,58
8,1,9,14
9,1,10,42


## 1.2. Kết hợp `GROUP BY` và `HAVING`

**Lọc ra các album thuộc MediaTypeID=2 và có tổng thời gian nằm trong khoảng 500k - 1000k milisec**

In [11]:
query = """
        SELECT albumid, AVG(Milliseconds) as AVG_TIME, SUM(Milliseconds) as SUM_TIME
        FROM track
        WHERE MediaTypeId=2
        
        GROUP BY AlbumId
        HAVING SUM_TIME BETWEEN 500000 and 1000000
        
        ORDER BY AVG_TIME DESC
        -- LIMIT (10)
"""
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[item[0] for item in cursor.description])

Unnamed: 0,AlbumId,AVG_TIME,SUM_TIME
0,294,596519.0,596519
1,279,582029.0,582029
2,330,567494.0,567494
3,312,561967.0,561967
4,301,560342.0,560342
5,299,545203.0,545203
6,311,526696.0,526696
7,292,522099.0,522099
8,273,501503.0,501503
9,173,347972.0,695944


# 2. Truy vấn dữ liệu từ nhiều tables

## 2.1. Các mệnh đề JOIN

**(1) Mỗi album là của artist nào?**

In [12]:
# Thử với JOIN, INNER JOIN, CROSS JOIN, LEFT JOIN
query = """
        SELECT title, name
        FROM album
        JOIN artist
            ON album.artistid = artist.artistid
        LIMIT(10)
"""
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[item[0] for item in cursor.description])

Unnamed: 0,Title,Name
0,For Those About To Rock We Salute You,AC/DC
1,Balls to the Wall,Accept
2,Restless and Wild,Accept
3,Let There Be Rock,AC/DC
4,Big Ones,Aerosmith
5,Jagged Little Pill,Alanis Morissette
6,Facelift,Alice In Chains
7,Warner 25 Anos,Antônio Carlos Jobim
8,Plays Metallica By Four Cellos,Apocalyptica
9,Audioslave,Audioslave


In [13]:
# Đặt tên viết tắt cho bảng truy vấn
query = """
        SELECT title as 'Album Title', name as 'Artist Name'
        FROM album as a1
        JOIN artist as a2
            ON a1.artistID = a2.artistID
        --LIMIT(10)
"""
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[item[0] for item in cursor.description])
df

Unnamed: 0,Album Title,Artist Name
0,For Those About To Rock We Salute You,AC/DC
1,Balls to the Wall,Accept
2,Restless and Wild,Accept
3,Let There Be Rock,AC/DC
4,Big Ones,Aerosmith
...,...,...
342,Respighi:Pines of Rome,Eugene Ormandy
343,Schubert: The Late String Quartets & String Qu...,Emerson String Quartet
344,Monteverdi: L'Orfeo,"C. Monteverdi, Nigel Rogers - Chiaroscuro; Lon..."
345,Mozart: Chamber Music,Nash Ensemble


**(2) Mỗi artist có những albums nào?**

In [14]:
# Dùng JOIN với USING thay cho ON
query = """
        SELECT ar.name AS 'Artist Name', al.title AS 'Album Title'
        FROM Artist ar
        LEFT JOIN Album al 
            USING(artistid)
"""
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[item[0] for item in cursor.description])
df

Unnamed: 0,Artist Name,Album Title
0,AC/DC,For Those About To Rock We Salute You
1,AC/DC,Let There Be Rock
2,Accept,Balls to the Wall
3,Accept,Restless and Wild
4,Aerosmith,Big Ones
...,...,...
413,"Mela Tenenbaum, Pro Musica Prague & Richard Kapp","Locatelli: Concertos for Violin, Strings and C..."
414,Emerson String Quartet,Schubert: The Late String Quartets & String Qu...
415,"C. Monteverdi, Nigel Rogers - Chiaroscuro; Lon...",Monteverdi: L'Orfeo
416,Nash Ensemble,Mozart: Chamber Music


## 2.2. Kết hợp các lệnh khác trong SQL

**(1) Kết hợp ORDER BY**

In [15]:
query = """
        SELECT ar.name AS 'Artist Name', al.title AS 'Album Title'
        FROM Artist ar
        LEFT JOIN Album al
            ON ar.artistid = al.artistid
        ORDER BY ar.name ASC
"""
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[item[0] for item in cursor.description])
df

Unnamed: 0,Artist Name,Album Title
0,A Cor Do Som,
1,AC/DC,For Those About To Rock We Salute You
2,AC/DC,Let There Be Rock
3,Aaron Copland & London Symphony Orchestra,"A Copland Celebration, Vol. I"
4,Aaron Goldberg,Worlds
...,...,...
413,Xis,
414,Yehudi Menuhin,Bartok: Violin & Viola Concertos
415,Yo-Yo Ma,Bach: The Cello Suites
416,Youssou N'Dour,


**(2) Kết hợp GROUP BY**

**Mỗi artist có bao nhiêu album? 10 artist có số album nhiều nhất?**

In [16]:
# Cách 1: Dùng lệnh count trực tiếp
query = """
        SELECT ar.name AS 'Artist Name', count(al.artistid) as NoAlbums
        FROM Artist ar
        INNER JOIN Album al
            ON ar.artistid = al.artistid
        
        GROUP BY ar.name
        -- HAVING NoAlbums > 10
        ORDER BY NoAlbums DESC
        LIMIT (10)
"""
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[item[0] for item in cursor.description])
df

Unnamed: 0,Artist Name,NoAlbums
0,Iron Maiden,21
1,Led Zeppelin,14
2,Deep Purple,11
3,U2,10
4,Metallica,10
5,Ozzy Osbourne,6
6,Pearl Jam,5
7,Various Artists,4
8,Van Halen,4
9,Lost,4


## 2.3. Lồng ghép kết quả SELECT vào 1 SELECT khác

**Cách tạo các lệnh SELECT**

In [17]:
# Bước 1: Thử tạo truy vấn đếm xem mỗi artistID có bao nhiêu album
query = """
        SELECT al.artistid, count(al.albumid) as count
        FROM Album al
        GROUP BY al.artistid
        ORDER BY count DESC
"""
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[item[0] for item in cursor.description])
df

Unnamed: 0,ArtistId,count
0,90,21
1,22,14
2,58,11
3,50,10
4,150,10
...,...,...
199,271,1
200,272,1
201,273,1
202,274,1


In [18]:
# Bước 2: Nối kết quả trên với bảng Artist để lấy tên thay cho ArtistID
query = """
        SELECT ar.name, ar1.count
        FROM
            (SELECT artistid, count(albumid) as count
             FROM Album
             GROUP BY artistid
             ORDER BY count DESC
            ) ar1
        JOIN artist ar
            ON ar.artistid = ar1.artistid
        LIMIT(10)
"""
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[item[0] for item in cursor.description])
df

Unnamed: 0,Name,count
0,Iron Maiden,21
1,Led Zeppelin,14
2,Deep Purple,11
3,Metallica,10
4,U2,10
5,Ozzy Osbourne,6
6,Pearl Jam,5
7,Various Artists,4
8,Faith No More,4
9,Foo Fighters,4


**Có thể kết hợp thêm với các lệnh khác**

In [19]:
query = """ 
        SELECT ar.name, ar1.count
        FROM
            (SELECT artistid, count(albumid) as count
             FROM Album
             GROUP BY artistid
             ORDER BY count DESC
            ) ar1
        JOIN artist ar
            ON ar.artistid = ar1.artistid
        
        -- Ví dụ: kết hợp với WHERE
        WHERE count > 10
"""
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[item[0] for item in cursor.description])
df

Unnamed: 0,Name,count
0,Iron Maiden,21
1,Led Zeppelin,14
2,Deep Purple,11


**Tìm tên của 10 album có số track nhiều nhất?**

In [20]:
query = """
        SELECT album.title, count
        FROM
            (SELECT albumid, count(trackid) as count
             FROM track
             GROUP BY AlbumId        
             ORDER BY count DESC
             LIMIT (10)
            ) AS tr
        JOIN album
            ON album.albumid = tr.albumid
"""
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[item[0] for item in cursor.description])

Unnamed: 0,Title,count
0,Greatest Hits,57
1,Minha Historia,34
2,Unplugged,30
3,"Lost, Season 3",26
4,"Lost, Season 1",25
5,"The Office, Season 3",25
6,My Way: The Best Of Frank Sinatra [Disc 1],24
7,"Lost, Season 2",24
8,"Battlestar Galactica (Classic), Season 1",24
9,Afrociberdelia,23


## 2.4. Các mệnh đề UNION

In [21]:
query = """
        SELECT LastName, FirstName, 'Emp' AS Type
        FROM employee
        --LIMIT (5)
        
        UNION
        SELECT LastName, FirstName, 'Cus'
        FROM Customer
        ORDER BY Type DESC
        
        --LIMIT (5)
"""
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[item[0] for item in cursor.description])

Unnamed: 0,LastName,FirstName,Type
0,Adams,Andrew,Emp
1,Callahan,Laura,Emp
2,Edwards,Nancy,Emp
3,Johnson,Steve,Emp
4,King,Robert,Emp
...,...,...,...
62,Tremblay,François,Cus
63,Van der Berg,Johannes,Cus
64,Wichterlová,František,Cus
65,Wójcik,Stanisław,Cus


# 3. Demo: NoSQL

**Tạo một database sử dụng kiểu `dict`**

In [22]:
# Ví dụ một danh sách học viên đơn giản
{'danh_sach': [{'ten': "Nam", 'tuoi': 23}, {'ten': "Lan", 'tuoi': 26}]}

{'danh_sach': [{'ten': 'Nam', 'tuoi': 23}, {'ten': 'Lan', 'tuoi': 26}]}

In [23]:
# Ví dụ: kết hợp danh sách trên với các thông tin khác, tạo ra một tập phức tạp
db = {'lop': {'id': 7, 'ten': 'Data Science',
              'giang-vien': {'id':5, 'ten': 'Khai', 'tuoi': 33},
              'danh-sach':[{'hoc-vien': {'id':3, 'ten': 'Nam'}},
                           {'hoc-vien': {'id':5, 'ten': 'Lan'}}
                          ]
             }
     }
db

{'lop': {'id': 7,
  'ten': 'Data Science',
  'giang-vien': {'id': 5, 'ten': 'Khai', 'tuoi': 33},
  'danh-sach': [{'hoc-vien': {'id': 3, 'ten': 'Nam'}},
   {'hoc-vien': {'id': 5, 'ten': 'Lan'}}]}}

**Bản thân CSDL này là một dict với cặp key-value**

In [24]:
db.keys()

dict_keys(['lop'])

In [25]:
db.values()

dict_values([{'id': 7, 'ten': 'Data Science', 'giang-vien': {'id': 5, 'ten': 'Khai', 'tuoi': 33}, 'danh-sach': [{'hoc-vien': {'id': 3, 'ten': 'Nam'}}, {'hoc-vien': {'id': 5, 'ten': 'Lan'}}]}])

**Truy cập vào các phân cấp trong db**

In [26]:
db['lop']['giang-vien']['ten']

'Khai'

In [27]:
db['lop']['danh-sach'][0]

{'hoc-vien': {'id': 3, 'ten': 'Nam'}}

**Dùng phương thức `get`()**

In [28]:
print(db['lop'].get('giang-vien'))

{'id': 5, 'ten': 'Khai', 'tuoi': 33}


In [29]:
print(db['lop'].get('mentor'))

None
