# SQL Joins

These are my notes for SQL joins. I will be using sqlite module along with pandas to run queries.


(c) Miradiz Rakhmatov

In [1]:
import pandas as pd 
import sqlite3 

In [2]:
## connect to the database 
con = sqlite3.connect('data/factbook.db')

## create a function that takes an SQL query as an argument to produce the outcome in pandas DataFrame
def run(query):
    return pd.read_sql(query, con)

# Schema diagram:
![](data/schema.svg)

## 1) Query to explore facts table:

In [3]:
q1 = '''
SELECT *
FROM facts
'''

run(q1).head()

Unnamed: 0,id,code,name,area,area_land,area_water,population,population_growth,birth_rate,death_rate,migration_rate
0,1,af,Afghanistan,652230.0,652230.0,0.0,32564342.0,2.32,38.57,13.89,1.51
1,2,al,Albania,28748.0,27398.0,1350.0,3029278.0,0.3,12.92,6.58,3.3
2,3,ag,Algeria,2381741.0,2381741.0,0.0,39542166.0,1.84,23.67,4.31,0.92
3,4,an,Andorra,468.0,468.0,0.0,85580.0,0.12,8.13,6.96,0.0
4,5,ao,Angola,1246700.0,1246700.0,0.0,19625353.0,2.78,38.78,11.49,0.46


## 2) Query to explore cities table:

In [4]:
q2 = '''
SELECT *
FROM cities
'''

run(q2).head()

Unnamed: 0,id,name,population,capital,facts_id
0,1,Oranjestad,37000,1,216
1,2,Saint John'S,27000,1,6
2,3,Abu Dhabi,942000,1,184
3,4,Dubai,1978000,0,184
4,5,Sharjah,983000,0,184


## Refer to the table below to categorize each join type that I'm going to demonstrate 

# Types of joins 
![](data/joins.png)

# Note:
## Sqlite doesn't support right and full outter joins since those joins are rarely used in practice

## 3) Query to inner join cities with facts.
* Include all the columns from cities
* Include country column from facts table

## INNER JOIN:

In [5]:
q3 = '''
SELECT 
    c.*, 
    f.name AS country_name
FROM cities c
JOIN facts f ON c.facts_id=f.id
'''

run(q3)


Unnamed: 0,id,name,population,capital,facts_id,country_name
0,1,Oranjestad,37000,1,216,Aruba
1,2,Saint John'S,27000,1,6,Antigua and Barbuda
2,3,Abu Dhabi,942000,1,184,United Arab Emirates
3,4,Dubai,1978000,0,184,United Arab Emirates
4,5,Sharjah,983000,0,184,United Arab Emirates
...,...,...,...,...,...,...
392,393,Mbabane,66000,1,167,Swaziland
393,394,Sanaa,2419000,1,193,Yemen
394,395,Aden,784000,0,193,Yemen
395,396,Lusaka,1802000,1,194,Zambia


##  4) Query to inner join the two tables to return country names with their corresponding capital names 
## INNER JOIN:

In [6]:
q4 = '''
SELECT 
    f.name AS country, 
    c.name AS capital_city
FROM facts f
JOIN cities c ON f.id=c.facts_id
WHERE c.capital = 1
'''
run(q4)

Unnamed: 0,country,capital_city
0,Aruba,Oranjestad
1,Antigua and Barbuda,Saint John'S
2,United Arab Emirates,Abu Dhabi
3,Afghanistan,Kabul
4,Algeria,Algiers
...,...,...
203,Samoa,Apia
204,Swaziland,Mbabane
205,Yemen,Sanaa
206,Zambia,Lusaka


## 5) Query to left join the tables to show BOTH THE country names that EXIST in cities table and THE ones that DON'T exist in cities table
## LEFT INCLUSIVE JOIN (Regular left join):

In [7]:
q5 = '''
SELECT 
    f.name AS country,
    c.name AS city
FROM facts f
LEFT JOIN cities c ON f.id=c.facts_id
'''
run(q5)

## country names that don't exist in cities table are shown as "None" (NULL values)

Unnamed: 0,country,city
0,Afghanistan,Kabul
1,Albania,Tirana
2,Algeria,Algiers
3,Algeria,Oran
4,Andorra,Andorra La Vella
...,...,...
443,Atlantic Ocean,
444,Indian Ocean,
445,Pacific Ocean,
446,Southern Ocean,


## 6) Query to left join the tables to show ONLY THE countries that don't exist in cities table
## LEFT EXCLUSIVE JOIN: please reffer to veen diagram above for clarifications


In [13]:
q6 = '''
SELECT 
    f.name AS country,
    c.name
FROM facts f
LEFT JOIN cities c ON f.id=c.facts_id
WHERE c.facts_id IS NULL
'''
print("Number of countries that don't have corresponding city in cities table:", len(run(q6)) )
run(q6).tail()

Number of countries that don't have corresponding city in cities table: 51


Unnamed: 0,country,name
46,Atlantic Ocean,
47,Indian Ocean,
48,Pacific Ocean,
49,Southern Ocean,
50,World,


## 7) Query that returns 10 CAPITAL cities with the highest population ranked from biggest to smallest population:

In [9]:
q7='''
SELECT 
    c.name city, 
    f.name country, 
    c.population
FROM cities c
JOIN facts f ON c.facts_id=f.id
WHERE c.capital = 1
ORDER BY 3 DESC
LIMIT 10
'''

run(q7)

Unnamed: 0,city,country,population
0,Tokyo,Japan,37217000
1,New Delhi,India,22654000
2,Mexico City,Mexico,20446000
3,Beijing,China,15594000
4,Dhaka,Bangladesh,15391000
5,Buenos Aires,Argentina,13528000
6,Manila,Philippines,11862000
7,Moscow,Russia,11621000
8,Cairo,Egypt,11169000
9,Jakarta,Indonesia,9769000


## 8) Query that returns CAPITAL cities with populations of OVER 10 million ordered from largest to smallest.

This time I will be using subquery 



In [10]:
q8 = '''
SELECT 
    sub.city, 
    f.name country, 
    sub.population
FROM facts f
JOIN 
    (SELECT 
        name AS city, 
        population, 
        facts_id
    FROM cities
    WHERE capital = 1 AND population > 10000000) sub  ON f.id = sub.facts_id
ORDER BY 3 DESC
'''

run(q8)

Unnamed: 0,city,country,population
0,Tokyo,Japan,37217000
1,New Delhi,India,22654000
2,Mexico City,Mexico,20446000
3,Beijing,China,15594000
4,Dhaka,Bangladesh,15391000
5,Buenos Aires,Argentina,13528000
6,Manila,Philippines,11862000
7,Moscow,Russia,11621000
8,Cairo,Egypt,11169000


## 9) Query to find the countries where the population of urban cities is more than half of the country's total population:

In [11]:
q9='''
SELECT 
    f.name, 
    SUM(c.population) pop_of_major_cities, 
    f.population country_population, 
    SUM(c.population)/CAST(f.population AS FLOAT) urban_pct
FROM cities c
JOIN facts f ON c.facts_id=f.id
GROUP BY 1
HAVING SUM(c.population) > f.population/2
ORDER BY 4 
'''
run(q9)

Unnamed: 0,name,pop_of_major_cities,country_population,urban_pct
0,Uruguay,1672000,3341893,0.500315
1,"Congo, Republic of the",2445000,4755097,0.514185
2,Brunei,241000,429646,0.560927
3,New Caledonia,157000,271615,0.578024
4,Virgin Islands,60000,103574,0.579296
5,Falkland Islands (Islas Malvinas),2000,3361,0.595061
6,Djibouti,496000,828324,0.5988
7,Australia,13789000,22751014,0.606083
8,Iceland,206000,331918,0.620635
9,Israel,5226000,8049314,0.649248


## 10) The same query as above with the use of subquery:


In [12]:
q10='''
SELECT
    f.name country,
    c.urban_pop,
    f.population total_pop,
    (c.urban_pop / CAST(f.population AS FLOAT)) urban_pct
FROM facts f
INNER JOIN (
            SELECT
                facts_id,
                SUM(population) urban_pop
            FROM cities
            GROUP BY 1) c ON c.facts_id = f.id
WHERE urban_pct > 0.5
ORDER BY 4 ASC;
'''

run(q10)

Unnamed: 0,country,urban_pop,total_pop,urban_pct
0,Uruguay,1672000,3341893,0.500315
1,"Congo, Republic of the",2445000,4755097,0.514185
2,Brunei,241000,429646,0.560927
3,New Caledonia,157000,271615,0.578024
4,Virgin Islands,60000,103574,0.579296
5,Falkland Islands (Islas Malvinas),2000,3361,0.595061
6,Djibouti,496000,828324,0.5988
7,Australia,13789000,22751014,0.606083
8,Iceland,206000,331918,0.620635
9,Israel,5226000,8049314,0.649248


# THE END 