# Overview

<!-- file:///home/strokach/documents/teaching/csc343/2018-fall/slides/SQL-DML.pdf#page=43 -->

I'm covering slides 41-90 for Sina.

- Table joins:
  - Cross join vs. natural join vs. theta join.
  - Inner join vs. full / left / right outer join.


- Impact of having null values:
  - Be super careful when columns involved in JOIN or WHERE have nulls.


- Subqueries:
  - In `FROM`
  - In `WHERE` (`ANY`, `ALL`, `IN`, `EXISTS`).

# Imports

In [5]:
import pandas as pd
import sqlalchemy as sa

In [6]:
%run sql_magic.ipynb

<IPython.core.display.Javascript object>

In [7]:
NOTEBOOK_NAME = "lecture_5"

# Start database

Install PostgreSQL locally, start a docker container, or use SAS.

In [None]:
# !docker run --name csc343-postgres -e POSTGRES_PASSWORD=postgres-pass -d -p 5432:5432 postgres:9.5

# Configure database

In [8]:
DB_URL = "postgresql://postgres:postgres-pass@localhost:5432"

In [9]:
# ElephantSQL: PostgreSQL as a Service (https://www.elephantsql.com/)
DB_URL = "postgres://xzignobk:UDJccTsI5GTt9_iIZbt0dVaACszsPvYK@stampy.db.elephantsql.com:5432/xzignobk"

## Set up `University` schema

In [10]:
!psql {DB_URL} -f data/University/coursesDDL.txt >/dev/null

psql:data/University/coursesDDL.txt:1: NOTICE:  drop cascades to 9 other objects
DETAIL:  drop cascades to type university.grade
drop cascades to type university.cgpa
drop cascades to type university.campus
drop cascades to type university.department
drop cascades to table university.student
drop cascades to table university.course
drop cascades to table university.offering
drop cascades to table university.took
drop cascades to table university.student_2


In [11]:
!psql {DB_URL} -f data/University/coursesData.txt >/dev/null

## Create database engine

In [12]:
engine = sa.create_engine(
    DB_URL,
    connect_args={'options': '-csearch_path=University'})

In [13]:
engine.table_names()

['student', 'course', 'offering', 'took']

# Examples from lecture

## Avoid natural joins

In [279]:
%%sql
-- Select student id, course id, instructor name
-- for each course taken by each student
SELECT sID, oID, instructor
FROM Student NATURAL JOIN Took NATURAL JOIN Offering
LIMIT 5;

Unnamed: 0,sid,oid,instructor
0,99132,1,Horton
1,99132,11,Zorich
2,99132,14,Percy
3,99132,15,Reisman
4,99132,16,Atwood


In [280]:
%%sql
select * from offering limit 2;

Unnamed: 0,oid,cnum,dept,term,instructor
0,1,343,CSC,20089,Horton
1,2,343,CSC,20089,Truta


In [281]:
%%sql
alter table offering
add column campus varchar(255) default null;

This result object does not return rows. It has been closed automatically.


In [282]:
%%sql
alter table offering drop column campus;

This result object does not return rows. It has been closed automatically.


## Dangling tuples

## Null is special

In [22]:
%%sql
select * from student;

Unnamed: 0,sid,firstname,surname,campus,email,cgpa
0,99132,Avery,Marchmount,StG,avery@cs,3.13
1,98000,William,Fairgrieve,StG,will@cs,4.0
2,99999,Afsaneh,Ali,UTSC,aali@cs,2.98
3,157,Leilani,Lakemeyer,UTM,lani@cs,3.42
4,11111,Homer,Simpson,StG,doh@gmail,0.4


In [23]:
%%sql
drop table if exists student_2;
create table student_2 as (select * from student);
update student_2 set cgpa = null where sid = 157;
update student_2 set cgpa = null where sid = 11111;

This result object does not return rows. It has been closed automatically.


In [341]:
%%sql
select * from student_2;

Unnamed: 0,sid,firstname,surname,campus,email,cgpa
0,99132,Avery,Marchmount,StG,avery@cs,3.13
1,98000,William,Fairgrieve,StG,will@cs,4.0
2,99999,Afsaneh,Ali,UTSC,aali@cs,2.98
3,157,Leilani,Lakemeyer,UTM,lani@cs,
4,11111,Homer,Simpson,StG,doh@gmail,


In [342]:
%%sql
select avg(cgpa) from student_2;

Unnamed: 0,avg
0,3.37


In [344]:
%%sql
select distinct cgpa from student_2;

Unnamed: 0,cgpa
0,
1,2.98
2,4.0
3,3.13


In [346]:
%%sql
select count(distinct cgpa) from student_2;

Unnamed: 0,count
0,3


In [352]:
%%sql
select cgpa from student_2
union
-- intersect
-- except
select cgpa from student_2;

Unnamed: 0,cgpa
0,
1,3.13
2,2.98
3,4.0


In [353]:
%%sql
select *
from student_2
where cgpa <= 3.6 or cgpa > 3.6
-- or cgpa is null;

Unnamed: 0,sid,firstname,surname,campus,email,cgpa
0,99132,Avery,Marchmount,StG,avery@cs,3.13
1,98000,William,Fairgrieve,StG,will@cs,4.0
2,99999,Afsaneh,Ali,UTSC,aali@cs,2.98


In [290]:
%%sql
-- Nulls are skiped in joins
select *
from student_2 s1
join student_2 s2 on (s1.cgpa = s2.cgpa)
-- join student_2 s2 ON (s1.cgpa = s2.cgpa or (s1.cgpa is null and s2.cgpa is null))

Unnamed: 0,sid,firstname,surname,campus,email,cgpa,sid.1,firstname.1,surname.1,campus.1,email.1,cgpa.1
0,99132,Avery,Marchmount,StG,avery@cs,3.13,99132,Avery,Marchmount,StG,avery@cs,3.13
1,98000,William,Fairgrieve,StG,will@cs,4.0,98000,William,Fairgrieve,StG,will@cs,4.0
2,99999,Afsaneh,Ali,UTSC,aali@cs,2.98,99999,Afsaneh,Ali,UTSC,aali@cs,2.98


In [291]:
%%sql
-- Create a unique constraint on cgpa
ALTER TABLE student_2 ADD CONSTRAINT unqiue_cgpa UNIQUE (cgpa);

This result object does not return rows. It has been closed automatically.


In [292]:
%%sql
-- Can't insert a tuple with a duplicate value for cgpa
insert into student_2 values (1, 'Hello', 'World', 'StG', null, 3.13);

(psycopg2.IntegrityError) duplicate key value violates unique constraint "unqiue_cgpa"
DETAIL:  Key (cgpa)=(3.13) already exists.
 [SQL: "insert into student_2 values (1, 'Hello', 'World', 'StG', null, 3.13);"]


In [293]:
%%sql
-- **Can** insert multiple tuples with cgpa = null
insert into student_2 values (1, 'Hello', 'World', 'StG', null, null);

This result object does not return rows. It has been closed automatically.


In [294]:
%%sql
select *
from student_2;

Unnamed: 0,sid,firstname,surname,campus,email,cgpa
0,99132,Avery,Marchmount,StG,avery@cs,3.13
1,98000,William,Fairgrieve,StG,will@cs,4.0
2,99999,Afsaneh,Ali,UTSC,aali@cs,2.98
3,157,Leilani,Lakemeyer,UTM,lani@cs,
4,11111,Homer,Simpson,StG,doh@gmail,
5,1,Hello,World,StG,,


https://www.postgresql.org/docs/8.2/static/ddl-constraints.html#AEN2058

> In general, a unique constraint is violated when there are two or more rows in the table where the values of all of the columns included in the constraint are equal. However, two null values are not considered equal in this comparison. **That means even in the presence of a unique constraint it is possible to store duplicate rows that contain a null value in at least one of the constrained columns. This behavior conforms to the SQL standard, but we have heard that other SQL databases may not follow this rule.** So be careful when developing applications that are intended to be portable.



## Subqueries

### Worksheet, Q1

In [250]:
%%sql
SELECT sid, dept||cnum as course, grade
FROM Took,
(
    SELECT *
    FROM Offering
    WHERE instructor = 'Horton'
) Hoffering
WHERE Took.oid = Hoffering.oid;

Unnamed: 0,sid,course,grade
0,99132,CSC343,79
1,98000,CSC343,82
2,98000,CSC263,78
3,99999,CSC343,89
4,157,CSC343,99


### Worksheet, Q2

In [24]:
%%sql
SELECT sid, surname
FROM Student
WHERE cgpa >
(
    SELECT cgpa
    FROM Student
    WHERE sid = 99999  -- 11111
);

Unnamed: 0,sid,surname
0,99132,Marchmount
1,98000,Fairgrieve
2,157,Lakemeyer


In [30]:
%%sql
select * from student_2;

Unnamed: 0,sid,firstname,surname,campus,email,cgpa
0,99132,Avery,Marchmount,StG,avery@cs,3.13
1,98000,William,Fairgrieve,StG,will@cs,4.0
2,99999,Afsaneh,Ali,UTSC,aali@cs,2.98
3,157,Leilani,Lakemeyer,UTM,lani@cs,
4,11111,Homer,Simpson,StG,doh@gmail,


In [28]:
%%sql
-- Be careful with nulls
SELECT sid, surname
FROM Student
WHERE cgpa >
(
    SELECT cgpa
    FROM student_2
    WHERE sid = 11111
);

Unnamed: 0,sid,surname


### The operator `ANY` / `ALL`

In [35]:
%%sql
SELECT sid, surname
FROM Student
WHERE cgpa > all
(
    SELECT cgpa
    FROM Student
    WHERE campus = 'StG'
);

Unnamed: 0,sid,surname


In [338]:
%%sql
select * from student

Unnamed: 0,sid,firstname,surname,campus,email,cgpa
0,99132,Avery,Marchmount,StG,avery@cs,3.13
1,98000,William,Fairgrieve,StG,will@cs,4.0
2,99999,Afsaneh,Ali,UTSC,aali@cs,2.98
3,157,Leilani,Lakemeyer,UTM,lani@cs,3.42
4,11111,Homer,Simpson,StG,doh@gmail,0.4


### Worksheet, Q3

In [31]:
%%sql
SELECT sid, dept||cnum AS course, grade
FROM Took NATURAL JOIN Offering
WHERE grade >= 80 AND
(cnum, dept) IN (
    SELECT cnum, dept
    FROM Took NATURAL JOIN Offering NATURAL JOIN Student
    WHERE surname = 'Lakemeyer'
);

Unnamed: 0,sid,course,grade
0,157,CSC343,99
1,99999,CSC343,89
2,98000,CSC343,82
3,157,CSC343,82
4,98000,CSC207,89
5,157,CSC148,89
6,98000,CSC148,89
7,99999,CSC148,91
8,98000,CSC148,93
9,99999,ANT200,99


### The Operator `EXISTS`

In [None]:
%%sql
SELECT surname, cgpa
FROM Student
WHERE EXISTS (
    SELECT *
    FROM Took
    WHERE Student.sid = Took.sid and
    grade > 85
);

### Worksheet, Q5

In [33]:
%%sql
SELECT instructor
FROM Offering Off1
WHERE NOT EXISTS (
    SELECT *
    FROM Offering
    WHERE oid <> Off1.oid
    AND instructor = Off1.instructor
);

Unnamed: 0,instructor
0,Truta
1,Heap
2,Chechik
3,Davies
4,Johancsik
5,Reisman
6,Dow
7,Miller
8,Mendel
9,Richler


### Worksheet, Q6

In [36]:
%%sql
SELECT DISTINCT oid
FROM Took
WHERE EXISTS (
    SELECT *
    FROM Took t, Offering o
    WHERE t.oid = o.oid
    AND t.oid <> Took.oid
    AND o.dept = 'CSC'
    AND took.sid = t.sid
);

Unnamed: 0,oid
0,14
1,34
2,27
3,8
4,17
5,28
6,15
7,1
8,26
9,31
