In [23]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [24]:
import pandas as pd
import numpy as np

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
import warnings
warnings.filterwarnings("ignore")

In [25]:
import sys
sys.path.append("../../../") 

from utils.paths import make_dir_line

modality = 'c'
project = 'Intermediate SQL'
data = make_dir_line(modality, project)

raw = data('raw')

In [26]:
import sqlite3

conn = sqlite3.connect(":memory:")  ## aca se indica el nombre de la db.
cur = conn.cursor()

# 4.3.0 Aggregate Functions

In [27]:
conn.executescript(
    """
DROP TABLE IF EXISTS films;

CREATE TABLE films (
    id            INT,
    title         STRING,
    release_year  INT,
    country       STRING,
    duration      INT,
    language      STRING,
    certification STRING,
    gross         DOUBLE,
    budget        DOUBLE
);

DROP TABLE IF EXISTS people;

CREATE TABLE people (
    id        INT,
    name      STRING,
    birthdate STRING,
    deathdate STRING
);

DROP TABLE IF EXISTS reviews;

CREATE TABLE reviews (
    id             INT,
    film_id        INT,
    num_user       INT,
    num_critic     INT,
    imdb_score     DOUBLE,
    num_votes      DOUBLE,
    facebook_likes DOUBLE
);

DROP TABLE IF EXISTS roles;

CREATE TABLE roles (
    id        INT,
    film_id   INT,
    person_id INT,
    role      STRING
);

"""
)
conn.commit()

In [28]:
df = pd.read_csv(raw / 'films.csv', sep = ',', decimal = '.', header = None, encoding = 'utf-8')
df.columns = ['id','title','release_year','country','duration','language','certification','gross','budget']
films = list(zip(df.id, df.title, df.release_year, df.country, df.duration, df.language, df.certification, df.gross, df.budget))
cur.executemany("INSERT INTO films VALUES  (?,?,?,?,?,?,?,?,?)", films)

<sqlite3.Cursor at 0x7f93d20fcb40>

In [29]:
df = pd.read_csv(raw / 'people.csv', sep = ',', decimal = '.', header = None, encoding = 'utf-8')
df.columns = ['id','name','birthdate','deathdate']
people = list(zip(df.id, df.name, df.birthdate, df.deathdate))
cur.executemany("INSERT INTO people VALUES  (?,?,?,?)", people)

<sqlite3.Cursor at 0x7f93d20fcb40>

In [30]:
df = pd.read_csv(raw / 'reviews.csv', sep = ',', decimal = '.', header = 0, encoding = 'utf-8')
df.columns = ['id','film_id','num_user','num_critic','imdb_score','num_votes','facebook_likes']
df = df.loc[:,['id','film_id','num_user','num_critic','imdb_score','num_votes','facebook_likes']]
reviews = list(zip(df.id, df.film_id, df.num_user, df.num_critic, df.imdb_score, df.num_votes, df.facebook_likes))
cur.executemany("INSERT INTO reviews VALUES  (?,?,?,?,?,?,?)", reviews)

<sqlite3.Cursor at 0x7f93d20fcb40>

In [31]:
df = pd.read_csv(raw / 'roles.csv', sep = ',', decimal = '.', header = None, encoding = 'utf-8')
df.columns = ['id','film_id','person_id','role']
roles = list(zip(df.id, df.film_id, df.person_id, df.role))
cur.executemany("INSERT INTO roles VALUES  (?,?,?,?)", roles)

<sqlite3.Cursor at 0x7f93d20fcb40>

## 4.3.3 Practice with aggregate functions

In [32]:
# -- Query the sum of film durations
cur.execute("""
            
            SELECT SUM(duration) AS total_duration 
            FROM films;
            
            """).fetchall()

[(534882,)]

In [33]:
# -- Query the sum of film durations
cur.execute("""
            
            SELECT avg(duration) AS average_duration 
            FROM films;
            
            """).fetchall()

[(107.94793138244198,)]

In [34]:
# -- Find the latest release_year
cur.execute("""
            
            SELECT MAX(release_year) AS latest_year 
            FROM films;
            
            """).fetchall()

[(2016,)]

In [35]:
# -- Find the duration of the shortest film
cur.execute("""
            
            SELECT MIN(duration) AS shortest_film 
            FROM films;
            
            """).fetchall()

[(7,)]

## 4.3.5 Combining aggregate functions with WHERE

In [36]:
# -- Calculate the sum of gross from the year 2000 or later
cur.execute("""
            
            SELECT SUM(gross) AS total_gross 
            FROM films 
            WHERE release_year >= 2000;
            
            """).fetchall()

[(150900926358.0,)]

In [37]:
# -- Calculate the average gross of films that start with A
cur.execute("""
            
            SELECT AVG(gross) AS avg_gross_A 
            FROM films 
            WHERE title LIKE 'A%';
            
            """).fetchall()

[(47893236.42248062,)]

In [38]:
# -- Calculate the lowest gross film in 1994
cur.execute("""
            
            SELECT MIN(gross) AS lowest_gross 
            FROM films 
            WHERE release_year = 1994;
            
            """).fetchall()

[(125169.0,)]

In [39]:
# -- Calculate the highest gross film released between 2000-2012
cur.execute("""
            
            SELECT MAX(gross) AS highest_gross 
            FROM films 
            WHERE release_year BETWEEN 2000 AND 2012;
            
            """).fetchall()

[(760505847.0,)]

## 4.3.6 Using ROUND()

In [40]:
# -- Round the average number of facebook_likes to one decimal place
cur.execute("""
            
            SELECT ROUND(AVG(facebook_likes),1) AS avg_facebook_likes 
            FROM reviews;
            
            """).fetchall()

[(7802.9,)]

In [41]:
# -- Calculate the average budget rounded to the thousands
cur.execute("""
            
            SELECT ROUND(AVG(budget),-3) AS avg_budget_thousands 
            FROM films;
            
            """).fetchall()

[(39902826.0,)]

## 4.3.9 Aliasing with functions

In [42]:
# -- Calculate the title and duration_hours from films
cur.execute("""
            
            SELECT title, (duration/60.0) AS duration_hours 
            FROM films;
            
            """).fetchall()

[("Intolerance: Love's Struggle Throughout the Ages", 2.05),
 ('Over the Hill to the Poorhouse', 1.8333333333333333),
 ('The Big Parade', 2.5166666666666666),
 ('Metropolis', 2.4166666666666665),
 ("Pandora's Box", 1.8333333333333333),
 ('The Broadway Melody', 1.6666666666666667),
 ("Hell's Angels", 1.6),
 ('A Farewell to Arms', 1.3166666666666667),
 ('42nd Street', 1.4833333333333334),
 ('She Done Him Wrong', 1.1),
 ('It Happened One Night', 1.0833333333333333),
 ('Top Hat', 1.35),
 ('Modern Times', 1.45),
 ('The Charge of the Light Brigade', 1.6666666666666667),
 ('Snow White and the Seven Dwarfs', 1.3833333333333333),
 ('The Prisoner of Zenda', 1.6833333333333333),
 ("Alexander's Ragtime Band", 1.7666666666666666),
 ("You Can't Take It with You", 2.1),
 ('Gone with the Wind', 3.7666666666666666),
 ('Mr. Smith Goes to Washington', 2.0),
 ('The Wizard of Oz', 1.7),
 ('Boom Town', 1.9833333333333334),
 ('Fantasia', 2.0),
 ('Pinocchio', 1.4666666666666666),
 ('Rebecca', 2.16666666666666

In [43]:
# -- Calculate the percentage of people who are no longer alive
cur.execute("""
            
            SELECT COUNT(deathdate) * 100.0 / COUNT(*) AS percentage_dead 
            FROM people;
            
            """).fetchall()

[(9.372394902941526,)]

In [44]:
# -- Find the number of decades in the films table
cur.execute("""
            
            SELECT (MAX(release_year) - MIN(release_year)) / 10.0 AS number_of_decades 
            FROM films;
            
            """).fetchall()

[(10.0,)]

## 4.3.10 Rounding results

In [45]:
# -- Round duration_hours to two decimal places
cur.execute("""
            
            SELECT title, ROUND(duration / 60.0,2) AS duration_hours 
            FROM films;
            
            """).fetchall()

[("Intolerance: Love's Struggle Throughout the Ages", 2.05),
 ('Over the Hill to the Poorhouse', 1.83),
 ('The Big Parade', 2.52),
 ('Metropolis', 2.42),
 ("Pandora's Box", 1.83),
 ('The Broadway Melody', 1.67),
 ("Hell's Angels", 1.6),
 ('A Farewell to Arms', 1.32),
 ('42nd Street', 1.48),
 ('She Done Him Wrong', 1.1),
 ('It Happened One Night', 1.08),
 ('Top Hat', 1.35),
 ('Modern Times', 1.45),
 ('The Charge of the Light Brigade', 1.67),
 ('Snow White and the Seven Dwarfs', 1.38),
 ('The Prisoner of Zenda', 1.68),
 ("Alexander's Ragtime Band", 1.77),
 ("You Can't Take It with You", 2.1),
 ('Gone with the Wind', 3.77),
 ('Mr. Smith Goes to Washington', 2.0),
 ('The Wizard of Oz', 1.7),
 ('Boom Town', 1.98),
 ('Fantasia', 2.0),
 ('Pinocchio', 1.47),
 ('Rebecca', 2.17),
 ('The Blue Bird', 1.38),
 ('How Green Was My Valley', 1.97),
 ('Bambi', 1.17),
 ('Casablanca', 1.37),
 ('A Guy Named Joe', 2.03),
 ('Bathing Beauty', 1.68),
 ('Spellbound', 1.58),
 ('State Fair', 1.67),
 ('The Lost Wee

In [46]:
print('Ok_')

Ok_
