In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import pandas as pd
import numpy as np

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
import warnings
warnings.filterwarnings("ignore")

In [4]:
import sys
sys.path.append("../") 

from utils.paths import make_dir_line

modality = 'c'
project = 'Intermediate SQL'
data = make_dir_line(modality, project)

raw = data('raw')

In [5]:
import sqlite3

conn = sqlite3.connect(":memory:")  ## aca se indica el nombre de la db.
cur = conn.cursor()

# 3.4.0 Sorting and Grouping

In [6]:
conn.executescript(
    """
DROP TABLE IF EXISTS films;

CREATE TABLE films (
    id            INT,
    title         STRING,
    release_year  INT,
    country       STRING,
    duration      INT,
    language      STRING,
    certification STRING,
    gross         DOUBLE,
    budget        DOUBLE
);

DROP TABLE IF EXISTS people;

CREATE TABLE people (
    id        INT,
    name      STRING,
    birthdate STRING,
    deathdate STRING
);

DROP TABLE IF EXISTS reviews;

CREATE TABLE reviews (
    id             INT,
    film_id        INT,
    num_user       INT,
    num_critic     INT,
    imdb_score     DOUBLE,
    num_votes      DOUBLE,
    facebook_likes DOUBLE
);

DROP TABLE IF EXISTS roles;

CREATE TABLE roles (
    id        INT,
    film_id   INT,
    person_id INT,
    role      STRING
);

"""
)
conn.commit()

In [7]:
df = pd.read_csv(raw / 'films.csv', sep = ',', decimal = '.', header = None, encoding = 'utf-8')
df.columns = ['id','title','release_year','country','duration','language','certification','gross','budget']
films = list(zip(df.id, df.title, df.release_year, df.country, df.duration, df.language, df.certification, df.gross, df.budget))
cur.executemany("INSERT INTO films VALUES  (?,?,?,?,?,?,?,?,?)", films)

<sqlite3.Cursor at 0x20fcf267140>

In [8]:
df = pd.read_csv(raw / 'people.csv', sep = ',', decimal = '.', header = None, encoding = 'utf-8')
df.columns = ['id','name','birthdate','deathdate']
people = list(zip(df.id, df.name, df.birthdate, df.deathdate))
cur.executemany("INSERT INTO people VALUES  (?,?,?,?)", people)

<sqlite3.Cursor at 0x20fcf267140>

In [9]:
df = pd.read_csv(raw / 'reviews.csv', sep = ',', decimal = '.', header = 0, encoding = 'utf-8')
df.columns = ['id','film_id','num_user','num_critic','imdb_score','num_votes','facebook_likes']
df = df.loc[:,['id','film_id','num_user','num_critic','imdb_score','num_votes','facebook_likes']]
reviews = list(zip(df.id, df.film_id, df.num_user, df.num_critic, df.imdb_score, df.num_votes, df.facebook_likes))
cur.executemany("INSERT INTO reviews VALUES  (?,?,?,?,?,?,?)", reviews)

<sqlite3.Cursor at 0x20fcf267140>

In [10]:
df = pd.read_csv(raw / 'roles.csv', sep = ',', decimal = '.', header = None, encoding = 'utf-8')
df.columns = ['id','film_id','person_id','role']
roles = list(zip(df.id, df.film_id, df.person_id, df.role))
cur.executemany("INSERT INTO roles VALUES  (?,?,?,?)", roles)

<sqlite3.Cursor at 0x20fcf267140>

## 3.4.2 Sorting single fields

In [11]:
# -- Select name from people and sort alphabetically
cur.execute("SELECT name FROM people ORDER BY name;").fetchall()

[('50 Cent',),
 ('A. Michael Baldwin',),
 ('A. Raven Cruz',),
 ('A.J. Buckley',),
 ('A.J. DeLucia',),
 ('A.J. Langer',),
 ('AJ Michalka',),
 ('Aaliyah',),
 ('Aaron Ashmore',),
 ('Aaron Hann',),
 ('Aaron Hill',),
 ('Aaron Hughes',),
 ('Aaron Kwok',),
 ('Aaron Schneider',),
 ('Aaron Seltzer',),
 ('Aaron Stanford',),
 ('Aaron Staton',),
 ('Aaron Yoo',),
 ('Aasheekaa Bathija',),
 ('Aasif Mandvi',),
 ('Abbie Cornish',),
 ('Abby Elliott',),
 ('Abby Mukiibi Nkaaga',),
 ('Abel Ferrara',),
 ('Abhishek Bachchan',),
 ('Abigail Evans',),
 ('Abigail Spencer',),
 ('Abraham Benrubi',),
 ('Ace Marrero',),
 ('Adam Alexi-Malle',),
 ('Adam Arkin',),
 ('Adam Baldwin',),
 ('Adam Boyer',),
 ('Adam Brooks',),
 ('Adam Brown',),
 ('Adam Butcher',),
 ('Adam Carolla',),
 ('Adam Copeland',),
 ('Adam DiMarco',),
 ('Adam Garcia',),
 ('Adam Goldberg',),
 ('Adam Green',),
 ('Adam Hicks',),
 ('Adam Jay Epstein',),
 ('Adam Lamberg',),
 ('Adam LeFevre',),
 ('Adam Marcus',),
 ('Adam McKay',),
 ('Adam Rapp',),
 ('Adam Rat

In [12]:
# -- Select name from people and sort alphabetically
cur.execute("Select title, duration FROM films ORDER BY duration DESC;").fetchall()

[('Carlos', 334),
 ('Blood In, Blood Out', 330),
 ("Heaven's Gate", 325),
 ('The Legend of Suriyothai', 300),
 ('Das Boot', 293),
 ('Apocalypse Now', 289),
 ('The Company', 286),
 ('Gods and Generals', 280),
 ('Gettysburg', 271),
 ('Arn: The Knight Templar', 270),
 ('Cleopatra', 251),
 ('Once Upon a Time in America', 251),
 ('Gandhi', 240),
 ('The Wolf of Wall Street', 240),
 ('Emma', 240),
 ('Dances with Wolves', 236),
 ('Lawrence of Arabia', 227),
 ('Gone with the Wind', 226),
 ('The Greatest Story Ever Told', 225),
 ('The Godfather: Part II', 220),
 ('All the Pretty Horses', 220),
 ('The Last Emperor', 219),
 ('Gangs of New York', 216),
 ('Woodstock', 215),
 ('The Thin Red Line', 215),
 ('Watchmen', 215),
 ('Wyatt Earp', 212),
 ('Nixon', 212),
 ('JFK', 206),
 ('Alexander', 206),
 ('Seven Samurai', 202),
 ('Malcolm X', 202),
 ('King Kong', 201),
 ('King Kong', 201),
 ('King Kong', 201),
 ('Doctor Zhivago', 200),
 ("It's a Mad, Mad, Mad, Mad World", 197),
 ('Deadline Gallipoli', 197),