In [8]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [9]:
import pandas as pd
import numpy as np

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
import warnings
warnings.filterwarnings("ignore")

In [10]:
import sys
sys.path.append("../") 

from utils.paths import make_dir_line

modality = 'c'
project = 'Intermediate SQL'
data = make_dir_line(modality, project)

raw = data('raw')

In [11]:
import sqlite3

conn = sqlite3.connect(":memory:")  ## aca se indica el nombre de la db.
cur = conn.cursor()

In [12]:
conn.executescript(
    """
DROP TABLE IF EXISTS films;

CREATE TABLE films (
    id            INT,
    title         STRING,
    release_year  INT,
    country       STRING,
    duration      INT,
    language      STRING,
    certification STRING,
    gross         DOUBLE,
    budget        DOUBLE
);

DROP TABLE IF EXISTS people;

CREATE TABLE people (
    id        INT,
    name      STRING,
    birthdate STRING,
    deathdate STRING
);

DROP TABLE IF EXISTS reviews;

CREATE TABLE reviews (
    id             INT,
    film_id        INT,
    num_user       INT,
    num_critic     INT,
    imdb_score     DOUBLE,
    num_votes      DOUBLE,
    facebook_likes DOUBLE
);

DROP TABLE IF EXISTS roles;

CREATE TABLE roles (
    id        INT,
    film_id   INT,
    person_id INT,
    role      STRING
);

"""
)
conn.commit()

In [13]:
df = pd.read_csv(raw / 'films.csv', sep = ',', decimal = '.', header = None, encoding = 'utf-8')
df.columns = ['id','title','release_year','country','duration','language','certification','gross','budget']
films = list(zip(df.id, df.title, df.release_year, df.country, df.duration, df.language, df.certification, df.gross, df.budget))
cur.executemany("INSERT INTO films VALUES  (?,?,?,?,?,?,?,?,?)", films)

<sqlite3.Cursor at 0x7f15ebf264c0>

In [14]:
df = pd.read_csv(raw / 'people.csv', sep = ',', decimal = '.', header = None, encoding = 'utf-8')
df.columns = ['id','name','birthdate','deathdate']
people = list(zip(df.id, df.name, df.birthdate, df.deathdate))
cur.executemany("INSERT INTO people VALUES  (?,?,?,?)", people)

<sqlite3.Cursor at 0x7f15ebf264c0>

In [15]:
df = pd.read_csv(raw / 'reviews.csv', sep = ',', decimal = '.', header = 0, encoding = 'utf-8')
df.columns = ['id','film_id','num_user','num_critic','imdb_score','num_votes','facebook_likes']
df = df.loc[:,['id','film_id','num_user','num_critic','imdb_score','num_votes','facebook_likes']]
reviews = list(zip(df.id, df.film_id, df.num_user, df.num_critic, df.imdb_score, df.num_votes, df.facebook_likes))
cur.executemany("INSERT INTO reviews VALUES  (?,?,?,?,?,?,?)", reviews)

<sqlite3.Cursor at 0x7f15ebf264c0>

In [16]:
df = pd.read_csv(raw / 'roles.csv', sep = ',', decimal = '.', header = None, encoding = 'utf-8')
df.columns = ['id','film_id','person_id','role']
roles = list(zip(df.id, df.film_id, df.person_id, df.role))
cur.executemany("INSERT INTO roles VALUES  (?,?,?,?)", roles)

<sqlite3.Cursor at 0x7f15ebf264c0>

## 4.1.3 Practice with COUNT()

In [17]:
# -- Count the number of records in the people table
cur.execute("""
            
            SELECT COUNT(*) AS count_records 
            FROM people;
            
            """).fetchall()

[(8397,)]

In [18]:
# -- Count the number of birthdates in the people table
cur.execute("""
            
            SELECT COUNT(birthdate) AS count_birthdate 
            FROM people;
            
            """).fetchall()

[(6152,)]

In [19]:
# -- Count the languages and countries represented in the films table
cur.execute("""
            
            SELECT COUNT(language) AS count_languages, COUNT(country) AS count_countries 
            FROM films;
            
            """).fetchall()

[(4955, 4966)]

## 4.1.4 SELECT DISTINCT

In [20]:
# -- Return the unique countries from the films table
cur.execute("""
            
            SELECT DISTINCT country 
            FROM films;
            
            """).fetchall()

[('USA',),
 ('Germany',),
 ('Japan',),
 ('Denmark',),
 ('UK',),
 ('Italy',),
 ('France',),
 ('West Germany',),
 ('Sweden',),
 ('Soviet Union',),
 ('Iran',),
 ('Australia',),
 ('Libya',),
 ('Canada',),
 ('South Korea',),
 ('Brazil',),
 ('Netherlands',),
 ('China',),
 ('Norway',),
 ('Switzerland',),
 ('New Zealand',),
 ('Hong Kong',),
 ('Peru',),
 ('India',),
 ('Spain',),
 ('Aruba',),
 ('Mexico',),
 ('Czech Republic',),
 ('Taiwan',),
 ('Argentina',),
 ('Thailand',),
 ('New Line',),
 ('Afghanistan',),
 ('Russia',),
 ('Ireland',),
 ('Colombia',),
 ('Romania',),
 ('Philippines',),
 ('Hungary',),
 ('Cameroon',),
 ('South Africa',),
 ('Israel',),
 ('Poland',),
 ('Turkey',),
 ('Slovakia',),
 ('Greece',),
 ('Iceland',),
 ('Georgia',),
 ('Finland',),
 ('Belgium',),
 ('Indonesia',),
 ('Nigeria',),
 ('Dominican Republic',),
 ('United Arab Emirates',),
 ('Egypt',),
 ('Bulgaria',),
 (None,),
 ('Bahamas',),
 ('Cambodia',),
 ('Kyrgyzstan',),
 ('Kenya',),
 ('Slovenia',),
 ('Pakistan',),
 ('Chile',),
 (

In [21]:
# -- Count the distinct countries from the films table
cur.execute("""
            
            SELECT COUNT (DISTINCT country) AS count_distinct_countries 
            FROM films;
            
            """).fetchall()

[(64,)]

## 4.1.7 Debugging errors

In [22]:
# -- Debug this code
cur.execute("""
            
            SELECT certification 
            FROM films 
            LIMIT 5;
            
            """).fetchall()

[('Not Rated',), (None,), ('Not Rated',), ('Not Rated',), ('Not Rated',)]

In [23]:
# -- Debug this code
cur.execute("""
            
            SELECT film_id, imdb_score, num_votes 
            FROM reviews;
            
            """).fetchall()

[(3934, 7.0999999, 203461.0),
 (3405, 6.4000001, 149998.0),
 (478, 3.20000005, 8465.0),
 (74, 7.5999999, 7071.0),
 (1254, 8.0, 241030.0),
 (740, 6.4000001, 64742.0),
 (4841, 8.10000038, 479047.0),
 (2869, 6.80000019, 18442.0),
 (3252, 7.19999981, 49855.0),
 (1181, 7.30000019, 16995.0),
 (2020, 6.69999981, 91092.0),
 (4152, 5.9000001, 108242.0),
 (3220, 6.19999981, 15780.0),
 (2312, 5.80000019, 63912.0),
 (1820, 7.0, 8535.0),
 (718, 6.4000001, 76850.0),
 (831, 6.5999999, 40126.0),
 (1231, 6.5, 26034.0),
 (1746, 6.5999999, 17261.0),
 (3508, 5.30000019, 8598.0),
 (1621, 6.0, 44913.0),
 (3197, 4.80000019, 33088.0),
 (4464, 4.80000019, 241.0),
 (3929, 7.0999999, 774.0),
 (2183, 4.5, 3119.0),
 (3298, 7.4000001, 6091.0),
 (2392, 5.0, 51252.0),
 (978, 6.0999999, 22748.0),
 (3176, 6.5999999, 55597.0),
 (272, 6.0999999, 18140.0),
 (2414, 6.30000019, 10052.0),
 (2873, 5.9000001, 8867.0),
 (2744, 7.4000001, 46076.0),
 (4013, 5.4000001, 133.0),
 (4707, 7.4000001, 232187.0),
 (4592, 5.30000019, 5693

In [24]:
# -- Debug this code
cur.execute("""
            
            SELECT COUNT(birthdate) AS count_birthdays 
            FROM people;
            
            """).fetchall()

[(6152,)]

## 4.1.10 Formatting

In [25]:
# -- Rewrite this query
cur.execute("""
            
            SELECT COUNT(birthdate) AS count_birthdays 
            FROM people;
            
            """).fetchall()

[(6152,)]

In [26]:
print('Ok_')

Ok_
