In [27]:
import sqlite3
import os
import pandas as pd
import numpy as np

In [51]:
DB_FILE = 'netflix.db'
# if DB file does not exist use the csv data to populate it
if not os.exists(DB_FILE):    
    titles = pd.read_csv('titles.csv')
    # make the database connection
    dbcon = sqlite3.connect(DB_FILE)
    # create empty tables with the correct relationships for population with the csv data
    create_titles = '''CREATE TABLE IF NOT EXISTS titles (
                                            id TEXT PRIMARY KEY,
                                            title TEXT,
                                            type TEXT,
                                            description TEXT,
                                            release_year INTEGER,
                                            age_certification TEXT,
                                            runtime INTEGER,
                                            genres TEXT,
                                            production_countries TEXT,
                                            seasons INTEGER,
                                            imdb_id TEXT,
                                            imdb_score INTEGER,
                                            imdb_votes INTEGER,
                                            tmdb_popularity INTEGER,
                                            tmdb_score INTEGER
                                        );'''
    create_people = '''CREATE TABLE IF NOT EXISTS people (
                                            person_id INTEGER PRIMARY KEY,
                                            name TEXT
                                        );'''
    create_role = '''CREATE TABLE IF NOT EXISTS roles (
                                            id TEXT NOT NULL,
                                            person_id INTEGER NOT NULL,
                                            character TEXT,
                                            role TEXT
                                            FOREIGN KEY (id) REFERENCES title(id)
                                            FOREIGN KEY (person_id) REFERENCES people(person_id) 
                                        );'''
    

Index(['id', 'title', 'type', 'description', 'release_year',
       'age_certification', 'runtime', 'genres', 'production_countries',
       'seasons', 'imdb_id', 'imdb_score', 'imdb_votes', 'tmdb_popularity',
       'tmdb_score'],
      dtype='object')

In [47]:


# use pandas to read the title data for easy storage into the SQLite db
titles = pd.read_csv('titles.csv')
try:
    titles.to_sql('titles', con=dbcon, if_exists='fail', index=False, dtype={'id': 'TEXT PRIMARY KEY'})
except (ValueError):
    print('The titles table already exists...')

'''
the credits data is a little harder, since it makes more sense to 
have a realtion table referencing the actor/director in a title.
Rather than a table with dupes
'''
credits = pd.read_csv('credits.csv')
# the IN relationship describes each person's role in a title
in_relation = credits[['person_id', 'id', 'character', 'role']]
# the new people table is table of each unique person
people = credits.drop(['id', 'character', 'role'], axis=1)
people.drop_duplicates(keep='first', inplace=True)
print(in_relation)
# add both these new tables to the database
try:
    people.to_sql('people', con=dbcon, if_exists='fail', index=False, dtype={'person_id': 'INTEGER PRIMARY KEY'})
    in_relation.to_sql('role', con=dbcon, if_exists='fail', index=False, dtype={'person_id': 'INTEGER PRIMARY KEY'})
except (ValueError):
    print('The people and roles tables already exist...')

The TITLES table already exists...
       person_id         id                character      role
0           3748    tm84618            Travis Bickle     ACTOR
1          14658    tm84618            Iris Steensma     ACTOR
2           7064    tm84618                      Tom     ACTOR
3           3739    tm84618  Matthew 'Sport' Higgins     ACTOR
4          48933    tm84618                    Betsy     ACTOR
...          ...        ...                      ...       ...
77208    1347054  tm1098060                   Afinni     ACTOR
77209     157590  tm1098060           Dr. Ian Stones     ACTOR
77210     129059  tm1098060                     Jane     ACTOR
77211    2050199  tm1098060               Nurse Titi     ACTOR
77212    2050423  tm1098060                      NaN  DIRECTOR

[77213 rows x 4 columns]


In [34]:
cur = dbcon.cursor()
cur.execute('''SELECT * FROM TITLES''')
cur.fetchall()[23]

('tm14350',
 'Alexandria… Why?',
 'MOVIE',
 'Set against the panoramic backdrop of war-torn Egypt, director Youssef Chahine tells a highly personal tale of love and determination. Amid the poverty, death and suffering caused by World War II, 18 year-old Yehia, retreats into a private world of fantasy and longing. Obsessed with Hollywood, he dreams of one day studying filmmaking in America, but after falling in love and discovering the lies of European occupation, Yehia profoundly reevaluates his identity and allegiances.',
 1979,
 None,
 133,
 "['drama']",
 "['EG']",
 None,
 'tt0077751',
 7.2,
 1689.0,
 1.736,
 6.0)

In [20]:
dupes = credits[credits.duplicated(['person_id'])]
#print(dupes)
print(credits.iloc[70:90])

    person_id        id             name  \
70    1464056  tm127384    Gloria Graham   
71    1671807  tm127384    Sylvia Taylor   
72      11490  tm127384     Julian Doyle   
73      84047  tm127384  Margarita Doyle   
74    1463016  tm127384     Zack Matalon   
75    1463018  tm127384       Scott Mike   
76    1463021  tm127384    William Palin   
77      11475  tm127384      Terry Jones   
78     919694  tm127384      Tom Raeburn   
79     400172  tm127384       Brian Ross   
80      14908  tm127384  Roy Forge Smith   
81       8327  tm127384    John Thornton   
82     156072  tm127384    Maggie Weston   
83      11475  tm127384      Terry Jones   
84      11473  tm127384    Terry Gilliam   
85      11472   tm70993   Graham Chapman   
86       1549   tm70993      John Cleese   
87      11473   tm70993    Terry Gilliam   
88      11474   tm70993        Eric Idle   
89      11475   tm70993      Terry Jones   

                                            character      role  
70       