In [1]:
# example making new folder with os
import os
os.makedirs('Data/',exist_ok=True) 
import pymysql
pymysql.install_as_MySQLdb()

In [2]:
import pandas as pd
import numpy as np
from sqlalchemy import create_engine
from urllib.parse import quote_plus
from sqlalchemy import text
import json

In [3]:
# Confirm folder was created and files added successfully
os.listdir("Data/")


['.ipynb_checkpoints',
 'basics.csv',
 'IMDB Movie Dataset Info.docx',
 'ratings.csv',
 'title-akas-us-only.csv',
 'title.basics.tsv.gz',
 'title.ratings.tsv.gz',
 'tmdb_api_results_2010.json']

## Importing title.akas

In [4]:
akas = pd.read_csv('Data/title-akas-us-only.csv', low_memory=False)



In [5]:
akas.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1452564 entries, 0 to 1452563
Data columns (total 8 columns):
 #   Column           Non-Null Count    Dtype 
---  ------           --------------    ----- 
 0   titleId          1452564 non-null  object
 1   ordering         1452564 non-null  int64 
 2   title            1452564 non-null  object
 3   region           1452564 non-null  object
 4   language         1452564 non-null  object
 5   types            1452564 non-null  object
 6   attributes       1452564 non-null  object
 7   isOriginalTitle  1452564 non-null  object
dtypes: int64(1), object(7)
memory usage: 88.7+ MB


## Importing title.basics

In [6]:
basics = pd.read_csv('Data/title.basics.tsv.gz', sep='\t', low_memory=False)


In [7]:
basics.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10017011 entries, 0 to 10017010
Data columns (total 9 columns):
 #   Column          Dtype 
---  ------          ----- 
 0   tconst          object
 1   titleType       object
 2   primaryTitle    object
 3   originalTitle   object
 4   isAdult         object
 5   startYear       object
 6   endYear         object
 7   runtimeMinutes  object
 8   genres          object
dtypes: object(9)
memory usage: 687.8+ MB


In [8]:
# Filter the basics table down to only include the US by using the filter akas dataframe
filter_us_titles = basics['tconst'].isin(akas['titleId'])
basics = basics[filter_us_titles]
basics

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,\N,1,"Comedy,Short"
5,tt0000006,short,Chinese Opium Den,Chinese Opium Den,0,1894,\N,1,Short
6,tt0000007,short,Corbett and Courtney Before the Kinetograph,Corbett and Courtney Before the Kinetograph,0,1894,\N,1,"Short,Sport"
...,...,...,...,...,...,...,...,...,...
10016872,tt9916560,tvMovie,March of Dimes Presents: Once Upon a Dime,March of Dimes Presents: Once Upon a Dime,0,1963,\N,58,Family
10016901,tt9916620,movie,The Copeland Case,The Copeland Case,0,\N,\N,\N,Drama
10016939,tt9916702,short,Loving London: The Playground,Loving London: The Playground,0,\N,\N,\N,"Drama,Short"
10016962,tt9916756,short,Pretty Pretty Black Girl,Pretty Pretty Black Girl,0,2019,\N,\N,Short


In [9]:
basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,\N,1,"Comedy,Short"
5,tt0000006,short,Chinese Opium Den,Chinese Opium Den,0,1894,\N,1,Short
6,tt0000007,short,Corbett and Courtney Before the Kinetograph,Corbett and Courtney Before the Kinetograph,0,1894,\N,1,"Short,Sport"


## Preprocessing steps for title basics

In [10]:
## changing \N to actual Nan values

In [11]:
basics = basics.replace({'\\N':np.nan})

In [12]:
## dropping Nan's from runtimeMinutes and genres columns

In [13]:
basics.dropna(subset=['runtimeMinutes', 'genres'], inplace=True)

In [14]:
## creating filters to keep only movie in titletype, and remove documentary from genres

In [15]:
filter_titletype = basics['titleType'].str.contains('movie')
filter_documentaries = basics['genres'].str.contains('Documentary')

In [16]:
basics = basics[~filter_documentaries]

In [17]:
basics = basics[filter_titletype]

  basics = basics[filter_titletype]


In [18]:
# verifying changes

In [19]:
basics['genres'].value_counts()

Drama                        30948
Comedy                       13170
Comedy,Drama                  6332
Horror                        5535
Drama,Romance                 5479
                             ...  
Action,Mystery,War               1
Animation,Musical,Romance        1
Drama,News,Thriller              1
Fantasy,Music,Mystery            1
Biography,Fantasy,Musical        1
Name: genres, Length: 1058, dtype: int64

In [20]:
basics['titleType'].value_counts()

movie    162615
Name: titleType, dtype: int64

In [21]:
# change startYear dtype to float

In [22]:
basics['startYear'] = basics['startYear'].astype(float)

In [23]:
basics.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 162615 entries, 8 to 10016777
Data columns (total 9 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   tconst          162615 non-null  object 
 1   titleType       162615 non-null  object 
 2   primaryTitle    162615 non-null  object 
 3   originalTitle   162615 non-null  object 
 4   isAdult         162615 non-null  object 
 5   startYear       159998 non-null  float64
 6   endYear         0 non-null       object 
 7   runtimeMinutes  162615 non-null  object 
 8   genres          162615 non-null  object 
dtypes: float64(1), object(8)
memory usage: 12.4+ MB


In [24]:
# filter to only keep movies between startyear 2000 and 2022

In [25]:
filter_startyear = (basics['startYear'] >=2000.0) & (basics['startYear'] <=2022.0)

In [26]:
basics = basics[filter_startyear]

In [27]:
## final info check to view rows count

In [28]:
basics.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 86979 entries, 34802 to 10016777
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   tconst          86979 non-null  object 
 1   titleType       86979 non-null  object 
 2   primaryTitle    86979 non-null  object 
 3   originalTitle   86979 non-null  object 
 4   isAdult         86979 non-null  object 
 5   startYear       86979 non-null  float64
 6   endYear         0 non-null      object 
 7   runtimeMinutes  86979 non-null  object 
 8   genres          86979 non-null  object 
dtypes: float64(1), object(8)
memory usage: 6.6+ MB


In [29]:
## export dataframe to csv in data folder

In [30]:
basics.to_csv(r'Data\basics.csv')

## Load and filter the title ratings file

In [31]:
ratings = pd.read_csv('Data/title.ratings.tsv.gz', sep='\t', low_memory=False)

In [32]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1331492 entries, 0 to 1331491
Data columns (total 3 columns):
 #   Column         Non-Null Count    Dtype  
---  ------         --------------    -----  
 0   tconst         1331492 non-null  object 
 1   averageRating  1331492 non-null  float64
 2   numVotes       1331492 non-null  int64  
dtypes: float64(1), int64(1), object(1)
memory usage: 30.5+ MB


In [33]:
filter_basics = ratings['tconst'].isin(basics['tconst'])
ratings = ratings[filter_basics]


In [34]:
ratings = ratings.replace({'\\N':np.nan})

In [35]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 71900 entries, 17961 to 1331462
Data columns (total 3 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   tconst         71900 non-null  object 
 1   averageRating  71900 non-null  float64
 2   numVotes       71900 non-null  int64  
dtypes: float64(1), int64(1), object(1)
memory usage: 2.2+ MB


In [36]:
ratings.to_csv(r'Data\ratings.csv')

## Part 2

### Hiding MySQL password

In [37]:
with open('/Users/Admin/.secret/sql_password.json') as f:
    login = json.load(f)
login.keys()

dict_keys(['password'])

### Setting up connection and engine

In [38]:
username = 'root'
password = login['password']
db_name = 'movie'
connection = f'mysql+pymysql://{username}:{password}@localhost/{db_name}'
engine = create_engine(connection)
conn = engine.connect()

In [39]:
q = """SHOW TABLES;"""
pd.read_sql(q, conn)

Unnamed: 0,Tables_in_movie
0,genres
1,ratings
2,title_basics
3,title_genres


In [40]:
q = '''DESCRIBE title_basics;'''
describe_genres = pd.read_sql(q, conn)
describe_genres

Unnamed: 0,Field,Type,Null,Key,Default,Extra
0,tconst,char(10),NO,PRI,,
1,primary_title,varchar(250),YES,,,
2,start_year,float,YES,,,
3,created_on,datetime,YES,,CURRENT_TIMESTAMP,DEFAULT_GENERATED
4,updated_on,datetime,YES,,CURRENT_TIMESTAMP,DEFAULT_GENERATED on update CURRENT_TIMESTAMP
5,runtime,int,YES,,,


In [41]:
basics.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 86979 entries, 34802 to 10016777
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   tconst          86979 non-null  object 
 1   titleType       86979 non-null  object 
 2   primaryTitle    86979 non-null  object 
 3   originalTitle   86979 non-null  object 
 4   isAdult         86979 non-null  object 
 5   startYear       86979 non-null  float64
 6   endYear         0 non-null      object 
 7   runtimeMinutes  86979 non-null  object 
 8   genres          86979 non-null  object 
dtypes: float64(1), object(8)
memory usage: 6.6+ MB


### Verifying lengths of features for SQL Script

In [42]:
basics['tconst'].map(len).max()

10

In [43]:
basics['primaryTitle'].map(len).max()

242

In [44]:
basics['genres'].map(len).max()

29

In [45]:
rename_map = {'runtimeMinutes':'runtime',
              'primaryTitle':'primary_title',
              'startYear':'start_year',}

In [46]:
basics = basics.rename(rename_map,axis=1)

In [47]:
basics.duplicated().sum()

0

In [48]:
basics = basics.drop(columns='titleType')
basics = basics.drop(columns='originalTitle')
basics = basics.drop(columns='isAdult')
basics = basics.drop(columns='endYear')
basics = basics.drop(columns='genres')

In [49]:
q = '''DESCRIBE ratings;'''
describe_ratings = pd.read_sql(q, conn)
describe_ratings

Unnamed: 0,Field,Type,Null,Key,Default,Extra
0,tconst,char(10),NO,,,
1,number_of_votes,int,YES,,,
2,created_on,datetime,YES,,CURRENT_TIMESTAMP,DEFAULT_GENERATED
3,updated_on,datetime,YES,,CURRENT_TIMESTAMP,DEFAULT_GENERATED on update CURRENT_TIMESTAMP
4,average_rating,float,YES,,,


In [50]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 71900 entries, 17961 to 1331462
Data columns (total 3 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   tconst         71900 non-null  object 
 1   averageRating  71900 non-null  float64
 2   numVotes       71900 non-null  int64  
dtypes: float64(1), int64(1), object(1)
memory usage: 2.2+ MB


In [51]:
ratings.duplicated().sum()

0

In [52]:
ratings['tconst'].value_counts()

tt0035423     1
tt3322940     1
tt3324494     1
tt3323940     1
tt3323920     1
             ..
tt13356020    1
tt13356884    1
tt13357042    1
tt13357236    1
tt9916362     1
Name: tconst, Length: 71900, dtype: int64

In [53]:
rename_map_2 = {'averageRating':'average_rating',
              'numVotes':'number_of_votes'}

In [54]:
ratings = ratings.rename(rename_map_2,axis=1)

In [55]:
ratings.head()

Unnamed: 0,tconst,average_rating,number_of_votes
17961,tt0035423,6.4,87153
40764,tt0062336,6.4,175
46645,tt0069049,6.7,7754
63640,tt0088751,5.2,336
69953,tt0096056,5.6,846


In [56]:
basics.head()

Unnamed: 0,tconst,primary_title,start_year,runtime
34802,tt0035423,Kate & Leopold,2001.0,118
61114,tt0062336,The Tango of the Widower and Its Distorting Mi...,2020.0,70
67666,tt0069049,The Other Side of the Wind,2018.0,122
86793,tt0088751,The Naked Monster,2005.0,100
93930,tt0096056,Crime and Punishment,2002.0,126


In [57]:
basics.loc[34802]

tconst                tt0035423
primary_title    Kate & Leopold
start_year               2001.0
runtime                     118
Name: 34802, dtype: object

### Filling SQL Tables with DataFrames

In [58]:
basics.to_sql("title_basics",conn,index=False, if_exists='append')

86979

In [59]:
ratings.to_sql("ratings",conn,index=False, if_exists='append')

71900

In [60]:
q = """SHOW TABLES"""
pd.read_sql(q, conn)

Unnamed: 0,Tables_in_movie
0,genres
1,ratings
2,title_basics
3,title_genres


In [61]:
q = '''DESCRIBE ratings;'''
describe_ratings = pd.read_sql(q, conn)
describe_ratings

Unnamed: 0,Field,Type,Null,Key,Default,Extra
0,tconst,char(10),NO,,,
1,number_of_votes,int,YES,,,
2,created_on,datetime,YES,,CURRENT_TIMESTAMP,DEFAULT_GENERATED
3,updated_on,datetime,YES,,CURRENT_TIMESTAMP,DEFAULT_GENERATED on update CURRENT_TIMESTAMP
4,average_rating,float,YES,,,


In [62]:
q = '''DESCRIBE title_basics;'''
describe_ratings = pd.read_sql(q, conn)
describe_ratings

Unnamed: 0,Field,Type,Null,Key,Default,Extra
0,tconst,char(10),NO,PRI,,
1,primary_title,varchar(250),YES,,,
2,start_year,float,YES,,,
3,created_on,datetime,YES,,CURRENT_TIMESTAMP,DEFAULT_GENERATED
4,updated_on,datetime,YES,,CURRENT_TIMESTAMP,DEFAULT_GENERATED on update CURRENT_TIMESTAMP
5,runtime,int,YES,,,


In [63]:
q = '''SELECT * FROM ratings;'''
describe_ratings = pd.read_sql(q, conn)
describe_ratings

Unnamed: 0,tconst,number_of_votes,created_on,updated_on,average_rating
0,tt0035423,87153,2023-10-05 18:18:13,2023-10-05 18:18:13,6.4
1,tt0062336,175,2023-10-05 18:18:13,2023-10-05 18:18:13,6.4
2,tt0069049,7754,2023-10-05 18:18:13,2023-10-05 18:18:13,6.7
3,tt0088751,336,2023-10-05 18:18:13,2023-10-05 18:18:13,5.2
4,tt0096056,846,2023-10-05 18:18:13,2023-10-05 18:18:13,5.6
...,...,...,...,...,...
71895,tt9914942,178,2023-10-05 18:18:16,2023-10-05 18:18:16,6.6
71896,tt9915872,9,2023-10-05 18:18:16,2023-10-05 18:18:16,6.4
71897,tt9916170,7,2023-10-05 18:18:16,2023-10-05 18:18:16,7.0
71898,tt9916190,243,2023-10-05 18:18:16,2023-10-05 18:18:16,3.7


In [64]:
q = '''SELECT * FROM title_basics;'''
describe_ratings = pd.read_sql(q, conn)
describe_ratings

Unnamed: 0,tconst,primary_title,start_year,created_on,updated_on,runtime
0,tt0035423,Kate & Leopold,2001.0,2023-10-05 18:18:08,2023-10-05 18:18:08,118
1,tt0062336,The Tango of the Widower and Its Distorting Mi...,2020.0,2023-10-05 18:18:08,2023-10-05 18:18:08,70
2,tt0069049,The Other Side of the Wind,2018.0,2023-10-05 18:18:08,2023-10-05 18:18:08,122
3,tt0088751,The Naked Monster,2005.0,2023-10-05 18:18:08,2023-10-05 18:18:08,100
4,tt0096056,Crime and Punishment,2002.0,2023-10-05 18:18:08,2023-10-05 18:18:08,126
...,...,...,...,...,...,...
86974,tt9914942,Life Without Sara Amat,2019.0,2023-10-05 18:18:11,2023-10-05 18:18:11,74
86975,tt9915872,The Last White Witch,2019.0,2023-10-05 18:18:11,2023-10-05 18:18:11,97
86976,tt9916170,The Rehearsal,2019.0,2023-10-05 18:18:11,2023-10-05 18:18:11,51
86977,tt9916190,Safeguard,2020.0,2023-10-05 18:18:11,2023-10-05 18:18:11,95


### Test Query for Top 10 Moves With Most Votes

In [65]:
q = """SELECT primary_title, number_of_votes
FROM title_basics
JOIN ratings 
ON title_basics.tconst = ratings.tconst
WHERE number_of_votes > 500000
ORDER BY number_of_votes DESC
LIMIT 10"""
pd.read_sql(q,conn)

Unnamed: 0,primary_title,number_of_votes
0,The Dark Knight,2741213
1,Inception,2432540
2,Interstellar,1938559
3,The Lord of the Rings: The Fellowship of the Ring,1927480
4,The Lord of the Rings: The Return of the King,1899060
5,The Dark Knight Rises,1754338
6,The Lord of the Rings: The Two Towers,1713825
7,Django Unchained,1613387
8,Gladiator,1546218
9,Batman Begins,1514561


In [75]:
basics.to_csv(r'MovieData\title-basics.csv')

In [76]:
ratings.to_csv(r'MovieData\ratings.csv')

# Part 3

In [66]:
import matplotlib.pyplot as plt
import seaborn as sns
import os, json, time
from tqdm.notebook import tqdm_notebook

## Setting up API

In [68]:
import json
with open('/Users/Admin/.secret/tmdb_api.json', 'r') as f:
    login = json.load(f)
## Display the keys of the loaded dict
login.keys()

dict_keys(['API Key'])

In [69]:
# Importing tmdbsimple and setting the API_KEY
import tmdbsimple as tmdb
tmdb.API_KEY =  login['API Key']

In [70]:
# Create the folder for saving files (if it doesn't exist)
FOLDER = "MovieData/"
os.makedirs(FOLDER, exist_ok=True)

# Show the list of files included in the folder
sorted(os.listdir(FOLDER))

[]

## Defining Functions

In [122]:
def get_movie_with_rating(movie_id):
    # Get the movie object for the current id
    movie = tmdb.Movies(movie_id)
    
    # save the .info .releases dictionaries
    movie_info = movie.info()
    releases = movie.releases()
    
    # Loop through countries in releases
    for c in releases['countries']:
        # if the country abbreviation==US
        if c['iso_3166_1' ] =='US':
            ## save a "certification" key in the info dict with the certification
            movie_info['certification'] = c['certification']
    return movie_info


def write_json(new_data, filename): 
    """Appends a list of records (new_data) to a json file (filename). 
    Adapted from: https://www.geeksforgeeks.org/append-to-json-file-using-python/"""  
    
    with open(filename,'r+') as file:
        # First we load existing data into a dict.
        file_data = json.load(file)
        ## Choose extend or append
        if (type(new_data) == list) & (type(file_data) == list):
            file_data.extend(new_data)
        else:
             file_data.append(new_data)
        # Sets file's current position at offset.
        file.seek(0)
        # convert back to json.
        json.dump(file_data, file)

In [78]:
YEARS_TO_GET = [2001,2002]
errors = [ ]

In [123]:
# Define the JSON file to store results for the year
JSON_FILE = f'{FOLDER}tmdb_api_results_{YEAR}.json'


# Check if the JSON file exists
file_exists = os.path.isfile(JSON_FILE)

# If it does not exist: create it
if file_exists == False:
    print(f"Creating {JSON_FILE} for API results for year={YEAR}.")
    
    # save an empty dict with just "imdb_id" to the new json file.
    with open(JSON_FILE,'w') as f:
        json.dump([{'imdb_id':0}],f)

# If it exists, print a message
else:
    print(f'The file {JSON_FILE} already exists.')

Creating MovieData/tmdb_api_results_2001.json for API results for year=2001.


In [93]:
# Filtering for movies from selected startYear
df = basics.loc[ basics['startYear']==2001].copy()
# saving movie ids to list
movie_ids = df['tconst']
movie_ids.head()

0     tt0035423
10    tt0114447
14    tt0116916
18    tt0118589
19    tt0118652
Name: tconst, dtype: object

In [94]:
# Filtering for movies from selected startYear
df = basics.loc[ basics['startYear']==2002].copy()
# saving movie ids to list
movie_ids = df['tconst']
movie_ids.head()

4     tt0096056
23    tt0118926
32    tt0119980
44    tt0120679
51    tt0120804
Name: tconst, dtype: object

In [95]:
# Load existing data from json into a dataframe called "previous_df"
previous_df = pd.read_json(JSON_FILE)
previous_df

Unnamed: 0,imdb_id
0,0


In [96]:
# filter out any ids that are already in the JSON_FILE
movie_ids_to_get = movie_ids[~movie_ids.isin(previous_df['imdb_id'])]

In [None]:
# Start of OUTER loop
for YEAR in tqdm_notebook(YEARS_TO_GET, desc='YEARS', position=0):
    # Loop through movie_ids_to_get with a tqdm progress bar
    for movie_id in tqdm_notebook(movie_ids_to_get, f"Movies from {YEAR}"):

    # Attempt to retrieve then data for the movie id
        try:
            temp = get_movie_with_rating(movie_id) 
        # Append/extend results to existing file using a pre-made function
            write_json(temp,JSON_FILE)
        # Short 20 ms sleep to prevent overwhelming server
            time.sleep(0.02)

    # If it fails,  make a dict with just the id and None for certification.
        except Exception as e:
            errors.append([movie_id, e])

YEARS:   0%|          | 0/2 [00:00<?, ?it/s]

Movies from 2001:   0%|          | 0/1572 [00:00<?, ?it/s]

In [98]:
print(f"- Total errors: {len(errors)}")

- Total errors: 283


In [104]:
csv_fname = f"{FOLDER}final_tmdb_data_{YEARS_TO_GET}.csv.gz"
final_year_df.to_csv(csv_fname, compression="gzip", index=False)

In [102]:
import glob
# Use glob to get all filepaths that match the pattern (*=wildcard)
tmdb_files = sorted(glob.glob("Data/final_tmdb_data*.csv.gz"))
tmdb_files


[]