## hollywood_network/hollywood_network.ipynb

External Dependencies:
- IMDb Non-Commercial Datasets

Returns:
- hollywood_network.html: A html file that will display an interactive network graph

In [1]:
import pandas as pd
import numpy as np
import pickle

import pypyodbc as podbc
from pyvis.network import Network

import warnings

warnings.filterwarnings("ignore", category=UserWarning)

In [2]:
DRIVER_NAME = "SQL SERVER"
SERVER_NAME = "NicksComputer\SQLEXPRESS"
DATABASE_NAME = "movie_recsys"

connection_string = f"""
DRIVER={{{DRIVER_NAME}}};
SERVER={SERVER_NAME};
DATABASE={DATABASE_NAME};
Trust_Connection=yes;
"""

conn = podbc.connect(connection_string)

## Print top 5 rows from each table

In [3]:
sql = []
table_str = []

table_str.append("Name Basics")
table_str.append("Title Basics")
table_str.append("Title Crew")
table_str.append("Title Episodes")
table_str.append("Title Principals")
table_str.append("Title Ratings")
sql.append(
    """
           SELECT TOP(5) *
           FROM nameBasics 
           """
)
sql.append(
    """
           SELECT TOP(5) *
           FROM titleBasics 
           """
)
sql.append(
    """
           SELECT TOP(5) *
           FROM titleCrew
           """
)
sql.append(
    """
           SELECT TOP(5) *
           FROM titleEpisode
           """
)
sql.append(
    """
           SELECT TOP(5) *
           FROM titlePrincipals 
           """
)
sql.append(
    """
           SELECT TOP(5) *
           FROM titleRatings
           """
)

for j, jsql in enumerate(sql):
    print(table_str[j])
    print(pd.read_sql_query(jsql, conn).to_markdown() + "\n")

Name Basics
|    | nconst    | primaryname   |   birthyear |   deathyear | primaryprofession             | knownfortitles                          |
|---:|:----------|:--------------|------------:|------------:|:------------------------------|:----------------------------------------|
|  0 | nm0649057 | Wim Opbrouck  |        1969 |           0 | actor,writer                  | tt0275154,tt5258726,tt3892822,tt0126764 |
|  1 | nm0649059 | Carina Opdam  |           0 |           0 | art_director,costume_designer | tt0138207                               |
|  2 | nm0649060 | Travis Opdyke |           0 |           0 | actor                         | tt0165833                               |
|  3 | nm0649061 | Opec          |           0 |           0 | actress                       | tt0181377                               |
|  4 | nm0649062 | Nancy Opel    |           0 |           0 | actress,costume_designer      | tt1515193,tt0203259,tt0266747,tt0119349 |

Title Basics
|    | tconst  

## Query some basic information from the database

In [None]:
print("Is Kate Winslet in the database?")
sql = """
SELECT TOP(10) *
FROM nameBasics 
WHERE primaryName LIKE 'Bruce Scivally'
"""
pd.read_sql_query(sql, conn)

In [None]:
print("Is Morgan Freeman in the database?")
sql = """
SELECT TOP(10) *
FROM nameBasics
WHERE primaryName LIKE '%%'
"""

pd.read_sql_query(sql, conn).head()

In [None]:
print("Is King of Queens in the database?")
sql = """
SELECT TOP(1) tconst, titleType, primaryTitle
FROM titleBasics
WHERE primaryTitle LIKE '%king of queens%'
"""

pd.read_sql_query(sql, conn).head()

In [None]:
print("Are there any duplicate actors/actresses in the database?")
sql = """
SELECT DISTINCT primaryName, count(primaryName) AS countActor
FROM nameBasics
WHERE primaryProfession LIKE '%actor%' OR
	  primaryProfession LIKE '%actress%'
GROUP BY primaryName ORDER BY countActor DESC
"""
print(pd.read_sql_query(sql, conn).head(50).to_markdown())

In [None]:
print(
    "What is the longest running show on television? sort by seasonNumber and then episode number"
)
sql = """
SELECT TOP(5) tB.primaryTitle, MAX(tE.seasonNumber) as SN, MAX(tE.episodeNumber) as EN,  
    MAX(tE.episodeNumber)*MAX(tE.seasonNumber) AS EpCount
FROM titleEpisode AS tE INNER JOIN titleBasics as tB ON tE.parentTconst=tB.tconst
GROUP BY tB.primaryTitle
HAVING MAX(tE.episodeNumber)*MAX(tE.seasonNumber) > 8000
ORDER BY EpCount DESC
"""
pd.read_sql_query(sql, conn).head()

In [None]:
print("IMDB top 50")

sql = """
SELECT tb.primaryTitle, tr.averageRating, tr.numVotes, tb.titleType
FROM titleRatings as tr JOIN titleBasics as tb on tr.tconst = tb.tconst
WHERE tr.numVotes > 50000 and tb.titleType LIKE 'movie'
ORDER BY CAST(tr.averageRating AS FLOAT) DESC, CAST(tr.numVotes AS INT) DESC
"""

print(pd.read_sql_query(sql, conn).head(50).to_markdown())

## Create actor-actor network

In [None]:
print("Which two actors have worked together the most")
print(
    "Takes 25 minutes to execute All rows from CTE0 (~7million total) yields 21,565,150 Rows"
)
sql = """
With CTE0 as (
	SELECT tp.tconst, tp.nconst 
	FROM titlePrincipals as tp JOIN titleBasics as tb on tb.tconst = tp.tconst
	WHERE (tb.titleType LIKE 'movie' OR tb.titleType LIKE 'tvMovie' OR tb.titleType LIKE 'video') 
	AND tb.startYear > 1950 AND tb.isAdult = 0 AND tb.runtimeMinutes > 20
),
CTE1 as (
	SELECT tp.tconst as tconst, nb.primaryName as primaryName
	FROM CTE0 as tp JOIN nameBasics as nb on nb.nconst = tp.nconst
),
CTE2 AS (
	SELECT c1.primaryName as name1, c2.primaryName as name2
	FROM CTE1 as c1 JOIN CTE1 as c2 on c1.tconst=c2.tconst
	WHERE c1.primaryName != c2.primaryName AND c1.primaryName < c2.primaryName 
)
	SELECT name1, name2, COUNT(name1) as num_appearances
	FROM CTE2
	GROUP BY name1, name2
    ORDER BY num_appearances DESC
    """

df_network = pd.read_sql_query(sql, conn)
print(df_network.head(50).to_markdown())
# df_network.to_csv('df_network.csv')

In [2]:
df_network = pd.read_csv("df_network.csv", index_col=[0])

In [54]:
df = df_network.copy()
df.columns = ["source", "target", "weight"]

In [55]:
# load in list of top 500 hollywood celebs
# as generated by ChatGPT
with open("hollywood_celebs.txt", "rb") as f:
    hollywood_celebs = pickle.load(f)
df = df[df["source"].isin(hollywood_celebs) & df["target"].isin(hollywood_celebs)]

In [56]:
# drop edges with only one connection
df = df[df["weight"] > 1]

# keep only celebs that have at least nkeep edges
nkeep = 5
num_edges_per_celeb = pd.concat(
    [df["source"], df["target"]], ignore_index=True
).value_counts()
num_edges_per_celeb = num_edges_per_celeb[(num_edges_per_celeb > nkeep)]

# get list of celeb_names
celeb_names = num_edges_per_celeb.index
num_celebs = len(celeb_names)

# map actor names to a unique integer
celeb_encoding = list(range(num_celebs))
map_key = {celeb_names[j]: celeb_encoding[j] for j in range(num_celebs)}
df["source"] = df["source"].map(map_key)
df["target"] = df["target"].map(map_key)

df = df.dropna()

In [57]:
# form edges - tuples with entries (source, target, weight)
edges = [
    (df.iloc[j]["source"], df.iloc[j]["target"], df.iloc[j]["weight"])
    for j in range(len(df))
    if df.iloc[j]["weight"] > 1
]

In [58]:
# compute size of nodes based on number of connections
node_size = []
for jceleb in range(num_celebs):
    actor_weight = df[(df["target"] == jceleb) | (df["source"] == jceleb)]["weight"]
    node_size.append(np.sum(actor_weight))
node_size = 120 * np.array(node_size) / np.max(node_size)

In [59]:
net = Network(
    notebook=True,
    cdn_resources="remote",
    bgcolor="#222222",
    font_color="white",
    height="1000px",
    width="100%",
    select_menu=True,
    filter_menu=True,
)

net.add_nodes(celeb_encoding, label=celeb_names[celeb_encoding], size=node_size)
net.add_edges(edges)
net.repulsion(node_distance=2400, spring_length=400)
net.show_buttons()
net.show("hollywood_network.html")

print(len(celeb_encoding), len(edges))

hollywood_network.html
182 755
