In [17]:
import pandas as pd
from imdb import Cinemagoer
from tqdm import tqdm
import numpy as np
import matplotlib.pyplot as plt


#### Preprocessing basics

In [11]:
basics_df = pd.read_csv("data/title.basics.tsv", sep='\t')
basics_df.head()

  basics_df = pd.read_csv("data/title.basics.tsv", sep='\t')


Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,\N,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,\N,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,\N,1,"Comedy,Short"


In [12]:
basics_df = basics_df[basics_df['titleType'] == "movie"]
basics_df = basics_df[basics_df['isAdult'] == 0]
basics_df['tconst'] = basics_df['tconst'].apply(lambda x: x[2:])
basics_df = basics_df[['tconst', 'primaryTitle', 'startYear', 'runtimeMinutes', 'genres']]
basics_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 626775 entries, 8 to 9699537
Data columns (total 5 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   tconst          626775 non-null  object
 1   primaryTitle    626775 non-null  object
 2   startYear       626775 non-null  object
 3   runtimeMinutes  626775 non-null  object
 4   genres          626775 non-null  object
dtypes: object(5)
memory usage: 28.7+ MB


In [13]:
basics_df.shape

(626775, 5)

In [18]:
basics_df['startYear'] = basics_df['startYear'].replace('\\N', np.nan)

In [20]:
basics_df['startYear'] = basics_df['startYear'].astype(float).astype('Int64')

In [24]:
basics_df = basics_df[basics_df['startYear'] > 1980]

In [22]:
basics_df[basics_df['startYear'] > 1990].shape

(332741, 5)

#### Fetching from API (in progress)

In [None]:
allMovieIDs = [id[2:] for id in basics_df.tconst]

In [9]:
cg = Cinemagoer()

In [None]:
basics_df.shape

(626775, 10)

In [31]:
basics_df.head()['tconst'].values

array(['0011801', '0013274', '0015414', '0015724', '0035423'],
      dtype=object)

In [35]:
cg.get_imdbMovieID('0011801')

'0011801'

In [38]:
movies = cg.get_movie_list("ls058726648")

In [39]:
len(movies)

129

In [27]:
cgInfo = cg.get_movie("0011801")

In [30]:
cgInfo.keys()


['localized title',
 'cast',
 'genres',
 'countries',
 'country codes',
 'language codes',
 'color info',
 'aspect ratio',
 'sound mix',
 'certificates',
 'original air date',
 'cover url',
 'imdbID',
 'languages',
 'title',
 'year',
 'kind',
 'original title',
 'director',
 'writer',
 'producer',
 'cinematographer',
 'art direction',
 'make up',
 'production manager',
 'akas',
 'production companies',
 'canonical title',
 'long imdb title',
 'long imdb canonical title',
 'smart canonical title',
 'smart long imdb canonical title',
 'full-size cover url']

In [25]:
i = 0
for idx, movieInfo in tqdm(basics_df.iterrows(), total=len(basics_df)):
    i += 1
    if i % 100_000 == 0:
        basics_df.to_csv(f'data_idx_{idx}')
    # print(idx)
    cgInfo = cg.get_movie(movieInfo['tconst'])
    
    # update columns with missing or incomplete data
    if movieInfo['startYear'] == '\\N' and 'original air date' in cgInfo:
        basics_df.at[idx, 'startYear'] = cgInfo['original air date'].split(' ')[2]
    if movieInfo['runtimeMinutes'] == '\\N' and 'runtimes' in cgInfo:
        basics_df.at[idx, 'runtimeMinutes'] = cgInfo['runtimes']
    if movieInfo['genres'] == '\\N' and 'genres' in cgInfo:
        basics_df.at[idx, 'genres'] = cgInfo['genres']
    
    # update other columns
    if 'plot outline' in cgInfo:
        basics_df.at[idx, 'plotInfo'] = cgInfo['plot outline']
    if 'rating' in cgInfo:
        basics_df.at[idx, 'avgRating'] = cgInfo['rating']
    if 'cast' in cgInfo:
        basics_df.at[idx, 'cast'] = ", ".join([str(name) for name in cgInfo['cast']])
    if 'director' in cgInfo:
        basics_df.at[idx, 'directors'] = ", ".join([str(name) for name in cgInfo['director']])
    if 'languages' in cgInfo:
        basics_df.at[idx, 'languages'] = ", ".join([str(language) for language in cgInfo['languages']])


  0%|          | 13/373197 [00:39<314:12:14,  3.03s/it]


KeyboardInterrupt: 

In [1]:
print()




#### Manual Fetching

In [5]:
principals_df = pd.read_csv("data/title.principals.tsv", sep='\t')
names_df = pd.read_csv("data/name.basics.tsv", sep='\t')


In [18]:
merged_df = basics_df.copy()
merged_df.head()

Unnamed: 0,tconst,primaryTitle,startYear,runtimeMinutes,genres
8,9,Miss Jerry,1894,45,Romance
144,147,The Corbett-Fitzsimmons Fight,1897,100,"Documentary,News,Sport"
498,502,Bohemios,1905,100,\N
570,574,The Story of the Kelly Gang,1906,70,"Action,Adventure,Biography"
587,591,The Prodigal Son,1907,90,Drama


actors

In [6]:
actors = principals_df[principals_df['category'].isin(["actor", "actress"])]
actors = actors.merge(names_df[['nconst', 'primaryName', 'knownForTitles']], on = "nconst", how = "left")
actors['tconst'] = actors['tconst'].apply(lambda x: x[2:])
actors.head()

Unnamed: 0,tconst,ordering,nconst,category,job,characters,primaryName,knownForTitles
0,5,1,nm0443482,actor,\N,"[""Blacksmith""]",Charles Kayser,tt0000005
1,5,2,nm0653042,actor,\N,"[""Assistant""]",John Ott,tt0000005
2,7,1,nm0179163,actor,\N,\N,James J. Corbett,"tt0163131,tt0003730,tt0034778,tt0003116"
3,7,2,nm0183947,actor,\N,\N,Peter Courtney,tt0000007
4,8,1,nm0653028,actor,\N,"[""Sneezing Man""]",Fred Ott,"tt0000008,tt0285265,tt0240514"


In [19]:
merged_df['main_cast'] = [[] for _ in range(len(merged_df))]
# Iterate over each row in actors
for index, row in tqdm(actors.head(1000).iterrows(), total = len(actors)):
    known_for = row['knownForTitles']
    actor_name = row['primaryName']
    if ',' in known_for:
        known_for_list = known_for.split(',')
    else:
        known_for_list = [known_for]
    for tconst in known_for_list:
        try:
            idx = merged_df[merged_df['tconst'] == tconst[2:]].index[0]
            merged_df.at[idx, 'main_cast'].append(actor_name)
        except Exception as e:
            # print(tconst)
            # print(e)
            pass


  0%|          | 1000/21615234 [01:22<498:15:52, 12.05it/s]


directors

In [20]:
directors = principals_df[principals_df['category'].isin(["director"])]
directors = directors.merge(names_df[['nconst', 'primaryName', 'knownForTitles']], on = "nconst", how = "left")
directors['tconst'] = directors['tconst'].apply(lambda x: x[2:])
directors.head()

Unnamed: 0,tconst,ordering,nconst,category,job,characters,primaryName,knownForTitles
0,1,2,nm0005690,director,\N,\N,William K.L. Dickson,"tt0219560,tt1496763,tt0308254,tt1428455"
1,2,1,nm0721526,director,\N,\N,Émile Reynaud,"tt16763674,tt0000003,tt16763740,tt13125956"
2,3,1,nm0721526,director,\N,\N,Émile Reynaud,"tt16763674,tt0000003,tt16763740,tt13125956"
3,4,1,nm0721526,director,\N,\N,Émile Reynaud,"tt16763674,tt0000003,tt16763740,tt13125956"
4,5,3,nm0005690,director,\N,\N,William K.L. Dickson,"tt0219560,tt1496763,tt0308254,tt1428455"


In [21]:
merged_df['directors'] = [[] for _ in range(len(merged_df))]
# Iterate over each row in actors
for index, row in tqdm(directors.head(1000).iterrows(), total = len(directors)):
    known_for = row['knownForTitles']
    director_name = row['primaryName']
    if ',' in known_for:
        known_for_list = known_for.split(',')
    else:
        known_for_list = [known_for]
    for tconst in known_for_list:
        try:
            idx = merged_df[merged_df['tconst'] == tconst[2:]].index[0]
            merged_df.at[idx, 'directors'].append(director_name)
        except Exception as e:
            # print(tconst)
            # print(e)
            pass


  0%|          | 1000/6384020 [01:31<162:46:30, 10.89it/s]


ratings

In [31]:
ratings_df = pd.read_csv("data/title.ratings.tsv", sep='\t')
ratings_df['tconst'] = ratings_df['tconst'].apply(lambda x: x[2:])
ratings_df.head()

Unnamed: 0,tconst,averageRating,numVotes
0,1,5.7,1959
1,2,5.8,263
2,3,6.5,1799
3,4,5.6,179
4,5,6.2,2596


In [33]:
merged_df = merged_df.merge(ratings_df[['tconst', 'averageRating']], on = "tconst")

languages

In [34]:
akas_df = pd.read_csv("data/title.akas.tsv", sep='\t')
akas_df.head()

  akas_df = pd.read_csv("data/title.akas.tsv", sep='\t')


Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
0,tt0000001,1,Карменсіта,UA,\N,imdbDisplay,\N,0
1,tt0000001,2,Carmencita,DE,\N,\N,literal title,0
2,tt0000001,3,Carmencita - spanyol tánc,HU,\N,imdbDisplay,\N,0
3,tt0000001,4,Καρμενσίτα,GR,\N,imdbDisplay,\N,0
4,tt0000001,5,Карменсита,RU,\N,imdbDisplay,\N,0


In [39]:
akas_df['region'].value_counts()

DE    4222774
FR    4218950
JP    4218381
IN    4157983
ES    4140375
       ...   
CC          1
TV          1
NU          1
PW          1
NR          1
Name: region, Length: 248, dtype: int64

In [38]:
akas_df[akas_df['titleId'] == 'tt0000001']

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
0,tt0000001,1,Карменсіта,UA,\N,imdbDisplay,\N,0
1,tt0000001,2,Carmencita,DE,\N,\N,literal title,0
2,tt0000001,3,Carmencita - spanyol tánc,HU,\N,imdbDisplay,\N,0
3,tt0000001,4,Καρμενσίτα,GR,\N,imdbDisplay,\N,0
4,tt0000001,5,Карменсита,RU,\N,imdbDisplay,\N,0
5,tt0000001,6,Carmencita,US,\N,imdbDisplay,\N,0
6,tt0000001,7,Carmencita,\N,\N,original,\N,1
7,tt0000001,8,カルメンチータ,JP,ja,imdbDisplay,\N,0


In [40]:
merged_df.head()

Unnamed: 0,tconst,primaryTitle,startYear,runtimeMinutes,genres,main_cast,directors,averageRating
0,9,Miss Jerry,1894,45,Romance,"[Blanche Bayliss, William Courtenay, Chauncey ...",[Alexander Black],5.3
1,147,The Corbett-Fitzsimmons Fight,1897,100,"Documentary,News,Sport",[],[Enoch J. Rector],5.3
2,502,Bohemios,1905,100,\N,"[Antonio del Pozo, El Mochuelo]",[],4.1
3,574,The Story of the Kelly Gang,1906,70,"Action,Adventure,Biography","[Orrie Perry, Reg Perry, Elizabeth Tait, John ...",[Charles Tait],6.0
4,591,The Prodigal Son,1907,90,Drama,"[Christiane Mandelys, Gilberte Sergy]",[Michel Carré],4.4


Postprocessing

In [54]:
merged_df['startYear'] = merged_df['startYear'].replace('\\N', np.nan)
merged_df['startYear'] = merged_df['startYear'].astype(float).fillna(0).astype(int)
min_year = merged_df['startYear'].min()
max_year = merged_df['startYear'].max()

# Define bin edges
bin_edges = pd.interval_range(start=min_year, end=max_year, periods=10)



Creating "Era" column

In [62]:
merged_df['startYear']

0         1894
1         1897
2         1905
3         1906
4         1907
          ... 
283547    2020
283548    2020
283549    2019
283550    2019
283551    2017
Name: startYear, Length: 283552, dtype: int32

In [56]:

# Replace the "\N" values with np.nan to treat them as missing values
merged_df['startYear'] = merged_df['startYear'].replace("\\N", np.nan)

# Define the bins and labels
bins = [-np.inf, 1900, 1920, 1940, 1960, 1980, 2000, 2020, np.inf]
labels = ['Before 1900', '1900-1920', '1920-1940', '1940-1960', '1960-1980', '1980-2000', '2000-2020', 'NA']

# Create a new column called "era" by binning the "startYear" column
merged_df['era'] = pd.cut(merged_df['startYear'], bins=bins, labels=labels, include_lowest=True)


In [84]:
# Replace the "\N" values with np.nan to treat them as missing values
merged_df['startYear'] = merged_df['startYear'].replace("\\N", np.nan)

# Compute the bin edges using quantiles
bin_edges = merged_df['startYear'].quantile([0, 0.25, 0.5, 0.75, 1]).tolist()

# Add negative and positive infinity to bin edges
bin_edges = [-np.inf] + bin_edges + [np.inf]

# Define the labels for the bins
labels = ['Before ' + str(int(bin_edges[1])), 
          str(int(bin_edges[1])) + ' to ' + str(int(bin_edges[2])),
          str(int(bin_edges[2])) + ' to ' + str(int(bin_edges[3])),
          str(int(bin_edges[3])) + ' to ' + str(int(bin_edges[4])),
          'After ' + str(int(bin_edges[4])), 'NA']

# Create a new column called "era" by binning the "startYear" column
merged_df['era'] = pd.cut(merged_df['startYear'], bins=bin_edges, labels=labels, include_lowest=True)


In [85]:
merged_df['era'].value_counts()

1978 to 2005    73800
2005 to 2015    72835
0 to 1978       70840
After 2015      66027
Before 0           50
NA                  0
Name: era, dtype: int64

Runtime bins

In [86]:
# Replace the "\N" values with np.nan to treat them as missing values
merged_df['runtimeMinutes'] = merged_df['runtimeMinutes'].replace("\\N", np.nan)

# Define the number of bins you want
num_bins = 5

# Define the labels for each bin
labels = ['Very Short', 'Short', 'Medium', 'Long', 'Very Long']

# Create bins based on the distribution of runtimeMinutes
bins = pd.qcut(merged_df['runtimeMinutes'], q=num_bins, duplicates='drop', labels=labels)

# Replace any null values with "NA" label
bins = bins.cat.add_categories(['NA']).fillna('NA')

# Create a new column called "runtime_bins" using the bins
merged_df['runtime_bins'] = bins


In [87]:
merged_df['runtime_bins'].value_counts()

Short         59381
Very Short    58923
Long          57742
Very Long     55641
Medium        51865
NA                0
Name: runtime_bins, dtype: int64

In [97]:
# Concatenate relevant columns for each movie into a single string
merged_df['text'] = merged_df['era'].astype(str) + ' ' + \
                    merged_df['runtime_bins'].astype(str) + ' ' + merged_df['genres'].apply(lambda x: ''.join(x)) + ' ' + \
                    merged_df['main_cast'].apply(lambda x: '  '.join(x)) + ' ' + merged_df['directors'].apply(lambda x: ' '.join(x))

In [98]:
merged_df.head()

Unnamed: 0,tconst,primaryTitle,startYear,runtimeMinutes,genres,main_cast,directors,averageRating,era,runtime_bins,text
0,9,Miss Jerry,1894,45.0,Romance,"[Blanche Bayliss, William Courtenay, Chauncey ...",[Alexander Black],5.3,0 to 1978,Very Short,0 to 1978 Very Short Romance Blanche Bayliss ...
1,147,The Corbett-Fitzsimmons Fight,1897,100.0,"Documentary,News,Sport",[],[Enoch J. Rector],5.3,0 to 1978,Long,"0 to 1978 Long Documentary,News,Sport Enoch J..."
2,502,Bohemios,1905,100.0,\N,"[Antonio del Pozo, El Mochuelo]",[],4.1,0 to 1978,Long,0 to 1978 Long \N Antonio del Pozo El Mochuelo
3,574,The Story of the Kelly Gang,1906,70.0,"Action,Adventure,Biography","[Orrie Perry, Reg Perry, Elizabeth Tait, John ...",[Charles Tait],6.0,0 to 1978,Very Short,"0 to 1978 Very Short Action,Adventure,Biograph..."
4,591,The Prodigal Son,1907,90.0,Drama,"[Christiane Mandelys, Gilberte Sergy]",[Michel Carré],4.4,0 to 1978,Medium,0 to 1978 Medium Drama Christiane Mandelys Gi...


In [100]:
merged_df.iloc[0]['text']

'0 to 1978 Very Short Romance Blanche Bayliss  William Courtenay  Chauncey Depew Alexander Black'

In [101]:
merged_df.to_csv("sample_df.csv")

### Modeling

In [3]:
merged_df = pd.read_csv("sample_df.csv")

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [4]:
# Define the choices for runtime_bins, era, and genres
runtime_choices = merged_df['runtime_bins'].unique()
era_choices = merged_df['era'].unique()
genre_choices = merged_df['genres'].str.split(',', expand=True).stack().unique().tolist()

In [6]:
def query_preferences(merged_df):
    # Define the choices for runtime_bins, era, genres, main_cast, and directors

    runtime_choices = merged_df['runtime_bins'].unique()
    era_choices = merged_df['era'].unique()
    genre_choices = merged_df['genres'].str.split(',', expand=True).stack().unique().tolist()
    main_cast_choices = merged_df['main_cast'].explode().str.strip().unique().astype(str).tolist()
    directors_choices = merged_df['directors'].explode().str.strip().unique().astype(str).tolist()

    # Ask for user preferences
    era_pref = input(f"Enter your preferred era ({', '.join(era_choices)}): ")
    runtime_pref = input(f"Enter your preferred runtime ({', '.join(runtime_choices)}): ")
    genre_pref = input(f"Enter your preferred genres ({', '.join(genre_choices)}): ")
    main_cast_pref = input(f"Enter your preferred main cast members ({', '.join(main_cast_choices)}), separated by commas: ")
    directors_pref = input(f"Enter your preferred directors ({', '.join(directors_choices)}), separated by commas: ")

    # Print user preferences
    print("Your preferences:")
    print(f"Era: {era_pref}")
    print(f"Runtime: {runtime_pref}")
    print(f"Genres: {genre_pref}")
    print(f"Main cast members: {main_cast_pref}")
    print(f"Directors: {directors_pref}")
    print("~-"*50)

    # Generate similarities based on user preferences
    generate_similarities(era_pref, runtime_pref, genre_pref, main_cast_pref, directors_pref)


def generate_similarities(era_pref, runtime_pref, genre_pref, main_cast_pref, directors_pref):
    # Filter movies based on user preferences
    filtered_df = merged_df.copy()
    # Compute TF-IDF vectors for movies and user input
    tfidf = TfidfVectorizer(stop_words='english')
    tfidf_matrix = tfidf.fit_transform(filtered_df['text'].values.astype('U'))
    user_input = era_pref + ' ' + runtime_pref + ' ' + genre_pref + ' ' + main_cast_pref + ' ' + directors_pref
    user_tfidf = tfidf.transform([user_input])
    cosine_similarities = cosine_similarity(user_tfidf, tfidf_matrix).flatten()

    # Get indices of top 5 movies with highest similarity scores
    top_indices = cosine_similarities.argsort()[::-1][:5]

    # Print top 5 recommended movies
    print("Recommended movies:")
    for idx in top_indices:
        # if idx in filtered_df.index:
        print(filtered_df.loc[idx, 'primaryTitle'], '(' + str(filtered_df.loc[idx, 'startYear']) + ')', '- Similarity score:', round(cosine_similarities[idx], 2))


In [7]:
query_preferences(merged_df)

Your preferences:
Era: 2005 to 2015
Runtime: Medium
Genres: Adventure, Action
Main cast members: 
Directors: 
~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-
Recommended movies:
Camel Caravan (2012) - Similarity score: 1.0
Mercenaries (2014) - Similarity score: 1.0
Wild Desert (2015) - Similarity score: 1.0
Big Game (2014) - Similarity score: 1.0
Viking: The Berserkers (2014) - Similarity score: 1.0


In [8]:
query_preferences(merged_df)

Your preferences:
Era: After 2015
Runtime: Medium
Genres: Thriller, Horror
Main cast members: Walter Edwin
Directors: Alexander Black
~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-
Recommended movies:
Who Will Marry Mary? (1913) - Similarity score: 0.64
Gloria's Romance (1916) - Similarity score: 0.64
Miss Jerry (1894) - Similarity score: 0.36
The White Pearl (1915) - Similarity score: 0.32
The Requin (2022) - Similarity score: 0.23
