In [1]:
import numpy as np
import pandas as pd
import gzip
import re
import sys

### extract information function

In [2]:
def extract_cast(row):
    '''
    Extract movie stars in "cast" string.
    :param:
        row(str): string of all cast
    :return:
        results(list): a list of all names in str
    '''
    assert isinstance(row, str)
    if row == "None":
        return "None"
    else:
        results = re.findall(r"\'name\': \'(\w+\s\w+-?\w+)'", row)
        return results
def extract_director(row):
    '''
    Extract director in "crew" string.
    :param:
        row(str): string of all crew
    :return:
        results(list): a list of all names in str
    '''
    assert isinstance(row, str)
    if row == "None":
        return "None"
    else:
        results = re.findall(r"\'job\': \'Director\', \'name\': \'(\w+\s\w+-?\w+)'", row)
        return results
def extract_keywords(row):
    '''
    Extract keywords in "keywords" string.
    :param:
        row(str): string of all keywords
    :return:
        results(list): a list of all keywords in str
    '''
    assert isinstance(row, str)
    if row == "None":
        return "None"
    else:
        results = re.findall(r"\'name': \'(.+?)\'", row)
        return results
def extract_genres(row):
    '''
    Extract genres in "genres" string.
    :param:
        row(str): string of all genres
    :return:
        results(list): a list of all gernes in str
    '''
    assert isinstance(row, str)
    if row == "None":
        return "None"
    else:
        results = re.findall(r"\'name': \'(\w+)'", row)
        return results
def extract_month_year(row):
    '''
    Extract month and year in "release-date" string.
    :param:
        row(str): string of all date
    :return:
        results(int): an integer represent month. total_month = year*12 + current_month
    '''
    assert isinstance(row, str)
    if row == "None":
        return "None"
    else:
        res = re.findall(r"\d+", row)
        results = int(res[0])*12 + int(res[1]) 
        return results
    
def remove_empty_list(x):
    '''
    Remove rows that cast, keywords, director and genres are all empty
    
    :param:
        x(list): list of all four feature add result
    :return:
        None for enpty
        remain the same if something is in x
    '''
    assert isinstance(x, list)
    if len(x) == 0:
        return None
    else:
        return x
    
def change_id(row, movie_id):
    # if imdb_id is none or nan use tmdb_id (which is "id" in movie matadata) to get movie_id
    if row['imdb_id'] == "None" or row['imdb_id'] == 'nan':
        res = int(row['id'])
        results = movie_id.loc[movie_id['tmdbId'] == res]['movieId']
    else:
        imdb_id = re.findall(r"\d+", row['imdb_id'])[0]
        results = movie_id.loc[movie_id['imdbId'] == int(imdb_id)]['movieId'] # result is a dataframe
        # if imdb_id is not seen in "link.csv", use tmdb_id to find
        if results.empty:
            res = int(row['id'])
            results = movie_id.loc[movie_id['tmdbId'] == res]['movieId']  
        else:
            results = int(results)
    return results

### Handling movie id issue

In [3]:
movie_links = pd.read_csv('../links.csv')

### target columns

In [4]:
columns = ['imdb_id', 'id', 'title', 'cast', 'crew', 'keywords', 'genres', 'vote_average', 'vote_count', 'overview', 'release_date', 'popularity', 'tagline']

### Merge movie metadata with cast and keywords

In [5]:
# Make sure read_csv are in right path or error will appear
try:
    data_movie = pd.read_csv('../movies_metadata.csv', dtype=str)
    cast = pd.read_csv('../credits.csv', dtype=str)
    keywords = pd.read_csv('../keywords.csv', dtype=str)
except FileNotFoundError:
    print("Path is not correct, please check.")
else:
    meta = data_movie.merge(cast, on='id', how='inner').merge(keywords, on='id', how='inner').drop_duplicates()
    tmp = meta[columns].copy()
    print(tmp.shape)

(45456, 13)


### change ID

In [6]:
tmp['imdb_id'].fillna('nan', inplace=True)  # fill nan in string for imdb_id NaN and handle it in function
tmp.loc[:, 'imdb_id'] = tmp.apply(change_id, axis=1, movie_id=movie_links)
print(tmp.shape)

(45456, 13)


### extracting information

In [7]:
tmp.loc[:,'cast'] = tmp.loc[:,'cast'].apply(extract_cast)
tmp.loc[:,'director'] = tmp.loc[:,'crew'].apply(extract_director)
tmp.loc[:,'keywords'] = tmp.loc[:,'keywords'].apply(extract_keywords)
tmp.loc[:,'genres'] = tmp.loc[:,'genres'].apply(extract_genres)
processed_movie = tmp[columns + ['director']].copy()
print(processed_movie.shape)

(45456, 14)


### check empty cast, keywords, genres and director

In [8]:
processed_movie["check_empty"] = processed_movie["cast"] + processed_movie["keywords"] + processed_movie["genres"] + processed_movie["director"]
processed_movie["check_empty"] = processed_movie["check_empty"].apply(remove_empty_list).dropna()

### fill NAN and drop NAN

In [9]:
# We don't want too many information be drop out, so fill up some NaN information before dropna
processed_movie['popularity'].fillna(0, inplace=True)
processed_movie['overview'].fillna('', inplace=True)
processed_movie['tagline'].fillna('', inplace=True)
processed_movie.dropna(inplace=True)
print(processed_movie.shape)

(45116, 15)


### change data type

In [10]:
# change data type for future use
processed_movie.loc[:,'vote_average'] = processed_movie.loc[:,'vote_average'].astype(float)
processed_movie.loc[:,'vote_count'] = processed_movie.loc[:,'vote_count'].astype(int)
processed_movie.loc[:,'release_date'] = processed_movie.loc[:,'release_date'].apply(extract_month_year)
processed_movie.drop(columns=['crew'], inplace=True)  # We only need director from crew information so drop
processed_movie.rename(columns={'imdb_id':'movieId'}, inplace=True)  # rename imdb_id as movieId to match rating_small.csv
print(processed_movie.shape)

(45116, 14)


In [11]:
processed_movie

Unnamed: 0,movieId,id,title,cast,keywords,genres,vote_average,vote_count,overview,release_date,popularity,tagline,director,check_empty
0,1,862,Toy Story,"[Tom Hanks, Tim Allen, Don Rickles, Jim Varney...","[jealousy, toy, boy, friendship, friends, riva...","[Animation, Comedy, Family]",7.7,5415,"Led by Woody, Andy's toys live happily in his ...",23950,21.946943,,[John Lasseter],"[Tom Hanks, Tim Allen, Don Rickles, Jim Varney..."
1,2,8844,Jumanji,"[Robin Williams, Jonathan Hyde, Kirsten Dunst,...","[board game, disappearance, new home, recluse,...","[Adventure, Fantasy, Family]",6.9,2413,When siblings Judy and Peter discover an encha...,23952,17.015539,Roll the dice and unleash the excitement!,[Joe Johnston],"[Robin Williams, Jonathan Hyde, Kirsten Dunst,..."
2,3,15602,Grumpier Old Men,"[Walter Matthau, Jack Lemmon, Sophia Loren, Da...","[fishing, best friend, duringcreditsstinger, o...","[Romance, Comedy]",6.5,92,A family wedding reignites the ancient feud be...,23952,11.7129,Still Yelling. Still Fighting. Still Ready for...,[Howard Deutch],"[Walter Matthau, Jack Lemmon, Sophia Loren, Da..."
3,4,31357,Waiting to Exhale,"[Whitney Houston, Angela Bassett, Loretta Devi...","[based on novel, interracial relationship, sin...","[Comedy, Drama, Romance]",6.1,34,"Cheated on, mistreated and stepped on, the wom...",23952,3.859495,Friends are the people who let you be yourself...,[Forest Whitaker],"[Whitney Houston, Angela Bassett, Loretta Devi..."
4,5,11862,Father of the Bride Part II,"[Steve Martin, Diane Keaton, Martin Short, Kim...","[baby, midlife crisis, confidence, aging, daug...",[Comedy],5.7,173,Just when George Banks has recovered from his ...,23942,8.387519,Just When His World Is Back To Normal... He's ...,[Charles Shyer],"[Steve Martin, Diane Keaton, Martin Short, Kim..."
5,6,949,Heat,"[Al Pacino, Val Kilmer, Jon Voight, Tom Sizemo...","[robbery, detective, bank, obsession, chase, s...","[Action, Crime, Drama, Thriller]",7.7,1886,"Obsessive master thief, Neil McCauley leads a ...",23952,17.924927,A Los Angeles Crime Saga,[Michael Mann],"[Al Pacino, Val Kilmer, Jon Voight, Tom Sizemo..."
6,7,11860,Sabrina,"[Harrison Ford, Julia Ormond, Greg Kinnear, An...","[paris, brother brother relationship, chauffeu...","[Comedy, Romance]",6.2,141,An ugly duckling having undergone a remarkable...,23952,6.677277,You are cordially invited to the most surprisi...,[Sydney Pollack],"[Harrison Ford, Julia Ormond, Greg Kinnear, An..."
7,8,45325,Tom and Huck,"[Brad Renfro, Michael McShane, Amy Wright, Eri...",[],"[Action, Adventure, Drama, Family]",5.4,45,"A mischievous young boy, Tom Sawyer, witnesses...",23952,2.561161,The Original Bad Boys.,[Peter Hewitt],"[Brad Renfro, Michael McShane, Amy Wright, Eri..."
8,9,9091,Sudden Death,"[Powers Boothe, Dorian Harewood, Ross Malinger...","[terrorist, hostage, explosive, vice president]","[Action, Adventure, Thriller]",5.5,174,International action superstar Jean Claude Van...,23952,5.23158,Terror goes into overtime.,[Peter Hyams],"[Powers Boothe, Dorian Harewood, Ross Malinger..."
9,10,710,GoldenEye,"[Pierce Brosnan, Sean Bean, Izabella Scorupco,...","[cuba, falsely accused, secret identity, compu...","[Adventure, Action, Thriller]",6.6,1194,James Bond must unmask the mysterious head of ...,23951,14.686036,No limits. No fears. No substitutes.,[Martin Campbell],"[Pierce Brosnan, Sean Bean, Izabella Scorupco,..."


### save processed data

In [12]:
processed_movie.to_parquet('processed_data.parquet', compression='gzip')

### read data

In [13]:
pd.read_parquet('processed_data.parquet')

Unnamed: 0_level_0,movieId,id,title,cast,keywords,genres,vote_average,vote_count,overview,release_date,popularity,tagline,director,check_empty
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
0,1,862,Toy Story,"[Tom Hanks, Tim Allen, Don Rickles, Jim Varney...","[jealousy, toy, boy, friendship, friends, riva...","[Animation, Comedy, Family]",7.7,5415,"Led by Woody, Andy's toys live happily in his ...",23950,21.946943,,[John Lasseter],"[Tom Hanks, Tim Allen, Don Rickles, Jim Varney..."
1,2,8844,Jumanji,"[Robin Williams, Jonathan Hyde, Kirsten Dunst,...","[board game, disappearance, new home, recluse,...","[Adventure, Fantasy, Family]",6.9,2413,When siblings Judy and Peter discover an encha...,23952,17.015539,Roll the dice and unleash the excitement!,[Joe Johnston],"[Robin Williams, Jonathan Hyde, Kirsten Dunst,..."
2,3,15602,Grumpier Old Men,"[Walter Matthau, Jack Lemmon, Sophia Loren, Da...","[fishing, best friend, duringcreditsstinger, o...","[Romance, Comedy]",6.5,92,A family wedding reignites the ancient feud be...,23952,11.7129,Still Yelling. Still Fighting. Still Ready for...,[Howard Deutch],"[Walter Matthau, Jack Lemmon, Sophia Loren, Da..."
3,4,31357,Waiting to Exhale,"[Whitney Houston, Angela Bassett, Loretta Devi...","[based on novel, interracial relationship, sin...","[Comedy, Drama, Romance]",6.1,34,"Cheated on, mistreated and stepped on, the wom...",23952,3.859495,Friends are the people who let you be yourself...,[Forest Whitaker],"[Whitney Houston, Angela Bassett, Loretta Devi..."
4,5,11862,Father of the Bride Part II,"[Steve Martin, Diane Keaton, Martin Short, Kim...","[baby, midlife crisis, confidence, aging, daug...",[Comedy],5.7,173,Just when George Banks has recovered from his ...,23942,8.387519,Just When His World Is Back To Normal... He's ...,[Charles Shyer],"[Steve Martin, Diane Keaton, Martin Short, Kim..."
5,6,949,Heat,"[Al Pacino, Val Kilmer, Jon Voight, Tom Sizemo...","[robbery, detective, bank, obsession, chase, s...","[Action, Crime, Drama, Thriller]",7.7,1886,"Obsessive master thief, Neil McCauley leads a ...",23952,17.924927,A Los Angeles Crime Saga,[Michael Mann],"[Al Pacino, Val Kilmer, Jon Voight, Tom Sizemo..."
6,7,11860,Sabrina,"[Harrison Ford, Julia Ormond, Greg Kinnear, An...","[paris, brother brother relationship, chauffeu...","[Comedy, Romance]",6.2,141,An ugly duckling having undergone a remarkable...,23952,6.677277,You are cordially invited to the most surprisi...,[Sydney Pollack],"[Harrison Ford, Julia Ormond, Greg Kinnear, An..."
7,8,45325,Tom and Huck,"[Brad Renfro, Michael McShane, Amy Wright, Eri...",[],"[Action, Adventure, Drama, Family]",5.4,45,"A mischievous young boy, Tom Sawyer, witnesses...",23952,2.561161,The Original Bad Boys.,[Peter Hewitt],"[Brad Renfro, Michael McShane, Amy Wright, Eri..."
8,9,9091,Sudden Death,"[Powers Boothe, Dorian Harewood, Ross Malinger...","[terrorist, hostage, explosive, vice president]","[Action, Adventure, Thriller]",5.5,174,International action superstar Jean Claude Van...,23952,5.23158,Terror goes into overtime.,[Peter Hyams],"[Powers Boothe, Dorian Harewood, Ross Malinger..."
9,10,710,GoldenEye,"[Pierce Brosnan, Sean Bean, Izabella Scorupco,...","[cuba, falsely accused, secret identity, compu...","[Adventure, Action, Thriller]",6.6,1194,James Bond must unmask the mysterious head of ...,23951,14.686036,No limits. No fears. No substitutes.,[Martin Campbell],"[Pierce Brosnan, Sean Bean, Izabella Scorupco,..."
