# Imports

In [3]:
import pandas as pd
pd.set_option('display.float_format', lambda x: '%.5f' % x)

movies = pd.read_csv('https://raw.githubusercontent.com/nchichilidze/RS-with-GE/main/preprocessed_movielens_1m/movies.csv')
tags = pd.read_csv('https://raw.githubusercontent.com/nchichilidze/RS-with-GE/main/preprocessed_movielens_1m/tags.csv')
ratings = pd.read_csv('https://raw.githubusercontent.com/nchichilidze/RS-with-GE/main/preprocessed_movielens_1m/ratings.csv')

movies = movies.iloc[: , 1:]
tags = tags.iloc[: , 1:]

# imdb_ratings = pd.read_csv("title.ratings.tsv", sep='\t')
# imdb_titles = pd.read_csv("title.basics.tsv", sep='\t')

In [103]:
# drop non movie titleTypes 

non_movie_titleTypes = ['tvEpisode', 'tvSeries', 'tvMiniSeries', 'video', 'videoGame', 'tvPilot']

for titleType in non_movie_titleTypes: 
    imdb_titles = imdb_titles[imdb_titles['titleType'] != titleType]

In [4]:
movies_full = movies.copy(deep=True)
movies_full['title'] = movies_full['title'].apply(lambda x: x.lower())

imdb_full = imdb_ratings.copy(deep=True)
imdb_full = pd.merge(imdb_full, imdb_titles, on='tconst')

imdb_full = imdb_full.rename(columns={'primaryTitle': 'title'})
imdb_full = imdb_full.rename(columns={'startYear': 'year'})
imdb_full = imdb_full.rename(columns={'genres': 'imdb_genres'})
imdb_full = imdb_full.drop(columns='endYear')
imdb_full['title'] = imdb_full['title'].apply(lambda x: x.lower())

In [5]:
movies_full = movies_full.astype({'title':'string'})

imdb_full = imdb_full.replace('\\N', None)
imdb_full = imdb_full.astype({'title':'string', 
                              'year':'int64'})


# Match IMDB & MovieLens

In [106]:
ml_unmatched = movies_full.copy(deep='True')
ml_unmatched = ml_unmatched.drop(columns = ['genres'])

imdb = imdb_full.copy(deep='True')
imdb = imdb.drop(columns = ['averageRating','numVotes','titleType','isAdult','runtimeMinutes', 'imdb_genres'])

unmatched_movie_ids = ml_unmatched['movie_id']

print("movies to match: " + str(ml_unmatched.shape))

movies to match: (3883, 3)


In [24]:
import string 
from string import punctuation

def reposition_movielens_article(title): 
    articles = {', the', ', a', ', an', ', le', ', la', ', l\'', ', el', ', dir', ', der'}
    for article in articles: 
        if(title[-len(article):] == article):
            title = article[2:] + ' ' + title[:title.index(article)]
    return title 

def remove_movielens_articles(title):
    new_title = title
    articles = {', the', ', a', ', an', ', le', ', la', ', l\'', ', el', ', dir', ', der'}
    for article in articles: 
        if article in title: 
            new_title = title[:title.index(article)]
    return new_title 

def remove_imdb_articles(title): 
    new_title = title
    articles = {'the ', 'a ', 'an ', 'le ', 'la ', 'l\' ', 'el ', 'dir', 'der'}
    for article in articles: 
        if article in title: 
            new_title = title[title.index(article) + len(article):] 
    return new_title

def remove_punctuation(title): 
    new_title = title
    if (any(p in title for p in punctuation)):
        new_title = title.translate(str.maketrans('', '', string.punctuation))
    return new_title 

def remove_numbers(title):
    no_digits = []
    for i in title:
        if not i.isdigit():
            no_digits.append(i)
    return ''.join(no_digits)

def remove_secondary(title):
    new_title = title
    if ('(' in title and ')' in title):
        new_title = title[: title.index('(')-1]
    
    return new_title 

def extract_secondary(title):
    new_title = title
    if ('(' in title and ')' in title):
        new_title = title[title.index('(') + 1: title.index(')')]
    return new_title  

In [108]:
# match on title + year 
match = pd.merge(ml_unmatched, imdb, on=['title', 'year'])
matched_df_full = match.copy(deep='True')


matched_movie_ids = list(matched_df_full['movie_id'])
unmatched_movie_ids = set(unmatched_movie_ids) - set(matched_movie_ids)

ml_unmatched = ml_unmatched[ml_unmatched['movie_id'].isin(list(unmatched_movie_ids))]

print("* title + year")
print("new matches: " + str(match.shape))
print("all matches: " + str(matched_df_full.shape))
print("left to match: " + str(ml_unmatched.shape))

# match on original title + year 
imdb['originalTitle'] = imdb['originalTitle'].apply(lambda x: x.lower())

match = pd.merge(ml_unmatched, imdb, 
                 left_on=['title', 'year'],
                 right_on=['originalTitle', 'year'])

matched_df_full = matched_df_full.append(match, ignore_index=True)

matched_movie_ids = list(matched_df_full['movie_id'])
unmatched_movie_ids = set(unmatched_movie_ids) - set(matched_movie_ids)

ml_unmatched = ml_unmatched[ml_unmatched['movie_id'].isin(list(unmatched_movie_ids))]

print("* original title + year")
print("new matches: " + str(match.shape))
print("all matches: " + str(matched_df_full.shape))
print("left to match: " + str(ml_unmatched.shape))

* title + year
new matches: (2455, 5)
all matches: (2455, 5)
left to match: (1460, 3)
* original title + year
new matches: (36, 6)
all matches: (2491, 7)
left to match: (1424, 3)


In [109]:
# match on original title + (year + 1)
ml_unmatched['year+1'] = ml_unmatched['year'] + 1
ml_unmatched['year-1'] = ml_unmatched['year'] - 1


match = pd.merge(ml_unmatched, imdb, 
                 left_on=['title', 'year+1'],
                 right_on=['originalTitle', 'year'])

matched_df_full = matched_df_full.append(match, ignore_index=True)

matched_movie_ids = list(matched_df_full['movie_id'])
unmatched_movie_ids = set(unmatched_movie_ids) - set(matched_movie_ids)

ml_unmatched = ml_unmatched[ml_unmatched['movie_id'].isin(list(unmatched_movie_ids))]

print("* original title + (year+1) ")
print("new matches: " + str(match.shape))
print("all matches: " + str(matched_df_full.shape))
print("left to match: " + str(ml_unmatched.shape))

# match on title + (year+1)

ml_unmatched['year+1'] = ml_unmatched['year'] + 1
ml_unmatched['year-1'] = ml_unmatched['year'] - 1


match = pd.merge(ml_unmatched, imdb, 
                 left_on=['title', 'year+1'],
                 right_on=['title', 'year'])

matched_df_full = matched_df_full.append(match, ignore_index=True)

matched_movie_ids = list(matched_df_full['movie_id'])
unmatched_movie_ids = set(unmatched_movie_ids) - set(matched_movie_ids)

ml_unmatched = ml_unmatched[ml_unmatched['movie_id'].isin(list(unmatched_movie_ids))]

print("* title + (year+1) ")
print("new matches: " + str(match.shape))
print("all matches: " + str(matched_df_full.shape))
print("left to match: " + str(ml_unmatched.shape))

* original title + (year+1) 
new matches: (65, 9)
all matches: (2556, 11)
left to match: (1361, 5)
* title + (year+1) 
new matches: (1, 8)
all matches: (2557, 11)
left to match: (1360, 5)


In [110]:
# match on a year that is 1 off 
# match on original title 
match = pd.merge(ml_unmatched, imdb, 
                 left_on=['title', 'year-1'],
                 right_on=['originalTitle', 'year'])

matched_df_full = matched_df_full.append(match, ignore_index=True)

matched_movie_ids = list(matched_df_full['movie_id'])
unmatched_movie_ids = set(unmatched_movie_ids) - set(matched_movie_ids)

ml_unmatched = ml_unmatched[ml_unmatched['movie_id'].isin(list(unmatched_movie_ids))]

print("original title + year - 1")
print("new matches: " + str(match.shape))
print("all matches: " + str(matched_df_full.shape))
print("left to match: " + str(ml_unmatched.shape))

# match on a year that is 1 off 
# match on title 
match = pd.merge(ml_unmatched, imdb, 
                 left_on=['title', 'year-1'],
                 right_on=['title', 'year'])

matched_df_full = matched_df_full.append(match, ignore_index=True)

matched_movie_ids = list(matched_df_full['movie_id'])
unmatched_movie_ids = set(unmatched_movie_ids) - set(matched_movie_ids)

ml_unmatched = ml_unmatched[ml_unmatched['movie_id'].isin(list(unmatched_movie_ids))]

print("title + year - 1")
print("new matches: " + str(match.shape))
print("all matches: " + str(matched_df_full.shape))
print("left to match: " + str(ml_unmatched.shape))

original title + year - 1
new matches: (82, 9)
all matches: (2639, 11)
left to match: (1278, 5)
title + year - 1
new matches: (8, 8)
all matches: (2647, 11)
left to match: (1270, 5)


In [111]:
# fix the article notation movielens has
# The notation is i.e. Contender, The 

ml_unmatched_articles = ml_unmatched.copy(deep='True')

ml_unmatched_articles['title'] = ml_unmatched_articles['title'].apply(lambda x: reposition_movielens_article(x))

match = pd.merge(ml_unmatched_articles, imdb, 
                 left_on=['title', 'year'],
                 right_on=['originalTitle', 'year'])

matched_df_full = matched_df_full.append(match, ignore_index=True)

matched_movie_ids = list(matched_df_full['movie_id'])
unmatched_movie_ids = set(unmatched_movie_ids) - set(matched_movie_ids)

ml_unmatched = ml_unmatched[ml_unmatched['movie_id'].isin(list(unmatched_movie_ids))]

print("fixed article + original title")
print("new matches: " + str(match.shape))
print("all matches: " + str(matched_df_full.shape))
print("left to match: " + str(ml_unmatched.shape))

ml_unmatched_articles = ml_unmatched_articles[ml_unmatched_articles['movie_id'].isin(unmatched_movie_ids)]

# article notation change but match with title instead of originalTitle 

match = pd.merge(ml_unmatched_articles, imdb, 
                 left_on=['title', 'year'],
                 right_on=['title', 'year'])

matched_df_full = matched_df_full.append(match, ignore_index=True)


matched_movie_ids = list(matched_df_full['movie_id'])
unmatched_movie_ids = set(unmatched_movie_ids) - set(matched_movie_ids)

ml_unmatched = ml_unmatched[ml_unmatched['movie_id'].isin(list(unmatched_movie_ids))]

print("fixed article + title")
print("new matches: " + str(match.shape))
print("all matches: " + str(matched_df_full.shape))
print("left to match: " + str(ml_unmatched.shape))

fixed article + original title
new matches: (715, 8)
all matches: (3362, 11)
left to match: (560, 5)
fixed article + title
new matches: (19, 7)
all matches: (3381, 11)
left to match: (541, 5)


In [112]:
# let's try the +- year again 

ml_unmatched_articles = ml_unmatched_articles[ml_unmatched_articles['movie_id'].isin(unmatched_movie_ids)]

ml_unmatched_articles['year+1'] = ml_unmatched_articles['year']+1
ml_unmatched_articles['year-1'] = ml_unmatched_articles['year']-1

match = pd.merge(ml_unmatched_articles, imdb, 
                 left_on=['title', 'year+1'],
                 right_on=['originalTitle', 'year'])

matched_df_full = matched_df_full.append(match, ignore_index=True)

matched_movie_ids = list(matched_df_full['movie_id'])
unmatched_movie_ids = set(unmatched_movie_ids) - set(matched_movie_ids)

ml_unmatched = ml_unmatched[ml_unmatched['movie_id'].isin(list(unmatched_movie_ids))]

print("* fixed article + year mismatch")
print("new matches: " + str(match.shape))
print("all matches: " + str(matched_df_full.shape))
print("left to match: " + str(ml_unmatched.shape))

ml_unmatched_articles = ml_unmatched_articles[ml_unmatched_articles['movie_id'].isin(unmatched_movie_ids)]

# article notation change but match with title instead of originalTitle 

match = pd.merge(ml_unmatched_articles, imdb, 
                 left_on=['title', 'year+1'],
                 right_on=['title', 'year'])

matched_df_full = matched_df_full.append(match, ignore_index=True)

matched_movie_ids = list(matched_df_full['movie_id'])
unmatched_movie_ids = set(unmatched_movie_ids) - set(matched_movie_ids)

ml_unmatched = ml_unmatched[ml_unmatched['movie_id'].isin(list(unmatched_movie_ids))]

print("* fixed article + year mismatch")
print("new matches: " + str(match.shape))
print("all matches: " + str(matched_df_full.shape))
print("left to match: " + str(ml_unmatched.shape))

* fixed article + year mismatch
new matches: (17, 9)
all matches: (3398, 11)
left to match: (525, 5)
* fixed article + year mismatch
new matches: (1, 8)
all matches: (3399, 11)
left to match: (524, 5)


In [113]:
# let's try the +- year again 

ml_unmatched_articles = ml_unmatched_articles[ml_unmatched_articles['movie_id'].isin(unmatched_movie_ids)]

ml_unmatched_articles['year+1'] = ml_unmatched_articles['year']+1
ml_unmatched_articles['year-1'] = ml_unmatched_articles['year']-1

match = pd.merge(ml_unmatched_articles, imdb, 
                 left_on=['title', 'year-1'],
                 right_on=['originalTitle', 'year'])

matched_df_full = matched_df_full.append(match, ignore_index=True)

matched_movie_ids = list(matched_df_full['movie_id'])
unmatched_movie_ids = set(unmatched_movie_ids) - set(matched_movie_ids)

ml_unmatched = ml_unmatched[ml_unmatched['movie_id'].isin(list(unmatched_movie_ids))]

print("* fixed article + year mismatch")
print("new matches: " + str(match.shape))
print("all matches: " + str(matched_df_full.shape))
print("left to match: " + str(ml_unmatched.shape))

ml_unmatched_articles = ml_unmatched_articles[ml_unmatched_articles['movie_id'].isin(unmatched_movie_ids)]

# article notation change but match with title instead of originalTitle 

match = pd.merge(ml_unmatched_articles, imdb, 
                 left_on=['title', 'year-1'],
                 right_on=['title', 'year'])

matched_df_full = matched_df_full.append(match, ignore_index=True)

matched_movie_ids = list(matched_df_full['movie_id'])
unmatched_movie_ids = set(unmatched_movie_ids) - set(matched_movie_ids)

ml_unmatched = ml_unmatched[ml_unmatched['movie_id'].isin(list(unmatched_movie_ids))]

print("* fixed article + year mismatch")
print("new matches: " + str(match.shape))
print("all matches: " + str(matched_df_full.shape))
print("left to match: " + str(ml_unmatched.shape))

* fixed article + year mismatch
new matches: (20, 9)
all matches: (3419, 11)
left to match: (504, 5)
* fixed article + year mismatch
new matches: (1, 8)
all matches: (3420, 11)
left to match: (503, 5)


In [114]:
# no numbers + year

ml_unmatched_numbers = ml_unmatched.copy(deep='True')
ml_unmatched_numbers['year+1'] = ml_unmatched_numbers['year']+1
ml_unmatched_numbers['year+1'] = ml_unmatched_numbers['year']+1

imdb_numbers = imdb.copy(deep='True')

ml_unmatched_numbers['title'] = ml_unmatched_numbers['title'].apply(lambda x: remove_punctuation(x))
imdb_numbers['title'] = imdb_numbers['title'].apply(lambda x: remove_punctuation(x))

match = pd.merge(ml_unmatched_numbers, imdb_numbers, 
                 left_on=['title', 'year'],
                 right_on=['title', 'year'])

matched_df_full = matched_df_full.append(match, ignore_index=True)

matched_movie_ids = list(matched_df_full['movie_id'])
unmatched_movie_ids = set(unmatched_movie_ids) - set(matched_movie_ids)

ml_unmatched = ml_unmatched[ml_unmatched['movie_id'].isin(list(unmatched_movie_ids))]

print("* no punctuation + year mismatch")
print("new matches: " + str(match.shape))
print("all matches: " + str(matched_df_full.shape))
print("left to match: " + str(ml_unmatched.shape))

# no punctuation + (year+1)

ml_unmatched_numbers = ml_unmatched.copy(deep='True')
ml_unmatched_numbers['year+1'] = ml_unmatched_numbers['year']+1

imdb_numbers = imdb.copy(deep='True')

ml_unmatched_numbers['title'] = ml_unmatched_numbers['title'].apply(lambda x: remove_punctuation(x))
imdb_numbers['title'] = imdb_numbers['title'].apply(lambda x: remove_punctuation(x))

match = pd.merge(ml_unmatched_numbers, imdb_numbers, 
                 left_on=['title', 'year+1'],
                 right_on=['title', 'year'])

matched_df_full = matched_df_full.append(match, ignore_index=True)

matched_movie_ids = list(matched_df_full['movie_id'])
unmatched_movie_ids = set(unmatched_movie_ids) - set(matched_movie_ids)

ml_unmatched = ml_unmatched[ml_unmatched['movie_id'].isin(list(unmatched_movie_ids))]

print("* no punctuation + year mismatch")
print("new matches: " + str(match.shape))
print("all matches: " + str(matched_df_full.shape))
print("left to match: " + str(ml_unmatched.shape))

# no punctuation + (year-1)

ml_unmatched_numbers = ml_unmatched.copy(deep='True')
ml_unmatched_numbers['year-1'] = ml_unmatched_numbers['year']-1

imdb_numbers = imdb.copy(deep='True')

ml_unmatched_numbers['title'] = ml_unmatched_numbers['title'].apply(lambda x: remove_punctuation(x))
imdb_numbers['title'] = imdb_numbers['title'].apply(lambda x: remove_punctuation(x))

match = pd.merge(ml_unmatched_numbers, imdb_numbers, 
                 left_on=['title', 'year-1'],
                 right_on=['title', 'year'])

matched_df_full = matched_df_full.append(match, ignore_index=True)

matched_movie_ids = list(matched_df_full['movie_id'])
unmatched_movie_ids = set(unmatched_movie_ids) - set(matched_movie_ids)

ml_unmatched = ml_unmatched[ml_unmatched['movie_id'].isin(list(unmatched_movie_ids))]

print("* no punctuation + year mismatch")
print("new matches: " + str(match.shape))
print("all matches: " + str(matched_df_full.shape))
print("left to match: " + str(ml_unmatched.shape))

* no punctuation + year mismatch
new matches: (28, 7)
all matches: (3448, 11)
left to match: (475, 5)
* no punctuation + year mismatch
new matches: (1, 8)
all matches: (3449, 11)
left to match: (474, 5)
* no punctuation + year mismatch
new matches: (1, 8)
all matches: (3450, 11)
left to match: (473, 5)


In [115]:
# no numbers + year

ml_unmatched_numbers = ml_unmatched.copy(deep='True')
ml_unmatched_numbers['year+1'] = ml_unmatched_numbers['year']+1
ml_unmatched_numbers['year+1'] = ml_unmatched_numbers['year']+1

imdb_numbers = imdb.copy(deep='True')

ml_unmatched_numbers['title'] = ml_unmatched_numbers['title'].apply(lambda x: remove_punctuation(x))
imdb_numbers['title'] = imdb_numbers['title'].apply(lambda x: remove_punctuation(x))

match = pd.merge(ml_unmatched_numbers, imdb_numbers, 
                 left_on=['title', 'year'],
                 right_on=['title', 'year'])

matched_df_full = matched_df_full.append(match, ignore_index=True)

matched_movie_ids = list(matched_df_full['movie_id'])
unmatched_movie_ids = set(unmatched_movie_ids) - set(matched_movie_ids)

ml_unmatched = ml_unmatched[ml_unmatched['movie_id'].isin(list(unmatched_movie_ids))]

print("* no numbers + year mismatch")
print("new matches: " + str(match.shape))
print("all matches: " + str(matched_df_full.shape))
print("left to match: " + str(ml_unmatched.shape))

# no punctuation + (year+1)

ml_unmatched_numbers = ml_unmatched.copy(deep='True')
ml_unmatched_numbers['year+1'] = ml_unmatched_numbers['year']+1

imdb_numbers = imdb.copy(deep='True')

ml_unmatched_numbers['title'] = ml_unmatched_numbers['title'].apply(lambda x: remove_punctuation(x))
imdb_numbers['title'] = imdb_numbers['title'].apply(lambda x: remove_punctuation(x))

match = pd.merge(ml_unmatched_numbers, imdb_numbers, 
                 left_on=['title', 'year+1'],
                 right_on=['title', 'year'])

matched_df_full = matched_df_full.append(match, ignore_index=True)

matched_movie_ids = list(matched_df_full['movie_id'])
unmatched_movie_ids = set(unmatched_movie_ids) - set(matched_movie_ids)

ml_unmatched = ml_unmatched[ml_unmatched['movie_id'].isin(list(unmatched_movie_ids))]

print("* no numbers + year mismatch")
print("new matches: " + str(match.shape))
print("all matches: " + str(matched_df_full.shape))
print("left to match: " + str(ml_unmatched.shape))

# no punctuation + (year-1)

ml_unmatched_numbers = ml_unmatched.copy(deep='True')
ml_unmatched_numbers['year-1'] = ml_unmatched_numbers['year']-1

imdb_numbers = imdb.copy(deep='True')

ml_unmatched_numbers['title'] = ml_unmatched_numbers['title'].apply(lambda x: remove_punctuation(x))
imdb_numbers['title'] = imdb_numbers['title'].apply(lambda x: remove_punctuation(x))

match = pd.merge(ml_unmatched_numbers, imdb_numbers, 
                 left_on=['title', 'year-1'],
                 right_on=['title', 'year'])

matched_df_full = matched_df_full.append(match, ignore_index=True)

matched_movie_ids = list(matched_df_full['movie_id'])
unmatched_movie_ids = set(unmatched_movie_ids) - set(matched_movie_ids)

ml_unmatched = ml_unmatched[ml_unmatched['movie_id'].isin(list(unmatched_movie_ids))]

print("* no numbers + year mismatch")
print("new matches: " + str(match.shape))
print("all matches: " + str(matched_df_full.shape))
print("left to match: " + str(ml_unmatched.shape))

* no numbers + year mismatch
new matches: (0, 7)
all matches: (3450, 11)
left to match: (473, 5)
* no numbers + year mismatch
new matches: (0, 8)
all matches: (3450, 11)
left to match: (473, 5)
* no numbers + year mismatch
new matches: (0, 8)
all matches: (3450, 11)
left to match: (473, 5)


In [116]:
# secondary title

ml_secondary_title = ml_unmatched.copy(deep='True')
ml_secondary_title['year+1'] = ml_secondary_title['year']+1
ml_secondary_title['year+1'] = ml_secondary_title['year']+1


ml_secondary_title['title'] = ml_secondary_title['title'].apply(lambda x: extract_secondary(x))

match = pd.merge(ml_secondary_title, imdb, 
                 left_on=['title', 'year'],
                 right_on=['title', 'year'])

matched_df_full = matched_df_full.append(match, ignore_index=True)

matched_movie_ids = list(matched_df_full['movie_id'])
unmatched_movie_ids = set(unmatched_movie_ids) - set(matched_movie_ids)

ml_unmatched = ml_unmatched[ml_unmatched['movie_id'].isin(list(unmatched_movie_ids))]
print("* extract secondary title ")
print("new matches: " + str(match.shape))
print("all matches: " + str(matched_df_full.shape))
print("left to match: " + str(ml_unmatched.shape))

# no st + (year+1)

ml_secondary_title = ml_unmatched.copy(deep='True')
ml_secondary_title['year+1'] = ml_secondary_title['year']+1


ml_secondary_title['title'] = ml_secondary_title['title'].apply(lambda x: extract_secondary(x))

match = pd.merge(ml_secondary_title, imdb, 
                 left_on=['title', 'year+1'],
                 right_on=['title', 'year'])

matched_df_full = matched_df_full.append(match, ignore_index=True)

matched_movie_ids = list(matched_df_full['movie_id'])
unmatched_movie_ids = set(unmatched_movie_ids) - set(matched_movie_ids)

ml_unmatched = ml_unmatched[ml_unmatched['movie_id'].isin(list(unmatched_movie_ids))]

print("* extract secondary title ")
print("new matches: " + str(match.shape))
print("all matches: " + str(matched_df_full.shape))
print("left to match: " + str(ml_unmatched.shape))

# no punctuation + (year-1)

ml_secondary_title = ml_unmatched.copy(deep='True')
ml_secondary_title['year-1'] = ml_secondary_title['year']-1


ml_secondary_title['title'] = ml_secondary_title['title'].apply(lambda x: extract_secondary(x))

match = pd.merge(ml_secondary_title, imdb, 
                 left_on=['title', 'year-1'],
                 right_on=['title', 'year'])

matched_df_full = matched_df_full.append(match, ignore_index=True)

matched_movie_ids = list(matched_df_full['movie_id'])
unmatched_movie_ids = set(unmatched_movie_ids) - set(matched_movie_ids)

ml_unmatched = ml_unmatched[ml_unmatched['movie_id'].isin(list(unmatched_movie_ids))]

print("* extract secondary title ")

print("new matches: " + str(match.shape))
print("all matches: " + str(matched_df_full.shape))
print("left to match: " + str(ml_unmatched.shape))

* extract secondary title 
new matches: (20, 7)
all matches: (3470, 11)
left to match: (453, 5)
* extract secondary title 
new matches: (1, 8)
all matches: (3471, 11)
left to match: (452, 5)
* extract secondary title 
new matches: (1, 8)
all matches: (3472, 11)
left to match: (451, 5)


In [117]:
# secondary title

ml_secondary_title = ml_unmatched.copy(deep='True')
ml_secondary_title['year+1'] = ml_secondary_title['year']+1
ml_secondary_title['year+1'] = ml_secondary_title['year']+1


ml_secondary_title['title'] = ml_secondary_title['title'].apply(lambda x: remove_secondary(x))

match = pd.merge(ml_secondary_title, imdb, 
                 left_on=['title', 'year'],
                 right_on=['title', 'year'])

matched_df_full = matched_df_full.append(match, ignore_index=True)

matched_movie_ids = list(matched_df_full['movie_id'])
unmatched_movie_ids = set(unmatched_movie_ids) - set(matched_movie_ids)

ml_unmatched = ml_unmatched[ml_unmatched['movie_id'].isin(list(unmatched_movie_ids))]

print("* no secondary title")
print("new matches: " + str(match.shape))
print("all matches: " + str(matched_df_full.shape))
print("left to match: " + str(ml_unmatched.shape))

# no secondary + (year+1)

ml_secondary_title = ml_unmatched.copy(deep='True')
ml_secondary_title['year+1'] = ml_secondary_title['year']+1


ml_secondary_title['title'] = ml_secondary_title['title'].apply(lambda x: remove_secondary(x))

match = pd.merge(ml_secondary_title, imdb, 
                 left_on=['title', 'year+1'],
                 right_on=['title', 'year'])

matched_df_full = matched_df_full.append(match, ignore_index=True)

matched_movie_ids = list(matched_df_full['movie_id'])
unmatched_movie_ids = set(unmatched_movie_ids) - set(matched_movie_ids)

ml_unmatched = ml_unmatched[ml_unmatched['movie_id'].isin(list(unmatched_movie_ids))]

print("* no secondary title")
print("new matches: " + str(match.shape))
print("all matches: " + str(matched_df_full.shape))
print("left to match: " + str(ml_unmatched.shape))

# no punctuation + (year-1)

ml_secondary_title = ml_unmatched.copy(deep='True')
ml_secondary_title['year-1'] = ml_secondary_title['year']-1


ml_secondary_title['title'] = ml_secondary_title['title'].apply(lambda x: remove_secondary(x))

match = pd.merge(ml_secondary_title, imdb, 
                 left_on=['title', 'year-1'],
                 right_on=['title', 'year'])

matched_df_full = matched_df_full.append(match, ignore_index=True)

matched_movie_ids = list(matched_df_full['movie_id'])
unmatched_movie_ids = set(unmatched_movie_ids) - set(matched_movie_ids)

ml_unmatched = ml_unmatched[ml_unmatched['movie_id'].isin(list(unmatched_movie_ids))]

print("* no secondary title")
print("new matches: " + str(match.shape))
print("all matches: " + str(matched_df_full.shape))
print("left to match: " + str(ml_unmatched.shape))

* no secondary title
new matches: (109, 7)
all matches: (3581, 11)
left to match: (342, 5)
* no secondary title
new matches: (8, 8)
all matches: (3589, 11)
left to match: (334, 5)
* no secondary title
new matches: (8, 8)
all matches: (3597, 11)
left to match: (326, 5)


In [118]:
# secondary title + article reposition

ml_secondary_title = ml_unmatched.copy(deep='True')
ml_secondary_title['year+1'] = ml_secondary_title['year']+1
ml_secondary_title['year+1'] = ml_secondary_title['year']+1


ml_secondary_title['title'] = ml_secondary_title['title'].apply(lambda x: remove_secondary(x))
ml_secondary_title['title'] = ml_secondary_title['title'].apply(lambda x: reposition_movielens_article(x))


match = pd.merge(ml_secondary_title, imdb, 
                 left_on=['title', 'year'],
                 right_on=['title', 'year'])

matched_df_full = matched_df_full.append(match, ignore_index=True)

matched_movie_ids = list(matched_df_full['movie_id'])
unmatched_movie_ids = set(unmatched_movie_ids) - set(matched_movie_ids)

ml_unmatched = ml_unmatched[ml_unmatched['movie_id'].isin(list(unmatched_movie_ids))]

print("* secondary title + article reposition")
print("new matches: " + str(match.shape))
print("all matches: " + str(matched_df_full.shape))
print("left to match: " + str(ml_unmatched.shape))

# no punctuation + (year+1)

ml_secondary_title = ml_unmatched.copy(deep='True')
ml_secondary_title['year+1'] = ml_secondary_title['year']+1


ml_secondary_title['title'] = ml_secondary_title['title'].apply(lambda x: remove_secondary(x))
ml_secondary_title['title'] = ml_secondary_title['title'].apply(lambda x: reposition_movielens_article(x))

match = pd.merge(ml_secondary_title, imdb, 
                 left_on=['title', 'year+1'],
                 right_on=['title', 'year'])

matched_df_full = matched_df_full.append(match, ignore_index=True)

matched_movie_ids = list(matched_df_full['movie_id'])
unmatched_movie_ids = set(unmatched_movie_ids) - set(matched_movie_ids)

ml_unmatched = ml_unmatched[ml_unmatched['movie_id'].isin(list(unmatched_movie_ids))]

print("* secondary title + article reposition")
print("new matches: " + str(match.shape))
print("all matches: " + str(matched_df_full.shape))
print("left to match: " + str(ml_unmatched.shape))

# no punctuation + (year-1)

ml_secondary_title = ml_unmatched.copy(deep='True')
ml_secondary_title['year-1'] = ml_secondary_title['year']-1


ml_secondary_title['title'] = ml_secondary_title['title'].apply(lambda x: remove_secondary(x))
ml_secondary_title['title'] = ml_secondary_title['title'].apply(lambda x: reposition_movielens_article(x))

match = pd.merge(ml_secondary_title, imdb, 
                 left_on=['title', 'year-1'],
                 right_on=['title', 'year'])

matched_df_full = matched_df_full.append(match, ignore_index=True)

matched_movie_ids = list(matched_df_full['movie_id'])
unmatched_movie_ids = set(unmatched_movie_ids) - set(matched_movie_ids)

ml_unmatched = ml_unmatched[ml_unmatched['movie_id'].isin(list(unmatched_movie_ids))]

print("* secondary title + article reposition")
print("new matches: " + str(match.shape))
print("all matches: " + str(matched_df_full.shape))
print("left to match: " + str(ml_unmatched.shape))

* secondary title + article reposition
new matches: (60, 7)
all matches: (3657, 11)
left to match: (268, 5)
* secondary title + article reposition
new matches: (4, 8)
all matches: (3661, 11)
left to match: (264, 5)
* secondary title + article reposition
new matches: (3, 8)
all matches: (3664, 11)
left to match: (261, 5)


In [119]:
# replace & with and 

def replace_ampersand(title): 
    if ('&' in title): 
        title = title.replace('&', 'and')
# secondary title + article reposition

ml_secondary_title = ml_unmatched.copy(deep='True')

ml_secondary_title['title'] = ml_secondary_title['title'].apply(lambda x: replace_ampersand(x))

match = pd.merge(ml_secondary_title, imdb, 
                 left_on=['title', 'year'],
                 right_on=['title', 'year'])

matched_df_full = matched_df_full.append(match, ignore_index=True)

matched_movie_ids = list(matched_df_full['movie_id'])
unmatched_movie_ids = set(unmatched_movie_ids) - set(matched_movie_ids)

ml_unmatched = ml_unmatched[ml_unmatched['movie_id'].isin(list(unmatched_movie_ids))]

print("* remove ampersand")
print("new matches: " + str(match.shape))
print("all matches: " + str(matched_df_full.shape))
print("left to match: " + str(ml_unmatched.shape))

# no punctuation + (year+1)

ml_secondary_title = ml_unmatched.copy(deep='True')
ml_secondary_title['year+1'] = ml_secondary_title['year']+1

ml_secondary_title['title'] = ml_secondary_title['title'].apply(lambda x: replace_ampersand(x))

match = pd.merge(ml_secondary_title, imdb, 
                 left_on=['title', 'year+1'],
                 right_on=['title', 'year'])

matched_df_full = matched_df_full.append(match, ignore_index=True)

matched_movie_ids = list(matched_df_full['movie_id'])
unmatched_movie_ids = set(unmatched_movie_ids) - set(matched_movie_ids)

ml_unmatched = ml_unmatched[ml_unmatched['movie_id'].isin(list(unmatched_movie_ids))]

print("* remove ampersand")
print("new matches: " + str(match.shape))
print("all matches: " + str(matched_df_full.shape))
print("left to match: " + str(ml_unmatched.shape))

# no punctuation + (year-1)

ml_secondary_title = ml_unmatched.copy(deep='True')
ml_secondary_title['year-1'] = ml_secondary_title['year']-1

ml_secondary_title['title'] = ml_secondary_title['title'].apply(lambda x: replace_ampersand(x))

match = pd.merge(ml_secondary_title, imdb, 
                 left_on=['title', 'year-1'],
                 right_on=['title', 'year'])

matched_df_full = matched_df_full.append(match, ignore_index=True)

matched_movie_ids = list(matched_df_full['movie_id'])
unmatched_movie_ids = set(unmatched_movie_ids) - set(matched_movie_ids)

ml_unmatched = ml_unmatched[ml_unmatched['movie_id'].isin(list(unmatched_movie_ids))]

print("* remove ampersand")
print("new matches: " + str(match.shape))
print("all matches: " + str(matched_df_full.shape))
print("left to match: " + str(ml_unmatched.shape))

* remove ampersand
new matches: (0, 7)
all matches: (3664, 11)
left to match: (261, 5)
* remove ampersand
new matches: (0, 8)
all matches: (3664, 11)
left to match: (261, 5)
* remove ampersand
new matches: (0, 8)
all matches: (3664, 11)
left to match: (261, 5)


In [120]:
# remvove article from movielens 

ml_secondary_title = ml_unmatched.copy(deep='True')

ml_secondary_title['title'] = ml_secondary_title['title'].apply(lambda x: remove_movielens_articles(x))

match = pd.merge(ml_secondary_title, imdb, 
                 left_on=['title', 'year'],
                 right_on=['title', 'year'])

matched_df_full = matched_df_full.append(match, ignore_index=True)

matched_movie_ids = list(matched_df_full['movie_id'])
unmatched_movie_ids = set(unmatched_movie_ids) - set(matched_movie_ids)

ml_unmatched = ml_unmatched[ml_unmatched['movie_id'].isin(list(unmatched_movie_ids))]

print("* no article")
print("new matches: " + str(match.shape))
print("all matches: " + str(matched_df_full.shape))
print("left to match: " + str(ml_unmatched.shape))


ml_secondary_title = ml_unmatched.copy(deep='True')
ml_secondary_title['year+1'] = ml_secondary_title['year']+1

ml_secondary_title['title'] = ml_secondary_title['title'].apply(lambda x: remove_movielens_articles(x))

match = pd.merge(ml_secondary_title, imdb, 
                 left_on=['title', 'year+1'],
                 right_on=['title', 'year'])

matched_df_full = matched_df_full.append(match, ignore_index=True)

matched_movie_ids = list(matched_df_full['movie_id'])
unmatched_movie_ids = set(unmatched_movie_ids) - set(matched_movie_ids)

ml_unmatched = ml_unmatched[ml_unmatched['movie_id'].isin(list(unmatched_movie_ids))]

print("* no article")
print("new matches: " + str(match.shape))
print("all matches: " + str(matched_df_full.shape))
print("left to match: " + str(ml_unmatched.shape))


ml_secondary_title = ml_unmatched.copy(deep='True')
ml_secondary_title['year-1'] = ml_secondary_title['year']-1

ml_secondary_title['title'] = ml_secondary_title['title'].apply(lambda x: remove_movielens_articles(x))

match = pd.merge(ml_secondary_title, imdb, 
                 left_on=['title', 'year-1'],
                 right_on=['title', 'year'])

matched_df_full = matched_df_full.append(match, ignore_index=True)

matched_movie_ids = list(matched_df_full['movie_id'])
unmatched_movie_ids = set(unmatched_movie_ids) - set(matched_movie_ids)

ml_unmatched = ml_unmatched[ml_unmatched['movie_id'].isin(list(unmatched_movie_ids))]

print("* no article")
print("new matches: " + str(match.shape))
print("all matches: " + str(matched_df_full.shape))
print("left to match: " + str(ml_unmatched.shape))

* no article
new matches: (17, 7)
all matches: (3681, 11)
left to match: (244, 5)
* no article
new matches: (1, 8)
all matches: (3682, 11)
left to match: (243, 5)
* no article
new matches: (2, 8)
all matches: (3684, 11)
left to match: (241, 5)


In [121]:
manually_fixing_df = pd.read_csv('https://raw.githubusercontent.com/jennyzhang0215/MovieLens-IMDB/master/movielens/statistics/manually_fixed_title_name', delimiter='|')

# remvove article from movielens 
def replace_manually(title): 
    fixed_title = title
    if (len(manually_fixing_df[manually_fixing_df['wrong_title_name'] == title] == 1)): 
        fixed_title = manually_fixing_df[manually_fixing_df['wrong_title_name'] == title]['correct_title_name'].values[0]
    return fixed_title

# secondary title + article reposition

ml_secondary_title = ml_unmatched.copy(deep='True')

ml_secondary_title['title'] = ml_secondary_title['title'].apply(lambda x: replace_manually(x))

match = pd.merge(ml_secondary_title, imdb, 
                 left_on=['title', 'year'],
                 right_on=['title', 'year'])

matched_df_full = matched_df_full.append(match, ignore_index=True)

matched_movie_ids = list(matched_df_full['movie_id'])
unmatched_movie_ids = set(unmatched_movie_ids) - set(matched_movie_ids)

ml_unmatched = ml_unmatched[ml_unmatched['movie_id'].isin(list(unmatched_movie_ids))]

print("new matches: " + str(match.shape))
print("all matches: " + str(matched_df_full.shape))
print("left to match: " + str(ml_unmatched.shape))

# no punctuation + (year+1)

ml_secondary_title = ml_unmatched.copy(deep='True')
ml_secondary_title['year+1'] = ml_secondary_title['year']+1

ml_secondary_title['title'] = ml_secondary_title['title'].apply(lambda x: remove_movielens_articles(x))

match = pd.merge(ml_secondary_title, imdb, 
                 left_on=['title', 'year+1'],
                 right_on=['title', 'year'])

matched_df_full = matched_df_full.append(match, ignore_index=True)

matched_movie_ids = list(matched_df_full['movie_id'])
unmatched_movie_ids = set(unmatched_movie_ids) - set(matched_movie_ids)

ml_unmatched = ml_unmatched[ml_unmatched['movie_id'].isin(list(unmatched_movie_ids))]

print("new matches: " + str(match.shape))
print("all matches: " + str(matched_df_full.shape))
print("left to match: " + str(ml_unmatched.shape))

# no punctuation + (year-1)

ml_secondary_title = ml_unmatched.copy(deep='True')
ml_secondary_title['year-1'] = ml_secondary_title['year']-1

ml_secondary_title['title'] = ml_secondary_title['title'].apply(lambda x: remove_movielens_articles(x))

match = pd.merge(ml_secondary_title, imdb, 
                 left_on=['title', 'year-1'],
                 right_on=['title', 'year'])

matched_df_full = matched_df_full.append(match, ignore_index=True)

matched_movie_ids = list(matched_df_full['movie_id'])
unmatched_movie_ids = set(unmatched_movie_ids) - set(matched_movie_ids)

ml_unmatched = ml_unmatched[ml_unmatched['movie_id'].isin(list(unmatched_movie_ids))]

print("new matches: " + str(match.shape))
print("all matches: " + str(matched_df_full.shape))
print("left to match: " + str(ml_unmatched.shape))

new matches: (51, 7)
all matches: (3735, 11)
left to match: (190, 5)
new matches: (0, 8)
all matches: (3735, 11)
left to match: (190, 5)
new matches: (0, 8)
all matches: (3735, 11)
left to match: (190, 5)


In [122]:
manual_matching_dict = {'dream man': 'tt0101770', 
               'new york cop': 'tt0368893',
               'true crime': 'tt0139668',
               'costa brava': 'tt0109489',
               'victor/victoria': 'tt0265987',
               'drunks': 'tt0112907',
               'farmer & chase': 'tt0113031',
               'kids of survival': 'tt0107314',
               'blood & wine': 'tt0859643',
               'hearts and minds': 'tt0071604',
               'alien escape': 'tt0112318',
               'other voices, other rooms': 'tt0119845',
               'time tracers': 'tt0128755',
               'follow the bitch': 'tt0119139',
               '101 dalmatians': 'tt011543',
               'steamboat willie': 'tt0019422',
               'henry: portrait of a serial killer': 'tt0099763',
               'attack of the killer tomatoes!': 'tt0080391',
               'little nemo: adventures in slumberland': 'tt0104740',
               'ten benny': 'tt0114008',
               'daddy long legs': 'tt0021775',
               'train ride to hollywood': 'tt0078412',
               'santitos': 'tt0126651',
               'it happened here': 'tt0055024',
               'last resort': 'tt0091387',
               'solar crisis': 'tt0100649',
               'kronos': 'tt0050610',
               'misérables, les': 'tt0077936', 
               'shadows (cienie)': 'tt0245718',
               'castle freak': 'tt10701458',
               'dumb & dumber': 'tt0109686',
               'farinelli: il castrato': 'tt0109771',
               'interview with the vampire': 'tt1860252',
               'enfer l\'': 'tt13124824',
               'robert a. heinlein\'s the puppet masters': 'tt0111003',
               'harlem': 'tt0034950',
               'wedding gift, the': 'tt0847585',
               'ciao, professore! (io speriamo che me la cavo )': 'tt0107225',
               'dear diary (caro diario)': 'tt0109382',
               'superweib, das': 'tt0117788',
               'promise, the (versprechen, das)': 'tt0111613',
               'under the domin tree (etz hadomim tafus)': 'tt0109751',
               'two friends': '1986',
               'rendezvous in paris (rendez-vous de paris, les)': 'tt0176090',
               'crude oasis, the': 'tt0112746',
               'godzilla 2000 (gojira ni-sen mireniamu)': 'tt0120685',
               'broken hearts club, the': 'tt1194103',
               'crime and punishment in suburbia': 'tt0096056',
               'mad max 2 (a.k.a. the road warrior)': 'tt0079501',
               'toxic avenger, part ii, the': 'tt0090190',
               'spring fever usa (a.k.a. lauderdale)': 'tt0097717',
               'i am cuba (soy cuba/ya kuba)': 'tt0058604',
               'wisdom of crocodiles, the (a.k.a. immortality)': 'tt0120894',
               'mr. death: the rise and fall of fred a. leuchter, jr.': 'tt0192335',
               'pink floyd - the wall': 'tt0084503', 
               'nosferatu a venezia': 'tt0091651',
               'good, the bad and the ugly, the': 'tt5083572',
               'two women (la ciociara)': 'tt0054749',
               'robert a. heinlein\'s the puppet masters': 'tt0111003',
                'the players club':'tt0119905',
                'big bang theory, the': 'tt1147717',
'jungle2jungle (a.k.a. jungle 2 jungle)' : 'tt0119432',
'boys, les' : 'tt0118764',
'prophecy ii, the' : 'tt0114194',
'machine, the':  'tt0933079',
'friday the 13th part 3: 3d' : 'tt0080761',
'karate kid, part ii, the' : 'tt0426060',
'empty mirror, the':  'tt0116192',
'citizen\'s band (a.k.a. handle with care)':  'tt0359987',
'hard 8 (a.k.a. sydney, a.k.a. hard eight)' : 'tt0119256' ,
'poison ivy: new seduction' : 'tt0105156' ,
'hard-boiled (lashou shentan)' : 'tt0104684',
'trial, the (le procés)':  'tt0057427',
'horror hotel (a.k.a. the city of the dead)' : 'tt0053719',
'two or three things i know about her' : 'tt0060304',
'vacation' : 'tt0015452',
'slaughterhouse 2':  'tt0093990'	,
'meatballs iii':  'tt0079540',
'children of the corn iii':  'tt0087050',
'seven beauties (pasqualino settebellezze)': 'tt0075040',
'lodger, the':  'tt0037024',
'vie est belle, la (life is rosey)' : 'tt0161066',
'communion (a.k.a. alice, sweet alice/holy terror)' : 'tt0188223',}
               

for index, row in ml_unmatched.iterrows():
    title = row['title']
    if(manual_matching_dict.get(title) != None): 
        row['tconst'] = manual_matching_dict.get(title)
        matched_df_full = matched_df_full.append(row)
        
matched_movie_ids = list(matched_df_full['movie_id'])
unmatched_movie_ids = set(unmatched_movie_ids) - set(matched_movie_ids)

ml_unmatched = ml_unmatched[ml_unmatched['movie_id'].isin(list(unmatched_movie_ids))]   
        
print("all matches: " + str(matched_df_full.shape))
print("left to match: " + str(ml_unmatched.shape))


all matches: (3808, 11)
left to match: (117, 5)


In [123]:
match_without_year = pd.merge(ml_unmatched, imdb, 
                 on=['title'])

In [124]:
match_without_year

Unnamed: 0,movie_id,title,year_x,year+1,year-1,tconst,originalTitle,year_y
0,28,persuasion,1995,1996,1994,tt0844330,persuasion,2007
1,28,persuasion,1995,1996,1994,tt1217062,persuasion,2008
2,51,guardian angel,1994,1995,1993,tt0173642,andjeo cuvar,1987
3,51,guardian angel,1994,1995,1993,tt2992538,guardian angel,2014
4,51,guardian angel,1994,1995,1993,tt5228142,guardian angel,2016
5,51,guardian angel,1994,1995,1993,tt8145386,guardian angel,2019
6,2311,2010,1984,1985,1983,tt0883356,2010,2006
7,3660,puppet master,1989,1990,1988,tt1815887,puppet master,2010
8,3660,puppet master,1989,1990,1988,tt2331990,puppet master,2012
9,3660,puppet master,1989,1990,1988,tt8164188,puppet master,2018


In [125]:
match_without_year = pd.merge(ml_unmatched, imdb, 
                 left_on=['title'], right_on=['originalTitle'])

In [126]:
match_without_year

Unnamed: 0,movie_id,title_x,year_x,year+1,year-1,tconst,title_y,originalTitle,year_y
0,28,persuasion,1995,1996,1994,tt0844330,persuasion,persuasion,2007
1,28,persuasion,1995,1996,1994,tt1217062,persuasion,persuasion,2008
2,51,guardian angel,1994,1995,1993,tt2992538,guardian angel,guardian angel,2014
3,51,guardian angel,1994,1995,1993,tt5228142,guardian angel,guardian angel,2016
4,51,guardian angel,1994,1995,1993,tt8145386,guardian angel,guardian angel,2019
5,2311,2010,1984,1985,1983,tt0883356,2010,2010,2006
6,3660,puppet master,1989,1990,1988,tt1815887,puppet master,puppet master,2010
7,3660,puppet master,1989,1990,1988,tt2331990,puppet master,puppet master,2012
8,3660,puppet master,1989,1990,1988,tt8164188,puppet master,puppet master,2018


In [127]:
matched_df_full = matched_df_full.drop(columns=['originalTitle', 'title_x', 'title_y', 'year_x','year+1','year-1', 'year_y'])

In [128]:
matched_df_full.head()

Unnamed: 0,movie_id,title,year,tconst
0,1,toy story,1995.0,tt0114709
1,2,jumanji,1995.0,tt0113497
2,3,grumpier old men,1995.0,tt0113228
3,4,waiting to exhale,1995.0,tt0114885
4,5,father of the bride part ii,1995.0,tt0113041


In [130]:
matched_df_full.to_csv('matched_movies_ML_IMDB.csv', index=False)

# Match The Movie Database (TMBD) and MovieLens

In [135]:
ml = pd.read_csv('./matched_movies_ML_IMDB.csv')

In [136]:
ml.head()

Unnamed: 0,movie_id,title,year,tconst
0,1,toy story,1995.0,tt0114709
1,2,jumanji,1995.0,tt0113497
2,3,grumpier old men,1995.0,tt0113228
3,4,waiting to exhale,1995.0,tt0114885
4,5,father of the bride part ii,1995.0,tt0113041


In [139]:
import tmdbsimple as tmdb
tmdb.API_KEY = 'MY_API_KEY'

In [140]:
ml_tmbd = movies_full.copy(deep='True')
ml_tmbd = ml_tmbd.astype({'title':'string'})

In [141]:
search = tmdb.Search()
ml_unfound_ids = []

In [None]:
ml_tmbd['original_language'] = ''
ml_tmbd['budget'] = -1
ml_tmbd['overview'] = ''
ml_tmbd['popularity'] = -1
ml_tmbd['revenue'] = -1
ml_tmbd['tagline'] = ''
ml_tmbd['tmdb_vote_avg'] = -1
ml_tmbd['tmdb_vote_count'] = -1
ml_tmbd['tmdb_id'] = -1


for index, row in ml_tmbd.iterrows(): 
    if (index % 100 == 0): 
        print(index)
    title = row['title']
    response = search.movie(query=title)
    info = response.get('results')
    
    if (len(info) > 0):
        info = info[0] 
        id = int(info['id'])
        info = tmdb.Movies(id)
        info = info.info() 
        
        ml_tmbd['tmdb_id'] = id
        original_language = info['original_language']
        budget = info['budget']
        overview = info['overview']
        popularity = info['popularity']
        revenue = info['revenue']
        tagline = info['tagline']
        tmdb_vote_avg = info['vote_average']
        tmdb_vote_count = info['vote_count']
        
        ml_tmbd.at[index, 'original_language'] = original_language
        ml_tmbd.at[index, 'budget'] = budget
        ml_tmbd.at[index, 'overview'] = overview
        ml_tmbd.at[index, 'popularity'] = popularity
        ml_tmbd.at[index, 'revenue'] = revenue
        ml_tmbd.at[index, 'tagline'] = tagline
        ml_tmbd.at[index, 'tmdb_vote_avg'] = tmdb_vote_avg
        ml_tmbd.at[index, 'tmdb_vote_count'] = tmdb_vote_count
    else:
       ml_unfound_ids.append(row['movie_id']) 

In [144]:
ml_tmbd.head()

Unnamed: 0,movie_id,title,genres,year,original_language,budget,overview,popularity,revenue,tagline,tmdb_vote_avg,tmdb_vote_count,tmdb_id
0,1,toy story,animation children's comedy,1995,en,30000000,"Led by Woody, Andy's toys live happily in his ...",165,373554033,,8,15156,6521
1,2,jumanji,adventure children's fantasy,1995,en,125000000,As the gang return to Jumanji to rescue one of...,122,800059707,,7,6769,6521
2,3,grumpier old men,comedy romance,1995,en,25000000,A family wedding reignites the ancient feud be...,9,71500000,Still Yelling. Still Fighting. Still Ready for...,6,288,6521
3,4,waiting to exhale,comedy drama,1995,en,16000000,"Cheated on, mistreated and stepped on, the wom...",8,81452156,Friends are the people who let you be yourself...,6,120,6521
4,5,father of the bride part ii,comedy,1995,en,0,Just when George Banks has recovered from his ...,14,76594107,Just When His World Is Back To Normal... He's ...,6,561,6521


In [145]:
len(ml_unfound_ids)

791

In [146]:
movies_full[movies_full['movie_id'].isin(ml_unfound_ids)]

Unnamed: 0,movie_id,title,genres,year
10,11,"american president, the",comedy drama romance,1995
28,29,"city of lost children, the",adventure sci-fi,1995
29,30,shanghai triad (yao a yao yao dao waipo qiao),drama,1995
58,59,"confessional, the (le confessionnal)",drama mystery,1995
59,60,"indian in the cupboard, the",adventure children's fantasy,1995
...,...,...,...,...
3860,3930,"creature from the black lagoon, the",horror,1954
3861,3931,"giant gila monster, the",horror sci-fi,1959
3863,3933,"killer shrews, the",horror sci-fi,1959
3869,3939,"slumber party massacre ii, the",horror,1987


In [148]:
movies_fix_articles = movies_full.copy(deep=True)
movies_fix_articles = movies_fix_articles[movies_fix_articles['movie_id'].isin(ml_unfound_ids)]

In [149]:
movies_fix_articles['title'] = movies_fix_articles['title'].apply(lambda x: reposition_movielens_article(x))

In [150]:
movies_fix_articles.head()

Unnamed: 0,movie_id,title,genres,year
10,11,the american president,comedy drama romance,1995
28,29,the city of lost children,adventure sci-fi,1995
29,30,shanghai triad (yao a yao yao dao waipo qiao),drama,1995
58,59,"confessional, the (le confessionnal)",drama mystery,1995
59,60,the indian in the cupboard,adventure children's fantasy,1995


In [151]:
ml_unfound_ids = []

for index, row in movies_fix_articles.iterrows(): 
    if (index % 100 == 0): 
        print(index)
    title = row['title']
    response = search.movie(query=title)
    info = response.get('results')
    
    if (len(info) > 0):
        info = info[0] 
        id = int(info['id'])
        info = tmdb.Movies(id)
        info = info.info() 
        
        ml_tmbd['tmdb_id'] = id
        original_language = info['original_language']
        budget = info['budget']
        overview = info['overview']
        popularity = info['popularity']
        revenue = info['revenue']
        tagline = info['tagline']
        tmdb_vote_avg = info['vote_average']
        tmdb_vote_count = info['vote_count']
        
        ml_tmbd.at[index, 'original_language'] = original_language
        ml_tmbd.at[index, 'budget'] = budget
        ml_tmbd.at[index, 'overview'] = overview
        ml_tmbd.at[index, 'popularity'] = popularity
        ml_tmbd.at[index, 'revenue'] = revenue
        ml_tmbd.at[index, 'tagline'] = tagline
        ml_tmbd.at[index, 'tmdb_vote_avg'] = tmdb_vote_avg
        ml_tmbd.at[index, 'tmdb_vote_count'] = tmdb_vote_count
    else:
       ml_unfound_ids.append(row['movie_id']) 

300
1000
1100
1200
1500
1900
2000
2800
2900
3000
3100
3300
3500


In [154]:
movies_fix_articles = movies_fix_articles[movies_fix_articles['movie_id'].isin(ml_unfound_ids)]
movies_fix_articles

Unnamed: 0,movie_id,title,genres,year
29,30,shanghai triad (yao a yao yao dao waipo qiao),drama,1995
58,59,"confessional, the (le confessionnal)",drama mystery,1995
67,68,french twist (gazon maudit),comedy romance,1995
79,80,"white balloon, the (badkonake sefid )",drama,1995
81,82,antonia's line (antonia),drama,1995
...,...,...,...,...
3832,3902,goya in bordeaux (goya en bodeos),drama,1999
3850,3920,"faraway, so close (in weiter ferne, so nah!)",drama fantasy,1993
3860,3930,the creature from the black lagoon,horror,1954
3869,3939,the slumber party massacre ii,horror,1987


In [155]:
movies_extract_title = movies_fix_articles.copy(deep=True)
movies_extract_title['title'] = movies_extract_title['title'].apply(lambda x: remove_secondary(x))

ml_unfound_ids = []

for index, row in movies_extract_title.iterrows(): 
    title = row['title']
    response = search.movie(query=title)
    info = response.get('results')
    
    if (len(info) > 0):
        info = info[0] 
        id = int(info['id'])
        info = tmdb.Movies(id)
        info = info.info() 
        
        ml_tmbd['tmdb_id'] = id
        original_language = info['original_language']
        budget = info['budget']
        overview = info['overview']
        popularity = info['popularity']
        revenue = info['revenue']
        tagline = info['tagline']
        tmdb_vote_avg = info['vote_average']
        tmdb_vote_count = info['vote_count']
        
        ml_tmbd.at[index, 'original_language'] = original_language
        ml_tmbd.at[index, 'budget'] = budget
        ml_tmbd.at[index, 'overview'] = overview
        ml_tmbd.at[index, 'popularity'] = popularity
        ml_tmbd.at[index, 'revenue'] = revenue
        ml_tmbd.at[index, 'tagline'] = tagline
        ml_tmbd.at[index, 'tmdb_vote_avg'] = tmdb_vote_avg
        ml_tmbd.at[index, 'tmdb_vote_count'] = tmdb_vote_count
    else:
       ml_unfound_ids.append(row['movie_id']) 

In [158]:
movies_extract_title = movies_extract_title[movies_extract_title['movie_id'].isin(ml_unfound_ids)]
movies_extract_title

Unnamed: 0,movie_id,title,genres,year
79,80,"white balloon, the",drama,1995
122,124,"star maker, the",drama,1995
125,127,"silence of the palace, the",drama,1994
197,199,"umbrellas of cherbourg, the",drama musical,1964
359,363,"wonderful, horrible life of leni riefenstahl, the",documentary,1993
...,...,...,...,...
3780,3850,whatever happened to aunt alice?,crime thriller,1969
3785,3855,"affair of love, an",drama romance,1999
3860,3930,the creature from the black lagoon,horror,1954
3869,3939,the slumber party massacre ii,horror,1987


In [159]:
movies_extract_title['title'] = movies_extract_title['title'].apply(lambda x: reposition_movielens_article(x))
movies_extract_title

Unnamed: 0,movie_id,title,genres,year
79,80,the white balloon,drama,1995
122,124,the star maker,drama,1995
125,127,the silence of the palace,drama,1994
197,199,the umbrellas of cherbourg,drama musical,1964
359,363,"the wonderful, horrible life of leni riefenstahl",documentary,1993
...,...,...,...,...
3780,3850,whatever happened to aunt alice?,crime thriller,1969
3785,3855,an affair of love,drama romance,1999
3860,3930,the creature from the black lagoon,horror,1954
3869,3939,the slumber party massacre ii,horror,1987


In [160]:
ml_unfound_ids = []

for index, row in movies_extract_title.iterrows(): 
    title = row['title']
    response = search.movie(query=title)
    info = response.get('results')
    
    if (len(info) > 0):
        info = info[0] 
        id = int(info['id'])
        info = tmdb.Movies(id)
        info = info.info() 
        
        ml_tmbd['tmdb_id'] = id
        original_language = info['original_language']
        budget = info['budget']
        overview = info['overview']
        popularity = info['popularity']
        revenue = info['revenue']
        tagline = info['tagline']
        tmdb_vote_avg = info['vote_average']
        tmdb_vote_count = info['vote_count']
        
        ml_tmbd.at[index, 'original_language'] = original_language
        ml_tmbd.at[index, 'budget'] = budget
        ml_tmbd.at[index, 'overview'] = overview
        ml_tmbd.at[index, 'popularity'] = popularity
        ml_tmbd.at[index, 'revenue'] = revenue
        ml_tmbd.at[index, 'tagline'] = tagline
        ml_tmbd.at[index, 'tmdb_vote_avg'] = tmdb_vote_avg
        ml_tmbd.at[index, 'tmdb_vote_count'] = tmdb_vote_count
    else:
       ml_unfound_ids.append(row['movie_id']) 

In [161]:
len(ml_unfound_ids)

34

In [163]:
movies_extract_secondary = movies_fix_articles.copy(deep=True)
movies_extract_secondary = movies_extract_secondary[movies_extract_secondary['movie_id'].isin(ml_unfound_ids)]
movies_extract_secondary['title'] = movies_extract_secondary['title'].apply(lambda x: extract_secondary(x))

In [164]:
ml_unfound_ids = []

for index, row in movies_extract_secondary.iterrows(): 
    title = row['title']
    response = search.movie(query=title)
    info = response.get('results')
    
    if (len(info) > 0):
        info = info[0] 
        id = int(info['id'])
        info = tmdb.Movies(id)
        info = info.info() 
        
        ml_tmbd['tmdb_id'] = id
        original_language = info['original_language']
        budget = info['budget']
        overview = info['overview']
        popularity = info['popularity']
        revenue = info['revenue']
        tagline = info['tagline']
        tmdb_vote_avg = info['vote_average']
        tmdb_vote_count = info['vote_count']
        
        ml_tmbd.at[index, 'original_language'] = original_language
        ml_tmbd.at[index, 'budget'] = budget
        ml_tmbd.at[index, 'overview'] = overview
        ml_tmbd.at[index, 'popularity'] = popularity
        ml_tmbd.at[index, 'revenue'] = revenue
        ml_tmbd.at[index, 'tagline'] = tagline
        ml_tmbd.at[index, 'tmdb_vote_avg'] = tmdb_vote_avg
        ml_tmbd.at[index, 'tmdb_vote_count'] = tmdb_vote_count
    else:
       ml_unfound_ids.append(row['movie_id']) 

In [169]:
movies_full_latest = movies.copy(deep=True)
movies_full_latest = movies_full_latest[movies_full_latest['movie_id'].isin(ml_unfound_ids)]
movies_full_latest

Unnamed: 0,movie_id,title,genres,year
399,403,two crimes,comedy crime drama,1995
861,872,aiqing wansui,drama,1994
1446,1474,jungle2jungle (a.k.a. jungle 2 jungle),children's comedy,1997
1587,1630,"lay of the land, the",comedy drama,1997
1650,1697,"big bang theory, the",crime,1994
1658,1706,harlem river drive,drama,1996
1704,1757,duoluo tianshi,drama,1995
1718,1774,mass transit,comedy drama,1998
1732,1792,u.s. marshalls,action thriller,1998
1758,1825,"player's club, the",action drama,1998


In [170]:
manually_fixing_df = pd.read_csv('https://raw.githubusercontent.com/jennyzhang0215/MovieLens-IMDB/master/movielens/statistics/manually_fixed_title_name', delimiter='|')

# remvove article from movielens 
def replace_manually(title): 
    fixed_title = title
    if (len(manually_fixing_df[manually_fixing_df['wrong_title_name'] == title] == 1)): 
        print("HI")
        fixed_title = manually_fixing_df[manually_fixing_df['wrong_title_name'] == title]['correct_title_name'].values[0]
    return fixed_title

movies_full_latest['title'] = movies_full_latest['title'].apply(lambda x: replace_manually(x))


HI
HI
HI
HI
HI
HI
HI
HI
HI
HI
HI
HI
HI
HI


In [171]:
ml_unfound_ids = []

for index, row in movies_full_latest.iterrows(): 
    title = row['title']
    response = search.movie(query=title)
    info = response.get('results')
    
    if (len(info) > 0):
        info = info[0] 
        id = int(info['id'])
        info = tmdb.Movies(id)
        info = info.info() 
        
        ml_tmbd['tmdb_id'] = id
        original_language = info['original_language']
        budget = info['budget']
        overview = info['overview']
        popularity = info['popularity']
        revenue = info['revenue']
        tagline = info['tagline']
        tmdb_vote_avg = info['vote_average']
        tmdb_vote_count = info['vote_count']
        
        ml_tmbd.at[index, 'original_language'] = original_language
        ml_tmbd.at[index, 'budget'] = budget
        ml_tmbd.at[index, 'overview'] = overview
        ml_tmbd.at[index, 'popularity'] = popularity
        ml_tmbd.at[index, 'revenue'] = revenue
        ml_tmbd.at[index, 'tagline'] = tagline
        ml_tmbd.at[index, 'tmdb_vote_avg'] = tmdb_vote_avg
        ml_tmbd.at[index, 'tmdb_vote_count'] = tmdb_vote_count
    else:
       ml_unfound_ids.append(row['movie_id']) 

In [172]:
len(ml_unfound_ids)

13

In [173]:
movies_full_latest = movies.copy(deep=True)
movies_full_latest = movies_full_latest[movies_full_latest['movie_id'].isin(ml_unfound_ids)]
movies_full_latest

Unnamed: 0,movie_id,title,genres,year
1446,1474,Jungle2Jungle (a.k.a. Jungle 2 Jungle),children's comedy,1997
1587,1630,"Lay of the Land, The",comedy drama,1997
1650,1697,"Big Bang Theory, The",crime,1994
1718,1774,Mass Transit,comedy drama,1998
1758,1825,"Player's Club, The",action drama,1998
2086,2155,"Slums of Beverly Hills, The",comedy,1998
2854,2923,Citizen's Band (a.k.a. Handle with Care),comedy,1977
2926,2995,"House on Haunted Hill, The",horror,1999
3001,3070,Adventures of Buckaroo Bonzai Across the 8th D...,adventure comedy sci-fi,1984
3076,3145,"Cradle Will Rock, The",drama,1999


In [174]:
movies_full_latest.at[1446, 'title'] = 'jungle 2 jungle'
movies_full_latest.at[2854, 'title'] = 'Handle with Care'
movies_full_latest.at[3869, 'title'] = 'the slumber party massacre'
movies_full_latest.at[3870, 'title'] = 'the slumber party massacre'

In [175]:
movies_full_latest

Unnamed: 0,movie_id,title,genres,year
1446,1474,jungle 2 jungle,children's comedy,1997
1587,1630,"Lay of the Land, The",comedy drama,1997
1650,1697,"Big Bang Theory, The",crime,1994
1718,1774,Mass Transit,comedy drama,1998
1758,1825,"Player's Club, The",action drama,1998
2086,2155,"Slums of Beverly Hills, The",comedy,1998
2854,2923,Handle with Care,comedy,1977
2926,2995,"House on Haunted Hill, The",horror,1999
3001,3070,Adventures of Buckaroo Bonzai Across the 8th D...,adventure comedy sci-fi,1984
3076,3145,"Cradle Will Rock, The",drama,1999


In [176]:
ml_unfound_ids = []

for index, row in movies_full_latest.iterrows(): 
    title = row['title']
    response = search.movie(query=title)
    info = response.get('results')
    
    if (len(info) > 0):
        info = info[0] 
        id = int(info['id'])
        info = tmdb.Movies(id)
        info = info.info() 
        
        ml_tmbd['tmdb_id'] = id
        original_language = info['original_language']
        budget = info['budget']
        overview = info['overview']
        popularity = info['popularity']
        revenue = info['revenue']
        tagline = info['tagline']
        tmdb_vote_avg = info['vote_average']
        tmdb_vote_count = info['vote_count']
        
        ml_tmbd.at[index, 'original_language'] = original_language
        ml_tmbd.at[index, 'budget'] = budget
        ml_tmbd.at[index, 'overview'] = overview
        ml_tmbd.at[index, 'popularity'] = popularity
        ml_tmbd.at[index, 'revenue'] = revenue
        ml_tmbd.at[index, 'tagline'] = tagline
        ml_tmbd.at[index, 'tmdb_vote_avg'] = tmdb_vote_avg
        ml_tmbd.at[index, 'tmdb_vote_count'] = tmdb_vote_count
    else:
       ml_unfound_ids.append(row['movie_id']) 

In [178]:
movies_full_latest = movies_full_latest[movies_full_latest['movie_id'].isin(ml_unfound_ids)]
movies_full_latest

Unnamed: 0,movie_id,title,genres,year
1587,1630,"Lay of the Land, The",comedy drama,1997
1650,1697,"Big Bang Theory, The",crime,1994
1718,1774,Mass Transit,comedy drama,1998
1758,1825,"Player's Club, The",action drama,1998
2086,2155,"Slums of Beverly Hills, The",comedy,1998
2926,2995,"House on Haunted Hill, The",horror,1999
3001,3070,Adventures of Buckaroo Bonzai Across the 8th D...,adventure comedy sci-fi,1984
3076,3145,"Cradle Will Rock, The",drama,1999
3860,3930,"Creature From the Black Lagoon, The",horror,1954


In [180]:
ml_unfound_ids = []

for index, row in movies_full_latest.iterrows():         
        ml_tmbd.at[index, 'tmdb_id'] = -1
        ml_tmbd.at[index, 'original_language'] = None
        ml_tmbd.at[index, 'budget'] = -1
        ml_tmbd.at[index, 'overview'] = ''
        ml_tmbd.at[index, 'popularity'] = -1
        ml_tmbd.at[index, 'revenue'] = -1
        ml_tmbd.at[index, 'tagline'] = ''
        ml_tmbd.at[index, 'tmdb_vote_avg'] = -1
        ml_tmbd.at[index, 'tmdb_vote_count'] = -1


In [183]:
ml_tmbd.head()

Unnamed: 0,movie_id,title,genres,year,original_language,budget,overview,popularity,revenue,tagline,tmdb_vote_avg,tmdb_vote_count,tmdb_id
0,1,toy story,animation children's comedy,1995,en,30000000,"Led by Woody, Andy's toys live happily in his ...",165,373554033,,8,15156,27475
1,2,jumanji,adventure children's fantasy,1995,en,125000000,As the gang return to Jumanji to rescue one of...,122,800059707,,7,6769,27475
2,3,grumpier old men,comedy romance,1995,en,25000000,A family wedding reignites the ancient feud be...,9,71500000,Still Yelling. Still Fighting. Still Ready for...,6,288,27475
3,4,waiting to exhale,comedy drama,1995,en,16000000,"Cheated on, mistreated and stepped on, the wom...",8,81452156,Friends are the people who let you be yourself...,6,120,27475
4,5,father of the bride part ii,comedy,1995,en,0,Just when George Banks has recovered from his ...,14,76594107,Just When His World Is Back To Normal... He's ...,6,561,27475


In [184]:
ml_tmbd.to_csv('matched_movies_ML_TMBD.csv', index=False)

# Match TMDB - MovieLens (fix)

In [7]:
import tmdbsimple as tmdb
tmdb.API_KEY = 'MY_API_KEY'

ml_tmbd = movies_full.copy(deep='True')
ml_tmbd = ml_tmbd.astype({'title':'string'})

search = tmdb.Search()
ml_unfound_ids = []


In [22]:
import string

def find_matching_movie(title, ml_tmbd, info): 
    if (len(info) == 0): 
        return None 
    
    ml_year = ml_tmbd[ml_tmbd['title'] == title]
    ml_year = int(ml_year['year'].values[0])

    closest_movie = None
    min_year_diff = 10000
    
    for m in info:
        if ('release_date' in m): 
            tmdb_year = m['release_date']
            tmdb_year = tmdb_year[:4]
        else: 
            # default to 0
            tmdb_year = '0'
            
        if (tmdb_year.isnumeric()):
            tmdb_year = int(tmdb_year)
            year_diff = abs(ml_year - tmdb_year)
            
            if (min_year_diff == 0):
                    # don't continue to look if we found an exact match
                    return closest_movie
                
            if (year_diff < min_year_diff): 
                closest_movie = m
                min_year_diff = year_diff
                
    return closest_movie


ml = pd.read_csv('./matched_movies_ML_IMDB.csv')

ml_tmbd = movies_full.copy(deep='True')
ml_tmbd = ml_tmbd.astype({'title':'string'})

search = tmdb.Search()
ml_unfound_ids = []

ml_tmbd['original_language'] = ''
ml_tmbd['budget'] = -1
ml_tmbd['overview'] = ''
ml_tmbd['popularity'] = -1
ml_tmbd['revenue'] = -1
ml_tmbd['tagline'] = ''
ml_tmbd['tmdb_vote_avg'] = -1
ml_tmbd['tmdb_vote_count'] = -1
ml_tmbd['tmdb_id'] = -1


for index, row in ml_tmbd.iterrows():  
    if(index % 100 == 0): 
        print(index)
              
    title = row['title']
    response = search.movie(query=title)
    info = response.get('results')
    
    info = find_matching_movie(title, ml_tmbd, info)
    if (info != None): 
        id = int(info['id'])
        info = tmdb.Movies(id)
        info = info.info() 
        
        ml_tmbd.at[index, 'tmdb_id'] = id
        original_language = info['original_language']
        budget = info['budget']
        overview = info['overview']
        popularity = info['popularity']
        revenue = info['revenue']
        tagline = info['tagline']
        tmdb_vote_avg = info['vote_average']
        tmdb_vote_count = info['vote_count']
        
        ml_tmbd.at[index, 'original_language'] = original_language
        ml_tmbd.at[index, 'budget'] = budget
        ml_tmbd.at[index, 'overview'] = overview
        ml_tmbd.at[index, 'popularity'] = popularity
        ml_tmbd.at[index, 'revenue'] = revenue
        ml_tmbd.at[index, 'tagline'] = tagline
        ml_tmbd.at[index, 'tmdb_vote_avg'] = tmdb_vote_avg
        ml_tmbd.at[index, 'tmdb_vote_count'] = tmdb_vote_count
    else:
       ml_unfound_ids.append(row['movie_id']) 

0
100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
2400
2500
2600
2700
2800
2900
3000
3100
3200
3300
3400
3500
3600
3700
3800


In [23]:
len(ml_unfound_ids)

783

In [25]:

movies_fix_articles = movies_full.copy(deep=True)
movies_fix_articles = movies_fix_articles[movies_fix_articles['movie_id'].isin(ml_unfound_ids)]
movies_fix_articles['title'] = movies_fix_articles['title'].apply(lambda x: reposition_movielens_article(x))
ml_unfound_ids = []

for index, row in movies_fix_articles.iterrows(): 
    if (index % 100 == 0): 
        print(index)
    title = row['title']
    response = search.movie(query=title)
    info = response.get('results')
    
    info = find_matching_movie(title, movies_fix_articles, info)
    if (info != None): 
        
        id = int(info['id'])
        info = tmdb.Movies(id)
        info = info.info() 
        
        ml_tmbd.at[index, 'tmdb_id'] = id
        original_language = info['original_language']
        budget = info['budget']
        overview = info['overview']
        popularity = info['popularity']
        revenue = info['revenue']
        tagline = info['tagline']
        tmdb_vote_avg = info['vote_average']
        tmdb_vote_count = info['vote_count']
        
        ml_tmbd.at[index, 'original_language'] = original_language
        ml_tmbd.at[index, 'budget'] = budget
        ml_tmbd.at[index, 'overview'] = overview
        ml_tmbd.at[index, 'popularity'] = popularity
        ml_tmbd.at[index, 'revenue'] = revenue
        ml_tmbd.at[index, 'tagline'] = tagline
        ml_tmbd.at[index, 'tmdb_vote_avg'] = tmdb_vote_avg
        ml_tmbd.at[index, 'tmdb_vote_count'] = tmdb_vote_count
    else:
       ml_unfound_ids.append(row['movie_id']) 

300
1000
1100
1200
1500
1900
2000
2800
2900
3000
3100
3300
3500


In [27]:

movies_fix = movies_full.copy(deep=True)
movies_fix = movies_fix[movies_fix['movie_id'].isin(ml_unfound_ids)]
movies_fix['title'] = movies_fix['title'].apply(lambda x: extract_secondary(x))
ml_unfound_ids = []

for index, row in movies_fix.iterrows(): 
    if (index % 100 == 0): 
        print(index)
    title = row['title']
    response = search.movie(query=title)
    info = response.get('results')
    
    info = find_matching_movie(title, movies_fix, info)
    if (info != None): 
        
        id = int(info['id'])
        info = tmdb.Movies(id)
        info = info.info() 
        
        ml_tmbd.at[index, 'tmdb_id'] = id
        original_language = info['original_language']
        budget = info['budget']
        overview = info['overview']
        popularity = info['popularity']
        revenue = info['revenue']
        tagline = info['tagline']
        tmdb_vote_avg = info['vote_average']
        tmdb_vote_count = info['vote_count']
        
        ml_tmbd.at[index, 'original_language'] = original_language
        ml_tmbd.at[index, 'budget'] = budget
        ml_tmbd.at[index, 'overview'] = overview
        ml_tmbd.at[index, 'popularity'] = popularity
        ml_tmbd.at[index, 'revenue'] = revenue
        ml_tmbd.at[index, 'tagline'] = tagline
        ml_tmbd.at[index, 'tmdb_vote_avg'] = tmdb_vote_avg
        ml_tmbd.at[index, 'tmdb_vote_count'] = tmdb_vote_count
    else:
       ml_unfound_ids.append(row['movie_id']) 

1100
1200
2800
2900
3300
3500


In [28]:
len(ml_unfound_ids)

80

In [29]:

movies_fix = movies_full.copy(deep=True)
movies_fix = movies_fix[movies_fix['movie_id'].isin(ml_unfound_ids)]
movies_fix['title'] = movies_fix['title'].apply(lambda x: extract_secondary(x))
movies_fix['title'] = movies_fix['title'].apply(lambda x: reposition_movielens_article(x))

ml_unfound_ids = []

for index, row in movies_fix.iterrows(): 
    if (index % 100 == 0): 
        print(index)
    title = row['title']
    response = search.movie(query=title)
    info = response.get('results')
    
    info = find_matching_movie(title, movies_fix, info)
    if (info != None): 
        
        id = int(info['id'])
        info = tmdb.Movies(id)
        info = info.info() 
        
        ml_tmbd.at[index, 'tmdb_id'] = id
        original_language = info['original_language']
        budget = info['budget']
        overview = info['overview']
        popularity = info['popularity']
        revenue = info['revenue']
        tagline = info['tagline']
        tmdb_vote_avg = info['vote_average']
        tmdb_vote_count = info['vote_count']
        
        ml_tmbd.at[index, 'original_language'] = original_language
        ml_tmbd.at[index, 'budget'] = budget
        ml_tmbd.at[index, 'overview'] = overview
        ml_tmbd.at[index, 'popularity'] = popularity
        ml_tmbd.at[index, 'revenue'] = revenue
        ml_tmbd.at[index, 'tagline'] = tagline
        ml_tmbd.at[index, 'tmdb_vote_avg'] = tmdb_vote_avg
        ml_tmbd.at[index, 'tmdb_vote_count'] = tmdb_vote_count
    else:
       ml_unfound_ids.append(row['movie_id']) 

1200


In [30]:
len(ml_unfound_ids)

70

In [31]:

movies_fix = movies_full.copy(deep=True)
movies_fix = movies_fix[movies_fix['movie_id'].isin(ml_unfound_ids)]
movies_fix['title'] = movies_fix['title'].apply(lambda x: remove_secondary(x))
movies_fix['title'] = movies_fix['title'].apply(lambda x: reposition_movielens_article(x))

ml_unfound_ids = []

for index, row in movies_fix.iterrows(): 
    if (index % 100 == 0): 
        print(index)
    title = row['title']
    response = search.movie(query=title)
    info = response.get('results')
    
    info = find_matching_movie(title, movies_fix, info)
    if (info != None): 
        
        id = int(info['id'])
        info = tmdb.Movies(id)
        info = info.info() 
        
        ml_tmbd.at[index, 'tmdb_id'] = id
        original_language = info['original_language']
        budget = info['budget']
        overview = info['overview']
        popularity = info['popularity']
        revenue = info['revenue']
        tagline = info['tagline']
        tmdb_vote_avg = info['vote_average']
        tmdb_vote_count = info['vote_count']
        
        ml_tmbd.at[index, 'original_language'] = original_language
        ml_tmbd.at[index, 'budget'] = budget
        ml_tmbd.at[index, 'overview'] = overview
        ml_tmbd.at[index, 'popularity'] = popularity
        ml_tmbd.at[index, 'revenue'] = revenue
        ml_tmbd.at[index, 'tagline'] = tagline
        ml_tmbd.at[index, 'tmdb_vote_avg'] = tmdb_vote_avg
        ml_tmbd.at[index, 'tmdb_vote_count'] = tmdb_vote_count
    else:
       ml_unfound_ids.append(row['movie_id']) 

1200


In [35]:

movies_fix = movies_full.copy(deep=True)
movies_fix = movies_fix[movies_fix['movie_id'].isin(ml_unfound_ids)]
movies_fix['title'] = movies_fix['title'].apply(lambda x: remove_secondary(x))
movies_fix['title'] = movies_fix['title'].apply(lambda x: reposition_movielens_article(x))

ml_unfound_ids = []

for index, row in movies_fix.iterrows(): 
    if (index % 100 == 0): 
        print(index)
    title = row['title']
    response = search.movie(query=title)
    info = response.get('results')
    
    info = find_matching_movie(title, movies_fix, info)
    if (info != None): 
        
        id = int(info['id'])
        info = tmdb.Movies(id)
        info = info.info() 
        
        ml_tmbd.at[index, 'tmdb_id'] = id
        original_language = info['original_language']
        budget = info['budget']
        overview = info['overview']
        popularity = info['popularity']
        revenue = info['revenue']
        tagline = info['tagline']
        tmdb_vote_avg = info['vote_average']
        tmdb_vote_count = info['vote_count']
        
        ml_tmbd.at[index, 'original_language'] = original_language
        ml_tmbd.at[index, 'budget'] = budget
        ml_tmbd.at[index, 'overview'] = overview
        ml_tmbd.at[index, 'popularity'] = popularity
        ml_tmbd.at[index, 'revenue'] = revenue
        ml_tmbd.at[index, 'tagline'] = tagline
        ml_tmbd.at[index, 'tmdb_vote_avg'] = tmdb_vote_avg
        ml_tmbd.at[index, 'tmdb_vote_count'] = tmdb_vote_count
    else:
       ml_unfound_ids.append(row['movie_id']) 

Unnamed: 0,movie_id,title,genres,year
399,403,two crimes,comedy crime drama,1995
861,872,aiqing wansui,drama,1994
1446,1474,jungle2jungle (a.k.a. jungle 2 jungle),children's comedy,1997
1587,1630,"lay of the land, the",comedy drama,1997
1650,1697,"big bang theory, the",crime,1994
1658,1706,harlem river drive,drama,1996
1704,1757,duoluo tianshi,drama,1995
1718,1774,mass transit,comedy drama,1998
1732,1792,u.s. marshalls,action thriller,1998
1758,1825,"player's club, the",action drama,1998


In [None]:
manually_fixing_df = pd.read_csv('https://raw.githubusercontent.com/jennyzhang0215/MovieLens-IMDB/master/movielens/statistics/manually_fixed_title_name', delimiter='|')

# remvove article from movielens 
def replace_manually(title): 
    fixed_title = title
    if (len(manually_fixing_df[manually_fixing_df['wrong_title_name'] == title] == 1)): 
        print("HI")
        fixed_title = manually_fixing_df[manually_fixing_df['wrong_title_name'] == title]['correct_title_name'].values[0]
    return fixed_title

movies_fix = movies_full.copy(deep=True)
movies_fix = movies_fix[movies_fix['movie_id'].isin(ml_unfound_ids)]
movies_fix['title'] = movies_fix['title'].apply(lambda x: replace_manually(x))

ml_unfound_ids = []

for index, row in movies_fix.iterrows(): 
    if (index % 100 == 0): 
        print(index)
    title = row['title']
    response = search.movie(query=title)
    info = response.get('results')
    
    info = find_matching_movie(title, movies_fix, info)
    if (info != None): 
        
        id = int(info['id'])
        info = tmdb.Movies(id)
        info = info.info() 
        
        ml_tmbd.at[index, 'tmdb_id'] = id
        original_language = info['original_language']
        budget = info['budget']
        overview = info['overview']
        popularity = info['popularity']
        revenue = info['revenue']
        tagline = info['tagline']
        tmdb_vote_avg = info['vote_average']
        tmdb_vote_count = info['vote_count']
        
        ml_tmbd.at[index, 'original_language'] = original_language
        ml_tmbd.at[index, 'budget'] = budget
        ml_tmbd.at[index, 'overview'] = overview
        ml_tmbd.at[index, 'popularity'] = popularity
        ml_tmbd.at[index, 'revenue'] = revenue
        ml_tmbd.at[index, 'tagline'] = tagline
        ml_tmbd.at[index, 'tmdb_vote_avg'] = tmdb_vote_avg
        ml_tmbd.at[index, 'tmdb_vote_count'] = tmdb_vote_count
    else:
       ml_unfound_ids.append(row['movie_id']) 

In [37]:
len(ml_unfound_ids)

13

In [39]:
ml_tmbd[ml_tmbd['budget'] == ml_tmbd['budget'].max()]

Unnamed: 0,movie_id,title,genres,year,original_language,budget,overview,popularity,revenue,tagline,tmdb_vote_avg,tmdb_vote_count,tmdb_id
1672,1721,titanic,drama romance,1997,en,200000000,101-year-old Rose DeWitt Bukater tells the sto...,136,2187463944,Nothing on Earth could come between them.,7,20711,6521
3335,3404,titanic,action drama,1953,en,200000000,101-year-old Rose DeWitt Bukater tells the sto...,136,2187463944,Nothing on Earth could come between them.,7,20711,6521


In [40]:
ml_tmbd[ml_tmbd['revenue'] == ml_tmbd['revenue'].max()]

Unnamed: 0,movie_id,title,genres,year,original_language,budget,overview,popularity,revenue,tagline,tmdb_vote_avg,tmdb_vote_count,tmdb_id
1672,1721,titanic,drama romance,1997,en,200000000,101-year-old Rose DeWitt Bukater tells the sto...,136,2187463944,Nothing on Earth could come between them.,7,20711,6521
3335,3404,titanic,action drama,1953,en,200000000,101-year-old Rose DeWitt Bukater tells the sto...,136,2187463944,Nothing on Earth could come between them.,7,20711,6521


In [45]:
ml_tmbd[ml_tmbd['tmdb_vote_count'] == ml_tmbd['tmdb_vote_count'].max()]

Unnamed: 0,movie_id,title,genres,year,original_language,budget,overview,popularity,revenue,tagline,tmdb_vote_avg,tmdb_vote_count,tmdb_id
2890,2959,fight club,drama,1999,en,63000000,A ticking-time-bomb insomniac and a slippery s...,67,100853753,Mischief. Mayhem. Soap.,8,23683,6521


In [49]:
ml_tmbd.to_csv("matched_movies_ML_TMDB.csv", index=False)

# Movies main dataset

In [50]:
imdb_df = pd.read_csv('./matched_movies_ML_IMDB.csv')
tmdb_df = pd.read_csv('./matched_movies_ML_TMDB.csv')
movies_main = movies_full.copy(deep=True)

In [51]:
imdb_df = imdb_df.astype({'title': 'string', 'tconst': 'string'})
tmdb_df = tmdb_df.astype({'title': 'string'})

In [53]:
imdb_full = pd.read_csv('imdb_shortened.csv')
imdb_full = imdb_full.astype({'tconst': 'string'})

In [299]:
imdb_full.head()

Unnamed: 0,tconst,averageRating,numVotes,titleType,title,originalTitle,isAdult,year,runtimeMinutes,imdb_genres
0,tt0010418,7.0,1853,movie,male and female,Male and Female,0,1919,116,"Adventure,Drama"
1,tt0011652,6.1,1565,movie,the saphead,The Saphead,0,1920,77,Comedy
2,tt0012349,8.3,123451,movie,the kid,The Kid,0,1921,68,"Comedy,Drama,Family"
3,tt0013442,7.9,94382,movie,nosferatu,"Nosferatu, eine Symphonie des Grauens",0,1922,94,"Fantasy,Horror"
4,tt0013662,7.1,475,movie,tess of the storm country,Tess of the Storm Country,0,1922,137,"Drama,Romance"


In [300]:
movies_main

Unnamed: 0,movie_id,title,genres,year
0,1,toy story,animation children's comedy,1995
1,2,jumanji,adventure children's fantasy,1995
2,3,grumpier old men,comedy romance,1995
3,4,waiting to exhale,comedy drama,1995
4,5,father of the bride part ii,comedy,1995
...,...,...,...,...
3878,3948,meet the parents,comedy,2000
3879,3949,requiem for a dream,drama,2000
3880,3950,tigerland,drama,2000
3881,3951,two family house,drama,2000


In [54]:
imdb_df_ids = list(imdb_df['movie_id'])

movies_main['imdb_rating'] = -1
movies_main['imdb_num_votes'] = -1
movies_main['is_adult'] = -1
movies_main['runtime_minutes'] = -1
movies_main.reset_index()
 
count = 0
for index, row in movies_main.iterrows():
    movie_id = int(row['movie_id'])

    if movie_id in imdb_df_ids: 
        tconst = imdb_df[imdb_df['movie_id'] == movie_id]
        tconst = tconst['tconst']
        tconst = str(tconst.values[0])
        imdb_row = imdb_full[imdb_full['tconst'] == tconst]
        if (len(imdb_row['averageRating'].values) > 0):
            movies_main.at[index, 'imdb_rating'] = imdb_row['averageRating']
        if (len(imdb_row['numVotes'].values) > 0):
            movies_main.at[index,'imdb_num_votes'] = imdb_row['numVotes']
        if (len(imdb_row['isAdult'].values) > 0):
            movies_main.at[index,'is_adult'] = imdb_row['isAdult']
        if (len(imdb_row['runtimeMinutes'].values) > 0):
            movies_main.at[index,'runtime_minutes'] = imdb_row['runtimeMinutes']

In [55]:
movies_main['budget'] = -1
movies_main['original_language'] = ''
movies_main['overview'] = ''
movies_main['popularity'] = -1
movies_main['revenue'] = -1
movies_main['tagline'] = ''
movies_main['tmdb_vote_count'] = -1
movies_main['tmdb_vote_avg'] = -1

movies_main.reset_index()
 
count = 0
for index, row in movies_main.iterrows():
    movie_id = int(row['movie_id'])
    tmdb_row = tmdb_df[tmdb_df['movie_id'] == movie_id]
    
    movies_main.at[index, 'budget'] = tmdb_row['budget'].values[0]
    movies_main.at[index,'original_language'] = tmdb_row['original_language'].values[0]
    movies_main.at[index,'overview'] = tmdb_row['overview'].values[0]
    movies_main.at[index,'popularity'] = tmdb_row['popularity'].values[0]
    movies_main.at[index,'revenue'] = tmdb_row['revenue'].values[0]
    movies_main.at[index,'tagline'] = tmdb_row['tagline'].values[0]
    movies_main.at[index,'tmdb_vote_count'] = tmdb_row['tmdb_vote_count'].values[0]
    movies_main.at[index,'tmdb_vote_avg'] = tmdb_row['tmdb_vote_avg'].values[0]

In [317]:
movies_main.head()

Unnamed: 0,movie_id,title,genres,year,imdb_rating,imdb_num_votes,is_adult,runtime_minutes,budget,original_language,overview,popularity,revenue,tagline,tmdb_vote_count,tmdb_vote_avg
0,1,toy story,animation children's comedy,1995,8,953300,0,81,30000000,en,"Led by Woody, Andy's toys live happily in his ...",165,373554033,,15156,8
1,2,jumanji,adventure children's fantasy,1995,7,333517,0,104,125000000,en,As the gang return to Jumanji to rescue one of...,122,800059707,,6769,7
2,3,grumpier old men,comedy romance,1995,6,26811,0,101,25000000,en,A family wedding reignites the ancient feud be...,9,71500000,Still Yelling. Still Fighting. Still Ready for...,288,6
3,4,waiting to exhale,comedy drama,1995,5,10757,0,124,16000000,en,"Cheated on, mistreated and stepped on, the wom...",8,81452156,Friends are the people who let you be yourself...,120,6
4,5,father of the bride part ii,comedy,1995,6,37343,0,106,0,en,Just when George Banks has recovered from his ...,14,76594107,Just When His World Is Back To Normal... He's ...,561,6


In [56]:
movies_main[movies_main['budget'] == movies_main['budget'].max()]

Unnamed: 0,movie_id,title,genres,year,imdb_rating,imdb_num_votes,is_adult,runtime_minutes,budget,original_language,overview,popularity,revenue,tagline,tmdb_vote_count,tmdb_vote_avg
1672,1721,titanic,drama romance,1997,7,1123617,0,194,200000000,en,101-year-old Rose DeWitt Bukater tells the sto...,136,2187463944,Nothing on Earth could come between them.,20711,7
3335,3404,titanic,action drama,1953,7,6814,0,98,200000000,en,101-year-old Rose DeWitt Bukater tells the sto...,136,2187463944,Nothing on Earth could come between them.,20711,7


In [57]:
movies_main.to_csv('movies_main.csv', index=False)