# Imports

In [1]:
import pandas as pd
pd.set_option('display.float_format', lambda x: '%.5f' % x)

In [2]:
movies = pd.read_csv('https://raw.githubusercontent.com/nchichilidze/RS-with-GE/main/preprocessed_movielens_1m/movies.csv')
tags = pd.read_csv('https://raw.githubusercontent.com/nchichilidze/RS-with-GE/main/preprocessed_movielens_1m/tags.csv')
ratings = pd.read_csv('https://raw.githubusercontent.com/nchichilidze/RS-with-GE/main/preprocessed_movielens_1m/ratings.csv')

movies = movies.iloc[: , 1:]
tags = tags.iloc[: , 1:]

In [5]:
movies.head()
tags.head()

Unnamed: 0,movie_id,tags,title,genres,year
0,1,pixar pixar pixar animation pixar animated fun...,Toy Story,animation children's comedy,1995
1,2,for children game animals joe johnston robin w...,Jumanji,adventure children's fantasy,1995
2,3,funniest movies comedinha de velhinhos engraã ...,Grumpier Old Men,comedy romance,1995
3,4,girl movie comedy drama comedy drama comedy dr...,Waiting to Exhale,comedy drama,1995
4,5,steve martin pregnancy remake steve martin fam...,Father of the Bride Part II,comedy,1995


In [6]:
movies.shape

(3883, 4)

In [3]:
imdb_ratings = pd.read_csv("title.ratings.tsv", sep='\t')
imdb_titles = pd.read_csv("title.basics.tsv", sep='\t')

  exec(code_obj, self.user_global_ns, self.user_ns)


In [9]:
imdb_ratings.head()

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1864
1,tt0000002,6.0,244
2,tt0000003,6.5,1632
3,tt0000004,5.8,158
4,tt0000005,6.2,2458


In [10]:
imdb_titles['titleType'].unique()

array(['short', 'movie', 'tvEpisode', 'tvSeries', 'tvShort', 'tvMovie',
       'tvMiniSeries', 'tvSpecial', 'video', 'videoGame', 'tvPilot'],
      dtype=object)

In [4]:
# drop non movie titleTypes 

non_movie_titleTypes = ['tvEpisode', 'tvSeries', 'tvMiniSeries', 'video', 'videoGame', 'tvPilot']

for titleType in non_movie_titleTypes: 
    imdb_titles = imdb_titles[imdb_titles['titleType'] != titleType]

In [5]:
movies_full = movies.copy(deep=True)

In [6]:
movies_full['title'] = movies_full['title'].apply(lambda x: x.lower())

In [7]:
imdb_full = imdb_ratings.copy(deep=True)
imdb_full = pd.merge(imdb_full, imdb_titles, on='tconst')

In [8]:
imdb_full = imdb_full.rename(columns={'primaryTitle': 'title'})
imdb_full = imdb_full.rename(columns={'startYear': 'year'})
imdb_full = imdb_full.rename(columns={'genres': 'imdb_genres'})
imdb_full = imdb_full.drop(columns='endYear')
imdb_full['title'] = imdb_full['title'].apply(lambda x: x.lower())
imdb_full

Unnamed: 0,tconst,averageRating,numVotes,titleType,title,originalTitle,isAdult,year,runtimeMinutes,imdb_genres
0,tt0000001,5.70000,1864,short,carmencita,Carmencita,0,1894,1,"Documentary,Short"
1,tt0000002,6.00000,244,short,le clown et ses chiens,Le clown et ses chiens,0,1892,5,"Animation,Short"
2,tt0000003,6.50000,1632,short,pauvre pierrot,Pauvre Pierrot,0,1892,4,"Animation,Comedy,Romance"
3,tt0000004,5.80000,158,short,un bon bock,Un bon bock,0,1892,12,"Animation,Short"
4,tt0000005,6.20000,2458,short,blacksmith scene,Blacksmith Scene,0,1893,1,"Comedy,Short"
...,...,...,...,...,...,...,...,...,...,...
476672,tt9916460,9.30000,17,tvMovie,pink taxi,Pink Taxi,0,2019,\N,Comedy
476673,tt9916538,8.30000,6,movie,kuambil lagi hatiku,Kuambil Lagi Hatiku,0,2019,123,Drama
476674,tt9916544,7.00000,51,short,my sweet prince,My Sweet Prince,0,2019,12,"Drama,Short"
476675,tt9916720,5.70000,194,short,the nun 2,The Nun 2,0,2019,10,"Comedy,Horror,Mystery"


# discovering & fixing mismatches

In [199]:
# another mismatch is that MovieLens names movies in the format: Contender, The 
# IMDB names them in the format: The Contender

def reformat_title(title): 
    new_title = title;
    if(len(title) >= 5): 
        if (title[-5:] == ', the'):
            new_title = 'the '
            new_title += title[:-5]
    return new_title

movies_full['title'] = movies_full['title'].map(lambda x: reformat_title(x))

In [200]:
movies_and_imdb = pd.merge(movies_full, imdb_full, on=['title', 'year'])
movies_and_imdb.shape

(3086, 12)

matched 3086 out of 3883 

In [203]:
unmatched = pd.merge(movies_full, imdb_full, on=['title', 'year'], how='left')
unmatched = unmatched.fillna(0)
unmatched = unmatched[unmatched['tconst'] == 0]
unmatched.head()

Unnamed: 0,movie_id,title,genres,year,tconst,averageRating,numVotes,titleType,originalTitle,isAdult,runtimeMinutes,imdb_genres
27,28,persuasion,romance,1995,0,0.0,0.0,0,0,0,0,0
29,30,shanghai triad (yao a yao yao dao waipo qiao),drama,1995,0,0.0,0.0,0,0,0,0,0
31,32,twelve monkeys,drama sci-fi,1995,0,0.0,0.0,0,0,0,0,0
46,47,seven (se7en),crime thriller,1995,0,0.0,0.0,0,0,0,0,0
51,51,guardian angel,action drama thriller,1994,0,0.0,0.0,0,0,0,0,0


now I need to manually understand what other title mismatches there are

In [231]:
movies_full[movies_full['title'] == 'the slumber party massacre ii']


Unnamed: 0,movie_id,title,genres,year
3869,3939,the slumber party massacre ii,horror,1987


In [204]:
# no match for the slumber party massacre ii and iii, try to change roman numberals into arabic, but it doesn't work
imdb_full[imdb_full['title'] == 'the slumber party massacre 2']


Unnamed: 0,tconst,averageRating,numVotes,titleType,title,originalTitle,isAdult,year,runtimeMinutes,imdb_genres


In [205]:
# removed 'the' from the title and found a match in IMDB
imdb_full[imdb_full['title'] == 'slumber party massacre ii']

Unnamed: 0,tconst,averageRating,numVotes,titleType,title,originalTitle,isAdult,year,runtimeMinutes,imdb_genres
63061,tt0093996,4.6,5816,movie,slumber party massacre ii,Slumber Party Massacre II,0,1987,77,"Comedy,Horror,Music"


In [258]:
imdb_to_match = imdb_full.copy(deep=True)
imdb_to_match = imdb_to_match.drop(columns=['averageRating', 'numVotes', 'titleType', 'originalTitle', 'isAdult', 'runtimeMinutes', 'imdb_genres'])
imdb_to_match = imdb_to_match.rename(columns={'title' : 'imdb_title'})
imdb_to_match = imdb_to_match.rename(columns={'year' : 'imdb_year'})
imdb_to_match.head()

Unnamed: 0,tconst,imdb_title,imdb_year
0,tt0000001,carmencita,1894
1,tt0000002,le clown et ses chiens,1892
2,tt0000003,pauvre pierrot,1892
3,tt0000004,un bon bock,1892
4,tt0000005,blacksmith scene,1893


In [290]:
import string 
from string import digits 
from string import punctuation



unmatched_experiments = unmatched.copy(deep='True')
unmatched_experiments['with_article'] = ''
unmatched_experiments['without_article'] = ''
unmatched_experiments['no_punctuation'] = ''
unmatched_experiments['no_numbers'] = ''

for index, row in unmatched_experiments.iterrows(): 
    # for those who have 'the' at the beginning of the title, remove it 
    title = row['title']
    if title[:3] == 'the': 
        unmatched_experiments.at[index, 'without_article'] = title[4:]
    # for those who don't have 'the' at the beginning, add it on 
    else: 
        unmatched_experiments.at[index, 'with_article'] = 'the ' + title 

    if (any(p in title for p in punctuation)):
        title_no_punctuation = title.translate(str.maketrans('', '', string.punctuation))
        unmatched_experiments.at[index, 'no_punctuation'] = title_no_punctuation

        
    if (any(p.isdigit() for p in title)):
        title_no_numbers = title.translate(digits)
        unmatched_experiments.at[index, 'no_numbers'] = title_no_punctuation
    

unmatched_experiments = unmatched_experiments.astype({'with_article': 'string',
                                                      'without_article': 'string',
                                                      'no_punctuation': 'string',
                                                      'no_numbers': 'string',})

unmatched_experiments = unmatched_experiments.drop(columns=['genres', 'tconst', 'averageRating', 'numVotes', 'titleType', 'originalTitle', 'isAdult', 'runtimeMinutes', 'imdb_genres'])

In [291]:
imdb_to_match = imdb_to_match.astype({'tconst': 'string'})
imdb_to_match.dtypes

tconst        string
imdb_title    string
imdb_year      int64
dtype: object

In [292]:
match_with_article = pd.merge(unmatched_experiments, imdb_to_match, 
                              left_on=['with_article', 'year'],
                              right_on=['imdb_title', 'imdb_year'],
                              how='left')

match_with_article = match_with_article[match_with_article['tconst'] != '<NA>']
print("Found matches for ")
print(len(match_with_article))
print("movies")
match_with_article

Found matches for 
7
movies


Unnamed: 0,movie_id,title,year,with_article,without_article,no_punctuation,no_numbers,tconst,imdb_title,imdb_year
294,1340,bride of frankenstein,1935,the bride of frankenstein,,,,tt0026138,the bride of frankenstein,1935.0
310,1430,underworld,1997,the underworld,,,,tt0301971,the underworld,1997.0
361,1771,night flier,1997,the night flier,,,,tt0119784,the night flier,1997.0
557,2782,pit and the pendulum,1961,the pit and the pendulum,,,,tt0055304,the pit and the pendulum,1961.0
590,2919,year of living dangerously,1982,the year of living dangerously,,,,tt0086617,the year of living dangerously,1982.0
639,3130,bonfire of the vanities,1990,the bonfire of the vanities,,,,tt0099165,the bonfire of the vanities,1990.0
811,3856,autumn heart,1999,the autumn heart,,,,tt0120593,the autumn heart,1999.0


In [293]:
match_without_article = pd.merge(unmatched_experiments, imdb_to_match, 
                              left_on=['without_article', 'year'],
                              right_on=['imdb_title', 'imdb_year'],
                              how='left')

match_without_article = match_without_article[match_without_article['tconst'] != '<NA>']
print("Found matches for ")
print(len(match_without_article))
print("movies")
match_without_article

Found matches for 
9
movies


Unnamed: 0,movie_id,title,year,with_article,without_article,no_punctuation,no_numbers,tconst,imdb_title,imdb_year
426,2155,the slums of beverly hills,1998,,slums of beverly hills,,,tt0120831,slums of beverly hills,1998.0
612,2995,the house on haunted hill,1999,,house on haunted hill,,,tt0185371,house on haunted hill,1999.0
642,3140,the three ages,1923,,three ages,,,tt0014538,three ages,1923.0
643,3145,the cradle will rock,1999,,cradle will rock,,,tt0150216,cradle will rock,1999.0
770,3670,the story of g.i. joe,1945,,story of g.i. joe,the story of gi joe,,tt0038120,story of g.i. joe,1945.0
828,3930,the creature from the black lagoon,1954,,creature from the black lagoon,,,tt0046876,creature from the black lagoon,1954.0
830,3936,the phantom of the opera,1943,,phantom of the opera,,,tt0036261,phantom of the opera,1943.0
831,3939,the slumber party massacre ii,1987,,slumber party massacre ii,,,tt0093996,slumber party massacre ii,1987.0
832,3940,the slumber party massacre iii,1990,,slumber party massacre iii,,,tt0100639,slumber party massacre iii,1990.0


In [294]:
match_without_punctuation = pd.merge(unmatched_experiments, imdb_to_match, 
                              left_on=['no_punctuation', 'year'],
                              right_on=['imdb_title', 'imdb_year'],
                              how='left')

match_without_punctuation = match_without_punctuation[match_without_punctuation['tconst'] != '<NA>']
print("Found matches for ")
print(len(match_without_punctuation))
print("movies")
match_without_punctuation

Found matches for 
23
movies


Unnamed: 0,movie_id,title,year,with_article,without_article,no_punctuation,no_numbers,tconst,imdb_title,imdb_year
14,119,"steal big, steal little",1995,"the steal big, steal little",,steal big steal little,,tt0114536,steal big steal little,1995.0
23,165,die hard: with a vengeance,1995,the die hard: with a vengeance,,die hard with a vengeance,,tt0112864,die hard with a vengeance,1995.0
80,468,"the englishman who went up a hill, but came do...",1995,,"englishman who went up a hill, but came down a...",the englishman who went up a hill but came dow...,,tt0112966,the englishman who went up a hill but came dow...,1995.0
107,603,"bye bye, love",1995,"the bye bye, love",,bye bye love,,tt0112606,bye bye love,1995.0
215,1066,shall we dance?,1937,the shall we dance?,,shall we dance,,tt0029546,shall we dance,1937.0
278,1294,m*a*s*h,1970,the m*a*s*h,,mash,,tt0066026,mash,1970.0
316,1460,that darn cat!,1997,the that darn cat!,,that darn cat,,tt0120317,that darn cat,1997.0
343,1684,mrs. dalloway,1997,the mrs. dalloway,,mrs dalloway,,tt0119723,mrs dalloway,1997.0
374,1825,the player's club,1998,,player's club,the players club,,tt0119905,the players club,1998.0
431,2171,"next stop, wonderland",1998,"the next stop, wonderland",,next stop wonderland,,tt0119778,next stop wonderland,1998.0


In [295]:
match_without_numbers = pd.merge(unmatched_experiments, imdb_to_match, 
                              left_on=['no_numbers', 'year'],
                              right_on=['imdb_title', 'imdb_year'],
                              how='left')

match_without_numbers = match_without_numbers[match_without_numbers['tconst'] != '<NA>']
print("Found matches for ")
print(len(match_without_numbers))
print("movies")
match_without_numbers

Found matches for 
0
movies


Unnamed: 0,movie_id,title,year,with_article,without_article,no_punctuation,no_numbers,tconst,imdb_title,imdb_year


In [300]:
unmatched_movie_ids = list(unmatched['movie_id'])
matched_with_article_movie_ids = list(match_with_article['movie_id'])
matched_without_article_movie_ids = list(match_without_article['movie_id'])
matched_without_punctuation_movie_ids = list(match_without_punctuation['movie_id'])
print(len(unmatched_movie_ids))
print(len(matched_with_article_movie_ids))
print(len(matched_without_article_movie_ids))
print(len(matched_without_punctuation_movie_ids))


833
7
9
23
left to match:
794


In [312]:
left_to_match = set(unmatched_movie_ids) - set(matched_with_article_movie_ids) - set(matched_without_article_movie_ids) - set(matched_without_punctuation_movie_ids)
print("left to match:")
print(len(left_to_match))

unmatched = unmatched[unmatched['movie_id'].isin(list(left_to_match))]
unmatched.shape

(794, 12)

In [315]:
unmatched

Unnamed: 0,movie_id,title,genres,year,tconst,averageRating,numVotes,titleType,originalTitle,isAdult,runtimeMinutes,imdb_genres
27,28,persuasion,romance,1995,0,0.00000,0.00000,0,0,0,0,0
29,30,shanghai triad (yao a yao yao dao waipo qiao),drama,1995,0,0.00000,0.00000,0,0,0,0,0
31,32,twelve monkeys,drama sci-fi,1995,0,0.00000,0.00000,0,0,0,0,0
46,47,seven (se7en),crime thriller,1995,0,0.00000,0.00000,0,0,0,0,0
51,51,guardian angel,action drama thriller,1994,0,0.00000,0.00000,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
3870,3904,"uninvited guest, an",drama,2000,0,0.00000,0.00000,0,0,0,0,0
3873,3907,the prince of central park,drama,1999,0,0.00000,0.00000,0,0,0,0,0
3880,3914,the broken hearts club,drama,2000,0,0.00000,0.00000,0,0,0,0,0
3886,3920,"faraway, so close (in weiter ferne, so nah!)",drama fantasy,1993,0,0.00000,0.00000,0,0,0,0,0


In [326]:
# let's remove things in brackets 

unmatched_experiments['no_brackets'] = ''

for index, row in unmatched_experiments.iterrows(): 
    # for those who have 'the' at the beginning of the title, remove it 
    title = row['title']
    if ('(' in title): 
        no_brackets = title[: title.index('(')-1]
        unmatched_experiments.at[index, 'no_brackets'] = no_brackets
    
unmatched_experiments = unmatched_experiments.astype({'no_brackets': 'string'}) 


In [328]:
match_without_brackets = pd.merge(unmatched_experiments, imdb_to_match, 
                              left_on=['no_brackets', 'year'],
                              right_on=['imdb_title', 'imdb_year'],
                              how='left')

match_without_brackets = match_without_brackets[match_without_brackets['tconst'] != '<NA>']

print("Found matches for ")
print(len(match_without_brackets))
print("movies")
match_without_brackets

Found matches for 
109
movies


Unnamed: 0,movie_id,title,year,with_article,without_article,no_punctuation,no_numbers,no_brackets,tconst,imdb_title,imdb_year
1,30,shanghai triad (yao a yao yao dao waipo qiao),1995,the shanghai triad (yao a yao yao dao waipo qiao),,shanghai triad yao a yao yao dao waipo qiao,,shanghai triad,tt0115012,shanghai triad,1995.00000
7,68,french twist (gazon maudit),1995,the french twist (gazon maudit),,french twist gazon maudit,,french twist,tt0113149,french twist,1995.00000
10,82,antonia's line (antonia),1995,the antonia's line (antonia),,antonias line antonia,,antonia's line,tt0112379,antonia's line,1995.00000
13,106,nobody loves me (keiner liebt mich),1994,the nobody loves me (keiner liebt mich),,nobody loves me keiner liebt mich,,nobody loves me,tt0110251,nobody loves me,1994.00000
27,213,burnt by the sun (utomlyonnye solntsem),1994,the burnt by the sun (utomlyonnye solntsem),,burnt by the sun utomlyonnye solntsem,,burnt by the sun,tt0111579,burnt by the sun,1994.00000
...,...,...,...,...,...,...,...,...,...,...,...
796,3800,criminal lovers (les amants criminels),1999,the criminal lovers (les amants criminels),,criminal lovers les amants criminels,,criminal lovers,tt0205735,criminal lovers,1999.00000
803,3832,"black sabbath (tre volti della paura, i)",1963,"the black sabbath (tre volti della paura, i)",,black sabbath tre volti della paura i,,black sabbath,tt0057603,black sabbath,1963.00000
813,3867,all the rage (a.k.a. it's the rage),1999,the all the rage (a.k.a. it's the rage),,all the rage aka its the rage,,all the rage,tt0176426,all the rage,1999.00000
821,3892,anatomy (anatomie),2000,the anatomy (anatomie),,anatomy anatomie,,anatomy,tt0187696,anatomy,2000.00000


In [329]:
left_to_match = set(left_to_match) - set(match_without_brackets['movie_id'])

In [330]:
len(left_to_match)

685

In [354]:
unmatched = unmatched[unmatched['movie_id'].isin(list(left_to_match))]


In [345]:
# a lot of these don't match because of mismatched years 
imdb_full[imdb_full['title'] == 'a little princess']

Unnamed: 0,tconst,averageRating,numVotes,titleType,title,originalTitle,isAdult,year,runtimeMinutes,imdb_genres
2244,tt0008196,6.2,692,movie,a little princess,A Little Princess,0,1917,62,Drama
76572,tt0113670,7.6,34066,movie,a little princess,A Little Princess,0,1995,97,"Drama,Family,Fantasy"
241761,tt11414492,6.9,185,movie,a little princess,A Little Princess,0,2019,104,Drama


In [385]:
import string 
from string import digits 
from string import punctuation



unmatched_experiments = unmatched.copy(deep='True')
# unmatched_experiments['with_a'] = ''
unmatched_experiments['without_a'] = ''
unmatched_experiments['without_an'] = ''
# unmatched_experiments['with_an'] = ''
unmatched_experiments['inside_brackets'] = ''

for index, row in unmatched_experiments.iterrows(): 
    # for those who have 'a' at the beginning of the title, remove it 
    title = row['title']
    if title[-3:] == ', a': 
        unmatched_experiments.at[index, 'without_a'] = 'a ' + title[:-3]
    # for those who don't have 'a' at the beginning, add it on 
    # else: 
    #     unmatched_experiments.at[index, 'with_a'] = 'a ' + title[:-3]

    if title[-4:] == ', an': 
        unmatched_experiments.at[index, 'without_an'] = 'an ' + title[:-4]
    # for those who don't have 'an' at the beginning, add it on 
    # else: 
    #     unmatched_experiments.at[index, 'with_an'] = 'an ' + title[:-3]

    
    if ('(' in title and ')' in title):
        inside_brackets = title[title.index('(')+1: title.index(')')]
        unmatched_experiments.at[index, 'inside_brackets'] = inside_brackets

unmatched_experiments = unmatched_experiments.astype({'without_a': 'string',
                                                      'without_an': 'string',
                                                      'inside_brackets' : 'string'})

unmatched_experiments = unmatched_experiments.drop(columns=['genres', 'tconst', 'averageRating', 'numVotes', 'titleType', 'originalTitle', 'isAdult', 'runtimeMinutes', 'imdb_genres'])


In [387]:
match_without_a = pd.merge(unmatched_experiments, imdb_to_match, 
                              left_on=['without_a', 'year'],
                              right_on=['imdb_title', 'imdb_year'],
                              how='left')

match_without_a = match_without_a[match_without_a['tconst'] != '<NA>']
print("Found matches for ")
print(len(match_without_a))
print("movies")
match_without_a

Found matches for 
77
movies


Unnamed: 0,movie_id,title,year,without_a,without_an,inside_brackets,tconst,imdb_title,imdb_year
20,207,"walk in the clouds, a",1995,a walk in the clouds,,,tt0114887,a walk in the clouds,1995.00000
26,239,"goofy movie, a",1995,a goofy movie,,,tt0113198,a goofy movie,1995.00000
33,258,"kid in king arthur's court, a",1995,a kid in king arthur's court,,,tt0113538,a kid in king arthur's court,1995.00000
35,262,"little princess, a",1995,a little princess,,,tt0113670,a little princess,1995.00000
44,295,"pyromaniac's love story, a",1995,a pyromaniac's love story,,,tt0114210,a pyromaniac's love story,1995.00000
...,...,...,...,...,...,...,...,...,...
569,3350,"raisin in the sun, a",1961,a raisin in the sun,,,tt0055353,a raisin in the sun,1961.00000
581,3405,"night to remember, a",1958,a night to remember,,,tt0051994,a night to remember,1958.00000
594,3475,"place in the sun, a",1951,a place in the sun,,,tt0043924,a place in the sun,1951.00000
623,3640,"king in new york, a",1957,a king in new york,,,tt0050598,a king in new york,1957.00000


In [388]:
match_without_an = pd.merge(unmatched_experiments, imdb_to_match, 
                              left_on=['without_an', 'year'],
                              right_on=['imdb_title', 'imdb_year'],
                              how='left')

match_without_an = match_without_an[match_without_an['tconst'] != '<NA>']
print("Found matches for ")
print(len(match_without_an))
print("movies")
match_without_an

Found matches for 
10
movies


Unnamed: 0,movie_id,title,year,without_a,without_an,inside_brackets,tconst,imdb_title,imdb_year
16,148,"awfully big adventure, an",1995,,an awfully big adventure,,tt0112427,an awfully big adventure,1995.0
167,900,"american in paris, an",1951,,an american in paris,,tt0043278,an american in paris,1951.0
168,932,"affair to remember, an",1957,,an affair to remember,,tt0050105,an affair to remember,1957.0
248,1321,"american werewolf in london, an",1981,,an american werewolf in london,,tt0082010,an american werewolf in london,1981.0
320,1853,"alan smithee film: burn hollywood burn, an",1997,,an alan smithee film: burn hollywood burn,,tt0118577,an alan smithee film: burn hollywood burn,1997.0
361,2141,"american tail, an",1986,,an american tail,,tt0090633,an american tail,1986.0
362,2142,"american tail: fievel goes west, an",1991,,an american tail: fievel goes west,,tt0101329,an american tail: fievel goes west,1991.0
447,2690,"ideal husband, an",1999,,an ideal husband,,tt0122541,an ideal husband,1999.0
448,2690,"ideal husband, an",1999,,an ideal husband,,tt0160395,an ideal husband,1999.0
469,2793,"american werewolf in paris, an",1997,,an american werewolf in paris,,tt0118604,an american werewolf in paris,1997.0


In [389]:
match_inside_brackets = pd.merge(unmatched_experiments, imdb_to_match, 
                              left_on=['inside_brackets', 'year'],
                              right_on=['imdb_title', 'imdb_year'],
                              how='left')

match_inside_brackets = match_inside_brackets[match_inside_brackets['tconst'] != '<NA>']
print("Found matches for ")
print(len(match_inside_brackets))
print("movies")
match_inside_brackets


Found matches for 
20
movies


Unnamed: 0,movie_id,title,year,without_a,without_an,inside_brackets,tconst,imdb_title,imdb_year
2,47,seven (se7en),1995,,,se7en,tt0114369,se7en,1995.0
4,58,"postino, il (the postman)",1994,,,the postman,tt0110877,the postman,1994.0
38,269,my crazy life (mi vida loca),1993,,,mi vida loca,tt0107566,mi vida loca,1993.0
76,561,killer (bulletproof heart),1994,,,bulletproof heart,tt0110259,bulletproof heart,1994.0
85,582,metisse (café au lait),1993,,,café au lait,tt0107642,café au lait,1993.0
110,681,clean slate (coup de torchon),1981,,,coup de torchon,tt0082206,coup de torchon,1981.0
118,718,"visitors, the (les visiteurs)",1993,,,les visiteurs,tt0108500,les visiteurs,1993.0
124,735,cemetery man (dellamorte dellamore),1994,,,dellamorte dellamore,tt0109592,dellamorte dellamore,1994.0
134,773,touki bouki (journey of the hyena),1973,,,journey of the hyena,tt0070820,journey of the hyena,1973.0
139,793,my life and times with antonin artaud (en comp...,1993,,,en compagnie d'antonin artaud,tt0106810,en compagnie d'antonin artaud,1993.0


In [390]:
left_to_match = set(left_to_match) - set(match_without_a['movie_id']) - set(match_without_an['movie_id']) - set(match_inside_brackets['movie_id'])
print("left to match:")
print(len(left_to_match))

unmatched = unmatched[unmatched['movie_id'].isin(list(left_to_match))]

left to match:
580


In [393]:
# movielens confessional, the (le confessionnal)
imdb_full[imdb_full['title'] == 'the confessional']

Unnamed: 0,tconst,averageRating,numVotes,titleType,title,originalTitle,isAdult,year,runtimeMinutes,imdb_genres
75872,tt0112714,7.4,1588,movie,the confessional,Le confessionnal,0,1995,100,"Drama,Mystery,Thriller"
214159,tt0860837,6.6,110,movie,the confessional,The Confessional,0,2009,117,"Action,Comedy,Crime"
325496,tt2047872,5.6,5,short,the confessional,The Confessional,0,2011,5,"Horror,Short"


In [396]:
# movielens dumb & dumber
imdb_full[imdb_full['title'] == 'dumb and dumber']

Unnamed: 0,tconst,averageRating,numVotes,titleType,title,originalTitle,isAdult,year,runtimeMinutes,imdb_genres
73900,tt0109686,7.3,374205,movie,dumb and dumber,Dumb and Dumber,0,1994,107,Comedy


In [398]:
# movielens the boys of st. vincent but 1993
imdb_full[imdb_full['title'] == 'the boys of st. vincent']

Unnamed: 0,tconst,averageRating,numVotes,titleType,title,originalTitle,isAdult,year,runtimeMinutes,imdb_genres
71705,tt0106473,7.6,1736,tvMovie,the boys of st. vincent,The Boys of St. Vincent,0,1992,93,Drama


In [404]:
# movielens colonel chabert, le
imdb_full[imdb_full['title'] == 'le colonel chabert']

Unnamed: 0,tconst,averageRating,numVotes,titleType,title,originalTitle,isAdult,year,runtimeMinutes,imdb_genres
98580,tt0164006,7.1,113,movie,le colonel chabert,Le colonel Chabert,0,1943,102,Drama


In [None]:
# remove ALL POSSIBLE ARTICLES FROM ALL TITLES AND MATCH LIKE THAT 
# MATCH WITH + - 1 YEAR 



In [412]:
# movielens rent-a-cop	
imdb_full[imdb_full['title'] == 'the naked gun 2 1/2']

Unnamed: 0,tconst,averageRating,numVotes,titleType,title,originalTitle,isAdult,year,runtimeMinutes,imdb_genres


In [416]:
unmatched.to_csv('.unmatched.csv', index=False)

In [426]:
# match with year + 1 - 1 

imdb_to_match['year+1'] = imdb_to_match['imdb_year'] + 1
imdb_to_match['year-1'] = imdb_to_match['imdb_year'] - 1

In [427]:
imdb_to_match.head()

Unnamed: 0,tconst,imdb_title,imdb_year,year+1,year-1
0,tt0000001,carmencita,1894,1895,1893
1,tt0000002,le clown et ses chiens,1892,1893,1891
2,tt0000003,pauvre pierrot,1892,1893,1891
3,tt0000004,un bon bock,1892,1893,1891
4,tt0000005,blacksmith scene,1893,1894,1892


In [432]:
unmatched_experiments = unmatched.copy(deep=True)
unmatched_experiments = unmatched_experiments.astype({'title': 'string'})
unmatched_experiments = unmatched_experiments.drop(columns=['genres', 'tconst', 'averageRating', 'numVotes', 'titleType', 'originalTitle', 'isAdult', 'runtimeMinutes', 'imdb_genres'])

In [433]:
unmatched_experiments

Unnamed: 0,movie_id,title,year
27,28,persuasion,1995
31,32,twelve monkeys,1995
51,51,guardian angel,1994
59,59,"confessional, the (le confessionnal)",1995
73,73,"misérables, les",1995
...,...,...,...
3870,3904,"uninvited guest, an",2000
3873,3907,the prince of central park,1999
3880,3914,the broken hearts club,2000
3886,3920,"faraway, so close (in weiter ferne, so nah!)",1993


In [434]:
year_plus_one_match = pd.merge(unmatched_experiments, imdb_to_match, 
                              left_on=['title', 'year'],
                              right_on=['imdb_title', 'year+1'],
                              how='left')

year_plus_one_match = year_plus_one_match[year_plus_one_match['tconst'] != '<NA>']
print("Found matches for ")
print(len(year_plus_one_match))
print("movies")
year_plus_one_match

Found matches for 
110
movies


Unnamed: 0,movie_id,title,year,tconst,imdb_title,imdb_year,year+1,year-1
8,121,the boys of st. vincent,1993,tt0106473,the boys of st. vincent,1992.00000,1993.00000,1991.00000
18,224,don juan demarco,1995,tt0112883,don juan demarco,1994.00000,1995.00000,1993.00000
23,243,gordy,1995,tt0113199,gordy,1994.00000,1995.00000,1993.00000
36,298,pushing hands,1992,tt0105652,pushing hands,1991.00000,1992.00000,1990.00000
37,301,picture bride,1995,tt0114129,picture bride,1994.00000,1995.00000,1993.00000
...,...,...,...,...,...,...,...,...
549,3771,the golden voyage of sinbad,1974,tt0071569,the golden voyage of sinbad,1973.00000,1974.00000,1972.00000
553,3789,the pawnbroker,1965,tt0059575,the pawnbroker,1964.00000,1965.00000,1963.00000
555,3799,pokémon the movie 2000,2000,tt0210234,pokémon the movie 2000,1999.00000,2000.00000,1998.00000
560,3819,tampopo,1986,tt0092048,tampopo,1985.00000,1986.00000,1984.00000


In [435]:
year_minus_one_match = pd.merge(unmatched_experiments, imdb_to_match, 
                              left_on=['title', 'year'],
                              right_on=['imdb_title', 'year-1'],
                              how='left')

year_minus_one_match = year_minus_one_match[year_minus_one_match['tconst'] != '<NA>']
print("Found matches for ")
print(len(year_minus_one_match))
print("movies")
year_minus_one_match

Found matches for 
82
movies


Unnamed: 0,movie_id,title,year,tconst,imdb_title,imdb_year,year+1,year-1
11,128,jupiter's wife,1994,tt0110217,jupiter's wife,1995.00000,1996.00000,1994.00000
14,157,canadian bacon,1994,tt0109370,canadian bacon,1995.00000,1996.00000,1994.00000
15,183,mute witness,1994,tt0110604,mute witness,1995.00000,1996.00000,1994.00000
20,230,dolores claiborne,1994,tt0109642,dolores claiborne,1995.00000,1996.00000,1994.00000
24,248,houseguest,1994,tt0110066,houseguest,1995.00000,1996.00000,1994.00000
...,...,...,...,...,...,...,...,...
506,3561,stacy's knights,1982,tt0084723,stacy's knights,1983.00000,1984.00000,1982.00000
508,3567,bossa nova,1999,tt0180837,bossa nova,2000.00000,2001.00000,1999.00000
542,3721,trixie,1999,tt0162711,trixie,2000.00000,2001.00000,1999.00000
544,3728,one false move,1991,tt0102592,one false move,1992.00000,1993.00000,1991.00000


In [437]:
left_to_match = set(left_to_match) - set(year_minus_one_match['movie_id']) - set(year_plus_one_match['movie_id'])
print("left to match:")
print(len(left_to_match))

unmatched = unmatched[unmatched['movie_id'].isin(list(left_to_match))]

left to match:
393


In [441]:
unmatched['title'] = unmatched['title'].replace({'twelve': '12'})

In [442]:
unmatched.head()

Unnamed: 0,movie_id,title,genres,year,tconst,averageRating,numVotes,titleType,originalTitle,isAdult,runtimeMinutes,imdb_genres
27,28,persuasion,romance,1995,0,0.0,0.0,0,0,0,0,0
31,32,twelve monkeys,drama sci-fi,1995,0,0.0,0.0,0,0,0,0,0
51,51,guardian angel,action drama thriller,1994,0,0.0,0.0,0,0,0,0,0
59,59,"confessional, the (le confessionnal)",drama mystery,1995,0,0.0,0.0,0,0,0,0,0
73,73,"misérables, les",drama musical,1995,0,0.0,0.0,0,0,0,0,0


In [445]:
unmatched.at[31,'title']='12 monkeys'

In [1]:
unmatched.head(20)

NameError: name 'unmatched' is not defined

In [457]:
def replace_ampersand(title): 
    if ('&' in title):
        title = title.replace('&', 'and')
    return title

unmatched['title'] = unmatched['title'].apply(lambda x: replace_ampersand(x)) 

In [459]:
unmatched[unmatched['title'] == 'dumb and dumber']

Unnamed: 0,movie_id,title,genres,year,tconst,averageRating,numVotes,titleType,originalTitle,isAdult,runtimeMinutes,imdb_genres
231,231,dumb and dumber,comedy,1994,0,0.0,0.0,0,0,0,0,0


In [461]:
unmatched_experiments = unmatched.copy(deep=True)
unmatched_experiments = unmatched_experiments.astype({'title': 'string'})
unmatched_experiments = unmatched_experiments.drop(columns=['genres', 'tconst', 'averageRating', 'numVotes', 'titleType', 'originalTitle', 'isAdult', 'runtimeMinutes', 'imdb_genres'])

new_matches = pd.merge(unmatched_experiments, imdb_to_match, 
                              left_on=['title', 'year'],
                              right_on=['imdb_title', 'imdb_year'],
                              how='left')

new_matches = new_matches[new_matches['tconst'] != '<NA>']
print("Found matches for ")
print(len(new_matches))
print("movies")
new_matches

Found matches for 
5
movies


Unnamed: 0,movie_id,title,year,tconst,imdb_title,imdb_year,year+1,year-1
1,32,12 monkeys,1995,tt0114746,12 monkeys,1995.0,1996.0,1994.0
15,231,dumb and dumber,1994,tt0109686,dumb and dumber,1994.0,1995.0,1993.0
230,2542,"lock, stock and two smoking barrels",1998,tt0120735,"lock, stock and two smoking barrels",1998.0,1999.0,1997.0
259,2837,bedrooms and hallways,1998,tt0126810,bedrooms and hallways,1998.0,1999.0,1997.0
385,3876,jerry and tom,1998,tt0120867,jerry and tom,1998.0,1999.0,1997.0


In [462]:
left_to_match = set(left_to_match) - set(new_matches['movie_id'])
print("left to match:")
print(len(left_to_match))

unmatched = unmatched[unmatched['movie_id'].isin(list(left_to_match))]

left to match:
388


# Do everything again, but more efficiently

The discoveries I made are: 
* movielens puts articles at the end of the movie title, i.e. : contender, the
* imdb puts articles at the start of the title: i.e. the contender 
* some of the articles I found are: the, an, l' , le , la, dir
* movielens -> movieName (secondaryName) .... imdb -> movieName or secondaryName
* movielens -> dumb & dumber ... imdb -> dumb and dumber 
* mismatch with punctuation and numbers 
* mismatch with years, sometimes the years are 1 year apart 

In [715]:
# dataframes that I will be matching 

unmatched = movies_full.copy(deep=True)
imdb_to_match = imdb_full.copy(deep=True) 

unmatched = unmatched.astype({'title': 'string'})
imdb_to_match = imdb_to_match.astype({'tconst': 'string'})

# unmatched = unmatched.drop(columns=['genres', 'tconst', 'averageRating', 'numVotes', 'titleType', 'originalTitle', 'isAdult', 'runtimeMinutes', 'imdb_genres'])
imdb_to_match = imdb_to_match.drop(columns=['imdb_genres', 'averageRating', 'numVotes', 'titleType', 'originalTitle', 'isAdult', 'runtimeMinutes', 'imdb_genres'])

unmatched.shape

(3883, 4)

In [716]:
import string 
from string import punctuation

def remove_movielens_articles(title):
    new_title = title
    articles = {', the', ', a', ', an', ', le', ', la', ', l\'', ', el', ', dir', ', der'}
    for article in articles: 
        if article in title: 
            new_title = title[:title.index(article)]
    return new_title 

def remove_imdb_articles(title): 
    new_title = title
    articles = {'the ', 'a ', 'an ', 'le ', 'la ', 'l\' ', 'el ', 'dir', 'der'}
    for article in articles: 
        if article in title: 
            new_title = title[title.index(article) + len(article):] 
    return new_title

def remove_punctuation(title): 
    new_title = title
    if (any(p in title for p in punctuation)):
        new_title = title.translate(str.maketrans('', '', string.punctuation))
    return new_title 

def remove_numbers(title):
    no_digits = []
    for i in title:
        if not i.isdigit():
            no_digits.append(i)
    return ''.join(no_digits)

def remove_secondary(title):
    new_title = title
    if ('(' in title and ')' in title):
        new_title = title[: title.index('(')-1]
    
    return new_title 

def extract_secondary(title):
    new_title = title
    if ('(' in title and ')' in title):
        new_title = title[title.index('(') + 1: title.index(')')]
    return new_title  

In [717]:
unmatched['stripped_title'] = ''

for index, row in unmatched.iterrows(): 
    title = row['title']
    no_article = remove_movielens_articles(title)    
    no_punctuation = remove_punctuation('.'.join(no_article))
    no_numbers = remove_numbers(''.join(no_punctuation))
    unmatched.at[index, 'stripped_title'] = no_numbers
    
imdb_to_match['stripped_title'] = ''

for index, row in imdb_to_match.iterrows(): 
    title = row['title']
    no_article = remove_imdb_articles(''.join(title))    
    no_punctuation = remove_punctuation('.'.join(no_article))
    no_numbers = remove_numbers(''.join(no_punctuation))
    imdb_to_match.at[index, 'stripped_title'] = no_numbers

In [718]:
unmatched['no_secondary'] = ''
unmatched['only_secondary'] = ''

for index, row in unmatched.iterrows(): 
    title = row['title']
    no_article = remove_movielens_articles(title)
    no_secondary = remove_secondary(''.join(no_article))
    only_secondary = extract_secondary(''.join(no_article))
    unmatched.at[index, 'no_secondary'] = no_secondary
    unmatched.at[index, 'only_secondary'] = only_secondary
        

imdb_to_match['no_secondary'] = ''
imdb_to_match['only_secondary'] = ''

for index, row in imdb_to_match.iterrows(): 
    title = row['title']
    no_article = remove_imdb_articles(''.join(title))
    
    no_secondary = remove_secondary(''.join(no_article))
    only_secondary = extract_secondary(''.join(no_article))
    imdb_to_match.at[index, 'no_secondary'] = no_secondary
    imdb_to_match.at[index, 'only_secondary'] = only_secondary
    

In [719]:
imdb_to_match = imdb_to_match.astype({'stripped_title': 'string',
                                      'no_secondary': 'string',
                                      'only_secondary': 'string',})

unmatched = unmatched.astype({'stripped_title': 'string',
                                      'no_secondary': 'string',
                                      'only_secondary': 'string',})

unmatched = unmatched.drop(columns = 'genres')

In [720]:
match = pd.merge(unmatched, imdb_to_match, on=['stripped_title', 'year'])
unmatched_ids = set(unmatched['movie_id']) - set(match['movie_id'])
unmatched = unmatched[unmatched['movie_id'].isin(list(unmatched_ids))]
# unmatched = unmatched.drop(columns=['stripped_title'])
# unmatched = unmatched.drop(columns=['title'])
unmatched

Unnamed: 0,movie_id,title,year,stripped_title,no_secondary,only_secondary
4,5,father of the bride part ii,1995,father of the bride part ii,father of the bride part ii,father of the bride part ii
23,24,powder,1995,powder,powder,powder
27,28,persuasion,1995,persuasion,persuasion,persuasion
29,30,shanghai triad (yao a yao yao dao waipo qiao),1995,shanghai triad yao a yao yao dao waipo qiao,shanghai triad,yao a yao yao dao waipo qiao
31,32,twelve monkeys,1995,twelve monkeys,twelve monkeys,twelve monkeys
...,...,...,...,...,...,...
3865,3935,kronos,1973,kronos,kronos,kronos
3866,3936,"phantom of the opera, the",1943,phantom of the opera,phantom of the opera,phantom of the opera
3875,3945,digimon: the movie,2000,digimon the movie,digimon: the movie,digimon: the movie
3878,3948,meet the parents,2000,meet the parents,meet the parents,meet the parents


In [721]:
only_secondary_match = unmatched.copy(deep=True)
only_secondary_match = pd.merge(only_secondary_match, imdb_to_match, on=['only_secondary', 'year'])

only_secondary_match

Unnamed: 0,movie_id,title_x,year,stripped_title_x,no_secondary_x,only_secondary,tconst,title_y,stripped_title_y,no_secondary_y
0,47,seven (se7en),1995,seven seen,seven,se7en,tt0114369,se7en,seen,se7en
1,561,killer (bulletproof heart),1994,killer bulletproof heart,killer,bulletproof heart,tt0110259,bulletproof heart,bulletproof heart,bulletproof heart
2,582,metisse (café au lait),1993,metisse café au lait,metisse,café au lait,tt0107642,café au lait,café au lait,café au lait
3,681,clean slate (coup de torchon),1981,clean slate coup de torchon,clean slate,coup de torchon,tt0082206,coup de torchon,coup de torchon,coup de torchon
4,735,cemetery man (dellamorte dellamore),1994,cemetery man dellamorte dellamore,cemetery man,dellamorte dellamore,tt0109592,dellamorte dellamore,dellamorte dellamore,dellamorte dellamore
5,793,my life and times with antonin artaud (en comp...,1993,my life and times with antonin artaud en compa...,my life and times with antonin artaud,en compagnie d'antonin artaud,tt0106810,en compagnie d'antonin artaud,en compagnie dantonin artaud,en compagnie d'antonin artaud
6,989,schlafes bruder (brother of sleep),1995,schlafes bruder brother of sleep,schlafes bruder,brother of sleep,tt0114354,brother of sleep,brother of sleep,brother of sleep
7,2595,photographer (fotoamator),1998,photographer fotoamator,photographer,fotoamator,tt0188996,fotoamator,fotoamator,fotoamator


In [722]:
no_secondary_match = unmatched.copy(deep=True)
no_secondary_match = pd.merge(no_secondary_match, imdb_to_match, on=['no_secondary', 'year'])
no_secondary_match

Unnamed: 0,movie_id,title_x,year,stripped_title_x,no_secondary,only_secondary_x,tconst,title_y,stripped_title_y,only_secondary_y
0,30,shanghai triad (yao a yao yao dao waipo qiao),1995,shanghai triad yao a yao yao dao waipo qiao,shanghai triad,yao a yao yao dao waipo qiao,tt0115012,shanghai triad,shanghai triad,shanghai triad
1,68,french twist (gazon maudit),1995,french twist gazon maudit,french twist,gazon maudit,tt0113149,french twist,french twist,french twist
2,82,antonia's line (antonia),1995,antonias line antonia,antonia's line,antonia,tt0112379,antonia's line,antonias line,antonia's line
3,106,nobody loves me (keiner liebt mich),1994,nobody loves me keiner liebt mich,nobody loves me,keiner liebt mich,tt0110251,nobody loves me,nobody loves me,nobody loves me
4,142,shadows (cienie),1988,shadows cienie,shadows,cienie,tt0095804,out of the shadows,shadows,shadows
...,...,...,...,...,...,...,...,...,...,...
78,3761,"blood in, blood out (a.k.a. bound by honor)",1993,blood in blood out aka bound by honor,"blood in, blood out",a.k.a. bound by honor,tt0106469,"blood in, blood out",blood in blood out,"blood in, blood out"
79,3787,shower (xizhao),1999,shower xizhao,shower,xizhao,tt0215369,shower,shower,shower
80,3800,criminal lovers (les amants criminels),1999,criminal lovers les amants criminels,criminal lovers,les amants criminels,tt0205735,criminal lovers,criminal lovers,criminal lovers
81,3832,"black sabbath (tre volti della paura, i)",1963,black sabbath tre volti della paura i,black sabbath,"tre volti della paura, i",tt0057603,black sabbath,black sabbath,black sabbath


In [723]:
unmatched

Unnamed: 0,movie_id,title,year,stripped_title,no_secondary,only_secondary
4,5,father of the bride part ii,1995,father of the bride part ii,father of the bride part ii,father of the bride part ii
23,24,powder,1995,powder,powder,powder
27,28,persuasion,1995,persuasion,persuasion,persuasion
29,30,shanghai triad (yao a yao yao dao waipo qiao),1995,shanghai triad yao a yao yao dao waipo qiao,shanghai triad,yao a yao yao dao waipo qiao
31,32,twelve monkeys,1995,twelve monkeys,twelve monkeys,twelve monkeys
...,...,...,...,...,...,...
3865,3935,kronos,1973,kronos,kronos,kronos
3866,3936,"phantom of the opera, the",1943,phantom of the opera,phantom of the opera,phantom of the opera
3875,3945,digimon: the movie,2000,digimon the movie,digimon: the movie,digimon: the movie
3878,3948,meet the parents,2000,meet the parents,meet the parents,meet the parents


In [724]:
unmatched_ids = set(unmatched['movie_id']) - set(only_secondary_match['movie_id']) - set(no_secondary_match['movie_id'])
unmatched = unmatched[unmatched['movie_id'].isin(list(unmatched_ids))]

In [725]:
unmatched['year+1'] = unmatched['year'] + 1
unmatched['year-1'] = unmatched['year'] - 1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [727]:
minus_year_match = unmatched.copy(deep=True)
minus_year_match = pd.merge(minus_year_match, imdb_to_match, left_on=['title', 'year-1'], right_on=['title', 'year'])
minus_year_match

Unnamed: 0,movie_id,title,year_x,stripped_title_x,no_secondary_x,only_secondary_x,year+1,year-1,tconst,year_y,stripped_title_y,no_secondary_y,only_secondary_y
0,224,don juan demarco,1995,don juan demarco,don juan demarco,don juan demarco,1996,1994,tt0112883,1994,demarco,demarco,demarco
1,243,gordy,1995,gordy,gordy,gordy,1996,1994,tt0113199,1994,gordy,gordy,gordy
2,298,pushing hands,1992,pushing hands,pushing hands,pushing hands,1993,1991,tt0105652,1991,pushing hands,pushing hands,pushing hands
3,301,picture bride,1995,picture bride,picture bride,picture bride,1996,1994,tt0114129,1994,picture bride,picture bride,picture bride
4,322,swimming with sharks,1995,swimming with sharks,swimming with sharks,swimming with sharks,1996,1994,tt0114594,1994,swimming with sharks,swimming with sharks,swimming with sharks
...,...,...,...,...,...,...,...,...,...,...,...,...,...
87,3687,light years,1988,light years,light years,light years,1989,1987,tt3351342,1987,light years,light years,light years
88,3799,pokémon the movie 2000,2000,pokémon the movie,pokémon the movie 2000,pokémon the movie 2000,2001,1999,tt0210234,1999,movie,movie 2000,movie 2000
89,3819,tampopo,1986,tampopo,tampopo,tampopo,1987,1985,tt0092048,1985,tampopo,tampopo,tampopo
90,3883,catfish in black bean sauce,2000,catfish in black bean sauce,catfish in black bean sauce,catfish in black bean sauce,2001,1999,tt0162903,1999,sauce,sauce,sauce


In [728]:
plus_year_match = unmatched.copy(deep=True)
plus_year_match = pd.merge(plus_year_match, imdb_to_match, left_on=['title', 'year+1'], right_on=['title', 'year'])
plus_year_match

Unnamed: 0,movie_id,title,year_x,stripped_title_x,no_secondary_x,only_secondary_x,year+1,year-1,tconst,year_y,stripped_title_y,no_secondary_y,only_secondary_y
0,128,jupiter's wife,1994,jupiters wife,jupiter's wife,jupiter's wife,1995,1993,tt0110217,1995,jupiters wife,jupiter's wife,jupiter's wife
1,157,canadian bacon,1994,canadian bacon,canadian bacon,canadian bacon,1995,1993,tt0109370,1995,bacon,bacon,bacon
2,183,mute witness,1994,mute witness,mute witness,mute witness,1995,1993,tt0110604,1995,mute witness,mute witness,mute witness
3,230,dolores claiborne,1994,dolores claiborne,dolores claiborne,dolores claiborne,1995,1993,tt0109642,1995,dolores claiborne,dolores claiborne,dolores claiborne
4,248,houseguest,1994,houseguest,houseguest,houseguest,1995,1993,tt0110066,1995,houseguest,houseguest,houseguest
...,...,...,...,...,...,...,...,...,...,...,...,...,...
63,3567,bossa nova,1999,bossa nova,bossa nova,bossa nova,2000,1998,tt0180837,2000,nova,nova,nova
64,3721,trixie,1999,trixie,trixie,trixie,2000,1998,tt0162711,2000,trixie,trixie,trixie
65,3728,one false move,1991,one false move,one false move,one false move,1992,1990,tt0102592,1992,one false move,one false move,one false move
66,3777,nekromantik,1987,nekromantik,nekromantik,nekromantik,1988,1986,tt0093608,1988,nekromantik,nekromantik,nekromantik


In [730]:
unmatched_ids = set(unmatched['movie_id']) - set(plus_year_match['movie_id']) - set(minus_year_match['movie_id'])
unmatched = unmatched[unmatched['movie_id'].isin(list(unmatched_ids))]
unmatched.head(40)

Unnamed: 0,movie_id,title,year,stripped_title,no_secondary,only_secondary,year+1,year-1
4,5,father of the bride part ii,1995,father of the bride part ii,father of the bride part ii,father of the bride part ii,1996,1994
23,24,powder,1995,powder,powder,powder,1996,1994
27,28,persuasion,1995,persuasion,persuasion,persuasion,1996,1994
31,32,twelve monkeys,1995,twelve monkeys,twelve monkeys,twelve monkeys,1996,1994
35,36,dead man walking,1995,dead man walking,dead man walking,dead man walking,1996,1994
36,37,across the sea of time,1995,across the sea of time,across the sea of time,across the sea of time,1996,1994
45,46,how to make an american quilt,1995,how to make an american quilt,how to make an american quilt,how to make an american quilt,1996,1994
50,51,guardian angel,1994,guardian angel,guardian angel,guardian angel,1995,1993
55,56,kids of the round table,1995,kids of the round table,kids of the round table,kids of the round table,1996,1994
56,57,home for the holidays,1995,home for the holidays,home for the holidays,home for the holidays,1996,1994


In [733]:
imdb_to_match[imdb_to_match['stripped_title'] == 'umbrellas of cherbourg']

Unnamed: 0,tconst,title,year,stripped_title,no_secondary,only_secondary
36173,tt0058450,the umbrellas of cherbourg,1964,umbrellas of cherbourg,umbrellas of cherbourg,umbrellas of cherbourg


In [737]:
unmatched_new = unmatched.copy(deep=True)

def remove_articles(title): 
    articles = {' the ', ' a ', ' an ', ' le ', ' la ', ' l\' ', ' el ', ' dir', ' der'}
    new_title = ''.join(title)
    for article in articles: 
        if (article in new_title): 
            new_title = new_title[:new_title.index(article)-1]
    return new_title 

unmatched_new['stripped_title'] = unmatched_new['stripped_title'].apply(lambda x: remove_articles(x))

# Final attempt

In [None]:
movies_full = movies_full.astype({'title':'string'})
imdb_full = imdb_full.replace('\\N', None)
imdb_full = imdb_full.astype({'title':'string', 
                              'originalTitle': 'string',
                              'year':'int64'})

In [33]:
movies_full.shape

(3883, 4)

In [62]:
ml_unmatched = movies_full.copy(deep='True')
ml_unmatched = ml_unmatched.drop(columns = ['genres'])

imdb = imdb_full.copy(deep='True')
imdb = imdb.drop(columns = ['averageRating','numVotes','titleType','isAdult','runtimeMinutes', 'imdb_genres'])

unmatched_movie_ids = ml_unmatched['movie_id']

In [63]:
# match on title + year 
match = pd.merge(ml_unmatched, imdb, on=['title', 'year'])
matched_df_full = match.copy(deep='True')

matched_movie_ids = list(matched_df_full['movie_id'])
unmatched_movie_ids = set(unmatched_movie_ids) - set(matched_movie_ids)

ml_unmatched = ml_unmatched[ml_unmatched['movie_id'].isin(list(unmatched_movie_ids))]

print("new matches: " + str(match.shape))
print("all matches: " + str(matched_df_full.shape))
print("left to match: " + str(ml_unmatched.shape))

new matches: (2455, 5)
all matches: (2455, 5)
left to match: (1460, 3)


In [64]:
# match on original title + year 
imdb['originalTitle'] = imdb['originalTitle'].apply(lambda x: x.lower())

match = pd.merge(ml_unmatched, imdb, 
                 left_on=['title', 'year'],
                 right_on=['originalTitle', 'year'])

matched_df_full = matched_df_full.append(match, ignore_index=True)

matched_movie_ids = list(matched_df_full['movie_id'])
unmatched_movie_ids = set(unmatched_movie_ids) - set(matched_movie_ids)

ml_unmatched = ml_unmatched[ml_unmatched['movie_id'].isin(list(unmatched_movie_ids))]

print("new matches: " + str(match.shape))
print("all matches: " + str(matched_df_full.shape))
print("left to match: " + str(ml_unmatched.shape))

new matches: (36, 6)
all matches: (2491, 7)
left to match: (1424, 3)


In [65]:
# match on original title + (year + 1)
ml_unmatched['year+1'] = ml_unmatched['year'] + 1
ml_unmatched['year-1'] = ml_unmatched['year'] - 1


match = pd.merge(ml_unmatched, imdb, 
                 left_on=['title', 'year+1'],
                 right_on=['originalTitle', 'year'])

matched_df_full = matched_df_full.append(match, ignore_index=True)

matched_movie_ids = list(matched_df_full['movie_id'])
unmatched_movie_ids = set(unmatched_movie_ids) - set(matched_movie_ids)

ml_unmatched = ml_unmatched[ml_unmatched['movie_id'].isin(list(unmatched_movie_ids))]

print("new matches: " + str(match.shape))
print("all matches: " + str(matched_df_full.shape))
print("left to match: " + str(ml_unmatched.shape))

# match on title + (year+1)

ml_unmatched['year+1'] = ml_unmatched['year'] + 1
ml_unmatched['year-1'] = ml_unmatched['year'] - 1


match = pd.merge(ml_unmatched, imdb, 
                 left_on=['title', 'year+1'],
                 right_on=['title', 'year'])

matched_df_full = matched_df_full.append(match, ignore_index=True)

matched_movie_ids = list(matched_df_full['movie_id'])
unmatched_movie_ids = set(unmatched_movie_ids) - set(matched_movie_ids)

ml_unmatched = ml_unmatched[ml_unmatched['movie_id'].isin(list(unmatched_movie_ids))]

print("new matches: " + str(match.shape))
print("all matches: " + str(matched_df_full.shape))
print("left to match: " + str(ml_unmatched.shape))

new matches: (65, 9)
all matches: (2556, 11)
left to match: (1361, 5)
new matches: (1, 8)
all matches: (2557, 11)
left to match: (1360, 5)


In [66]:
# match on a year that is 1 off 
# match on original title 
match = pd.merge(ml_unmatched, imdb, 
                 left_on=['title', 'year-1'],
                 right_on=['originalTitle', 'year'])

matched_df_full = matched_df_full.append(match, ignore_index=True)

matched_movie_ids = list(matched_df_full['movie_id'])
unmatched_movie_ids = set(unmatched_movie_ids) - set(matched_movie_ids)

ml_unmatched = ml_unmatched[ml_unmatched['movie_id'].isin(list(unmatched_movie_ids))]

print("new matches: " + str(match.shape))
print("all matches: " + str(matched_df_full.shape))
print("left to match: " + str(ml_unmatched.shape))

# match on a year that is 1 off 
# match on title 
match = pd.merge(ml_unmatched, imdb, 
                 left_on=['title', 'year-1'],
                 right_on=['title', 'year'])

matched_df_full = matched_df_full.append(match, ignore_index=True)

matched_movie_ids = list(matched_df_full['movie_id'])
unmatched_movie_ids = set(unmatched_movie_ids) - set(matched_movie_ids)

ml_unmatched = ml_unmatched[ml_unmatched['movie_id'].isin(list(unmatched_movie_ids))]

print("new matches: " + str(match.shape))
print("all matches: " + str(matched_df_full.shape))
print("left to match: " + str(ml_unmatched.shape))

new matches: (82, 9)
all matches: (2639, 11)
left to match: (1278, 5)
new matches: (8, 8)
all matches: (2647, 11)
left to match: (1270, 5)


In [67]:
# reset df 
ml_unmatched = ml_unmatched.drop(columns=['year+1', 'year-1'])

In [80]:
import string 
from string import punctuation

def reposition_movielens_article(title): 
    articles = {', the', ', a', ', an', ', le', ', la', ', l\'', ', el', ', dir', ', der'}
    for article in articles: 
        if(title[-len(article):] == article):
            title = article[2:] + ' ' + title[:title.index(article)]
    return title 

def remove_movielens_articles(title):
    new_title = title
    articles = {', the', ', a', ', an', ', le', ', la', ', l\'', ', el', ', dir', ', der'}
    for article in articles: 
        if article in title: 
            new_title = title[:title.index(article)]
    return new_title 

def remove_imdb_articles(title): 
    new_title = title
    articles = {'the ', 'a ', 'an ', 'le ', 'la ', 'l\' ', 'el ', 'dir', 'der'}
    for article in articles: 
        if article in title: 
            new_title = title[title.index(article) + len(article):] 
    return new_title

def remove_punctuation(title): 
    new_title = title
    if (any(p in title for p in punctuation)):
        new_title = title.translate(str.maketrans('', '', string.punctuation))
    return new_title 

def remove_numbers(title):
    no_digits = []
    for i in title:
        if not i.isdigit():
            no_digits.append(i)
    return ''.join(no_digits)

def remove_secondary(title):
    new_title = title
    if ('(' in title and ')' in title):
        new_title = title[: title.index('(')-1]
    
    return new_title 

def extract_secondary(title):
    new_title = title
    if ('(' in title and ')' in title):
        new_title = title[title.index('(') + 1: title.index(')')]
    return new_title  

In [68]:
# fix the article notation movielens has
# The notation is i.e. Contender, The 

ml_unmatched_articles = ml_unmatched.copy(deep='True')

ml_unmatched_articles['title'] = ml_unmatched_articles['title'].apply(lambda x: reposition_movielens_articles_from_end(x))

match = pd.merge(ml_unmatched_articles, imdb, 
                 left_on=['title', 'year'],
                 right_on=['originalTitle', 'year'])

matched_df_full = matched_df_full.append(match, ignore_index=True)

matched_movie_ids = list(matched_df_full['movie_id'])
unmatched_movie_ids = set(unmatched_movie_ids) - set(matched_movie_ids)

ml_unmatched = ml_unmatched[ml_unmatched['movie_id'].isin(list(unmatched_movie_ids))]

print("new matches: " + str(match.shape))
print("all matches: " + str(matched_df_full.shape))
print("left to match: " + str(ml_unmatched.shape))

ml_unmatched_articles = ml_unmatched_articles[ml_unmatched_articles['movie_id'].isin(unmatched_movie_ids)]

# article notation change but match with title instead of originalTitle 

match = pd.merge(ml_unmatched_articles, imdb, 
                 left_on=['title', 'year'],
                 right_on=['title', 'year'])

matched_df_full = matched_df_full.append(match, ignore_index=True)


matched_movie_ids = list(matched_df_full['movie_id'])
unmatched_movie_ids = set(unmatched_movie_ids) - set(matched_movie_ids)

ml_unmatched = ml_unmatched[ml_unmatched['movie_id'].isin(list(unmatched_movie_ids))]

print("new matches: " + str(match.shape))
print("all matches: " + str(matched_df_full.shape))
print("left to match: " + str(ml_unmatched.shape))

new matches: (715, 6)
all matches: (3362, 11)
left to match: (560, 3)
new matches: (19, 5)
all matches: (3381, 11)
left to match: (541, 3)


In [69]:
# let's try the +- year again 

ml_unmatched_articles = ml_unmatched_articles[ml_unmatched_articles['movie_id'].isin(unmatched_movie_ids)]

ml_unmatched_articles['year+1'] = ml_unmatched_articles['year']+1
ml_unmatched_articles['year-1'] = ml_unmatched_articles['year']-1

match = pd.merge(ml_unmatched_articles, imdb, 
                 left_on=['title', 'year+1'],
                 right_on=['originalTitle', 'year'])

matched_df_full = matched_df_full.append(match, ignore_index=True)

matched_movie_ids = list(matched_df_full['movie_id'])
unmatched_movie_ids = set(unmatched_movie_ids) - set(matched_movie_ids)

ml_unmatched = ml_unmatched[ml_unmatched['movie_id'].isin(list(unmatched_movie_ids))]

print("new matches: " + str(match.shape))
print("all matches: " + str(matched_df_full.shape))
print("left to match: " + str(ml_unmatched.shape))

ml_unmatched_articles = ml_unmatched_articles[ml_unmatched_articles['movie_id'].isin(unmatched_movie_ids)]

# article notation change but match with title instead of originalTitle 

match = pd.merge(ml_unmatched_articles, imdb, 
                 left_on=['title', 'year+1'],
                 right_on=['title', 'year'])

matched_df_full = matched_df_full.append(match, ignore_index=True)

matched_movie_ids = list(matched_df_full['movie_id'])
unmatched_movie_ids = set(unmatched_movie_ids) - set(matched_movie_ids)

ml_unmatched = ml_unmatched[ml_unmatched['movie_id'].isin(list(unmatched_movie_ids))]

print("new matches: " + str(match.shape))
print("all matches: " + str(matched_df_full.shape))
print("left to match: " + str(ml_unmatched.shape))

new matches: (17, 9)
all matches: (3398, 11)
left to match: (525, 3)
new matches: (1, 8)
all matches: (3399, 11)
left to match: (524, 3)


In [70]:
# let's try the +- year again 

ml_unmatched_articles = ml_unmatched_articles[ml_unmatched_articles['movie_id'].isin(unmatched_movie_ids)]

ml_unmatched_articles['year+1'] = ml_unmatched_articles['year']+1
ml_unmatched_articles['year-1'] = ml_unmatched_articles['year']-1

match = pd.merge(ml_unmatched_articles, imdb, 
                 left_on=['title', 'year-1'],
                 right_on=['originalTitle', 'year'])

matched_df_full = matched_df_full.append(match, ignore_index=True)

matched_movie_ids = list(matched_df_full['movie_id'])
unmatched_movie_ids = set(unmatched_movie_ids) - set(matched_movie_ids)

ml_unmatched = ml_unmatched[ml_unmatched['movie_id'].isin(list(unmatched_movie_ids))]

print("new matches: " + str(match.shape))
print("all matches: " + str(matched_df_full.shape))
print("left to match: " + str(ml_unmatched.shape))

ml_unmatched_articles = ml_unmatched_articles[ml_unmatched_articles['movie_id'].isin(unmatched_movie_ids)]

# article notation change but match with title instead of originalTitle 

match = pd.merge(ml_unmatched_articles, imdb, 
                 left_on=['title', 'year-1'],
                 right_on=['title', 'year'])

matched_df_full = matched_df_full.append(match, ignore_index=True)

matched_movie_ids = list(matched_df_full['movie_id'])
unmatched_movie_ids = set(unmatched_movie_ids) - set(matched_movie_ids)

ml_unmatched = ml_unmatched[ml_unmatched['movie_id'].isin(list(unmatched_movie_ids))]

print("new matches: " + str(match.shape))
print("all matches: " + str(matched_df_full.shape))
print("left to match: " + str(ml_unmatched.shape))

new matches: (20, 9)
all matches: (3419, 11)
left to match: (504, 3)
new matches: (1, 8)
all matches: (3420, 11)
left to match: (503, 3)


In [76]:
# no numbers + year

ml_unmatched_numbers = ml_unmatched.copy(deep='True')
ml_unmatched_numbers['year+1'] = ml_unmatched_numbers['year']+1
ml_unmatched_numbers['year+1'] = ml_unmatched_numbers['year']+1

imdb_numbers = imdb.copy(deep='True')

ml_unmatched_numbers['title'] = ml_unmatched_numbers['title'].apply(lambda x: remove_punctuation(x))
imdb_numbers['title'] = imdb_numbers['title'].apply(lambda x: remove_punctuation(x))

match = pd.merge(ml_unmatched_numbers, imdb_numbers, 
                 left_on=['title', 'year'],
                 right_on=['title', 'year'])

matched_df_full = matched_df_full.append(match, ignore_index=True)

matched_movie_ids = list(matched_df_full['movie_id'])
unmatched_movie_ids = set(unmatched_movie_ids) - set(matched_movie_ids)

ml_unmatched = ml_unmatched[ml_unmatched['movie_id'].isin(list(unmatched_movie_ids))]

print("new matches: " + str(match.shape))
print("all matches: " + str(matched_df_full.shape))
print("left to match: " + str(ml_unmatched.shape))

# no punctuation + (year+1)

ml_unmatched_numbers = ml_unmatched.copy(deep='True')
ml_unmatched_numbers['year+1'] = ml_unmatched_numbers['year']+1

imdb_numbers = imdb.copy(deep='True')

ml_unmatched_numbers['title'] = ml_unmatched_numbers['title'].apply(lambda x: remove_punctuation(x))
imdb_numbers['title'] = imdb_numbers['title'].apply(lambda x: remove_punctuation(x))

match = pd.merge(ml_unmatched_numbers, imdb_numbers, 
                 left_on=['title', 'year+1'],
                 right_on=['title', 'year'])

matched_df_full = matched_df_full.append(match, ignore_index=True)

matched_movie_ids = list(matched_df_full['movie_id'])
unmatched_movie_ids = set(unmatched_movie_ids) - set(matched_movie_ids)

ml_unmatched = ml_unmatched[ml_unmatched['movie_id'].isin(list(unmatched_movie_ids))]

print("new matches: " + str(match.shape))
print("all matches: " + str(matched_df_full.shape))
print("left to match: " + str(ml_unmatched.shape))

# no punctuation + (year-1)

ml_unmatched_numbers = ml_unmatched.copy(deep='True')
ml_unmatched_numbers['year-1'] = ml_unmatched_numbers['year']-1

imdb_numbers = imdb.copy(deep='True')

ml_unmatched_numbers['title'] = ml_unmatched_numbers['title'].apply(lambda x: remove_punctuation(x))
imdb_numbers['title'] = imdb_numbers['title'].apply(lambda x: remove_punctuation(x))

match = pd.merge(ml_unmatched_numbers, imdb_numbers, 
                 left_on=['title', 'year-1'],
                 right_on=['title', 'year'])

matched_df_full = matched_df_full.append(match, ignore_index=True)

matched_movie_ids = list(matched_df_full['movie_id'])
unmatched_movie_ids = set(unmatched_movie_ids) - set(matched_movie_ids)

ml_unmatched = ml_unmatched[ml_unmatched['movie_id'].isin(list(unmatched_movie_ids))]

print("new matches: " + str(match.shape))
print("all matches: " + str(matched_df_full.shape))
print("left to match: " + str(ml_unmatched.shape))

new matches: (0, 6)
all matches: (3455, 11)
left to match: (471, 3)
new matches: (0, 7)
all matches: (3455, 11)
left to match: (471, 3)
new matches: (0, 7)
all matches: (3455, 11)
left to match: (471, 3)


In [75]:
# no punctuation + year

ml_unmatched_punctuation = ml_unmatched.copy(deep='True')
ml_unmatched_punctuation['year+1'] = ml_unmatched_punctuation['year']+1
ml_unmatched_punctuation['year+1'] = ml_unmatched_punctuation['year']+1

imdb_punctuation = imdb.copy(deep='True')

ml_unmatched_punctuation['title'] = ml_unmatched_punctuation['title'].apply(lambda x: remove_punctuation(x))
imdb_punctuation['title'] = imdb_punctuation['title'].apply(lambda x: remove_punctuation(x))

match = pd.merge(ml_unmatched_punctuation, imdb_punctuation, 
                 left_on=['title', 'year'],
                 right_on=['title', 'year'])

matched_df_full = matched_df_full.append(match, ignore_index=True)

matched_movie_ids = list(matched_df_full['movie_id'])
unmatched_movie_ids = set(unmatched_movie_ids) - set(matched_movie_ids)

ml_unmatched = ml_unmatched[ml_unmatched['movie_id'].isin(list(unmatched_movie_ids))]

print("new matches: " + str(match.shape))
print("all matches: " + str(matched_df_full.shape))
print("left to match: " + str(ml_unmatched.shape))

# no punctuation + (year+1)

ml_unmatched_punctuation = ml_unmatched.copy(deep='True')
ml_unmatched_punctuation['year+1'] = ml_unmatched_punctuation['year']+1

imdb_punctuation = imdb.copy(deep='True')

ml_unmatched_punctuation['title'] = ml_unmatched_punctuation['title'].apply(lambda x: remove_punctuation(x))
imdb_punctuation['title'] = imdb_punctuation['title'].apply(lambda x: remove_punctuation(x))

match = pd.merge(ml_unmatched_punctuation, imdb_punctuation, 
                 left_on=['title', 'year+1'],
                 right_on=['title', 'year'])

matched_df_full = matched_df_full.append(match, ignore_index=True)

matched_movie_ids = list(matched_df_full['movie_id'])
unmatched_movie_ids = set(unmatched_movie_ids) - set(matched_movie_ids)

ml_unmatched = ml_unmatched[ml_unmatched['movie_id'].isin(list(unmatched_movie_ids))]

print("new matches: " + str(match.shape))
print("all matches: " + str(matched_df_full.shape))
print("left to match: " + str(ml_unmatched.shape))

# no punctuation + (year-1)

ml_unmatched_punctuation = ml_unmatched.copy(deep='True')
ml_unmatched_punctuation['year-1'] = ml_unmatched_punctuation['year']-1

imdb_punctuation = imdb.copy(deep='True')

ml_unmatched_punctuation['title'] = ml_unmatched_punctuation['title'].apply(lambda x: remove_punctuation(x))
imdb_punctuation['title'] = imdb_punctuation['title'].apply(lambda x: remove_punctuation(x))

match = pd.merge(ml_unmatched_punctuation, imdb_punctuation, 
                 left_on=['title', 'year-1'],
                 right_on=['title', 'year'])

matched_df_full = matched_df_full.append(match, ignore_index=True)

matched_movie_ids = list(matched_df_full['movie_id'])
unmatched_movie_ids = set(unmatched_movie_ids) - set(matched_movie_ids)

ml_unmatched = ml_unmatched[ml_unmatched['movie_id'].isin(list(unmatched_movie_ids))]

print("new matches: " + str(match.shape))
print("all matches: " + str(matched_df_full.shape))
print("left to match: " + str(ml_unmatched.shape))

new matches: (0, 6)
all matches: (3454, 11)
left to match: (472, 3)
new matches: (0, 7)
all matches: (3454, 11)
left to match: (472, 3)
new matches: (1, 7)
all matches: (3455, 11)
left to match: (471, 3)


In [77]:
# secondary title

ml_secondary_title = ml_unmatched.copy(deep='True')
ml_secondary_title['year+1'] = ml_unmatched_punctuation['year']+1
ml_secondary_title['year+1'] = ml_unmatched_punctuation['year']+1


ml_secondary_title['title'] = ml_secondary_title['title'].apply(lambda x: extract_secondary(x))

match = pd.merge(ml_secondary_title, imdb, 
                 left_on=['title', 'year'],
                 right_on=['title', 'year'])

matched_df_full = matched_df_full.append(match, ignore_index=True)

matched_movie_ids = list(matched_df_full['movie_id'])
unmatched_movie_ids = set(unmatched_movie_ids) - set(matched_movie_ids)

ml_unmatched = ml_unmatched[ml_unmatched['movie_id'].isin(list(unmatched_movie_ids))]

print("new matches: " + str(match.shape))
print("all matches: " + str(matched_df_full.shape))
print("left to match: " + str(ml_unmatched.shape))

# no punctuation + (year+1)

ml_secondary_title = ml_unmatched.copy(deep='True')
ml_secondary_title['year+1'] = ml_secondary_title['year']+1


ml_secondary_title['title'] = ml_secondary_title['title'].apply(lambda x: extract_secondary(x))

match = pd.merge(ml_secondary_title, imdb_punctuation, 
                 left_on=['title', 'year+1'],
                 right_on=['title', 'year'])

matched_df_full = matched_df_full.append(match, ignore_index=True)

matched_movie_ids = list(matched_df_full['movie_id'])
unmatched_movie_ids = set(unmatched_movie_ids) - set(matched_movie_ids)

ml_unmatched = ml_unmatched[ml_unmatched['movie_id'].isin(list(unmatched_movie_ids))]

print("new matches: " + str(match.shape))
print("all matches: " + str(matched_df_full.shape))
print("left to match: " + str(ml_unmatched.shape))

# no punctuation + (year-1)

ml_secondary_title = ml_unmatched.copy(deep='True')
ml_secondary_title['year-1'] = ml_secondary_title['year']-1


ml_secondary_title['title'] = ml_secondary_title['title'].apply(lambda x: extract_secondary(x))

match = pd.merge(ml_unmatched_punctuation, imdb_punctuation, 
                 left_on=['title', 'year-1'],
                 right_on=['title', 'year'])

matched_df_full = matched_df_full.append(match, ignore_index=True)

matched_movie_ids = list(matched_df_full['movie_id'])
unmatched_movie_ids = set(unmatched_movie_ids) - set(matched_movie_ids)

ml_unmatched = ml_unmatched[ml_unmatched['movie_id'].isin(list(unmatched_movie_ids))]

print("new matches: " + str(match.shape))
print("all matches: " + str(matched_df_full.shape))
print("left to match: " + str(ml_unmatched.shape))

new matches: (20, 6)
all matches: (3475, 11)
left to match: (451, 3)
new matches: (1, 7)
all matches: (3476, 11)
left to match: (450, 3)
new matches: (1, 7)
all matches: (3477, 11)
left to match: (450, 3)


In [78]:
# secondary title

ml_secondary_title = ml_unmatched.copy(deep='True')
ml_secondary_title['year+1'] = ml_unmatched_punctuation['year']+1
ml_secondary_title['year+1'] = ml_unmatched_punctuation['year']+1


ml_secondary_title['title'] = ml_secondary_title['title'].apply(lambda x: remove_secondary(x))

match = pd.merge(ml_secondary_title, imdb, 
                 left_on=['title', 'year'],
                 right_on=['title', 'year'])

matched_df_full = matched_df_full.append(match, ignore_index=True)

matched_movie_ids = list(matched_df_full['movie_id'])
unmatched_movie_ids = set(unmatched_movie_ids) - set(matched_movie_ids)

ml_unmatched = ml_unmatched[ml_unmatched['movie_id'].isin(list(unmatched_movie_ids))]

print("new matches: " + str(match.shape))
print("all matches: " + str(matched_df_full.shape))
print("left to match: " + str(ml_unmatched.shape))

# no punctuation + (year+1)

ml_secondary_title = ml_unmatched.copy(deep='True')
ml_secondary_title['year+1'] = ml_secondary_title['year']+1


ml_secondary_title['title'] = ml_secondary_title['title'].apply(lambda x: remove_secondary(x))

match = pd.merge(ml_secondary_title, imdb_punctuation, 
                 left_on=['title', 'year+1'],
                 right_on=['title', 'year'])

matched_df_full = matched_df_full.append(match, ignore_index=True)

matched_movie_ids = list(matched_df_full['movie_id'])
unmatched_movie_ids = set(unmatched_movie_ids) - set(matched_movie_ids)

ml_unmatched = ml_unmatched[ml_unmatched['movie_id'].isin(list(unmatched_movie_ids))]

print("new matches: " + str(match.shape))
print("all matches: " + str(matched_df_full.shape))
print("left to match: " + str(ml_unmatched.shape))

# no punctuation + (year-1)

ml_secondary_title = ml_unmatched.copy(deep='True')
ml_secondary_title['year-1'] = ml_secondary_title['year']-1


ml_secondary_title['title'] = ml_secondary_title['title'].apply(lambda x: remove_secondary(x))

match = pd.merge(ml_unmatched_punctuation, imdb_punctuation, 
                 left_on=['title', 'year-1'],
                 right_on=['title', 'year'])

matched_df_full = matched_df_full.append(match, ignore_index=True)

matched_movie_ids = list(matched_df_full['movie_id'])
unmatched_movie_ids = set(unmatched_movie_ids) - set(matched_movie_ids)

ml_unmatched = ml_unmatched[ml_unmatched['movie_id'].isin(list(unmatched_movie_ids))]

print("new matches: " + str(match.shape))
print("all matches: " + str(matched_df_full.shape))
print("left to match: " + str(ml_unmatched.shape))

new matches: (109, 6)
all matches: (3586, 11)
left to match: (341, 3)
new matches: (9, 7)
all matches: (3595, 11)
left to match: (332, 3)
new matches: (1, 7)
all matches: (3596, 11)
left to match: (332, 3)


In [81]:
# secondary title + article reposition

ml_secondary_title = ml_unmatched.copy(deep='True')
ml_secondary_title['year+1'] = ml_unmatched_punctuation['year']+1
ml_secondary_title['year+1'] = ml_unmatched_punctuation['year']+1


ml_secondary_title['title'] = ml_secondary_title['title'].apply(lambda x: remove_secondary(x))
ml_secondary_title['title'] = ml_secondary_title['title'].apply(lambda x: reposition_movielens_article(x))


match = pd.merge(ml_secondary_title, imdb, 
                 left_on=['title', 'year'],
                 right_on=['title', 'year'])

matched_df_full = matched_df_full.append(match, ignore_index=True)

matched_movie_ids = list(matched_df_full['movie_id'])
unmatched_movie_ids = set(unmatched_movie_ids) - set(matched_movie_ids)

ml_unmatched = ml_unmatched[ml_unmatched['movie_id'].isin(list(unmatched_movie_ids))]

print("new matches: " + str(match.shape))
print("all matches: " + str(matched_df_full.shape))
print("left to match: " + str(ml_unmatched.shape))

# no punctuation + (year+1)

ml_secondary_title = ml_unmatched.copy(deep='True')
ml_secondary_title['year+1'] = ml_secondary_title['year']+1


ml_secondary_title['title'] = ml_secondary_title['title'].apply(lambda x: remove_secondary(x))
ml_secondary_title['title'] = ml_secondary_title['title'].apply(lambda x: reposition_movielens_article(x))

match = pd.merge(ml_secondary_title, imdb_punctuation, 
                 left_on=['title', 'year+1'],
                 right_on=['title', 'year'])

matched_df_full = matched_df_full.append(match, ignore_index=True)

matched_movie_ids = list(matched_df_full['movie_id'])
unmatched_movie_ids = set(unmatched_movie_ids) - set(matched_movie_ids)

ml_unmatched = ml_unmatched[ml_unmatched['movie_id'].isin(list(unmatched_movie_ids))]

print("new matches: " + str(match.shape))
print("all matches: " + str(matched_df_full.shape))
print("left to match: " + str(ml_unmatched.shape))

# no punctuation + (year-1)

ml_secondary_title = ml_unmatched.copy(deep='True')
ml_secondary_title['year-1'] = ml_secondary_title['year']-1


ml_secondary_title['title'] = ml_secondary_title['title'].apply(lambda x: remove_secondary(x))
ml_secondary_title['title'] = ml_secondary_title['title'].apply(lambda x: reposition_movielens_article(x))

match = pd.merge(ml_unmatched_punctuation, imdb_punctuation, 
                 left_on=['title', 'year-1'],
                 right_on=['title', 'year'])

matched_df_full = matched_df_full.append(match, ignore_index=True)

matched_movie_ids = list(matched_df_full['movie_id'])
unmatched_movie_ids = set(unmatched_movie_ids) - set(matched_movie_ids)

ml_unmatched = ml_unmatched[ml_unmatched['movie_id'].isin(list(unmatched_movie_ids))]

print("new matches: " + str(match.shape))
print("all matches: " + str(matched_df_full.shape))
print("left to match: " + str(ml_unmatched.shape))

new matches: (60, 6)
all matches: (3656, 11)
left to match: (274, 3)
new matches: (4, 7)
all matches: (3660, 11)
left to match: (270, 3)
new matches: (1, 7)
all matches: (3661, 11)
left to match: (270, 3)


In [84]:
# replace & with and 

def replace_ampersand(title): 
    if ('&' in title): 
        title = title.replace('&', 'and')
# secondary title + article reposition

ml_secondary_title = ml_unmatched.copy(deep='True')

ml_secondary_title['title'] = ml_secondary_title['title'].apply(lambda x: replace_ampersand(x))

match = pd.merge(ml_secondary_title, imdb, 
                 left_on=['title', 'year'],
                 right_on=['title', 'year'])

matched_df_full = matched_df_full.append(match, ignore_index=True)

matched_movie_ids = list(matched_df_full['movie_id'])
unmatched_movie_ids = set(unmatched_movie_ids) - set(matched_movie_ids)

ml_unmatched = ml_unmatched[ml_unmatched['movie_id'].isin(list(unmatched_movie_ids))]

print("new matches: " + str(match.shape))
print("all matches: " + str(matched_df_full.shape))
print("left to match: " + str(ml_unmatched.shape))

# no punctuation + (year+1)

ml_secondary_title = ml_unmatched.copy(deep='True')
ml_secondary_title['year+1'] = ml_secondary_title['year']+1

ml_secondary_title['title'] = ml_secondary_title['title'].apply(lambda x: replace_ampersand(x))

match = pd.merge(ml_secondary_title, imdb_punctuation, 
                 left_on=['title', 'year+1'],
                 right_on=['title', 'year'])

matched_df_full = matched_df_full.append(match, ignore_index=True)

matched_movie_ids = list(matched_df_full['movie_id'])
unmatched_movie_ids = set(unmatched_movie_ids) - set(matched_movie_ids)

ml_unmatched = ml_unmatched[ml_unmatched['movie_id'].isin(list(unmatched_movie_ids))]

print("new matches: " + str(match.shape))
print("all matches: " + str(matched_df_full.shape))
print("left to match: " + str(ml_unmatched.shape))

# no punctuation + (year-1)

ml_secondary_title = ml_unmatched.copy(deep='True')
ml_secondary_title['year-1'] = ml_secondary_title['year']-1

ml_secondary_title['title'] = ml_secondary_title['title'].apply(lambda x: replace_ampersand(x))

match = pd.merge(ml_unmatched_punctuation, imdb_punctuation, 
                 left_on=['title', 'year-1'],
                 right_on=['title', 'year'])

matched_df_full = matched_df_full.append(match, ignore_index=True)

matched_movie_ids = list(matched_df_full['movie_id'])
unmatched_movie_ids = set(unmatched_movie_ids) - set(matched_movie_ids)

ml_unmatched = ml_unmatched[ml_unmatched['movie_id'].isin(list(unmatched_movie_ids))]

print("new matches: " + str(match.shape))
print("all matches: " + str(matched_df_full.shape))
print("left to match: " + str(ml_unmatched.shape))

new matches: (0, 5)
all matches: (3662, 11)
left to match: (270, 3)
new matches: (0, 7)
all matches: (3662, 11)
left to match: (270, 3)
new matches: (1, 7)
all matches: (3663, 11)
left to match: (270, 3)


In [110]:
# remvove article from movielens 
# i don't think this works
def replace_ampersand(title): 
    if ('&' in title): 
        print(title)
        title = title.replace('&', 'and')
    return title
# secondary title + article reposition

ml_secondary_title = ml_unmatched.copy(deep='True')

ml_secondary_title['title'] = ml_secondary_title['title'].apply(lambda x: remove_movielens_articles(x))

match = pd.merge(ml_secondary_title, imdb, 
                 left_on=['title', 'year'],
                 right_on=['title', 'year'])

matched_df_full = matched_df_full.append(match, ignore_index=True)

matched_movie_ids = list(matched_df_full['movie_id'])
unmatched_movie_ids = set(unmatched_movie_ids) - set(matched_movie_ids)

ml_unmatched = ml_unmatched[ml_unmatched['movie_id'].isin(list(unmatched_movie_ids))]

print("new matches: " + str(match.shape))
print("all matches: " + str(matched_df_full.shape))
print("left to match: " + str(ml_unmatched.shape))

# no punctuation + (year+1)

ml_secondary_title = ml_unmatched.copy(deep='True')
ml_secondary_title['year+1'] = ml_secondary_title['year']+1

ml_secondary_title['title'] = ml_secondary_title['title'].apply(lambda x: remove_movielens_articles(x))

match = pd.merge(ml_secondary_title, imdb_punctuation, 
                 left_on=['title', 'year+1'],
                 right_on=['title', 'year'])

matched_df_full = matched_df_full.append(match, ignore_index=True)

matched_movie_ids = list(matched_df_full['movie_id'])
unmatched_movie_ids = set(unmatched_movie_ids) - set(matched_movie_ids)

ml_unmatched = ml_unmatched[ml_unmatched['movie_id'].isin(list(unmatched_movie_ids))]

print("new matches: " + str(match.shape))
print("all matches: " + str(matched_df_full.shape))
print("left to match: " + str(ml_unmatched.shape))

# no punctuation + (year-1)

ml_secondary_title = ml_unmatched.copy(deep='True')
ml_secondary_title['year-1'] = ml_secondary_title['year']-1

ml_secondary_title['title'] = ml_secondary_title['title'].apply(lambda x: remove_movielens_articles(x))

match = pd.merge(ml_unmatched_punctuation, imdb_punctuation, 
                 left_on=['title', 'year-1'],
                 right_on=['title', 'year'])

matched_df_full = matched_df_full.append(match, ignore_index=True)

matched_movie_ids = list(matched_df_full['movie_id'])
unmatched_movie_ids = set(unmatched_movie_ids) - set(matched_movie_ids)

ml_unmatched = ml_unmatched[ml_unmatched['movie_id'].isin(list(unmatched_movie_ids))]

print("new matches: " + str(match.shape))
print("all matches: " + str(matched_df_full.shape))
print("left to match: " + str(ml_unmatched.shape))

new matches: (0, 5)
all matches: (3683, 11)
left to match: (252, 3)
new matches: (0, 7)
all matches: (3683, 11)
left to match: (252, 3)
new matches: (1, 7)
all matches: (3684, 11)
left to match: (252, 3)


In [109]:
replace_ampersand('dumb & dumber')

'dumb and dumber'

In [113]:
ml_unmatched.tail(50)

Unnamed: 0,movie_id,title,year
3394,3463,last resort,1994
3395,3464,solar crisis,1993
3403,3472,horror hotel (a.k.a. the city of the dead),1960
3453,3522,sacco and vanzetti (sacco e vanzetti),1971
3463,3532,freedom for us (à nous la liberté ),1931
3485,3554,love and basketball,2000
3491,3560,phantom love (ai no borei),1978
3499,3568,smiling fish and goat on fire,1999
3502,3571,time code,2000
3508,3577,two moon juction,1988


In [103]:
imdb[imdb['title'] == 'les miserables']

Unnamed: 0,tconst,title,originalTitle,year
25325,tt0044907,les miserables,les miserables,1952
51163,tt0077936,les miserables,les miserables,1978


In [107]:
imdb[imdb['title'] == 'la haine']

Unnamed: 0,tconst,title,originalTitle,year
76275,tt0113247,la haine,la haine,1995
192196,tt0448513,la haine,la haine,1910


In [112]:
imdb[imdb['title'] == 'das versprechen']

Unnamed: 0,tconst,title,originalTitle,year
75339,tt0111613,das versprechen,das versprechen,1994


In [122]:
# puppet master ii
imdb[imdb['title'] == 'the puppet master ii']

Unnamed: 0,tconst,title,originalTitle,year


In [130]:
manually_fixing_df = pd.read_csv('https://raw.githubusercontent.com/jennyzhang0215/MovieLens-IMDB/master/movielens/statistics/manually_fixed_title_name', delimiter='|')

In [131]:
manually_fixing_df.head(20)

Unnamed: 0,wrong_title_name,correct_title_name
0,"to wong foo, thanks for everything! julie newmar","to wong foo thanks for everything, julie newmar"
1,faster pussycat! kill! kill!,"faster, pussycat! kill! kill!"
2,star trek: the wrath of khan,star trek ii: the wrath of khan
3,jungle2jungle,jungle 2 jungle
4,when the cats away,when the cat's away
5,mrs. brown,mrs brown
6,tales from the crypt presents: demon knight,tales from the crypt: demon knight
7,tales from the crypt presents: bordello of blood,bordello of blood
8,jackie chan's first strike,police story 4: first strike
9,die hard: with a vengeance,die hard with a vengeance


In [135]:
# remvove article from movielens 
# i don't think this works
def replace_manually(title): 
    fixed_title = title
    if (len(manually_fixing_df[manually_fixing_df['wrong_title_name'] == title] == 1)): 
        fixed_title = manually_fixing_df[manually_fixing_df['wrong_title_name'] == title]['correct_title_name'].values[0]
    return fixed_title

# secondary title + article reposition

ml_secondary_title = ml_unmatched.copy(deep='True')

ml_secondary_title['title'] = ml_secondary_title['title'].apply(lambda x: replace_manually(x))

match = pd.merge(ml_secondary_title, imdb, 
                 left_on=['title', 'year'],
                 right_on=['title', 'year'])

matched_df_full = matched_df_full.append(match, ignore_index=True)

matched_movie_ids = list(matched_df_full['movie_id'])
unmatched_movie_ids = set(unmatched_movie_ids) - set(matched_movie_ids)

ml_unmatched = ml_unmatched[ml_unmatched['movie_id'].isin(list(unmatched_movie_ids))]

print("new matches: " + str(match.shape))
print("all matches: " + str(matched_df_full.shape))
print("left to match: " + str(ml_unmatched.shape))

# no punctuation + (year+1)

ml_secondary_title = ml_unmatched.copy(deep='True')
ml_secondary_title['year+1'] = ml_secondary_title['year']+1

ml_secondary_title['title'] = ml_secondary_title['title'].apply(lambda x: remove_movielens_articles(x))

match = pd.merge(ml_secondary_title, imdb_punctuation, 
                 left_on=['title', 'year+1'],
                 right_on=['title', 'year'])

matched_df_full = matched_df_full.append(match, ignore_index=True)

matched_movie_ids = list(matched_df_full['movie_id'])
unmatched_movie_ids = set(unmatched_movie_ids) - set(matched_movie_ids)

ml_unmatched = ml_unmatched[ml_unmatched['movie_id'].isin(list(unmatched_movie_ids))]

print("new matches: " + str(match.shape))
print("all matches: " + str(matched_df_full.shape))
print("left to match: " + str(ml_unmatched.shape))

# no punctuation + (year-1)

ml_secondary_title = ml_unmatched.copy(deep='True')
ml_secondary_title['year-1'] = ml_secondary_title['year']-1

ml_secondary_title['title'] = ml_secondary_title['title'].apply(lambda x: remove_movielens_articles(x))

match = pd.merge(ml_unmatched_punctuation, imdb_punctuation, 
                 left_on=['title', 'year-1'],
                 right_on=['title', 'year'])

matched_df_full = matched_df_full.append(match, ignore_index=True)

matched_movie_ids = list(matched_df_full['movie_id'])
unmatched_movie_ids = set(unmatched_movie_ids) - set(matched_movie_ids)

ml_unmatched = ml_unmatched[ml_unmatched['movie_id'].isin(list(unmatched_movie_ids))]

print("new matches: " + str(match.shape))
print("all matches: " + str(matched_df_full.shape))
print("left to match: " + str(ml_unmatched.shape))

new matches: (50, 5)
all matches: (3734, 11)
left to match: (202, 3)
new matches: (0, 7)
all matches: (3734, 11)
left to match: (202, 3)
new matches: (1, 7)
all matches: (3735, 11)
left to match: (202, 3)


In [137]:
ml_unmatched.tail()

Unnamed: 0,movie_id,title,year
3830,3900,crime and punishment in suburbia,2000
3834,3904,"uninvited guest, an",2000
3844,3914,"broken hearts club, the",2000
3850,3920,"faraway, so close (in weiter ferne, so nah!)",1993
3865,3935,kronos,1973


In [None]:
# I will run through the unmatched rows and see if there are higher year mismatches for any of them 
count = 0;
for index, row in ml_unmatched.iterrows():
    title = row['title']
    ml_year = row['year'] 
    
    find_in_imdb = imdb[imdb['title'] == title]
    if (len(find_in_imdb) >= 1): 
        print("__________________________________")
        print(title + " ML year: " + str(ml_year))
        print("Found match in imdb. ")
        print(find_in_imdb)
        count = count+1; 
        
print("total " + str(count))

In [149]:
manual_matching_dict = {'dream man': 'tt0101770', 
               'new york cop': 'tt0368893',
               'true crime': 'tt0139668',
               'costa brava': 'tt0109489',
               'victor/victoria': 'tt0265987',
               'drunks': 'tt0112907',
               'farmer & chase': 'tt0113031',
               'kids of survival': 'tt0107314',
               'blood & wine': 'tt0859643',
               'hearts and minds': 'tt0071604',
               'alien escape': 'tt0112318',
               'other voices, other rooms': 'tt0119845',
               'time tracers': 'tt0128755',
               'follow the bitch': 'tt0119139',
               '101 dalmatians': 'tt011543',
               'steamboat willie': 'tt0019422',
               'henry: portrait of a serial killer': 'tt0099763',
               'attack of the killer tomatoes!': 'tt0080391',
               'little nemo: adventures in slumberland': 'tt0104740',
               'ten benny': 'tt0114008',
               'daddy long legs': 'tt0021775',
               'train ride to hollywood': 'tt0078412',
               'santitos': 'tt0126651',
               'it happened here': 'tt0055024',
               'last resort': 'tt0091387',
               'solar crisis': 'tt0100649',
               'kronos': 'tt0050610'} 

for index, row in ml_unmatched.iterrows():
    title = row['title']
    if(manual_matching_dict.get(title) != None): 
        row['tconst'] = manual_matching_dict.get(title)
        matched_df_full = matched_df_full.append(row)   

In [151]:
matched_movie_ids = list(matched_df_full['movie_id'])
unmatched_movie_ids = set(unmatched_movie_ids) - set(matched_movie_ids)

ml_unmatched = ml_unmatched[ml_unmatched['movie_id'].isin(list(unmatched_movie_ids))]

print("all matches: " + str(matched_df_full.shape))
print("left to match: " + str(ml_unmatched.shape))

all matches: (3762, 11)
left to match: (175, 3)


In [279]:
imdb[imdb['title'] == 'mr. death']

Unnamed: 0,tconst,title,originalTitle,year
109869,tt0192335,mr. death,mr. death: the rise and fall of fred a. leucht...,1999
444844,tt7086072,mr. death,mr. death,2016


In [280]:
manual_matching_dict = {'misérables, les': 'tt0077936', 
               'shadows (cienie)': 'tt0245718',
               'castle freak': 'tt10701458',
               'dumb & dumber': 'tt0109686',
               'farinelli: il castrato': 'tt0109771',
               'interview with the vampire': 'tt1860252',
               'enfer l\'': 'tt13124824',
               'robert a. heinlein\'s the puppet masters': 'tt0111003',
               'harlem': 'tt0034950',
               'wedding gift, the': 'tt0847585',
               'ciao, professore! (io speriamo che me la cavo )': 'tt0107225',
               'dear diary (caro diario)': 'tt0109382',
               'superweib, das': 'tt0117788',
               'promise, the (versprechen, das)': 'tt0111613',
               'under the domin tree (etz hadomim tafus)': 'tt0109751',
               'two friends': '1986',
               'rendezvous in paris (rendez-vous de paris, les)': 'tt0176090',
               'crude oasis, the': 'tt0112746',
               'godzilla 2000 (gojira ni-sen mireniamu)': 'tt0120685',
               'broken hearts club, the': 'tt1194103',
               'crime and punishment in suburbia': 'tt0096056',
               'mad max 2 (a.k.a. the road warrior)': 'tt0079501',
               'toxic avenger, part ii, the': 'tt0090190',
               'spring fever usa (a.k.a. lauderdale)': 'tt0097717',
               'i am cuba (soy cuba/ya kuba)': 'tt0058604',
               'wisdom of crocodiles, the (a.k.a. immortality)': 'tt0120894',
               'mr. death: the rise and fall of fred a. leuchter, jr.': 'tt0192335'} 


for index, row in ml_unmatched.iterrows():
    title = row['title']
    if(manual_matching_dict.get(title) != None): 
        row['tconst'] = manual_matching_dict.get(title)
        matched_df_full = matched_df_full.append(row)   
        
print("all matches: " + str(matched_df_full.shape))
print("left to match: " + str(ml_unmatched.shape))

all matches: (3788, 11)
left to match: (175, 3)


In [282]:
matched_movie_ids = list(matched_df_full['movie_id'])
unmatched_movie_ids = set(unmatched_movie_ids) - set(matched_movie_ids)

ml_unmatched = ml_unmatched[ml_unmatched['movie_id'].isin(list(unmatched_movie_ids))]

In [294]:
len(ml_unmatched)

149

In [303]:
ml_unmatched.to_csv('unmatched.csv')

In [347]:
imdb[imdb['title'] == 'cool dry place']

Unnamed: 0,tconst,title,originalTitle,year


In [349]:
manual_matching_dict = {'pink floyd - the wall': 'tt0084503', 
               'nosferatu a venezia': 'tt0091651',
               'good, the bad and the ugly, the': 'tt5083572',
               'two women (la ciociara)': 'tt0054749',
               'robert a. heinlein\'s the puppet masters': 'tt0111003',
                'the players club':'tt0119905',
                'big bang theory, the': 'tt1147717',
'jungle2jungle (a.k.a. jungle 2 jungle)' : 'tt0119432',
'boys, les' : 'tt0118764',
'prophecy ii, the' : 'tt0114194',
'machine, the':  'tt0933079',
'friday the 13th part 3: 3d' : 'tt0080761',
'karate kid, part ii, the' : 'tt0426060',
'empty mirror, the':  'tt0116192',
'citizen\'s band (a.k.a. handle with care)':  'tt0359987',
'hard 8 (a.k.a. sydney, a.k.a. hard eight)' : 'tt0119256' ,
'poison ivy: new seduction' : 'tt0105156' ,
'hard-boiled (lashou shentan)' : 'tt0104684',
'trial, the (le procés)':  'tt0057427',
'horror hotel (a.k.a. the city of the dead)' : 'tt0053719',
'two or three things i know about her' : 'tt0060304',
'vacation' : 'tt0015452',
'slaughterhouse 2':  'tt0093990'	,
'meatballs iii':  'tt0079540',
'children of the corn iii':  'tt0087050',
'seven beauties (pasqualino settebellezze)': 'tt0075040',
'lodger, the':  'tt0037024',
'vie est belle, la (life is rosey)' : 'tt0161066',
'communion (a.k.a. alice, sweet alice/holy terror)' : 'tt0188223',}


In [351]:
for index, row in ml_unmatched.iterrows():
    title = row['title']
    if(manual_matching_dict.get(title) != None): 
        row['tconst'] = manual_matching_dict.get(title)
        matched_df_full = matched_df_full.append(row)
        
matched_movie_ids = list(matched_df_full['movie_id'])
unmatched_movie_ids = set(unmatched_movie_ids) - set(matched_movie_ids)

ml_unmatched = ml_unmatched[ml_unmatched['movie_id'].isin(list(unmatched_movie_ids))]   
        
print("all matches: " + str(matched_df_full.shape))
print("left to match: " + str(ml_unmatched.shape))


all matches: (3840, 11)
left to match: (123, 3)
