In [2]:
# Importing urlib
import urllib
import os

# Creating the data folder
if not os.path.exists('./data'):
	os.makedirs('./data')

# Obtaining the dataset using the url that hosts it
kaggle_url = 'https://github.com/sundeepblue/movie_rating_prediction/raw/master/movie_metadata.csv'
if not os.path.exists('./data/kaggle_dataset.csv'):     # avoid downloading if the file exists
	response = urllib.request.urlretrieve(kaggle_url, './data/kaggle_dataset.csv')

In [5]:
import gzip

# Obtaining IMDB's text files
imdb_url_prefix = 'ftp://ftp.funet.fi/pub/mirrors/ftp.imdb.com/pub/'
imdb_files_list = ['genres.list', 'ratings.list']
for name in imdb_files_list:
	if not os.path.exists('./data/' + name):
		response = urllib.request.urlretrieve(imdb_url_prefix + name, './data/' + name)
		urllib.urlcleanup()   # urllib fails to download two files from a ftp source. This fixes the bug!
		with gzip.open('./data/' + name) as comp_file, open('./data/' + name[:-3], 'w') as reg_file:
			file_content = comp_file.read()
			reg_file.write(file_content)

In [9]:
imdb_url = 'https://anaconda.org/BigGorilla/datasets/1/download/imdb_dataset.csv'
if not os.path.exists('./data/imdb_dataset.csv'):     # avoid downloading if the file exists
	response = urllib.request.urlretrieve(kaggle_url, './data/imdb_dataset.csv')

In [11]:
with open("./data/ratings.list/ratings.list") as myfile:
	head = [next(myfile) for x in range(38)]
print (''.join(head[28:38]))   # skipping the first 28 lines as they are descriptive headers

      0000000125  1888533   9.2  The Shawshank Redemption (1994)
      0000000125  1289428   9.2  The Godfather (1972)
      0000000124  889607   9.0  The Godfather: Part II (1974)
      0000000124  1864164   9.0  The Dark Knight (2008)
      0000000133  518449   8.9  12 Angry Men (1957)
      0000000133  971107   8.9  Schindler's List (1993)
      0000000123  1477112   8.9  Pulp Fiction (1994)
      0000000124  1349449   8.9  The Lord of the Rings: The Return of the King (2003)
      0000000123  559468   8.8  Il buono, il brutto, il cattivo (1966)
      0000000133  1513600   8.8  Fight Club (1999)



In [12]:
with open("./data/genres.list/genres.list") as myfile:
	head = [next(myfile) for x in range(392)]
print (''.join(head[382:392]))   # skipping the first 382 lines as they are descriptive header


"!Next?" (1994)						Documentary
"#1 Single" (2006)					Reality-TV
"#15SecondScare" (2015)					Horror
"#15SecondScare" (2015)					Short
"#15SecondScare" (2015)					Thriller
"#15SecondScare" (2015) {Who Wants to Play with the Rabbit? (#1.2)}	Drama
"#15SecondScare" (2015) {Who Wants to Play with the Rabbit? (#1.2)}	Horror
"#15SecondScare" (2015) {Who Wants to Play with the Rabbit? (#1.2)}	Short



In [20]:
import re
import pandas as pd

with open("./data/genres.list/genres.list") as genres_file:
	raw_content = genres_file.readlines()
	genres_list = []
	content = raw_content[384:]
	for line in content:
		m = re.match(r'"?(.*[^"])"? \(((?:\d|\?){4})(?:/\w*)?\).*\s((?:\w|-)+)', line.strip())
		genres_list.append([m.group(1), m.group(2), m.group(3)])
	genres_data = pd.DataFrame(genres_list, columns=['movie', 'year', 'genre'])

In [21]:
with open("./data/ratings.list/ratings.list") as ratings_file:
	raw_content = ratings_file.readlines()
	ratings_list = []
	content = raw_content[28:]
	for line in content:
		m = re.match(r'(?:\d|\.|\*){10}\s+\d+\s+(1?\d\.\d)\s"?(.*[^"])"? \(((?:\d|\?){4})(?:/\w*)?\)', line.strip())
		if m is None: continue
		ratings_list.append([m.group(2), m.group(3), m.group(1)])
	ratings_data = pd.DataFrame(ratings_list, columns=['movie', 'year', 'rating'])

In [22]:
import pandas as pd

# Loading the Kaggle dataset from the .csv file (kaggle_dataset.csv)
kaggle_data = pd.read_csv('./data/kaggle_dataset.csv')

In [23]:
print ('Number of movies in kaggle_data: {}'.format(kaggle_data.shape[0]))
print ('Number of movies in genres_data: {}'.format(genres_data.shape[0]))
print ('Number of movies in ratings_data: {}'.format(ratings_data.shape[0]))

Number of movies in kaggle_data: 5043
Number of movies in genres_data: 2658941
Number of movies in ratings_data: 789415


In [24]:
print ('Number of duplicates in kaggle_data: {}'.format(
	sum(kaggle_data.duplicated(subset=['movie_title', 'title_year'], keep=False))))
print ('Number of duplicates in genres_data: {}'.format(
	sum(genres_data.duplicated(subset=['movie', 'year'], keep=False))))
print ('Number of duplicates in ratings_data: {}'.format(
	sum(ratings_data.duplicated(subset=['movie', 'year'], keep=False))))

Number of duplicates in kaggle_data: 241
Number of duplicates in genres_data: 2031322
Number of duplicates in ratings_data: 342815


In [25]:
kaggle_data = kaggle_data.drop_duplicates(subset=['movie_title', 'title_year'], keep='first').copy()
genres_data = genres_data.drop_duplicates(subset=['movie', 'year'], keep='first').copy()
ratings_data = ratings_data.drop_duplicates(subset=['movie', 'year'], keep='first').copy()

In [27]:
def preprocess_title(title):
	title = title.lower()
	title = title.replace(',', ' ')
	title = title.replace("'", '')    
	title = title.replace('&', 'and')
	title = title.replace('?', '')
# 	title = title.decode('utf-8', 'ignore')
	return title.strip()

kaggle_data['norm_movie_title'] = kaggle_data['movie_title'].map(preprocess_title)
genres_data['norm_movie'] = genres_data['movie'].map(preprocess_title)
ratings_data['norm_movie'] = ratings_data['movie'].map(preprocess_title)

In [28]:

def preprocess_year(year):
	if pd.isnull(year):
		return '?'
	else:
		return str(int(year))

kaggle_data['norm_title_year'] = kaggle_data['title_year'].map(preprocess_year)
kaggle_data.head()

Unnamed: 0,color,director_name,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,genres,...,country,content_rating,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes,norm_movie_title,norm_title_year
0,Color,James Cameron,723.0,178.0,0.0,855.0,Joel David Moore,1000.0,760505847.0,Action|Adventure|Fantasy|Sci-Fi,...,USA,PG-13,237000000.0,2009.0,936.0,7.9,1.78,33000,avatar,2009
1,Color,Gore Verbinski,302.0,169.0,563.0,1000.0,Orlando Bloom,40000.0,309404152.0,Action|Adventure|Fantasy,...,USA,PG-13,300000000.0,2007.0,5000.0,7.1,2.35,0,pirates of the caribbean: at worlds end,2007
2,Color,Sam Mendes,602.0,148.0,0.0,161.0,Rory Kinnear,11000.0,200074175.0,Action|Adventure|Thriller,...,UK,PG-13,245000000.0,2015.0,393.0,6.8,2.35,85000,spectre,2015
3,Color,Christopher Nolan,813.0,164.0,22000.0,23000.0,Christian Bale,27000.0,448130642.0,Action|Thriller,...,USA,PG-13,250000000.0,2012.0,23000.0,8.5,2.35,164000,the dark knight rises,2012
4,,Doug Walker,,,131.0,,Rob Walker,131.0,,Documentary,...,,,,,12.0,7.1,,0,star wars: episode vii - the force awakens,?


In [29]:

brief_imdb_data = pd.merge(ratings_data, genres_data, how='inner', on=['norm_movie', 'year'])
brief_imdb_data.head()

Unnamed: 0,movie_x,year,rating,norm_movie,movie_y,genre
0,The Shawshank Redemption,1994,9.2,the shawshank redemption,The Shawshank Redemption,Crime
1,The Godfather,1972,9.2,the godfather,The Godfather,Crime
2,The Godfather: Part II,1974,9.0,the godfather: part ii,The Godfather: Part II,Crime
3,The Dark Knight,2008,9.0,the dark knight,The Dark Knight,Action
4,12 Angry Men,1957,8.9,12 angry men,12 Angry Men,Crime


In [30]:

# reading the new IMDB dataset
imdb_data = pd.read_csv('./data/imdb_dataset.csv')
# let's normlize the title as we did in Part 3 of the tutorial
imdb_data['norm_title'] = imdb_data['title'].map(preprocess_title)
imdb_data['norm_year'] = imdb_data['year'].map(preprocess_year)
imdb_data = imdb_data.drop_duplicates(subset=['norm_title', 'norm_year'], keep='first').copy()
imdb_data.shape

(869178, 27)

In [31]:

data_attempt1 = pd.merge(imdb_data, kaggle_data, how='inner', left_on=['norm_title', 'norm_year'],
						 right_on=['norm_movie_title', 'norm_title_year'])
data_attempt1.shape

(4248, 57)

In [33]:
import py_stringsimjoin as ssj
import py_stringmatching as sm

imdb_data['id'] = range(imdb_data.shape[0])
kaggle_data['id'] = range(kaggle_data.shape[0])
similar_titles = ssj.edit_distance_join(imdb_data, kaggle_data, 'id', 'id', 'norm_title',
										'norm_movie_title', l_out_attrs=['norm_title', 'norm_year'],
										 r_out_attrs=['norm_movie_title', 'norm_title_year'], threshold=1)
# selecting the entries that have the same production year
data_attempt2 = similar_titles[similar_titles.r_norm_title_year == similar_titles.l_norm_year]
data_attempt2.shape

  projected_dataframe = dataframe[proj_attrs].dropna(0,
  projected_dataframe = dataframe[proj_attrs].dropna(0,
0% [##############################] 100% | ETA: 00:00:00
Total time elapsed: 00:00:40


(4679, 8)

In [34]:
data_attempt2[data_attempt2.l_norm_title != data_attempt2.r_norm_movie_title].head()

Unnamed: 0,_id,l_id,r_id,l_norm_title,l_norm_year,r_norm_movie_title,r_norm_title_year,_sim_score
145,145,852736,46,world war v,2013,world war z,2013,1.0
175,175,281649,56,grave,2012,brave,2012,1.0
243,243,816188,67,upe,2009,up,2009,1.0
250,250,817366,67,ut,2009,up,2009,1.0
265,265,316745,70,hug,2011,hugo,2011,1.0


In [35]:
# transforming the "budget" column into string and creating a new **mixture** column
ssj.utils.converter.dataframe_column_to_str(imdb_data, 'budget', inplace=True)
imdb_data['mixture'] = imdb_data['norm_title'] + ' ' + imdb_data['norm_year'] + ' ' + imdb_data['budget']

# repeating the same thing for the Kaggle dataset
ssj.utils.converter.dataframe_column_to_str(kaggle_data, 'budget', inplace=True)
kaggle_data['mixture'] = kaggle_data['norm_movie_title'] + ' ' + kaggle_data['norm_title_year'] + \
						 ' ' + kaggle_data['budget']

In [36]:
C = ssj.overlap_coefficient_join(kaggle_data, imdb_data, 'id', 'id', 'mixture', 'mixture', sm.WhitespaceTokenizer(), 
								 l_out_attrs=['norm_movie_title', 'norm_title_year', 'duration',
											  'budget', 'content_rating'],
								 r_out_attrs=['norm_title', 'norm_year', 'length', 'budget', 'mpaa'],
								 threshold=0.65)
C.shape

  projected_dataframe = dataframe[proj_attrs].dropna(0,
  projected_dataframe = dataframe[proj_attrs].dropna(0,
0% [##############################] 100% | ETA: 00:00:00
Total time elapsed: 00:00:08


(18317, 14)

In [38]:
import py_entitymatching as em
em.set_key(kaggle_data, 'id')   # specifying the key column in the kaggle dataset
em.set_key(imdb_data, 'id')     # specifying the key column in the imdb dataset
em.set_key(C, '_id')            # specifying the key in the candidate set
em.set_ltable(C, kaggle_data)   # specifying the left table 
em.set_rtable(C, imdb_data)     # specifying the right table
em.set_fk_rtable(C, 'r_id')     # specifying the column that matches the key in the right table 
em.set_fk_ltable(C, 'l_id')     # specifying the column that matches the key in the left table 

True

In [39]:
C[['l_norm_movie_title', 'r_norm_title', 'l_norm_title_year', 'r_norm_year',
   'l_budget', 'r_budget', 'l_content_rating', 'r_mpaa']].head()

Unnamed: 0,l_norm_movie_title,r_norm_title,l_norm_title_year,r_norm_year,l_budget,r_budget,l_content_rating,r_mpaa
0,dude wheres my dog!,#hacked,2014,2014,20000,20000,PG,
1,road hard,#horror,2015,2015,1500000,1500000,,
2,me you and five bucks,#horror,2015,2015,1500000,1500000,,
3,checkmate,#horror,2015,2015,1500000,1500000,,
4,#horror,#horror,2015,2015,1500000,1500000,Not Rated,


In [40]:

# Sampling 500 pairs and writing this sample into a .csv file
sampled = C.sample(500, random_state=0)
sampled.to_csv('./data/sampled.csv', encoding='utf-8')

In [44]:
# If you would like to avoid labeling the pairs for now, you can download the labled.csv file from
# BigGorilla using the following command (if you prefer to do it yourself, command the next line)
# response = urllib.request.urlretrieve('https://anaconda.org/BigGorilla/datasets/1/download/labeled.csv',
# 							  './data/labeled.csv')
labeled = em.read_csv_metadata('data/labeled.csv', ltable=kaggle_data, rtable=imdb_data,
							   fk_ltable='l_id', fk_rtable='r_id', key='_id')
labeled.head()

Metadata file is not present in the given path; proceeding to read the csv file.


Unnamed: 0.1,Unnamed: 0,_id,l_id,r_id,l_norm_movie_title,l_norm_title_year,l_duration,l_budget,l_content_rating,r_norm_title,r_norm_year,r_length,r_budget,r_mpaa,_sim_score,label
0,4771,4771,2639,235925,eye of the beholder,1999,109.0,15000000,R,eye of the beholder,1999,109.0,35000000,R,0.833333,1
1,11478,11478,2001,600301,rocky balboa,2006,139.0,24000000,PG,rocky balboa,2006,139.0,24000000,PG,1.0,1
2,13630,13630,4160,691766,from russia with love,1963,115.0,2000000,Approved,the aeolians: from russia with love,2012,,20000,,0.666667,0
3,1972,1972,1248,101029,sex tape,2014,94.0,40000000,R,blended,2014,117.0,40000000,PG-13,0.666667,0
4,15903,15903,722,758133,the scorch trials,2015,132.0,61000000,PG-13,the scorch trials,2015,132.0,61000000,PG-13,1.0,1


In [45]:
split = em.split_train_test(labeled, train_proportion=0.5, random_state=0)
train_data = split['train']
test_data = split['test']

dt = em.DTMatcher(name='DecisionTree', random_state=0)
svm = em.SVMMatcher(name='SVM', random_state=0)
rf = em.RFMatcher(name='RF', random_state=0)
lg = em.LogRegMatcher(name='LogReg', random_state=0)
ln = em.LinRegMatcher(name='LinReg')
nb = em.NBMatcher(name='NaiveBayes')

In [46]:
attr_corres = em.get_attr_corres(kaggle_data, imdb_data)
attr_corres['corres'] = [('norm_movie_title', 'norm_title'), 
						 ('norm_title_year', 'norm_year'),
						('content_rating', 'mpaa'),
						 ('budget', 'budget'),
]

l_attr_types = em.get_attr_types(kaggle_data)
r_attr_types = em.get_attr_types(imdb_data)

tok = em.get_tokenizers_for_matching()
sim = em.get_sim_funs_for_matching()

F = em.get_features(kaggle_data, imdb_data, l_attr_types, r_attr_types, attr_corres, tok, sim)

In [47]:
train_features = em.extract_feature_vecs(train_data, feature_table=F, attrs_after='label', show_progress=False) 
train_features = em.impute_table(train_features,  exclude_attrs=['_id', 'l_id', 'r_id', 'label'], strategy='mean')

ValueError: 'X' and 'missing_values' types are expected to be both numerical. Got X.dtype=float64 and  type(missing_values)=<class 'str'>.

In [48]:

result = em.select_matcher([dt, rf, svm, ln, lg, nb], table=train_features, 
						   exclude_attrs=['_id', 'l_id', 'r_id', 'label'], k=5,
						   target_attr='label', metric='f1', random_state=0)
result['cv_stats']

TypeError: select_matcher() got an unexpected keyword argument 'metric'

In [49]:
best_model = result['selected_matcher']
best_model.fit(table=train_features, exclude_attrs=['_id', 'l_id', 'r_id', 'label'], target_attr='label')

test_features = em.extract_feature_vecs(test_data, feature_table=F, attrs_after='label', show_progress=False)
test_features = em.impute_table(test_features, exclude_attrs=['_id', 'l_id', 'r_id', 'label'], strategy='mean')

# Predict on the test data
predictions = best_model.predict(table=test_features, exclude_attrs=['_id', 'l_id', 'r_id', 'label'], 
								 append=True, target_attr='predicted', inplace=False)

# Evaluate the predictions
eval_result = em.eval_matches(predictions, 'label', 'predicted')
em.print_eval_summary(eval_result)

NameError: name 'result' is not defined

In [50]:
candset_features = em.extract_feature_vecs(C, feature_table=F, show_progress=True)
candset_features = em.impute_table(candset_features, exclude_attrs=['_id', 'l_id', 'r_id'], strategy='mean')
predictions = best_model.predict(table=candset_features, exclude_attrs=['_id', 'l_id', 'r_id'],
								 append=True, target_attr='predicted', inplace=False)
matches = predictions[predictions.predicted == 1]

0% [##############################] 100% | ETA: 00:00:00
Total time elapsed: 00:00:13


ValueError: 'X' and 'missing_values' types are expected to be both numerical. Got X.dtype=float64 and  type(missing_values)=<class 'str'>.

In [51]:
from py_entitymatching.catalog import catalog_manager as cm
matches = matches[['_id', 'l_id', 'r_id', 'predicted']]
matches.reset_index(drop=True, inplace=True)
cm.set_candset_properties(matches, '_id', 'l_id', 'r_id', kaggle_data, imdb_data)
matches = em.add_output_attributes(matches, l_output_attrs=['norm_movie_title', 'norm_title_year', 'budget', 'content_rating'],
								   r_output_attrs=['norm_title', 'norm_year', 'budget', 'mpaa'],
								   l_output_prefix='l_', r_output_prefix='r_',
								   delete_from_catalog=False)
matches.drop('predicted', axis=1, inplace=True)
matches.head()

NameError: name 'matches' is not defined

In [None]:
#======================================================================
