In [6]:
# importing packages
import py_entitymatching as em
import pandas as pd
import numpy as np
import os

### Reading Dataset

Here we read two datasets. One contains merged tuples from songs.csv and tracks.csv. The other contains movies data.

In [2]:
# setting up the directory path 
path_A = os.path.join('dataset','merged_data.csv')
path_B = os.path.join('..','dataset','datasets','movies.csv')

# Read the CSV files
A = em.read_csv_metadata(path_A,low_memory=False) # setting the parameter low_memory to False  to speed up loading.
B = em.read_csv_metadata(path_B,low_memory=False)

In [3]:
#creating a 'id' column in merged dataset for further analysis
# Note: year attribute in this dataset has both of the years where years from songs and tracks dataset doesn't match
A['id'] = A.index
A.head()

Unnamed: 0,movie_title,year,episode,song_title,artists,song_id,track_id,id
0,the pledge,"[2001, 1996]",,poor twisted me,james hetfield+lars ulrich+metallica+arrangement with warner special products,511255.0,678831.0,0
1,william s. burroughs: commissioner of sewers,"[1991, 1990]",,batman br�t fische,fm einheit,150981.0,724999.0,1
2,the warriors,"[2005, 1979]",,love is a fire,genya ravan+johnny vastano+vini poncia,328251.0,690267.0,2
3,t in the park 2010,2010,muse/calvin harris (#1.3),map of the problematique [live from wembley stadium],matthew bellamy+muse,227686.0,231063.0,3
4,dolly parton: live & well,"[2004, 2002]",,dagger through the heart,dolly parton,531984.0,418267.0,4


In [4]:
B.rename(columns = {'title':'movie_title'}, inplace = True)
B.head()

Unnamed: 0,id,movie_title,year,length,budget,rating,votes,r1,r2,r3,...,r9,r10,mpaa,Action,Animation,Comedy,Drama,Documentary,Romance,Short
0,0,#,2012,15.0,10000.0,,,,,,...,,,,0,0,1,0,0,0,1
1,1,#,2014,,0.0,,,,,,...,,,,0,0,1,0,0,0,1
2,2,#1,2005,30.0,5000.0,6.8,11.0,14.5,0.0,0.0,...,4.5,14.5,,0,0,0,1,0,0,1
3,3,#1,2009,4.0,,,,,,,...,,,,0,1,0,0,0,0,1
4,4,#1,2010,12.0,0.0,,,,,,...,,,,0,0,1,0,0,0,1


In [None]:
#Down sample the tables
sample_A, sample_B = em.down_sample(A, B, size=100000, y_param=1, show_progress=False)

# Set 'id' as the keys to the input tables
em.set_key(sample_A,'id')
em.set_key(sample_B,'id')

sample_A = sample_A.apply(lambda x: x.astype(str).str.lower())
sample_B = sample_B.apply(lambda x: x.astype(str).str.lower())

#Print lengths of the sampled tables
print(len(sample_A))
print(len(sample_B))

In [None]:
sample_A.to_csv('merged_sample.csv', index = False, sep = ',')
sample_B.to_csv('movies_sample.csv', index = False, sep = ',')

#Get headers of sampled tables
headers_A = list(A.columns)
headers_B = list(B.columns)

In [7]:
sample_A = em.read_csv_metadata('merged_sample.csv',low_memory=False)
sample_B = em.read_csv_metadata('movies_sample.csv',low_memory=False)

# Set 'id' as the keys to the input tables
em.set_key(sample_A,'id')
em.set_key(sample_B,'id')

True

In [8]:
#generating features for blocking
block_f = em.get_features_for_blocking(sample_A, sample_B)
block_f

Unnamed: 0,feature_name,left_attribute,right_attribute,left_attr_tokenizer,right_attr_tokenizer,simfunction,function,function_source,is_auto_generated
0,movie_title_movie_title_jac_qgm_3_qgm_3,movie_title,movie_title,qgm_3,qgm_3,jaccard,<function movie_title_movie_title_jac_qgm_3_qgm_3 at 0x000000001C1FE438>,from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ...,True
1,movie_title_movie_title_cos_dlm_dc0_dlm_dc0,movie_title,movie_title,dlm_dc0,dlm_dc0,cosine,<function movie_title_movie_title_cos_dlm_dc0_dlm_dc0 at 0x000000001C1FE518>,from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ...,True
2,movie_title_movie_title_jac_dlm_dc0_dlm_dc0,movie_title,movie_title,dlm_dc0,dlm_dc0,jaccard,<function movie_title_movie_title_jac_dlm_dc0_dlm_dc0 at 0x000000001C1FE3C8>,from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ...,True
3,movie_title_movie_title_mel,movie_title,movie_title,,,monge_elkan,<function movie_title_movie_title_mel at 0x000000001C1FE588>,from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ...,True
4,movie_title_movie_title_lev_dist,movie_title,movie_title,,,lev_dist,<function movie_title_movie_title_lev_dist at 0x000000001C1FE5F8>,from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ...,True
5,movie_title_movie_title_lev_sim,movie_title,movie_title,,,lev_sim,<function movie_title_movie_title_lev_sim at 0x000000001C1FE668>,from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ...,True
6,movie_title_movie_title_nmw,movie_title,movie_title,,,needleman_wunsch,<function movie_title_movie_title_nmw at 0x000000001C1FE6D8>,from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ...,True
7,movie_title_movie_title_sw,movie_title,movie_title,,,smith_waterman,<function movie_title_movie_title_sw at 0x000000001C1FE748>,from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ...,True
8,id_id_exm,id,id,,,exact_match,<function id_id_exm at 0x000000001C1FE7B8>,from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ...,True
9,id_id_anm,id,id,,,abs_norm,<function id_id_anm at 0x000000001C1FE828>,from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ...,True


In [9]:
#helper function for filtering out some of the tuples based on movile title
def title_function(x, y):

    x_title = str(x['movie_title'])
    y_title = str(y['movie_title'])
    
    x_split = x_title.split()
    y_split = y_title.split()

    intersection = len(set(x_split) & set(y_split))
    union = len(set(x_split) | set(y_split))
    
    frac = float (float(intersection) / float(union))
    
    if( frac < 0.5):
        return True
    
    else:
        x_year = x['year']
        y_year = y['year']
        
        if x_year[0] == '[':
            x_year = x_year[1:-1]
            x_year = x_year.split(',')
        else:
            x_year = [x_year]
        
        for x in x_year:
            if abs(int(x) - float(y_year)) <= 5:
                return False
        
        return True

In [10]:
rb = em.RuleBasedBlocker()
ob = em.OverlapBlocker()
bb = em.BlackBoxBlocker()

# remove pairs that don't share similar movie titles
rule1 = ['movie_title_movie_title_jac_qgm_3_qgm_3(ltuple,rtuple) < 0.5']
rb.add_rule(rule1, block_f)

C1 = rb.block_tables(sample_A, sample_B, l_output_attrs=['movie_title','year'], r_output_attrs=['movie_title','year'], show_progress=False)


0%                          100%
[##############################] | ETA: 00:00:00
Total time elapsed: 00:01:06
0%  100%
[]

Finding pairs with missing value...


In [11]:
len(C1)

4705

In [12]:
bb.set_black_box_function(title_function)
C2 = bb.block_candset(C1)

0%                          100%
[##############################] | ETA: 00:00:00
Total time elapsed: 00:00:02


In [13]:
print len(C2) 
C2.head(50)

1244


Unnamed: 0,_id,ltable_id,rtable_id,ltable_movie_title,ltable_year,rtable_movie_title,rtable_year
32,32,592.0,571847.0,prison break,"[2005, 1978]",prison break,2008.0
33,33,3476.0,571847.0,prison break,2005,prison break,2008.0
34,34,2365.0,828790.0,las vegas,"[2003, 1963]",viva las vegas,1964.0
37,37,1450.0,828790.0,las vegas,"[2003, 1963]",viva las vegas,1964.0
40,40,307.0,53603.0,an american tail: fievel goes west,1991,an american tail: fievel goes west,1991.0
42,42,2634.0,527545.0,once upon a wheel,1971,once upon a scoundrel,1974.0
48,48,5010.0,827242.0,vinyl,"[2016, 1970]",vinyl,1965.0
58,58,4949.0,722638.0,the great gatsby,"[2013, 2006]",the great gabble,2003.0
59,59,3283.0,260836.0,from hell,2001,from hell,2001.0
66,66,3251.0,711622.0,the doors,1991,the doors,1991.0


In [14]:
#Debugging step: debugging the helper function
status = bb.block_tuples(sample_A.ix[3276], sample_B.ix[sample_B[sample_B['id']== 776498].index.item()])
title_function(sample_A.ix[3276], sample_B.ix[sample_B[sample_B['id']== 776498].index.item()])

False

In [15]:
#outputting the debugger from py_entitymatching after performing blocking
dbg = em.debug_blocker(C2, sample_A, sample_B, output_size=50)
dbg

Unnamed: 0,_id,similarity,ltable_id,rtable_id,ltable_movie_title,ltable_year,rtable_movie_title,rtable_year
0,0,0.8,6314,675906,sweeney todd: the demon barber of fleet street,2007,sweeney todd: the demon barber of fleet street,1936
1,1,0.75,5336,713432,the living end,1992,the end,1992
2,2,0.75,6892,95485,big brother,2000,big brother trouble,2000
3,3,0.714286,5601,128144,catch me if you can,2002,catch me if you can,1996
4,4,0.714286,2582,733493,the last of the mohicans,1992,the last of the mohicans,1936
5,5,0.666667,725,688705,terror firmer,1999,terror,1999
6,6,0.666667,320,427285,thats life,2000,life,2000
7,7,0.666667,2975,823966,veronica mars,2004,veronica,2004
8,8,0.666667,589,266343,gangs of new york,2002,gangs of new york,1938
9,9,0.666667,4853,680720,take,2007,take out,2007


In [18]:
#writing the movie dataset after blocking
C2 = C2.rename(columns = {'_id':'id'})
C2.to_csv('movie_dataset_after_blocking.csv',sep = ',',index = False)

In [72]:
#Sample the result set C2
#we sample 400 tuples after blocking
S= em.sample_table(C2,400)
G = em.label_table(S, label_column_name='gold_labels')

In [81]:
#labeling the sampled data
S['gold_label'] = G['gold_labels']
S.to_csv('./dataset/labeled_movie_data.csv', index = False, sep = ',')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
