In [1]:
import py_entitymatching as em
import pandas as pd
import numpy as np
import os

In [2]:
temp_G = em.read_csv_metadata(os.path.join('dataset','labeled_movie_data.csv'))
temp_G = temp_G.rename(columns = {'_id':'id'})
temp_G.to_csv('labeled_movies.csv',index=False, sep=',')

In [33]:
A = em.read_csv_metadata('merged_sample.csv',key='id', low_memory=False) # setting the parameter low_memory to False  to speed up loading.
B = em.read_csv_metadata('movies_sample.csv',key='id',low_memory=False)
G = em.read_csv_metadata('labeled_movies.csv',key='id',low_memory=False,ltable=A, rtable=B, fk_ltable='ltable_id', fk_rtable='rtable_id')

In [34]:
G.head()

Unnamed: 0,id,ltable_id,rtable_id,ltable_movie_title,ltable_year,rtable_movie_title,rtable_year,gold_label
0,33,3476,571847,prison break,2005,prison break,2008,1
1,34,2365,828790,las vegas,"[2003, 1963]",viva las vegas,1964,0
2,40,307,53603,an american tail: fievel goes west,1991,an american tail: fievel goes west,1991,1
3,42,2634,527545,once upon a wheel,1971,once upon a scoundrel,1974,0
4,73,4723,700054,the boys are back,2009,the boys are back,2009,1


In [5]:
temp_G = G
#temp_G = temp_G.rename(columns = {'ltable_year':'ltable_year_old'})
temp_G.head()

Unnamed: 0,id,ltable_id,rtable_id,ltable_movie_title,ltable_year,rtable_movie_title,rtable_year,gold_label
0,33,3476,571847,prison break,2005,prison break,2008,1
1,34,2365,828790,las vegas,"[2003, 1963]",viva las vegas,1964,0
2,40,307,53603,an american tail: fievel goes west,1991,an american tail: fievel goes west,1991,1
3,42,2634,527545,once upon a wheel,1971,once upon a scoundrel,1974,0
4,73,4723,700054,the boys are back,2009,the boys are back,2009,1


In [6]:
for index, row in temp_G.iterrows():
    
    if row['ltable_year'][0] == '[':
        year = row['ltable_year'][1:-1]
        year = year.split(',')
        a = abs(int(year[0])-int(row['rtable_year']))
        b = abs(int(year[1])-int(row['rtable_year'])) 
        if(a<b):
            temp_G.loc[index,'ltable_year'] = year[0]
        else:
            temp_G.loc[index,'ltable_year'] = year[1]
    row['rtable_year'] = int(row['rtable_year'])
    #print row['rtable_year']

In [7]:
temp_G[['rtable_year']] = temp_G[['rtable_year']].astype(int)
temp_G.head()
#temp_G.to_csv('labeled_movie_ltable_fix.csv',sep='\t',index = False)

Unnamed: 0,id,ltable_id,rtable_id,ltable_movie_title,ltable_year,rtable_movie_title,rtable_year,gold_label
0,33,3476,571847,prison break,2005,prison break,2008,1
1,34,2365,828790,las vegas,1963,viva las vegas,1964,0
2,40,307,53603,an american tail: fievel goes west,1991,an american tail: fievel goes west,1991,1
3,42,2634,527545,once upon a wheel,1971,once upon a scoundrel,1974,0
4,73,4723,700054,the boys are back,2009,the boys are back,2009,1


In [8]:
# Split G into I an J
train_test = em.split_train_test(temp_G, train_proportion=0.75,random_state=0)
I = train_test['train']
J = train_test['test']
I.to_csv(os.path.join('dataset','movies_train_0.75.csv'))
J.to_csv(os.path.join('dataset','movies_test_0.75.csv'))

In [9]:
len(I),len(J)

(300, 100)

In [26]:
# Generate a set of features
F = em.get_features_for_matching(A, B)
print(F.feature_name)
# Remove all features on id parameters
F = F[0:8]
print(F.feature_name)

0         movie_title_movie_title_jac_qgm_3_qgm_3
1     movie_title_movie_title_cos_dlm_dc0_dlm_dc0
2     movie_title_movie_title_jac_dlm_dc0_dlm_dc0
3                     movie_title_movie_title_mel
4                movie_title_movie_title_lev_dist
5                 movie_title_movie_title_lev_sim
6                     movie_title_movie_title_nmw
7                      movie_title_movie_title_sw
8                                       id_id_exm
9                                       id_id_anm
10                                 id_id_lev_dist
11                                  id_id_lev_sim
Name: feature_name, dtype: object
0        movie_title_movie_title_jac_qgm_3_qgm_3
1    movie_title_movie_title_cos_dlm_dc0_dlm_dc0
2    movie_title_movie_title_jac_dlm_dc0_dlm_dc0
3                    movie_title_movie_title_mel
4               movie_title_movie_title_lev_dist
5                movie_title_movie_title_lev_sim
6                    movie_title_movie_title_nmw
7                     m

In [27]:
#F = F[0:5]

In [28]:
# Convert I into a set of feature vectors using F
H = em.extract_feature_vecs(I, 
                            feature_table=F, 
                            attrs_after='gold_label',
                            show_progress=False) 

In [12]:
# Check if the feature vectors contain missing values
# A return value of True means that there are missing values
any(pd.notnull(H))

True

In [13]:
# Impute feature vectors with 0.
H.fillna(value=0, inplace=True)

In [14]:
# Create a set of ML-matchers
dt = em.DTMatcher(name='DecisionTree', random_state=0,max_depth=5)
svm = em.SVMMatcher(name='SVM', random_state=0)
rf = em.RFMatcher(name='RF', random_state=0)
lg = em.LogRegMatcher(name='LogReg', random_state=0)
ln = em.LinRegMatcher(name='LinReg')
nb = em.NBMatcher(name='NaiveBayes')

In [15]:
# Compute precision and select the best ML matcher using CV
result_precision = em.select_matcher([dt, rf, svm, ln, lg, nb], table=H, 
        exclude_attrs=['id', 'ltable_id', 'rtable_id', 'gold_label'],
        k=5,
        target_attr='gold_label', metric='precision', random_state=0)

result_precision['cv_stats']

Unnamed: 0,Name,Matcher,Num folds,Fold 1,Fold 2,Fold 3,Fold 4,Fold 5,Mean score
0,DecisionTree,<py_entitymatching.matcher.dtmatcher.DTMatcher object at 0x000000000C0BDFD0>,5,0.761905,0.842105,0.809524,0.764706,0.8,0.795648
1,RF,<py_entitymatching.matcher.rfmatcher.RFMatcher object at 0x000000000C0BDEB8>,5,0.888889,0.842105,0.85,0.75,0.863636,0.838926
2,SVM,<py_entitymatching.matcher.svmmatcher.SVMMatcher object at 0x000000000C0BDEF0>,5,0.789474,0.75,0.8,0.733333,0.941176,0.802797
3,LinReg,<py_entitymatching.matcher.linregmatcher.LinRegMatcher object at 0x000000000C0BDE48>,5,0.782609,0.708333,0.692308,0.722222,0.76,0.733094
4,LogReg,<py_entitymatching.matcher.logregmatcher.LogRegMatcher object at 0x000000000C084FD0>,5,0.789474,0.761905,0.791667,0.764706,0.863636,0.794277
5,NaiveBayes,<py_entitymatching.matcher.nbmatcher.NBMatcher object at 0x000000000C0BD898>,5,0.782609,0.708333,0.692308,0.764706,0.76,0.741591


In [16]:
result_recall = em.select_matcher([dt, rf, svm, ln, lg, nb], table=H, 
        exclude_attrs=['id', 'ltable_id', 'rtable_id', 'gold_label'],
        k=5,
        target_attr='gold_label', metric='recall', random_state=0)
result_recall['cv_stats']

Unnamed: 0,Name,Matcher,Num folds,Fold 1,Fold 2,Fold 3,Fold 4,Fold 5,Mean score
0,DecisionTree,<py_entitymatching.matcher.dtmatcher.DTMatcher object at 0x000000000C0BDFD0>,5,0.888889,0.888889,0.85,0.866667,0.952381,0.889365
1,RF,<py_entitymatching.matcher.rfmatcher.RFMatcher object at 0x000000000C0BDEB8>,5,0.888889,0.888889,0.85,0.8,0.904762,0.866508
2,SVM,<py_entitymatching.matcher.svmmatcher.SVMMatcher object at 0x000000000C0BDEF0>,5,0.833333,0.833333,0.8,0.733333,0.761905,0.792381
3,LinReg,<py_entitymatching.matcher.linregmatcher.LinRegMatcher object at 0x000000000C0BDE48>,5,1.0,0.944444,0.9,0.866667,0.904762,0.923175
4,LogReg,<py_entitymatching.matcher.logregmatcher.LogRegMatcher object at 0x000000000C084FD0>,5,0.833333,0.888889,0.95,0.866667,0.904762,0.88873
5,NaiveBayes,<py_entitymatching.matcher.nbmatcher.NBMatcher object at 0x000000000C0BD898>,5,1.0,0.944444,0.9,0.866667,0.904762,0.923175


In [17]:
result_f1 = em.select_matcher([dt, rf, svm, ln, lg, nb], table=H, 
        exclude_attrs=['id', 'ltable_id', 'rtable_id', 'gold_label'],
        k=5,
        target_attr='gold_label', metric='f1', random_state=0)
result_f1['cv_stats']

Unnamed: 0,Name,Matcher,Num folds,Fold 1,Fold 2,Fold 3,Fold 4,Fold 5,Mean score
0,DecisionTree,<py_entitymatching.matcher.dtmatcher.DTMatcher object at 0x000000000C0BDFD0>,5,0.820513,0.864865,0.829268,0.8125,0.869565,0.839342
1,RF,<py_entitymatching.matcher.rfmatcher.RFMatcher object at 0x000000000C0BDEB8>,5,0.888889,0.864865,0.85,0.774194,0.883721,0.852334
2,SVM,<py_entitymatching.matcher.svmmatcher.SVMMatcher object at 0x000000000C0BDEF0>,5,0.810811,0.789474,0.8,0.733333,0.842105,0.795145
3,LinReg,<py_entitymatching.matcher.linregmatcher.LinRegMatcher object at 0x000000000C0BDE48>,5,0.878049,0.809524,0.782609,0.787879,0.826087,0.816829
4,LogReg,<py_entitymatching.matcher.logregmatcher.LogRegMatcher object at 0x000000000C084FD0>,5,0.810811,0.820513,0.863636,0.8125,0.883721,0.838236
5,NaiveBayes,<py_entitymatching.matcher.nbmatcher.NBMatcher object at 0x000000000C0BD898>,5,0.878049,0.809524,0.782609,0.8125,0.826087,0.821754


In [18]:
#Debug Random Forest Matcher X
# Split H into P and Q
PQ = em.split_train_test(H, train_proportion=0.5, random_state=0)
P = PQ['train']
Q = PQ['test']

In [19]:
# Debug X using GUI
em.vis_debug_rf(rf, P, Q, 
        exclude_attrs=['id', 'ltable_id', 'rtable_id', 'gold_label'],
        target_attr='gold_label')

In [24]:
print(F.feature_name)


0        movie_title_movie_title_jac_qgm_3_qgm_3
1    movie_title_movie_title_cos_dlm_dc0_dlm_dc0
2    movie_title_movie_title_jac_dlm_dc0_dlm_dc0
3                    movie_title_movie_title_mel
4               movie_title_movie_title_lev_dist
5                movie_title_movie_title_lev_sim
6                    movie_title_movie_title_nmw
7                     movie_title_movie_title_sw
Name: feature_name, dtype: object


In [37]:
# Evaluate matching output
# Convert movies dataset after blocking  into a set of feature vectors using feature table
K = em.read_csv_metadata('movie_dataset_after_blocking.csv',key='id', low_memory=False, ltable = A, rtable = B, fk_ltable = 'ltable_id', fk_rtable = 'rtable_id')
K['gold_label'] = 0
K.head()

Unnamed: 0,id,ltable_id,rtable_id,ltable_movie_title,ltable_year,rtable_movie_title,rtable_year,gold_label
0,32,592.0,571847.0,prison break,"[2005, 1978]",prison break,2008.0,0
1,33,3476.0,571847.0,prison break,2005,prison break,2008.0,0
2,34,2365.0,828790.0,las vegas,"[2003, 1963]",viva las vegas,1964.0,0
3,37,1450.0,828790.0,las vegas,"[2003, 1963]",viva las vegas,1964.0,0
4,40,307.0,53603.0,an american tail: fievel goes west,1991,an american tail: fievel goes west,1991.0,0


In [38]:
L = em.extract_feature_vecs(K, feature_table=F, attrs_after='gold_label', show_progress=False)

In [40]:
# Train using feature vectors from I using random forest
rf.fit(table=H, 
       exclude_attrs=['id', 'ltable_id', 'rtable_id', 'gold_label'], 
       target_attr='gold_label')
# Predict on L 
predictions_rf = rf.predict(table=L, exclude_attrs=['id', 'ltable_id', 'rtable_id', 'gold_label'], 
                         append=True,target_attr='predicted_rf_labels')
# Evaluate the predictions
eval_result = em.eval_matches(predictions_rf, 'gold_label', 'predicted_rf_labels')
em.print_eval_summary(eval_result)

Precision : 0.0% (0/430)
Recall : 0.0% (0/0)
F1 : 0.0%
False positives : 430 (out of 430 positive predictions)
False negatives : 0 (out of 814 negative predictions)


In [41]:
predictions_rf.head()

Unnamed: 0,id,ltable_id,rtable_id,movie_title_movie_title_jac_qgm_3_qgm_3,movie_title_movie_title_cos_dlm_dc0_dlm_dc0,movie_title_movie_title_jac_dlm_dc0_dlm_dc0,movie_title_movie_title_mel,movie_title_movie_title_lev_dist,movie_title_movie_title_lev_sim,movie_title_movie_title_nmw,movie_title_movie_title_sw,gold_label,predicted_rf_labels
0,32,592.0,571847.0,1.0,1.0,1.0,1.0,0.0,1.0,12.0,12.0,0,1
1,33,3476.0,571847.0,1.0,1.0,1.0,1.0,0.0,1.0,12.0,12.0,0,1
2,34,2365.0,828790.0,0.5,0.816497,0.666667,0.769841,5.0,0.642857,4.0,9.0,0,0
3,37,1450.0,828790.0,0.5,0.816497,0.666667,0.769841,5.0,0.642857,4.0,9.0,0,0
4,40,307.0,53603.0,1.0,1.0,1.0,1.0,0.0,1.0,34.0,34.0,0,1


In [44]:
predictions_rf = predictions_rf[predictions_rf['predicted_rf_labels']==1]
predicted_matches = predictions_rf[['id','ltable_id','rtable_id']]
predicted_matches.to_csv('predicted_matches_for_movies_dataset.csv',index=False,sep = ',')

In [46]:
# filtering the matched tuples from both dataset 
matchedSongsTracks = A[A['id'].isin(list(predicted_matches['ltable_id']))]
matchedMovies = B[B['id'].isin(list(predicted_matches['rtable_id']))]

#assert(len(matchedTracks)==len(matchedSongs))
len(matchedSongsTracks), len(matchedMovies)

(414, 296)

In [48]:
len(set(predicted_matches['ltable_id'])), len(set(predicted_matches['rtable_id'])), len(predicted_matches)

(414, 296, 430)

In [49]:
matchedSongsTracks.head()

Unnamed: 0,movie_title,year,episode,song_title,artists,song_id,track_id,id
6,you cant hurry love,1988,,you can't hurry love (lp version),eddie holland+lamont dozier+brian holland+phil collins+arrangement with warner special products,36942.0,729825.0,6
11,the good shepherd,"[2006, 1998]",,the whiffenpoof song (baa! baa! baa!),george s. pomeroy+meade minnigerode+tod b. galloway+rev. by rudy vallee,712029.0,662614.0,11
17,the bleeding,"[2011, 2008]",,and it's alright,peter broderick,892960.0,649958.0,17
35,perkele! kuvia suomesta,1971,,laulu nesteest�,m.a. numminen,532624.0,574812.0,35
39,the 40 year old virgin,"[2005, 1988]",,push it,hurby luv bug azor+ray davies+salt-n-pepa,733866.0,644175.0,39


In [50]:
matchedMovies.head()

Unnamed: 0,id,movie_title,year,length,budget,rating,votes,r1,r2,r3,...,r9,r10,mpaa,Action,Animation,Comedy,Drama,Documentary,Romance,Short
460,571847,prison break,2008,,,8.4,103.0,4.5,4.5,4.5,...,24.5,34.5,,0,0,1,0,0,0,1
557,53603,an american tail: fievel goes west,1991,75.0,,6.4,17349.0,4.5,4.5,4.5,...,4.5,4.5,,0,1,0,0,0,0,0
1154,700054,the boys are back,2009,104.0,,6.8,8859.0,4.5,4.5,4.5,...,4.5,4.5,pg-13,0,0,0,1,0,0,0
1432,407456,lantana,2001,121.0,,7.4,16548.0,4.5,4.5,4.5,...,14.5,14.5,r,0,0,0,1,0,1,0
2409,833034,wake,2003,90.0,,6.9,348.0,4.5,4.5,4.5,...,4.5,44.5,,0,0,0,1,0,0,0


In [51]:
predicted_matches.head()

Unnamed: 0,id,ltable_id,rtable_id
0,32,592.0,571847.0
1,33,3476.0,571847.0
4,40,307.0,53603.0
14,73,4723.0,700054.0
15,86,2281.0,407456.0


In [134]:
column_names = list(pd.merge(matchedMovies, matchedSongsTracks, on='id',how ='outer').columns)
column_names.remove('movie_title_y')
column_names.remove('year_y')
column_names.remove('id')
column_names[0] = 'movie_title'
column_names[1] = 'year'
column_names

['movie_title',
 'year',
 'length',
 'budget',
 'rating',
 'votes',
 'r1',
 'r2',
 'r3',
 'r4',
 'r5',
 'r6',
 'r7',
 'r8',
 'r9',
 'r10',
 'mpaa',
 'Action',
 'Animation',
 'Comedy',
 'Drama',
 'Documentary',
 'Romance',
 'Short',
 'episode',
 'song_title',
 'artists',
 'song_id',
 'track_id']

In [148]:
#Schema of the merged table
E = pd.DataFrame(columns = column_names)
len(E.columns)

29

In [150]:
import math
for index, row in predicted_matches.iterrows(): 
    
    entry = list()
    
    left_entry = matchedSongsTracks[matchedSongsTracks['id']==row['ltable_id']]
    right_entry = matchedMovies[matchedMovies['id']==row['rtable_id']]
    
    assert(len(left_entry)==1)
    assert(len(right_entry)==1)
    
    left_entry['id'] = 1
    right_entry['id'] = 1
    
    left_entry = left_entry.drop(['movie_title','year'], axis=1)
    merged_row = pd.merge(right_entry, left_entry,on='id',how='inner')
    merged_row = merged_row.drop('id',axis = 1)
    
    assert (len(merged_row.columns)==len(E.columns))
    
    #appending the merged value to table E
    E = E.append(merged_row, ignore_index=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [147]:
E.columns


Index([u'Action', u'Animation', u'Comedy', u'Documentary', u'Drama',
       u'Romance', u'Short', u'artists', u'budget', u'episode', u'length',
       u'movie_title', u'mpaa', u'r1', u'r10', u'r2', u'r3', u'r4', u'r5',
       u'r6', u'r7', u'r8', u'r9', u'rating', u'song_id', u'song_title',
       u'track_id', u'votes', u'year'],
      dtype='object')

In [151]:
E.to_csv('Final_merged_table.csv',sep=',',index=False)