In [None]:
import sys
sys.path.append('../')
from utils import *
from graphUtils import *

# DeepMatcher

In [None]:
!pip install deepmatcher 

## Training

In [None]:
import pickle
df = pickle.load(open('../../data/imdb/imdb_reviews_1000film.df','rb'))
ground_truth = pickle.load(open('../../data/imdb/imdb_GT.pkl','rb'))
review_ids = pickle.load(open('../../data/imdb/imdb_reviewIDs.pkl','rb'))
row_ids = pickle.load(open('../../data/imdb/imdb_movieIDs.pkl','rb'))

In [None]:
import datetime
import csv
movies_dic = {}
with open('../../data/imdb/imdb_movielens.csv') as csv_file:
    csv_reader = csv.reader(csv_file, delimiter=',')
    next(csv_reader)
    for row in csv_reader:
        if row[12].replace('_',' ') not in movies_dic: 
            movies_dic[row[12].replace('_',' ')] = []
            
        temp = [r.replace('_',' ') for r in row[0:10]]
        
        month,year = '',''
        if len(row[10]) > 0:        
            month = datetime.date(1900, int(row[10][4::]), 1).strftime('%B')
            year = row[10][0:4]
        
        temp.append(month.lower() + ' ' + year)
        temp.append(int(float(row[14])))
        
        movies_dic[row[12].replace('_',' ')].append(temp)
        

In [None]:
import numpy as np
import pandas as pd

def train_validate_test_split(df, train_percent=.8, validate_percent=.1, seed=None):
    np.random.seed(seed)
    perm = np.random.permutation(df.index)
    m = len(df.index)
    train_end = int(train_percent * m)
    validate_end = int(validate_percent * m) + train_end
    train = df.iloc[perm[:train_end]]
    validate = df.iloc[perm[train_end:validate_end]]
    test = df.iloc[perm[validate_end:]]
    return train, validate, test


In [None]:
import random
header = ['left_movie','right_movie','label']
all_table = []

for movie in [m for m in ground_truth.keys()][0:int(0.6*len(ground_truth))]:
    if movie not in movies_dic: continue
    #text = ' '.join(str(m).strip() for m in movies_dic[movie][0] if m not in ['', 'nan'])
    text = movie + ' ' + ' '.join(str(m).strip() for m in movies_dic[movie][0] if m not in ['', 'nan'])

    row = []
    for r in ground_truth[movie]:
        all_table.append([text,review_ids[r],'1'])
        
    for r in random.sample(review_ids.keys(),len(review_ids)):
        if r not in ground_truth[movie]:
            all_table.append([text,review_ids[r],'0'])


In [None]:
len(all_table)

In [None]:
from sklearn.utils import shuffle

df = pd.DataFrame(all_table,columns=header)
df = shuffle(df)
train, test, val = train_validate_test_split(df)

In [None]:
train.to_csv('train.csv',index=False)
test.to_csv('test.csv',index=False)
val.to_csv('validate.csv',index=False)

In [None]:
import deepmatcher as dm
train, validation, test = dm.data.process(
    path='',
    train='train.csv',
    validation='validate.csv',
    test='test.csv')


In [None]:
model = dm.MatchingModel()
model.run_train(train, validation, best_save_path='/deepMatcher_imdb.pth',epochs=2)

## Testing

In [None]:
import random
from sklearn.utils import shuffle
import pandas as pd
from tqdm import tqdm

header = ['review','left_movie','right_movie','label']
#all_table.append(header)
deep_imdb = {}
for movie in tqdm([m for m in ground_truth.keys()][int(0.5*len(ground_truth))::]):
    all_table = []

    if movie not in movies_dic: continue
    text = ' '.join(str(m).strip() for m in movies_dic[movie][0] if m not in ['', 'nan'])
    t#ext = movie + ' ' + ' '.join(str(m).strip() for m in movies_dic[movie][0] if m not in ['', 'nan'])
    
    row = []
    if movie in ground_truth:
        for r in ground_truth[movie]:
            all_table.append([r,text,review_ids[r],'1'])
        
        for r in random.sample(review_ids.keys(),len(review_ids)):
            if r not in ground_truth[movie]:
                all_table.append([r,text,review_ids[r],'0'])

    dff = pd.DataFrame(all_table,columns=header)          
    dff = shuffle(dff)
    dff["id"] = dff.index

    dff.to_csv('new_test.csv',index=False)
    rev_index = pd.Series(dff.review.values,index=dff.index).to_dict()    


    unlabeled = dm.data.process_unlabeled(path='new_test.csv', trained_model=model,ignore_columns=('label','review'))
    preds = model.run_prediction(unlabeled)


    temp = {}
    for row in preds.iterrows():
      temp[rev_index[int(row[0])]] =  float(row[1])
      deep_imdb[movie] = temp = dict(sorted(temp.items(), key=lambda x: x[1],reverse=True))


In [None]:
for KK in [1,5,20,500]: 
    i = 0
    precision,recall,fs = 0,0,0
    MAP, MR, hasP = 0,0,0

    for movie in deep_imdb:
        if movie not in ground_truth: continue
        #if row_ids[movie] not in movie_review_d2v: continue
        
        i+=1
        preds =  [f for (f,j) in   sorted(deep_imdb[movie].items(), key=lambda x: x[1],reverse=True)  ][0:KK]
        golds = [f for f in ground_truth[movie]]

        MAP += MAP_K(golds,preds)
        MR += MRRR(golds,preds)
        hasP += HAS_POSITIVE(golds,preds)
        
    print('\n#################### ' + str(KK) + ' ###########################\n')
    print('MRR:',MR/i,'MAP:',MAP/i, 'HAS POSITIVE:', hasP/i)
