## Blocking

In [4]:
import rltk
import csv
from datetime import datetime
from dateutil import parser
import re
# You can use this tokenizer in case you need to manipulate some data
tokenizer = rltk.tokenizer.crf_tokenizer.crf_tokenizer.CrfTokenizer()

In [26]:
def my_tokenizer(doc):
    return set(re.findall(r"\w+", doc))

In [35]:
def jaccard_score(a,b):
    n = len(a.intersection(b))
    u = len(a.union(b))
    return n/u

In [57]:
import swalign

In [72]:
def sw_score(a,b):
    match_score = 2
    mismatch_score = -1
    matrix = swalign.NucleotideScoringMatrix(match_score, mismatch_score)
    lalignment_object = swalign.LocalAlignment(matrix)
    alignment_object = lalignment_object.align(str(a), str(b))
    try:
        return alignment_object.matches/(alignment_object.matches+alignment_object.mismatches)
    except:
        return 0

In [71]:
class IMDB(rltk.Record):
    def __init__(self, raw_object):
        super().__init__(raw_object)
        self.name = ''

    @rltk.cached_property
    def id(self):
        return self.raw_object['ID']
    
    @rltk.cached_property
    def title(self):
        return self.raw_object['Title']

    @rltk.cached_property
    def title_tokens(self):
        return my_tokenizer(self.raw_object['Title'])

    @rltk.cached_property
    def date(self):
        try:
            return str(parser.parse(self.raw_object['Year']).date())
        except:
            return f"IMDB Item {self.id} has no valid publish year record"
    
    @rltk.cached_property
    def director(self):
        return self.raw_object['Director']

class Rotten_TMT(rltk.Record):
    def __init__(self, raw_object):
        super().__init__(raw_object)
        self.name = ''

    @rltk.cached_property
    def id(self):
        return self.raw_object['ID']
    
    @rltk.cached_property
    def title(self):
        return self.raw_object['Title']

    @rltk.cached_property
    def title_tokens(self):
        return my_tokenizer(self.raw_object['Title'])

    @rltk.cached_property
    def date(self):
        try:
            return str(parser.parse(self.raw_object['Year']).date())
        except:
            return f"RottenTMT Item {self.id} has no valid publish year record"
    
    @rltk.cached_property
    def director(self):
        return self.raw_object['Director']

In [72]:
imdb_ds = rltk.Dataset(rltk.CSVReader(open("raw_data/imdb.csv", encoding="UTF8")),record_class=IMDB)
rt_tmt_ds = rltk.Dataset(rltk.CSVReader(open("raw_data/rotten_tomatoes.csv", encoding="UTF8")),record_class=Rotten_TMT)

In [73]:
bg = rltk.HashBlockGenerator()

In [74]:
name_tokens_block = bg.generate(
    bg.block(imdb_ds, function_=lambda r: ",".join(r.title_tokens)),
    bg.block(rt_tmt_ds, function_=lambda r: ",".join(r.title_tokens))
) #0.50,0.97

date_block = bg.generate(
    bg.block(imdb_ds, property_='date'),
    bg.block(rt_tmt_ds, property_='date')
) #0.23,0.80

author_block = bg.generate(
    bg.block(imdb_ds, property_='director'),
    bg.block(rt_tmt_ds, property_='director')
) #0.69,0.79

In [75]:
candidate_pairs = list(rltk.get_record_pairs(imdb_ds, rt_tmt_ds, block=name_tokens_block))
for pair in rltk.get_record_pairs(imdb_ds, rt_tmt_ds, block=author_block):
    if pair not in candidate_pairs:
        candidate_pairs.append(pair)
for pair in rltk.get_record_pairs(imdb_ds, rt_tmt_ds, block=date_block):
    if pair not in candidate_pairs:
        candidate_pairs.append(pair)

In [76]:
len(candidate_pairs)

12602

In [89]:
ci = set()
for r1,r2 in candidate_pairs:
    ci.add(r1.id)
    ci.add(r2.id)

In [None]:
cdi = [(i.id, j.id) for i,j in candidate_pairs]

## Output

In [6]:
import pandas as pd
import numpy as np

In [7]:
imdb = pd.read_csv("raw_data/imdb.csv")
rotten_tmt = pd.read_csv("raw_data/rotten_tomatoes.csv")
labeled_dt = pd.read_csv("raw_data/labeled_data.csv")

In [8]:
imdb_title = imdb[["ID","Title"]].dropna()
imdb_title.to_csv("data/imdb_title.txt", index=False, header=False, sep="\t", quoting=csv.QUOTE_NONE, escapechar='\\')

In [9]:
imdb_director = imdb[["ID","Director"]].dropna()
imdb_director.to_csv("data/imdb_director.txt", index=False, header=False, sep="\t", quoting=csv.QUOTE_NONE, escapechar='\\')

In [102]:
imdb_date = imdb[["ID","Year"]].dropna()
imdb_date["Year"] = imdb_date["Year"].apply(lambda x: parser.parse(x).date())
imdb_date.to_csv("data/imdb_date.txt", index=False, header=False, sep="\t", quoting=csv.QUOTE_NONE, escapechar='\\')

In [103]:
imdb_year = imdb_date
imdb_year["Year"] = imdb_date["Year"].apply(lambda x: str(x.year))
with open('data/imdb_date.txt', 'w', newline='', encoding='UTF8') as file:
        for line in imdb_date.values:
            r1,r2 = line
            if r1 in ci:
                file.write("\t".join([r1, r2])+"\n")

In [11]:
rotten_tomatoes_title = rotten_tmt[["ID","Title"]].dropna()
rotten_tomatoes_title.to_csv("data/rotten_tomatoes_title.txt", index=False, header=False, sep="\t", quoting=csv.QUOTE_NONE, escapechar='\\')

In [12]:
rotten_tomatoes_director = rotten_tmt[["ID","Director"]].dropna()
rotten_tomatoes_director.to_csv("data/rotten_tomatoes_director.txt", index=False, header=False, sep="\t", quoting=csv.QUOTE_NONE, escapechar='\\')

In [97]:
rotten_tomatoes_date = rotten_tmt[["ID","Year"]].dropna()
rotten_tomatoes_date["Year"] = rotten_tomatoes_date["Year"].apply(lambda x: parser.parse(x).date())
rotten_tomatoes_date.to_csv("data/rotten_tomatoes_date.txt", index=False, header=False, sep="\t", quoting=csv.QUOTE_NONE, escapechar='\\')

In [98]:
rotten_tomatoes_year = rotten_tomatoes_date
rotten_tomatoes_year["Year"] = rotten_tomatoes_date["Year"].apply(lambda x: x.year)
with open('data/rotten_tomatoes_year.txt', 'w', newline='', encoding='UTF8') as file:
        for line in rotten_tomatoes_date.values:
            r1,r2 = line
            if r1 in ci:
                file.write("\t".join([r1, str(r2)])+"\n")

In [None]:
with open('data/same_movie_target.txt', 'w', newline='', encoding='UTF8') as file:
        for r1, r2 in candidate_pairs:
            file.write("\t".join([r1.id, r2.id])+"\n")

In [None]:
same_movie_truth = labeled_dt[["ltable.ID","rtable.ID","gold"]].dropna()
same_movie_truth["ltable.ID"] = same_movie_truth["ltable.ID"].apply(lambda x: "a-"+str(x))
same_movie_truth["rtable.ID"] = same_movie_truth["rtable.ID"].apply(lambda x: "b-"+str(x))
same_movie_truth.to_csv("data/same_movie_truth.txt", index=False, header=False, sep="\t", quoting=csv.QUOTE_NONE, escapechar='\\')

In [64]:
imdb_title_token = imdb_title
imdb_title_token["Title_Tokens"] = imdb_title_token.apply(lambda x: my_tokenizer(x[1]), axis=1)
rotten_tmt_title_token = rotten_tomatoes_title
rotten_tmt_title_token["Title_Tokens"] = rotten_tmt_title_token.apply(lambda x: my_tokenizer(x[1]), axis=1)
similar_title = imdb_title_token.merge(rotten_tmt_title_token,how='cross')

In [66]:
similar_title["similar"] = similar_title.apply(lambda x: jaccard_score(x[2],x[5]), axis=1)

In [67]:
similar_title = similar_title[similar_title["similar"]>=0.5]

In [69]:
similar_title_id = similar_title[["ID_x","ID_y"]]

In [86]:
len(similar_title_id)

3393

In [84]:
with open('data/similar_title_id.txt', 'w', newline='', encoding='UTF8') as file:
        for line in similar_title_id.values:
            r1,r2 = line
            if (r1,r2) in cdi:
                file.write("\t".join([r1, r2])+"\n")