 **This is a sample of how to use URL_Matching class**

## URL Matching based Method
- Feature Extraction: extract article url and image url from provided file
- remove manually defined stop words from urls
- URL tokenization
- URL comparison: a pair of image url and article url is considered to be matched if they contains more than one common tokens. 
- sort the potential matched image list by the number of same tokens
- Evaluate this performance of this method using MR100 on both training dataset and evaluation dataset 

Define stop words

In [49]:
stop_words = ['in', 'der', 'die', 'und', 'im', 'auf', 'mit', 'fuer', 'von', 'den', 'an', 'fc', 'das', 'am',
                   'vor', 'aus', 'dem', 'anfang', 'sich', 'bei', 'ein', 'des', 'zu', 'sind', 'eine', 'ueber',
                   'gegen', 'nach', 'ist', 'zum', 'beim', 'wird', 'nrw', 'nicht', 'als', 'mehr', 'ab', 'zur',
                   'werden', 'hat', 's', 'wie', 'einem', 'auch', 'e', 'unter', 'wieder', 'vom', 'so', 'um',
                   'noch', 'will', 'afd', 'war', 'strasse']
test_img_file = "MediaEvalNewsImagesBatch04images.tsv"
test_article_file = "MediaEvalNewsImagesBatch04articles.tsv"

TR_A_ID_IDX = 1
TR_I_ID_IDX = 4
TR_IMG_URL_IDX = 3
TR_TITLE_IDX = 2

TEST_A_ID_IDX = 0
TEST_I_ID_IDX = 1
TEST_IMG_URL_IDX = 0
TEST_TITLE_IDX = 2
data_folder=r"../data"

Extract the ground truth: the url paris in the training/evaluation data

In [8]:
def extract_gt(gt_file):
    ground_truth = {}
    with open(gt_file, encoding='utf-8') as file:
        next(file)
        lines = file.readlines()
        lines = [line.rstrip() for line in lines]
        for line in lines:
            segs = line.split("\t")
            if len(segs) < 3:
                break
            ar_id = segs[TR_A_ID_IDX]
            img_id = segs[TR_I_ID_IDX]
            ground_truth[ar_id] = img_id
    return ground_truth

Extract image url tokens from give files

In [9]:
def extract_img_url_token(img_url_file, id_idx, img_url_idx):
    img_id_name_dict = {}
    with open(img_url_file, encoding='utf-8') as file:
        next(file)
        lines = file.readlines()
        lines = [line.rstrip() for line in lines]
        for line in lines:
            segs = line.split("\t")
            if len(segs) < 3:
                break
            img_id = segs[id_idx]
            img_name_full = segs[img_url_idx].split("/")
            img_name = img_name_full[len(img_name_full) - 1]
            tokens = img_name.split(".")[0].split("-")
            tokens = [item for item in tokens if item.isalpha() and item != "null"]
            img_id_name_dict[img_id] = tokens
    return img_id_name_dict

Extract article url tokens from give files

In [10]:
def extract_article_token(article_file, a_id_idx, ar_name_idx):
    art_id_name_dict = {}
    with open(article_file, encoding="utf8") as file:
        next(file)
        lines = file.readlines()
        lines = [line.rstrip() for line in lines]
        for line in lines:
            segs = line.split("\t")
            if len(segs) < 3:
                break
            ar_id = segs[a_id_idx]
            ar_name_full = segs[ar_name_idx].split("/")
            ar_name = ar_name_full[len(ar_name_full) - 1]
            tokens = ar_name.split(".")[0].split("-")
            tokens = [item for item in tokens if item.isalpha() and item != "null"]
            art_id_name_dict[ar_id] = tokens
        return art_id_name_dict

Find the matched URL paris

In [11]:
def match_url(art_id_name_dict, img_id_name_dict):
    print("matching url")
    candidates = {}
    total = 0
    result = {}
    for art_k, art_v in art_id_name_dict.items():
        cnt = 0
        flag = False

        for img_k, img_v in img_id_name_dict.items():
            common_elements = [x for x in art_v if x in img_v and x not in stop_words and len(x) > 1]
            if len(common_elements) > 0:
                if art_k not in result:
                    result[art_k] = []
                result[art_k].append((img_k, len(common_elements)))
                flag = True
                cnt += 1
                for ele in common_elements:
                    if ele not in candidates:
                        candidates[ele] = 0
                    candidates[ele] += 1
        if art_k in result:
            temp_list = result[art_k]
            temp_list.sort(key=lambda x: x[1], reverse=True)
            result[art_k] = [i[0] for i in temp_list]
        if flag:
            total += 1
    print(total)
    print(len(result))
    return result

Write url matching results into files

In [12]:
def write_url_sim(result_file, result):
    with open(result_file, 'a') as the_file:
        for art_id, image_list in result.items():
            line = art_id
            for image in image_list:
                line += "\t" + image
                the_file.write(line)
    the_file.close()

### Prediction

In [52]:
test_article_file = "MediaEvalNewsImagesBatch04articles.tsv"
img_id_name_dict = extract_img_url_token(os.path.join(data_folder, test),
                                              TEST_I_ID_IDX,
                                              TEST_IMG_URL_IDX)
article_id_name_dict = extract_article_token(os.path.join(data_folder, test_article_file),
                                                  TEST_A_ID_IDX,
                                                  TEST_TITLE_IDX)
result = match_url(article_id_name_dict, img_id_name_dict)
write_url_sim("result/test_url_matching.tsv", result)

matching url
1772
1772


### Evaluation

In [53]:
def evaluate(result, ground_truth):
    count = 0
    total = 0
    for ar_id, img_id in ground_truth.items():
        if ar_id in result:
            if img_id in result[ar_id][0:100]:
                count += 1
        total += 1
    return count / total

In [54]:
evaluation_file='content2019-03-v3.tsv'
tr_file=os.path.join(data_folder, evaluation_file)
ground_truth = extract_gt(tr_file)
img_id_name_dict = extract_img_url_token(tr_file, TR_I_ID_IDX, TR_IMG_URL_IDX)
article_id_name_dict = extract_article_token(tr_file, TR_A_ID_IDX, TR_TITLE_IDX)
print(len(article_id_name_dict))
result = match_url(article_id_name_dict, img_id_name_dict)
write_url_sim("result/eval_url_matching.tsv",result)
evaluation_result = evaluate(result, ground_truth)
print("MR100 in evaluation dataset is ", evaluation_result)

2387
matching url
2246
2246
MR100 in evaluation dataset is  0.4377880184331797


In [57]:
tr_file=r"processed_data/data/train.tsv"
ground_truth = extract_gt(tr_file)
img_id_name_dict = extract_img_url_token(tr_file, TR_I_ID_IDX, TR_IMG_URL_IDX)
article_id_name_dict = extract_article_token(tr_file, TR_A_ID_IDX, TR_TITLE_IDX)
print(len(article_id_name_dict))
result = match_url(article_id_name_dict, img_id_name_dict)
write_url_sim("result/tr_url_matching.tsv",result)
tr_result = evaluate(result, ground_truth)
print("MR100 in training dataset is ", tr_result)

5143
matching url
4958
4958
MR100 in training dataset is  0.40773867392572427
