In [1]:
import pandas as pd
import numpy as np
import sys
sys.path.insert(1, "../models/")
from baseline import *
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
from nltk.corpus import wordnet 
from nltk.corpus import stopwords
# nltk.download('wordnet')
# nltk.download('stopwords')
import string
import matplotlib.pyplot as plt
import os
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
from tqdm import tqdm

In [13]:
# utility functions

# display image from thumbnails folder given list of ids
def show_images(imgids):
    for img in imgids:
        plt.figure()
        try:
            img=mpimg.imread('../data/image/'+img+'.jpg')
            imgplot = plt.imshow(img)
        except:
            pass
    plt.show()

In [3]:
# similarity metrics

# returns number of exact tag overlap
def baseline_score(t0,t1):
    return len(set(t0) & set(t1))

# returns {exact matches} + eta*{synonym matches}
def syn_score(t0, t1, eta=0.5):
    score = len(set(t0) & set(t1))
    for tag in t0:
        for syn in wordnet.synsets(tag):
            for name in syn.lemma_names():
                if name in t1:
                    score += eta
    return score

# returns sum({exact match}*{tfidf val})
def tfidf_score(tfidf_df, t0, test):
    ref = tfidf_df.loc[test,:]
    score = 0
    for t in t0:
        for i in test:
            if t == i:
                score += ref[i]
    return score

In [5]:
# KNN model 
class KNN():
    # @param k: number of neighbors to return
    def __init__(self, k):
        self.k = k
    
    # @param train: training set of articles 
    # @article_to_image: map of images associated with each article
    def fit(self, train, article_to_image):
        self.train = train
        self.article_to_image = article_to_image
    
    # @param sim: function to return similarity score 
    # @param test: article to predict in form (id, tags)
    # TODO: implement sep functions for text train and image train
    def predict(self, sim, test):
        test_id, test_tags = test
        self.ranks = {}
        for train_id, train_tags in tqdm(self.train):
            s = sim(train_tags, test_tags)
            if len(self.ranks) < self.k:
                self.ranks[len(self.ranks)] = (train_id, s)
            elif s > min(self.ranks.values(), key=lambda x:x[1])[1]:
                key = min(self.ranks.keys(), key=lambda x:self.ranks[x][1])
                self.ranks[key] = (train_id, s)
        self.ranks = sorted(self.ranks.values(), key = lambda x:x[1], reverse=True)
        # map to predicted images
        self.pred = []
        for train_id, s in self.ranks:
            self.pred += self.article_to_image[train_id]
        return self.pred
    
    def score(self):
        pass

In [6]:
# extract tag data and format as dictionary

data_dir = "../data/csv_outputs/"
article_feats = {}
image_feats = {}
tag_ref = {'event':'event_tag',
           'org':'org_tag',
           'org_industry':'org_industry_tag',
           'person':'person_tag',
           'person_team':'person_team_tag',
           'person_type':'person_type',
           'place':'place_tag',
           'subject':'subject_tag',
           'summary':'headline_extended'
          }
for csv in tqdm(os.listdir(data_dir)):
    if 'ap_category' not in csv: # skip ap category for now
        df = pd.read_csv(data_dir+csv)
        if 'article' in csv:
            feat = csv[8:-4]
            g = df.groupby("id")[tag_ref[feat]]
            article_feats[feat] = g.apply(lambda x: list(x.astype(str).str.lower()))
        elif 'image' in csv:
            feat = csv[6:-4]
            g = df.groupby("id")[tag_ref[feat]]
            image_feats[feat] = g.apply(lambda x: list(x.astype(str).str.lower()))

100%|██████████████████████████████████████████████████████████████████████████████████| 20/20 [04:18<00:00,  9.87s/it]


In [10]:
# format accessible image and article corpus
# TODO: some ids don't overlap

images = []
for imgid, tags in tqdm(image_feats['subject'].iteritems()):
    images.append({'imgid':imgid, 'tags': tags})

# images associated with an article
df = pd.read_csv('../data/csv_outputs/image_subject.csv')
g = df.groupby("article_idx")['id']
article_images = g.apply(list).to_dict()

articles = []
for articleid, tags in tqdm(article_feats['subject'].iteritems()):
    if articleid in article_images.keys():
        articles.append((articleid,tags))

11087it [00:00, 553650.92it/s]
30098it [00:00, 519859.83it/s]


In [16]:
# knn example use case
# predict for first article

test = articles[0]
train = articles[1:]

# test article
print(article_feats['summary'][test[0]])

model = KNN(3)
model.fit(train, article_images)
preds = model.predict(baseline_score, test)
show_images(preds)

["utah authorities say a man killed after leading police on a chase down a busy street is 37-year-old man from the salt lake city suburb of west valley city. salt lake city police said in a news release monday afternoon that the deceased is harold vincent robinson. authorities wouldn't say if the suspect was killed by police gunfire or due to injuries from his truck crashing into a building."]


100%|██████████████████████████████████████████████████████████████████████████| 7237/7237 [00:00<00:00, 112968.90it/s]


<Figure size 432x288 with 0 Axes>

<Figure size 432x288 with 0 Axes>

<Figure size 432x288 with 0 Axes>

<Figure size 432x288 with 0 Axes>

<Figure size 432x288 with 0 Axes>

<Figure size 432x288 with 0 Axes>

<Figure size 432x288 with 0 Axes>

<Figure size 432x288 with 0 Axes>

<Figure size 432x288 with 0 Axes>

<Figure size 432x288 with 0 Axes>

<Figure size 432x288 with 0 Axes>

<Figure size 432x288 with 0 Axes>

<Figure size 432x288 with 0 Axes>

<Figure size 432x288 with 0 Axes>

<Figure size 432x288 with 0 Axes>

<Figure size 432x288 with 0 Axes>

<Figure size 432x288 with 0 Axes>

<Figure size 432x288 with 0 Axes>

<Figure size 432x288 with 0 Axes>

<Figure size 432x288 with 0 Axes>

<Figure size 432x288 with 0 Axes>

<Figure size 432x288 with 0 Axes>

<Figure size 432x288 with 0 Axes>

<Figure size 432x288 with 0 Axes>