In [30]:
import pandas as pd
import numpy as np
import sys
sys.path.insert(1, "../models/")
# from baseline import *
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
from nltk.corpus import wordnet 
from nltk.corpus import stopwords
# nltk.download('wordnet')
# nltk.download('stopwords')
import string
import matplotlib.pyplot as plt
import os
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
from tqdm import tqdm_notebook

In [4]:
# utility functions

# display image from thumbnails folder given list of ids
def show_images(imgids):
    for img in imgids:
        try:
            plt.figure();
            img=mpimg.imread('../data/thumbnail/'+img+'.jpg')
            imgplot = plt.imshow(img);
        except:
            plt.close()
            print(img)
    plt.show()

In [5]:
# similarity metrics

# returns number of exact tag overlap
def baseline_score(train,test):
    return len(set(train) & set(test))/len(train)

# returns {exact matches} + eta*{synonym matches}
def syn_score(train, test, eta=0.5):
    score = len(set(train) & set(test))
    for tag in train:
        for syn in wordnet.synsets(tag):
            for name in syn.lemma_names():
                if name in test:
                    score += eta
                    test.remove(name)
    return score / len(train)

# returns sum({exact match}*{tfidf val})
def tfidf_score(tfidf_df, t0, test):
    ref = tfidf_df.loc[test,:]
    score = 0
    for t in t0:
        for i in test:
            if t == i:
                score += ref[i]
    return score

In [6]:
# KNN model 
class KNN():
    # @param k: number of neighbors to return
    def __init__(self, k=3):
        self.k = k
    
    # @param train: training set of articles 
    # @article_to_image: map of images associated with each article
    def fit(self, train, article_to_image):
        self.train = train
        self.article_to_image = article_to_image
    
    # @param sim: function to return similarity score 
    # @param test_tags: article tags to predict 
    # TODO: tags -> text, use tagging api
    # TODO: implement sep functions for text train and image train
    def predict(self, sim, test_tags):
        self.ranks = {}
        for train_id, train_tags in tqdm(self.train):
            s = sim(train_tags, test_tags)
            if len(self.ranks) < self.k:
                self.ranks[len(self.ranks)] = (train_id, s)
            elif s > min(self.ranks.values(), key=lambda x:x[1])[1]:
                key = min(self.ranks.keys(), key=lambda x:self.ranks[x][1])
                self.ranks[key] = (train_id, s)
        self.ranks = sorted(self.ranks.values(), key = lambda x:x[1], reverse=True)
        print(self.ranks)
        # map to predicted images
        self.pred = []
        for train_id, s in self.ranks:
            print(article_feats['summary'][train_id])
            print(self.article_to_image[train_id])
            self.pred += self.article_to_image[train_id]
        return self.pred
    
    def score(self):
        pass

In [59]:
datadir = '../data/clean_data/'

tag_ref = {'ap_category':'category_code',
           'event':'event_tag',
           'org':'org_tag',
           'org_industry':'org_industry_tag',
           'person':'person_tag',
           'person_team':'person_team_tag',
           'person_type':'person_type',
           'place':'place_tag',
           'subject':'subject_tag',
           'summary':'headline_extended'
          }

train = pd.Series([])
for csv_file in tqdm_notebook(os.listdir(datadir)):
    if 'article' in csv_file: 
        print(csv_file)
        df = pd.read_csv(datadir+csv_file)
        feat = csv_file[8:-4]
        g = df.groupby("id")[tag_ref[feat]]
        if train.empty:
            train = g.apply(lambda x: list(x.astype(str).str.lower()))
        else:
            g = g.apply(lambda x: list(x.astype(str).str.lower()))
            train = train.combine(g, lambda x1, x2: list(set(x1+x2)), fill_value=[])

HBox(children=(IntProgress(value=0, max=20), HTML(value='')))

article_ap_category.csv
article_event.csv
id
00c39bce7f61468e8b7b11fbd9be6ed1                              [nan]
00c482e2c51942de93495eb98a8bf2e5                              [nan]
00c6682106da42f299ab9955de385aa5    [police pursuit salt lake city]
00d713a2b6cb44c88fbd2fd3f10228f3                              [nan]
00d74d155440484f95cd61fa77aa2fb3                              [nan]
00da8194fa4c4bfa97aed5cc708a747d                              [nan]
00dc191fec464d2da9b5a9ceed7a8589                              [nan]
00dfb4bde3ec4258b02ea69eed1dce5c                              [nan]
00e39295e7e744149942f67e430d946f                  [costco shooting]
00e41d56d9dc40b0b8560f38e49b3d75                              [nan]
00ea08633bd74705bd38228a6afbf305                  [europe migrants]
00ea7bbd48a54475915edf9074d5e50d                              [nan]
00ecd42e683b4e8497fd96025aedae2b                              [nan]
00ef063629e24afd911bc84615fc81d4                              [nan]
00f

Name: org_tag, Length: 22602, dtype: object
article_org_industry.csv
id
00c39bce7f61468e8b7b11fbd9be6ed1                                                [nan]
00c482e2c51942de93495eb98a8bf2e5                                 [nan, nan, nan, nan]
00c6682106da42f299ab9955de385aa5                                                [nan]
00d713a2b6cb44c88fbd2fd3f10228f3                                           [nan, nan]
00d74d155440484f95cd61fa77aa2fb3                                                [nan]
00da8194fa4c4bfa97aed5cc708a747d                                                [nan]
00dc191fec464d2da9b5a9ceed7a8589                                                [nan]
00dfb4bde3ec4258b02ea69eed1dce5c                                      [nan, nan, nan]
00e39295e7e744149942f67e430d946f                                                [nan]
00e41d56d9dc40b0b8560f38e49b3d75                                                [nan]
00ea08633bd74705bd38228a6afbf305                                    

Name: person_tag, Length: 22602, dtype: object
article_person_team.csv
id
00c482e2c51942de93495eb98a8bf2e5    [golden state warriors, los angeles lakers, un...
00c6682106da42f299ab9955de385aa5                                                [nan]
00d713a2b6cb44c88fbd2fd3f10228f3                                                [nan]
00d74d155440484f95cd61fa77aa2fb3                                                [nan]
00dc191fec464d2da9b5a9ceed7a8589                                                [nan]
00e39295e7e744149942f67e430d946f                                                [nan]
00e41d56d9dc40b0b8560f38e49b3d75                                                [nan]
00ea7bbd48a54475915edf9074d5e50d                                                [nan]
00ecd42e683b4e8497fd96025aedae2b                                                [nan]
00f10c10d7bc4676abdef08344085a41                                                [nan]
0103f13096ef4180ba342d8b826d07ce                                  

Name: person_type, Length: 22539, dtype: object
article_place.csv
id
00c39bce7f61468e8b7b11fbd9be6ed1    [manhattan, new york city, new york, united st...
00c482e2c51942de93495eb98a8bf2e5           [california, united states, north america]
00c6682106da42f299ab9955de385aa5    [utah, united states, north america, salt lake...
00d713a2b6cb44c88fbd2fd3f10228f3           [california, united states, north america]
00d74d155440484f95cd61fa77aa2fb3    [california, united states, north america, chico]
00da8194fa4c4bfa97aed5cc708a747d                [dhaka, bangladesh, south asia, asia]
00dc191fec464d2da9b5a9ceed7a8589                                                [nan]
00dfb4bde3ec4258b02ea69eed1dce5c    [iran, middle east, tehran, united kingdom, we...
00e39295e7e744149942f67e430d946f    [los angeles, california, united states, north...
00e41d56d9dc40b0b8560f38e49b3d75                     [thailand, southeast asia, asia]
00ea08633bd74705bd38228a6afbf305    [europe, italy, western europe, gre

Name: subject_tag, Length: 22602, dtype: object
article_summary.csv
id
00c39bce7f61468e8b7b11fbd9be6ed1    [harvey weinstein's lawyers are arguing agains...
00c482e2c51942de93495eb98a8bf2e5    [stephen curry is using a foundation of faith ...
00c6682106da42f299ab9955de385aa5    [authorities say a robbery suspect has died mo...
00d713a2b6cb44c88fbd2fd3f10228f3    [the trump administration has said it will not...
00d74d155440484f95cd61fa77aa2fb3    [california had its slowest recorded growth ra...
00da8194fa4c4bfa97aed5cc708a747d    [bangladesh is facing its worst-ever dengue fe...
00dc191fec464d2da9b5a9ceed7a8589    [music review: drew holcomb and the neighbors ...
00dfb4bde3ec4258b02ea69eed1dce5c    [oman's top diplomat in tehran for talks as te...
00e39295e7e744149942f67e430d946f    [the lawyer for an off-duty officer who shot a...
00e41d56d9dc40b0b8560f38e49b3d75    [thailand's health ministry has received its f...
00ea08633bd74705bd38228a6afbf305    [police in north macedonia say th

In [60]:
train

id
00c39bce7f61468e8b7b11fbd9be6ed1    [law and order, smuggling, person, general new...
00c482e2c51942de93495eb98a8bf2e5    [general news, n, national basketball associat...
00c6682106da42f299ab9955de385aa5    [police, theft, general news, police pursuit s...
00d713a2b6cb44c88fbd2fd3f10228f3    [u.s. department of transportation, the trump ...
00d74d155440484f95cd61fa77aa2fb3    [birth rates, general news, n, nan, demographi...
00da8194fa4c4bfa97aed5cc708a747d    [south asia, disease outbreaks, dengue fever, ...
00dc191fec464d2da9b5a9ceed7a8589    [music review: drew holcomb and the neighbors ...
00dfb4bde3ec4258b02ea69eed1dce5c    [diplomacy, boris johnson, middle east, person...
00e39295e7e744149942f67e430d946f    [f, law and order, costco shooting, general ne...
00e41d56d9dc40b0b8560f38e49b3d75    [thailand, medical marijuana, general news, ma...
00ea08633bd74705bd38228a6afbf305    [western europe, europe, serbia, i, politician...
00ea7bbd48a54475915edf9074d5e50d    [f, welfare of 

In [30]:
# extract tag data and format as dictionary

data_dir = "../data/csv_outputs/"
article_feats = {}
image_feats = {}
tag_ref = {'ap_category':'category_code',
           'event':'event_tag',
           'org':'org_tag',
           'org_industry':'org_industry_tag',
           'person':'person_tag',
           'person_team':'person_team_tag',
           'person_type':'person_type',
           'place':'place_tag',
           'subject':'subject_tag',
           'summary':'headline_extended'
          }
for csv in tqdm(os.listdir(data_dir)):
    if 'ap_category' not in csv: # skip ap category for now
        df = pd.read_csv(data_dir+csv)
        if 'article' in csv:
            feat = csv[8:-4]
            g = df.groupby("id")[tag_ref[feat]]
            article_feats[feat] = g.apply(lambda x: list(x.astype(str).str.lower()))
        elif 'image' in csv:
            feat = csv[6:-4]
            g = df.groupby("id")[tag_ref[feat]]
            image_feats[feat] = g.apply(lambda x: list(x.astype(str).str.lower()))


  0%|                                                                                           | 0/20 [00:00<?, ?it/s]
 10%|████████▎                                                                          | 2/20 [00:06<00:58,  3.22s/it]
 15%|████████████▍                                                                      | 3/20 [00:13<01:12,  4.25s/it]
 20%|████████████████▌                                                                  | 4/20 [00:20<01:20,  5.05s/it]
 25%|████████████████████▊                                                              | 5/20 [00:26<01:20,  5.39s/it]
 30%|████████████████████████▉                                                          | 6/20 [00:28<01:02,  4.44s/it]
 35%|█████████████████████████████                                                      | 7/20 [00:34<01:05,  5.03s/it]
 40%|█████████████████████████████████▏                                                 | 8/20 [00:41<01:05,  5.47s/it]
 45%|██████████████████████████████████

In [35]:
# format accessible image and article corpus
# TODO: some ids don't overlap

images = []
for imgid, tags in tqdm(image_feats['subject'].iteritems()):
    images.append({'imgid':imgid, 'tags': tags})

# images associated with an article
df = pd.read_csv('../data/csv_outputs/image_subject.csv')
g = df.groupby("article_idx")['id']
article_images = g.apply(list).to_dict()

articles = []
# displayable
for articleid, tags in tqdm(article_feats['subject'].iteritems()):
    if articleid in article_images.keys():
        articles.append((articleid,tags))

55145it [00:00, 767959.78it/s]
22602it [00:00, 810828.70it/s]


In [67]:
# KNN model
class KNN():
    # @param k: number of neighbors to return
    def __init__(self, k, article_to_image):
        self.k = k
        self.article_to_image = article_to_image

    # @param train: training set of articles
    # @article_to_image: map of images associated with each article
    def fit(self, train):
        self.train = train

    # returns number of normalized exact tag overlap
    def baseline_score(self,t0,t1):
        return len(set(t0) & set(t1))/len(t0)

    # @param sim: function to return similarity score
    # @param test: article to predict in form (id, tags)
    # TODO: implement sep functions for text train and image train
    def predict(self, test_tags):
        ranks = {}
        train = self.train.copy()
        train_ids, train_tags_all = train.index, train.values

        # go through ids
        for ind in range(len(train_ids)):
            train_id, train_tags = train_ids[ind], train_tags_all[ind]
            s = self.baseline_score(train_tags, test_tags)
            if len(ranks) < self.k:
                ranks[len(ranks)] = (train_id, s)
            elif s > min(ranks.values(), key=lambda x:x[1])[1]:
                key = min(ranks.keys(), key=lambda x:ranks[x][1])
                ranks[key] = (train_id, s)
        ranks = sorted(ranks.values(), key = lambda x:x[1], reverse=True)

        # map to predicted images
        train_ids = []
        pred = {}
        for train_id, s in ranks:
            img_ids = self.article_to_image[str(train_id)]
            for img_id in img_ids:
                pred[img_id] = s
            train_ids.append(train_id)
        return train_ids, pred

    def score(self):
        pass

In [64]:
# images associated with an article
df = pd.read_csv('../data/clean_data/image_summary.csv')
g = df.groupby("article_idx")['id']
article_images = g.apply(set).to_dict()

# articles = []
# displayable
'''for articleid, tags in tqdm(article_feats['subject'].iteritems()):
    if articleid in article_images.keys():
        displayable = True
        for img in article_images[articleid]:
            if not os.path.isfile('../data/thumbnail/'+img+'.jpg'):
                displayable = False 
        if displayable:
            articles.append((articleid,tags))'''

"for articleid, tags in tqdm(article_feats['subject'].iteritems()):\n    if articleid in article_images.keys():\n        displayable = True\n        for img in article_images[articleid]:\n            if not os.path.isfile('../data/thumbnail/'+img+'.jpg'):\n                displayable = False \n        if displayable:\n            articles.append((articleid,tags))"

In [65]:
article_images

{'00c39bce7f61468e8b7b11fbd9be6ed1': {'30d4efb9f17a4aa786f2018cf718756a'},
 '00c482e2c51942de93495eb98a8bf2e5': {'4dab89d356a045058908a8fe6815a414',
  '76762a71203d478c803f0a607e69176a'},
 '00c6682106da42f299ab9955de385aa5': {'0df7da0c208b491ca555a34f05b5482f',
  '4bdaf6f1e9fe4cbc9e89828164776bf2',
  '77faf53964cb41329e03c0cb317fd600'},
 '00d713a2b6cb44c88fbd2fd3f10228f3': {'017d112292f64525a02ca28d24f744d0',
  'a188285abdc44c8cb6998df1af7e937f'},
 '00d74d155440484f95cd61fa77aa2fb3': {'02effd5924854ec7a255a66b48682c8d',
  '363c6fdf48554fa48842d99fea0932e7',
  '44949ba7530b403c891c1a5d6044f21a',
  '642e95705eef4ea5a14bb43dea5a225f'},
 '00da8194fa4c4bfa97aed5cc708a747d': {'1f0584c9928945c9a381de5b29cbdd27',
  '3613f5fb0a984bd7a39d61243793bde7',
  '6065551f73b04c48bf7b62bf872e2688',
  '730891d7f04b46d291d4323c2fa41cdb',
  '77b881de490e403995470cc5397ef57b',
  '7aae5e48f25749a286604c9d4d356d22',
  'a23283a2d7d14305ae06950a66a36ad5',
  'b2278f8f84e842c19b3a8bfdfe434ad2',
  'b323f6c58ed748d1

In [77]:
# knn example use case
# predict for first article

# test article
test = ['obama']
print(test)

model = KNN(3, article_images)
model.fit(train)
preds = model.predict(test)
print(preds)
show_images(preds)

['obama']
(['00c39bce7f61468e8b7b11fbd9be6ed1', '00c482e2c51942de93495eb98a8bf2e5', '00c6682106da42f299ab9955de385aa5'], {'30d4efb9f17a4aa786f2018cf718756a': 0.0, '4dab89d356a045058908a8fe6815a414': 0.0, '76762a71203d478c803f0a607e69176a': 0.0, '77faf53964cb41329e03c0cb317fd600': 0.0, '4bdaf6f1e9fe4cbc9e89828164776bf2': 0.0, '0df7da0c208b491ca555a34f05b5482f': 0.0})
['00c39bce7f61468e8b7b11fbd9be6ed1', '00c482e2c51942de93495eb98a8bf2e5', '00c6682106da42f299ab9955de385aa5']
{'30d4efb9f17a4aa786f2018cf718756a': 0.0, '4dab89d356a045058908a8fe6815a414': 0.0, '76762a71203d478c803f0a607e69176a': 0.0, '77faf53964cb41329e03c0cb317fd600': 0.0, '4bdaf6f1e9fe4cbc9e89828164776bf2': 0.0, '0df7da0c208b491ca555a34f05b5482f': 0.0}


In [72]:
syn_preds = model.predict(syn_score, test)
print(syn_preds)
show_images(syn_preds)

TypeError: predict() takes 2 positional arguments but 3 were given

In [12]:
exact_match = baseline_model(test[1], images, 10, baseline_score)
syn_match = baseline_model(test[1], images, 10, lambda x,y: syn_score(x,y,eta=0.5))
print(syn_match)

[('0002c8b6322446a8a9b8f8abaccb4430', 0), ('0006323cab994ddd9c0824d14c2146fd', 0), ('001329af20914404b6609d9b05def407', 0)]
