# Environment
Class providing access to MEC datasets using MongoDB.

In [1]:
import pymongo
import numpy as np
from sklearn.decomposition import PCA


class MEC():
    
    def __init__(self, dataset, db_name='mec'):
        self.db = db = pymongo.MongoClient()[db_name]
        self.dataset = dataset
        self._indexing()
        self.dictionary = dict((x, i) for i, x in enumerate(sorted(self.df.keys())))
        
    def _indexing(self):
        self.df, self.tf = {}, {}
        self.docs = {}
        cursor = self.db[self.dataset].find(no_cursor_timeout=True).sort('_id', pymongo.ASCENDING)
        size = cursor.count()
        for i, doc in enumerate(cursor):
            self.docs[i] = {'d': doc['_id'], 'tags': []}
            try:
                tags = doc['clarifai_tags']
                probs = doc['clarifai_probs']
                self.docs[i]['tags'] = tags
                self.docs[i]['image'] = doc['image']
                for j, tag in enumerate(tags):
                    try:
                        posting = self.tf[tag]
                    except KeyError:
                        posting = {}
                    posting[i] = {'raw_tf': 1, 'prob': probs[j], 
                                    'norm_p': probs[j] / max(probs),
                                   'freq': 1.0 / len(tags)}
                    self.tf[tag] = posting
                    try:
                        self.df[tag]['df'] += 1
                    except KeyError:
                        self.df[tag] = {'df': 1.0}
                    self.df[tag]['idf'] = np.log(1 + float(size) / self.df[tag]['df'])  
            except KeyError:
                pass
        cursor.close()

    def get_tf(self, tag, doc, metric='norm_p'):
        try:
            w = self.tf[tag][doc][metric]
        except KeyError:
            w = 0.0
        return w

    def get_idf(self, tag, metric='idf'):
        try:
            w = self.df[tag][metric]
        except KeyError:
            w = 0.0
        return w

    def bag(self, doc, tfm='norm_p', idfm='idf'):
        try:
            tgs = self.docs[doc]['tags']
            if idfm is not None:
                b = np.array([self.get_tf(x, doc, metric=tfm) * self.get_idf(x, metric=idfm)
                         for x in tgs])
            else:
                b = np.array([self.get_tf(x, doc, metric=tfm)
                         for x in tgs])
        except KeyError:
            tgs = []
            b = np.array([])
        return tgs, b

    def get_posting(self, tag, tfm='norm_p', idfm='idf'):
        p = []
        if idfm is not None:
            idf = self.get_idf(tag, metric=idfm)
        try:
            pd = self.tf[tag]
            for d, m in pd.items():
                if idfm is not None:
                    p.append((d, m[tfm]*idf))
                else:
                    p.append((d, m[tfm]))
        except KeyError:
            pass
        return sorted(p, key=lambda x: -x[0])

    def cosine(d1, d2):
        b1, w1 = self.bag(d1)
        b2, w2 = self.bag(d2)
        p = 0.0
        for i, t in enumerate(b1):
            w = w1[i]
            try:
                j = b2.index(t)
                k = w2[j]
                p += w*k
            except ValueError:
                pass
        v1 = sum([np.power(x, 2) for x in w1])
        v2 = sum([np.power(x, 2) for x in w2])
        e = np.sqrt(v1) * np.sqrt(v2)
        if e > 0:
            return p / e
        else:
            return 0.0

    def tag_cosine(t1, t2):
        p1 = dict(self.get_posting(t1))
        p2 = dict(self.get_posting(t2))
        L1 = sum([np.power(x, 2) for x in p1.values()])
        L2 = sum([np.power(x, 2) for x in p2.values()])
        e = np.sqrt(L1) * np.sqrt(L2)
        p = 0.0
        if e > 0:
            for t1, w1 in p1.items():
                try:
                    w2 = p2[t1]
                    p += w1*w2
                except KeyError:
                    pass
            return p / e
        else:
            return 0.0

    def naive_matching(d1):
        mappings = []
        for d in self.docs.keys():
            if d != d1:
                mappings.append((d, self.cosine(d1, d)))
        return sorted(mappings, key=lambda x: -x[1])

    def naive_tag_matching(t1):
        mappings = []
        for t2 in self.df.keys():
            if t1 != t2:
                mappings.append((t2, self.tag_cosine(t1, t2)))
        return sorted(mappings, key=lambda x: -x[1])

    def doc_vector(self, d, size, dictionary=None, tfm='norm_p', idfm='idf'):
        if dictionary is None:
            dictionary = self.dictionary
        v = np.zeros(size)
        tags, wts = self.bag(d, tfm=tfm, idfm=idfm)
        for i, t in enumerate(tags):
            pos = dictionary[t]
            if v[pos] == 0:
                v[pos] = wts[i]
            else:
                v[pos] = (v[pos]+wts[i])/2
        return v
    
    def dataset_matrix(self, size, dictionary=None, tfm='norm_p', idfm='idf'):
        vectors = []
        for d in sorted(self.docs.keys()):
            vectors.append(self.doc_vector(d, size=size, dictionary=dictionary,
                                          tfm=tfm, idfm=idfm))
        return np.array(vectors)
    
    def visualize(self, matrix, axes, labels=None, p1=None, p2=None):
        if p1 is None: p1 = 0
        if p2 is None: p2 = 1
        pca = PCA(n_components=p2+1)
        pca.fit(matrix)
        m = pca.transform(matrix)
        if labels is None:
            axes.scatter(m[:,[p1]], m[:,[p2]], alpha=0.4)
        else:
            axes.scatter(m[:,[p1]], m[:,[p2]], alpha=0.4, c=labels)
                
        
        
