## 1.3 Introduction to Information Retrieval

Here we work with an eBay item data set.  The data contains 9895 item titles and descriptions.

First we load the data - this is easiest with a `csv.reader`:

In [160]:
import csv
import re
from collections import Counter

with open("data/bike-items.txt") as f:
    r = csv.reader(f, delimiter=',', quotechar='"')
    rgx = re.compile(r'\b[a-zA-Z]+\b') 
    docs = [ (' '.join(re.findall(rgx, x[0])).lower(), ' '.join(re.findall(rgx, x[1])).lower())  for i,x in enumerate(r) if i > 1 ]

print('We have a list of (item title, description) tuple :\n + %s\n + %s' % (docs[0][0],docs[0][1]))

items_t = [ d[0] for d in docs ] # item titles
items_d = [ d[1] for d in docs ] # item descriptions
items_i = range(0, len(items_t)) # item id


We have a list of (item title, description) tuple :
 + cycling bicycle mtb bike fixie gloss carbon fiber riser bar handlebar
 + description feature easy to use made of high quality carbon fiber with the special design can save for a long time the carbon fiber handlebar is made of high quality carbon fiber so that you can use it relieved this quick disassembling carbon fiber handlebar is easy to use and one of the best gifts to your friends specification material carbon fiber color black handlebar clamp diameter mm length package included x cycling carbon fiber rise


9893

## Wordcount across all documents

In [150]:
# Without counters
wc = {}
for d in items_d:
    for w in d.split(' '):
        if w in wc:
            wc[w] += 1
        else:
            wc[w] = 1

print(wc['bike'])
print(wc['carbon'])

# With counters
wc = Counter()
for d in items_d:
    for w in d.split(' '):
        wc[w] += 1
        
print(wc['bike'])
print(wc['carbon'])

5803
1068
5803
1068


## Term frequency matrix

We can start by creating a document by document word count:

In [153]:
tf = {}
for i, d in enumerate(items_d):
    dtf = Counter()
    for w in d.split(' '):
            dtf[w] += 1
    tf[i] = dtf
        
print(tf[1])

Counter({'the': 5, 'to': 4, 'paypal': 3, 'only': 2, 'please': 2, 'destinations': 2, 'ship': 2, 'shipping': 2, 'address': 2, 'in': 2, 'your': 2, 'set': 1, 'checkout': 1, 'wheel': 1, 'payment': 1, 'rates': 1, 'listed': 1, 'item': 1, 'is': 1, 'clyde': 1, 'us': 1, 'time': 1, 'quote': 1, 'of': 1, 'account': 1, 'united': 1, 'before': 1, 'states': 1, 'are': 1, 'threw': 1, 'other': 1, 'speed': 1, 'a': 1, 'message': 1, 'within': 1, 'hub': 1, 'orders': 1, 'commercial': 1, 'verify': 1, 'or': 1, 'we': 1, 'most': 1, 'residential': 1, 'red': 1, 'send': 1, 'cycles': 1, 'at': 1, 'correct': 1, 'verified': 1, 'that': 1, 'after': 1, 'james': 1, 'continental': 1, 'sent': 1, 'receiving': 1, 'x': 1, 'for': 1, 'internal': 1, 'days': 1, 'payme': 1, 'making': 1})


Ideally our term-frequency is a matrix not a ragged array!  Each document should be a vector that has an element for each document in the corpus.  

In [187]:
def get_lexicon(corpus):
    lexicon = set()
    for doc in corpus:
        lexicon.update([word for word in doc.split()])
    return lexicon

corpus = ['the quick brown fox','mary had a little lamb','the owl and the pussycat']
lexicon = get_lexicon(corpus)

tfm =[]
for doc in corpus:
    for term in doc.split():
        tfv = [doc.split().count(word) for word in lexicon]
    tfm.append(tfv)
        
print(tfm)  

[[0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1], [1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0], [0, 0, 0, 0, 1, 2, 0, 0, 0, 1, 1, 0]]


As number of terms increases this method becomes inefficient.  Here is a faster implementation:

In [188]:
def get_lexicon(corpus):
    lexicon = set()
    for doc in corpus:
        lexicon.update([word for word in doc.split()])
    return list(lexicon)

corpus = ['the quick brown fox','mary had a little lamb','the owl and the pussycat']
lexicon = get_lexicon(corpus)

tfm =[]
for doc in corpus:
    tfv = [0]*len(lexicon)
    for term in doc.split():
        tfv[lexicon.index(term)] += 1
    tfm.append(tfv)
        
print(tfm)  

[[0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1], [1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0], [0, 0, 0, 0, 1, 2, 0, 0, 0, 1, 1, 0]]


Lets compare the time of each method

In [190]:
def tfm1(corpus):
    
    def get_lexicon(corpus):
        lexicon = set()
        for doc in corpus:
            lexicon.update([word for word in doc.split()])
        return lexicon
    
    lexicon = get_lexicon(corpus)

    tfm =[]
    for doc in corpus:
        for term in doc.split():
            tfv = [doc.split().count(word) for word in lexicon]
        tfm.append(tfv)
    
    return tfm

def tfm2(corpus):
    
    def get_lexicon(corpus):
        lexicon = set()
        for doc in corpus:
            lexicon.update([word for word in doc.split()])
        return list(lexicon)

    lexicon = get_lexicon(corpus)

    tfm =[]
    for doc in corpus:
        tfv = [0]*len(lexicon)
        for term in doc.split():
            tfv[lexicon.index(term)] += 1
        tfm.append(tfv)
    
    return tfm

corpus = ['the quick brown fox','mary had a little lamb','the owl and the pussycat']

%timeit tfm1(corpus)
%timeit tfm2(corpus)


10000 loops, best of 3: 83.2 µs per loop
100000 loops, best of 3: 11.1 µs per loop


## Term frequency matrix

In [191]:
tfm = tfm2(items_d)

In [200]:
print(len(tfm), len(tfm[0]))
tfm[0].count(0)

9893 15841


15791

We have high percentage of zero elements - we should really find better way to store the tf matrix.  How about numpy 