In [1]:
import os
import nltk

import numpy as np

from collections import OrderedDict, defaultdict, Counter
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

In [2]:
documents = OrderedDict()

for name in os.listdir('../../../data/Introduction_to_Databases_Captions'):
    with open('../../../data/Introduction_to_Databases_Captions/' + name) as f:
        lines = []
        for line in f:
            lines.append(line.strip('| \n'))
        documents[name] = ' '.join(lines)
        
keywords = defaultdict(set)
stemmer = nltk.stem.PorterStemmer()

for name in os.listdir('../../../data/keywords/'):
    with open('../../../data/keywords/' + name) as f:
        for line in f:
            lecture_name, key, _, rank = line.strip().split(',')
            rank = int(rank)
            key = key.replace('-', ' ')
            key = ' '.join(stemmer.stem(token) for token in nltk.word_tokenize(key))
            key = key.lower()
            keywords[lecture_name].add(key)

In [17]:
stopwords = []
with open('../../../data/smart.txt') as f:
    for line in f:
        line = line.strip()
        tokens = nltk.word_tokenize(line)
        stopwords.append(' '.join([stemmer.stem(token) for token in tokens]))
        
external_tfidfs = Counter()
with open('tfidf_log.out') as f:
    for line in f:
        word, tfidf = line.strip().split(':')
        tokens = nltk.word_tokenize(word)
        stemmed_tokens = [stemmer.stem(token) for token in tokens]
        if stemmed_tokens[0] in stopwords or stemmed_tokens[-1] in stopwords: continue
        if len(tokens) < 2: continue
        if len(word) < 4: continue
        stemmed = ' '.join(stemmed_tokens)
        external_tfidfs[stemmed] += float(tfidf)

sorted(external_tfidfs.items(), key=lambda (k, v): (v, k), reverse=True)

[(u'multi valu depend', 1083615.0372300001),
 (u'insert command', 707193.69361),
 (u'multivalu depend', 689619.0859930001),
 (u'project oper', 590944.77306),
 (u'tupl base constraint', 541807.5186150001),
 (u'modif command', 541807.5186150001),
 (u'book sub element', 541807.5186150001),
 (u'author element', 541807.5186150001),
 (u'transit rule', 534199.35301),
 (u'intersect oper', 522150.877535),
 (u'relat algebra express', 514545.96228800004),
 (u'isol level', 490936.59172300005),
 (u'function depend', 473030.468914),
 (u'cs applic', 465570.303777),
 (u'close tag', 465103.240337),
 (u'aggreg function', 463055.75694500003),
 (u'student id', 456515.445573),
 (u'special attribut', 450994.003364),
 (u'express languag', 435175.875601),
 (u'databas design', 423099.10559700005),
 (u'xslt specif', 361205.01241),
 (u'xml schema descriptor', 361205.01241),
 (u'student tupl', 361205.01241),
 (u'standford student', 361205.01241),
 (u'sql video', 361205.01241),
 (u'specif tupl', 361205.01241),
 (u

In [18]:
stemmed_documents = OrderedDict()

for name, document in documents.items():
    document = document.replace('-', ' ')
    document = ' '.join([stemmer.stem(token) for token in nltk.word_tokenize(document)])
    stemmed_documents[name] = document.lower()

extractor = TfidfVectorizer(ngram_range=(1,4), stop_words=stopwords)
X = extractor.fit_transform(stemmed_documents.values())
X = X / X.sum(axis=1)
inverse_voc = {v: k for k, v in extractor.vocabulary_.items()}

In [19]:
external_tfidf_idxs = [extractor.vocabulary_[word] for word in sorted(external_tfidfs.keys()) if word in extractor.vocabulary_]
external_tfidf_vals = np.array([value for word, value in sorted(external_tfidfs.items()) if word in extractor.vocabulary_])
external_tfidf_vals /= external_tfidf_vals.sum()
X[:, external_tfidf_idxs] += external_tfidf_vals

In [20]:
extracted = OrderedDict()
for doc_idx, name in enumerate(stemmed_documents.keys()):
    extracted[name] = [inverse_voc[idx] for idx in np.argsort(np.array(X[doc_idx,:])).flatten()]

In [21]:
matches = OrderedDict()
jaccard = OrderedDict()

for name, keys in sorted(keywords.items()):
    extracted_words = extracted[name][-len(keys):]
    matches[name.replace('.keys', '')] = list(set(keys).intersection(extracted_words))
    jaccard[name.replace('.keys', '')] = float(len(set(keys).intersection(extracted_words))) / len(set(keys).union(set(extracted_words)))
    
print('Average jaccard: {}'.format(sum(jaccard.values()) / len(jaccard)))

Average jaccard: 0.12790217467


In [14]:
matches = OrderedDict()
jaccard = OrderedDict()

for name, keys in sorted(keywords.items()):
    extracted_words = extracted[name][-2*len(keys):]
    matches[name.replace('.keys', '')] = list(set(keys).intersection(extracted_words))
    jaccard[name.replace('.keys', '')] = float(len(set(keys).intersection(extracted_words))) / len(set(keys).union(set(extracted_words)))
    
print('Average jaccard: {}'.format(sum(jaccard.values()) / len(jaccard)))

Average jaccard: 0.104138252084


In [None]:
with open('tfidf_combined_keys.csv', 'w') as f:
    for document, keys in sorted(extracted.items()):
        keys = list(reversed(keys))
        for i, key in enumerate(keys[:10]):
            if i < 5:
                f.write('{},{},{}\n'.format(document, key, i + 1))
            else:
                f.write('{},{},{}\n'.format(document, key, 0))

In [None]:
list(reversed(extracted['03_01_Well_Formed_XML_Part_2.txt'])).index('extens style sheet languag')

In [None]:
zip(*sorted(external_tfidfs.items(), key=lambda (k, v): (v, k), reverse=True))[0].index('document type descriptor')

In [None]:
len(keywords['03_02_DTDs_IDs_and_IDREFs.txt'])

In [None]:
stemmed_documents['11_03_Isolation_Levels_Part_3.txt'].count('repeat read isol level')

In [24]:
list(reversed(extracted['03_02_DTDs_IDs_and_IDREFs.txt']))

[u'dtd',
 u'book',
 u'element',
 u'document',
 u'xml',
 u'author',
 u'valid',
 u'id',
 u'id ref',
 u'remark',
 u'attribut',
 u'author element',
 u'string',
 u'refer',
 u'book refer',
 u'ref',
 u'book element',
 u'type',
 u'data',
 u'titl',
 u'document type descriptor',
 u'pc data',
 u'structur',
 u'xsd',
 u'magazin',
 u'edit',
 u'pc',
 u'xml schema',
 u'close tag',
 u'subel',
 u'id ref attribut',
 u'insert command',
 u'id attribut',
 u'hg',
 u'valid document',
 u'multivalu depend',
 u'special attribut',
 u'project oper',
 u'pointer',
 u'regular express',
 u'valid xml',
 u'ju',
 u'element call',
 u'specif',
 u'bookstor',
 u'document document',
 u'isbn number',
 u'id rep',
 u'rep attribut',
 u'll',
 u'modif command',
 u'tupl base constraint',
 u'type descriptor',
 u'descriptor',
 u'basic structur',
 u'transit rule',
 u'specif specif',
 u'author author',
 u'intersect oper',
 u'isbn',
 u'requir',
 u'error',
 u'element element',
 u'relat algebra express',
 u'special type',
 u'document type'

In [22]:
matches

OrderedDict([('01_01_Introduction.txt',
              [u'applic',
               u'databas administr',
               u'databas',
               u'data model',
               u'databas design',
               u'databas manag system',
               u'middl ware',
               u'user',
               u'databas system',
               u'data']),
             ('02_01_The_Relational_Model.txt',
              [u'relat model',
               u'relat',
               u'queri',
               u'tabl',
               u'key',
               u'tupl',
               u'null',
               u'attribut']),
             ('02_02_Querying_Relational_Databases.txt',
              [u'specif queri',
               u'relat databas',
               u'queri',
               u'relat',
               u'queri languag',
               u'sql',
               u'relat algebra']),
             ('03_01_Well_Formed_XML_Part_1.txt',
              [u'open tag',
               u'xml',
               u'remark',
        

In [None]:
reverse(extracted[''])