In [19]:
import os
import nltk

import numpy as np

from collections import OrderedDict, defaultdict, Counter
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

In [20]:
documents = OrderedDict()

for name in os.listdir('../../../data/Introduction_to_Databases_Captions'):
    with open('../../../data/Introduction_to_Databases_Captions/' + name) as f:
        lines = []
        for line in f:
            lines.append(line.strip('| \n'))
        documents[name] = ' '.join(lines)
        
keywords = defaultdict(set)
stemmer = nltk.stem.PorterStemmer()

for name in os.listdir('../../../data/keywords/'):
    with open('../../../data/keywords/' + name) as f:
        for line in f:
            lecture_name, key, _, rank = line.strip().split(',')
            rank = int(rank)
            key = key.replace('-', ' ')
            key = ' '.join(stemmer.stem(token) for token in nltk.word_tokenize(key))
            key = key.lower()
            keywords[lecture_name].add(key)

In [28]:
stopwords = []
with open('../../../data/smart.txt') as f:
    for line in f:
        line = line.strip()
        tokens = nltk.word_tokenize(line)
        stopwords.append(' '.join([stemmer.stem(token) for token in tokens]))
        
external_tfidfs = Counter()
with open('tfidf_log.out') as f:
    for line in f:
        word, tfidf = line.strip().split(':')
        tokens = nltk.word_tokenize(word)
        stemmed_tokens = [stemmer.stem(token) for token in tokens]
        if stemmed_tokens[0] in stopwords or stemmed_tokens[-1] in stopwords: continue
        if len(word) < 3: continue
        stemmed = ' '.join(stemmed_tokens)
        external_tfidfs[stemmed] += float(tfidf)

sorted(external_tfidfs.items(), key=lambda (k, v): (v, k), reverse=True)

[(u'multi valu depend', 1083615.0372300001),
 (u'edit', 900412.789572),
 (u'author', 884180.892396),
 (u'design', 847849.7994469999),
 (u'oper', 755971.6199159999),
 (u'insert command', 707193.69361),
 (u'multivalu depend', 689619.0859930001),
 (u'project', 662592.363077),
 (u'applic', 653034.821672),
 (u'limit', 651003.077309),
 (u'program', 611817.210849),
 (u'project oper', 590944.77306),
 (u'test', 576645.1851369999),
 (u'set', 574611.404104),
 (u'manag', 560077.9220189999),
 (u'tupl base constraint', 541807.5186150001),
 (u'modif command', 541807.5186150001),
 (u'book sub element', 541807.5186150001),
 (u'author element', 541807.5186150001),
 (u'group', 537313.70686),
 (u'transit rule', 534199.35301),
 (u'roll', 533554.743148),
 (u'construct', 530772.516812),
 (u'relat', 529830.267845),
 (u'function', 525760.996594),
 (u'condit', 523150.552487),
 (u'type', 522992.594945),
 (u'intersect oper', 522150.877535),
 (u'count', 520795.613291),
 (u'relat algebra express', 514545.9622880000

In [29]:
stemmed_documents = OrderedDict()

for name, document in documents.items():
    document = document.replace('-', ' ')
    document = ' '.join([stemmer.stem(token) for token in nltk.word_tokenize(document)])
    stemmed_documents[name] = document.lower()

extractor = TfidfVectorizer(ngram_range=(1,4), stop_words=stopwords)
X = extractor.fit_transform(stemmed_documents.values())
X = X / X.sum(axis=1)
inverse_voc = {v: k for k, v in extractor.vocabulary_.items()}

In [30]:
external_tfidf_idxs = [extractor.vocabulary_[word] for word in sorted(external_tfidfs.keys()) if word in extractor.vocabulary_]
external_tfidf_vals = np.array([value for word, value in sorted(external_tfidfs.items()) if word in extractor.vocabulary_])
external_tfidf_vals /= external_tfidf_vals.sum()
X[:, external_tfidf_idxs] += external_tfidf_vals

In [31]:
extracted = OrderedDict()
for doc_idx, name in enumerate(stemmed_documents.keys()):
    extracted[name] = [inverse_voc[idx] for idx in np.argsort(np.array(X[doc_idx,:])).flatten()]

In [32]:
matches = OrderedDict()
jaccard = OrderedDict()

for name, keys in sorted(keywords.items()):
    extracted_words = extracted[name][-len(keys):]
    matches[name.replace('.keys', '')] = list(set(keys).intersection(extracted_words))
    jaccard[name.replace('.keys', '')] = float(len(set(keys).intersection(extracted_words))) / len(set(keys).union(set(extracted_words)))
    
print('Average jaccard: {}'.format(sum(jaccard.values()) / len(jaccard)))

Average jaccard: 0.118874887844


In [33]:
matches = OrderedDict()
jaccard = OrderedDict()

for name, keys in sorted(keywords.items()):
    extracted_words = extracted[name][-2*len(keys):]
    matches[name.replace('.keys', '')] = list(set(keys).intersection(extracted_words))
    jaccard[name.replace('.keys', '')] = float(len(set(keys).intersection(extracted_words))) / len(set(keys).union(set(extracted_words)))
    
print('Average jaccard: {}'.format(sum(jaccard.values()) / len(jaccard)))

Average jaccard: 0.117225482021


In [34]:
with open('tfidf_combined_keys.csv', 'w') as f:
    for document, keys in sorted(extracted.items()):
        keys = list(reversed(keys))
        for i, key in enumerate(keys[:10]):
            if i < 5:
                f.write('{},{},{}\n'.format(document, key, i + 1))
            else:
                f.write('{},{},{}\n'.format(document, key, 0))