In [2]:
import os
import nltk

import numpy as np

from collections import OrderedDict, defaultdict, Counter
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

In [9]:
documents = OrderedDict()

for name in os.listdir('/Users/andrewlamb/Google_Drive/Stanford/CS199/DB_SelfPaced/Flat'):
    with open('/Users/andrewlamb/Google_Drive/Stanford/CS199/DB_SelfPaced/Flat/' + name) as f:
        lines = []
        for line in f:
            lines.append(line.strip('| \n'))
        documents[name] = (' '.join(lines))
        
keywords = defaultdict(set)
stemmer = nltk.stem.PorterStemmer()

for name in os.listdir('/Users/andrewlamb/Google_Drive/Stanford/CS199/studyguides/data/keywords'):
    with open('/Users/andrewlamb/Google_Drive/Stanford/CS199/studyguides/data/keywords/' + name) as f:
        for line in f:
            lecture_name, key, _, rank = line.strip().split(',')
            rank = int(rank)
            if rank > -1:
                key = ' '.join(stemmer.stem(token) for token in nltk.word_tokenize(key))
                key = key.lower()
                key = key.replace('-', ' ').replace('`', '').replace("'", '').strip()
                keywords[lecture_name].add(key)

In [12]:
stopwords = []
with open('/Users/andrewlamb/Google_Drive/Stanford/CS199/RAKE-tutorial/SmartStoplistAdditional.txt') as f:
    for line in f:
        stopwords.append(line.strip())
        
external_tfidfs = Counter()
with open('tfidf_log.out') as f:
    for line in f:
        word, tfidf = line.strip().split(':')
        tokens = nltk.word_tokenize(word)
        if tokens[0] in stopwords or tokens[-1] in stopwords: continue
        if len(tokens) < 2: continue
        if len(word) < 4: continue
        stemmed = ' '.join(stemmer.stem(token) for token in tokens)
        external_tfidfs[stemmed] += float(tfidf)

sorted(external_tfidfs.items(), key=lambda (k, v): (v, k), reverse=True)

[(u'multi valu depend', 1083615.0372300001),
 (u'insert command', 707193.69361),
 (u'multivalu depend', 689619.0859930001),
 (u'project oper', 590944.77306),
 (u'tupl base constraint', 541807.5186150001),
 (u'modif command', 541807.5186150001),
 (u'book sub element', 541807.5186150001),
 (u'author element', 541807.5186150001),
 (u'transit rule', 534199.35301),
 (u'intersect oper', 522150.877535),
 (u'relat algebra express', 514545.96228800004),
 (u'isol level', 490936.59172300005),
 (u'function depend', 473030.468914),
 (u'associ class', 472423.62180900003),
 (u'cs applic', 465570.303777),
 (u'close tag', 465103.240337),
 (u'aggreg function', 463055.75694500003),
 (u'student id', 456515.445573),
 (u'special attribut', 450994.003364),
 (u'express languag', 435175.875601),
 (u'databas design', 423099.10559700005),
 (u'xslt specif', 361205.01241),
 (u'xml schema descriptor', 361205.01241),
 (u'student tupl', 361205.01241),
 (u'standford student', 361205.01241),
 (u'sql video', 361205.0124

In [35]:
stemmed_documents = OrderedDict()

for name, document in documents.items():
    document = document.replace('-', ' ')
    document = ' '.join([stemmer.stem(token) for token in nltk.word_tokenize(document)])
    stemmed_documents[name] = document.lower()

extractor = TfidfVectorizer(ngram_range=(1,4), stop_words=stopwords)
X = extractor.fit_transform(stemmed_documents.values())
X = X / X.sum(axis=1)
inverse_voc = {v: k for k, v in extractor.vocabulary_.items()}

In [24]:
external_tfidf_idxs = [extractor.vocabulary_[word] for word in sorted(external_tfidfs.keys()) if word in extractor.vocabulary_]
external_tfidf_vals = np.array([value for word, value in sorted(external_tfidfs.items()) if word in extractor.vocabulary_])
external_tfidf_vals /= external_tfidf_vals.sum()
# X[:, external_tfidf_idxs] += external_tfidf_vals

In [36]:
extracted = OrderedDict()
for doc_idx, name in enumerate(stemmed_documents.keys()):
    extracted[name] = [inverse_voc[idx] for idx in np.argsort(np.array(X[doc_idx,:])).flatten()]

In [37]:
matches = OrderedDict()
jaccard = OrderedDict()

for name, keys in sorted(keywords.items()):
    extracted_words = extracted[name][-len(keys):]
    matches[name.replace('.keys', '')] = list(set(keys).intersection(extracted_words))
    jaccard[name.replace('.keys', '')] = float(len(set(keys).intersection(extracted_words))) / len(set(keys).union(set(extracted_words)))
    
print('Average jaccard: {}'.format(sum(jaccard.values()) / len(jaccard)))

Average jaccard: 0.125600212567


In [38]:
matches = OrderedDict()
jaccard = OrderedDict()

for name, keys in sorted(keywords.items()):
    extracted_words = extracted[name][-2*len(keys):]
    matches[name.replace('.keys', '')] = list(set(keys).intersection(extracted_words))
    jaccard[name.replace('.keys', '')] = float(len(set(keys).intersection(extracted_words))) / len(set(keys).union(set(extracted_words)))
    
print('Average jaccard: {}'.format(sum(jaccard.values()) / len(jaccard)))

Average jaccard: 0.111878588632


In [39]:
with open('tfidf_keys.csv', 'w') as f:
    for document, keys in sorted(extracted.items()):
        keys = list(reversed(keys))
        for i, key in enumerate(keys[:10]):
            if i < 5:
                f.write('{},{},{}\n'.format(document, key, i + 1))
            else:
                f.write('{},{},{}\n'.format(document, key, 0))