In [1]:
import os
import nltk

import numpy as np

from collections import OrderedDict, defaultdict, Counter
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

In [2]:
documents = OrderedDict()

for name in os.listdir('/Users/andrewlamb/Google_Drive/Stanford/CS199/DB_SelfPaced/Flat'):
    with open('/Users/andrewlamb/Google_Drive/Stanford/CS199/DB_SelfPaced/Flat/' + name) as f:
        lines = []
        for line in f:
            lines.append(line.strip('| \n'))
        documents[name] = (' '.join(lines))
        
keywords = defaultdict(set)
stemmer = nltk.stem.PorterStemmer()

for name in os.listdir('/Users/andrewlamb/Google_Drive/Stanford/CS199/studyguides/data/keywords'):
    with open('/Users/andrewlamb/Google_Drive/Stanford/CS199/studyguides/data/keywords/' + name) as f:
        for line in f:
            lecture_name, key, _, rank = line.strip().split(',')
            rank = int(rank)
            if rank > -1:
                key = key.replace('-', ' ')
                key = ' '.join(stemmer.stem(token) for token in nltk.word_tokenize(key))
                key = key.lower()
                keywords[lecture_name].add(key)

In [22]:
stopwords = []
with open('/Users/andrewlamb/Google_Drive/Stanford/CS199/RAKE-tutorial/SmartStoplistAdditional.txt') as f:
    for line in f:
        stopwords.append(line.strip())
        
external_tfidfs = Counter()
with open('tfidf_tf_log.out') as f:
    for line in f:
        word, tfidf = line.strip().split(':')
        tokens = nltk.word_tokenize(word)
        if tokens[0] in stopwords or tokens[-1] in stopwords: continue
        if len(tokens) < 2: continue
        if len(word) < 4: continue
        stemmed = ' '.join(stemmer.stem(token) for token in tokens)
        external_tfidfs[stemmed] += float(tfidf)

sorted(external_tfidfs.items(), key=lambda (k, v): (v, k), reverse=True)

[(u'multi valu depend', 331549013952.0),
 (u'insert command', 165777991514.2),
 (u'tupl base constraint', 165774506976.0),
 (u'modif command', 165774506976.0),
 (u'book sub element', 165774506976.0),
 (u'author element', 165774506976.0),
 (u'transit rule', 138147164749.1),
 (u'multivalu depend', 135393059867.7),
 (u'xslt specif', 110516337984.0),
 (u'xml schema descriptor', 110516337984.0),
 (u'student tupl', 110516337984.0),
 (u'standford student', 110516337984.0),
 (u'sql video', 110516337984.0),
 (u'specif tupl', 110516337984.0),
 (u'select privileg', 110516337984.0),
 (u'select express', 110516337984.0),
 (u'relat s1', 110516337984.0),
 (u'relat algebra video', 110516337984.0),
 (u'referenti integr violat', 110516337984.0),
 (u'queri n time', 110516337984.0),
 (u'post risk', 110516337984.0),
 (u'physic design advisor', 110516337984.0),
 (u'natur function depend', 110516337984.0),
 (u'modif statement', 110516337984.0),
 (u'mega relat', 110516337984.0),
 (u'label valu pair', 11051633

In [23]:
external_tfidf_idxs = [extractor.vocabulary_[word] for word in sorted(external_tfidfs.keys()) if word in extractor.vocabulary_]
external_tfidf_vals = np.array([value for word, value in sorted(external_tfidfs.items()) if word in extractor.vocabulary_])
external_tfidf_vals /= external_tfidf_vals.sum()

In [24]:
stemmed_documents = OrderedDict()

for name, document in documents.items():
    document = document.replace('-', ' ')
    document = ' '.join([stemmer.stem(token) for token in nltk.word_tokenize(document)])
    stemmed_documents[name] = document.lower()

extractor = TfidfVectorizer(ngram_range=(1,4), stop_words=stopwords)
X = extractor.fit_transform(stemmed_documents.values())
X = X / X.sum(axis=1)
X[:, external_tfidf_idxs] += external_tfidf_vals
inverse_voc = {v: k for k, v in extractor.vocabulary_.items()}

extracted = OrderedDict()
for doc_idx, name in enumerate(stemmed_documents.keys()):
    extracted[name] = [inverse_voc[idx] for idx in np.argsort(np.array(X[doc_idx,:])).flatten()]

In [25]:
matches = OrderedDict()
jaccard = OrderedDict()

for name, keys in sorted(keywords.items()):
    extracted_words = extracted[name][-len(keys):]
    matches[name.replace('.keys', '')] = list(set(keys).intersection(extracted_words))
    jaccard[name.replace('.keys', '')] = float(len(set(keys).intersection(extracted_words))) / len(set(keys).union(set(extracted_words)))
    
print('Average jaccard: {}'.format(sum(jaccard.values()) / len(jaccard)))

Average jaccard: 0.120458904278


In [None]:
matches = OrderedDict()
jaccard = OrderedDict()

for name, keys in sorted(keywords.items()):
    extracted_words = extracted[name][-2*len(keys):]
    matches[name.replace('.keys', '')] = list(set(keys).intersection(extracted_words))
    jaccard[name.replace('.keys', '')] = float(len(set(keys).intersection(extracted_words))) / len(set(keys).union(set(extracted_words)))
    
print('Average jaccard: {}'.format(sum(jaccard.values()) / len(jaccard)))

In [None]:
sorted(matches.items()[6])

In [None]:
stemmed_documents.values()[5].count('xml schema')

In [None]:
list(reversed(extracted['03_02_DTDs_IDs_and_IDREFs.txt'])).index('xml schema')

In [None]:
with open('tfidf_keys.csv', 'w') as f:
    for document, keys in sorted(extracted.items()):
        keys = list(reversed(keys))
        for i, key in enumerate(keys[:10]):
            if i < 5:
                f.write('{},{},{}\n'.format(document, key, i + 1))
            else:
                f.write('{},{},{}\n'.format(document, key, 0))