In [1]:
import os
import nltk
import wikipedia
import re

import numpy as np

from collections import OrderedDict, defaultdict
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from IPython.display import display
from ipywidgets import IntProgress

In [2]:
documents = OrderedDict()

for name in os.listdir('/Users/andrewlamb/Google_Drive/Stanford/CS199/DB_SelfPaced/Flat'):
    with open('/Users/andrewlamb/Google_Drive/Stanford/CS199/DB_SelfPaced/Flat/' + name) as f:
        lines = []
        for line in f:
            lines.append(line.strip('| \n'))
        documents[name] = (' '.join(lines))
        
keywords = defaultdict(set)
stemmer = nltk.stem.PorterStemmer()

for name in os.listdir('/Users/andrewlamb/Google_Drive/Stanford/CS199/studyguides/data/keywords'):
    with open('/Users/andrewlamb/Google_Drive/Stanford/CS199/studyguides/data/keywords/' + name) as f:
        for line in f:
            lecture_name, key, _, rank = line.strip().split(',')
            rank = int(rank)
            if rank > -1:
                key = key.replace('-', ' ')
                key = ' '.join(stemmer.stem(token) for token in nltk.word_tokenize(key))
                key = key.lower()
                keywords[lecture_name].add(key)

In [3]:
wikipedia_documents = {}

prog = IntProgress(min=0, max=len(documents))
display(prog)

for name in documents.keys():
    search = re.sub('\d+_\d+', '', name)
    search = re.sub('Part_\d+', '', search)
    search = search.replace('Demo', '')
    search = search.replace('.txt', '')
    search = search.replace('_', ' ')
    search = search.strip()
    search = search.lower()
    
    results = wikipedia.search(search)
    
    for result in results:
        try:
            page = wikipedia.page(result)
            break
        except wikipedia.DisambiguationError:
            pass
        
    wikipedia_documents[name] = page
    
    prog.value += 1

In [None]:
wikipedia.search('JSON Data'.lower())

In [None]:
wikipedia_documents

In [None]:
# Fix up some of the pages with bad results.
wikipedia_documents['01_01_Introduction.txt'] = wikipedia.page('database')
wikipedia_documents['06_02_Basic_SELECT_Statement.txt'] = wikipedia.page('select (sql)')
wikipedia_documents['06_03_Table_Variables_and_Set_Operators.txt'] = wikipedia.page('sql')
wikipedia_documents['06_05_Aggregation.txt'] = wikipedia.page('aggregate function')
wikipedia_documents['06_06_NULL_Values.txt'] = wikipedia.page('sql (null)')
wikipedia_documents['06_08_The_JOIN_Family_of_Operators.txt'] = wikipedia.page('join (sql)')
wikipedia_documents['09_01_UML_Data_Modeling.txt'] = wikipedia.page('unified modeling language')
wikipedia_documents['09_02_UML_to_Relations_Part_1.txt'] = wikipedia.page('unified modeling language')
wikipedia_documents['09_02_UML_to_Relations_Part_2.txt'] = wikipedia.page('unified modeling language')
wikipedia_documents['09_02_UML_to_Relations_Part_3.txt'] = wikipedia.page('unified modeling language')
wikipedia_documents['10_01_Indexes_Part_1.txt'] = wikipedia.page('database index')
wikipedia_documents['10_01_Indexes_Part_2.txt'] = wikipedia.page('database index')
wikipedia_documents['11_01_Introduction_to_Transactions.txt'] = wikipedia.page('database transaction')
wikipedia_documents['11_02_Transactions_Properties_Part_1.txt'] = wikipedia.page('database transaction')
wikipedia_documents['11_02_Transactions_Properties_Part_2.txt'] = wikipedia.page('database transaction')
wikipedia_documents['11_02_Transactions_Properties_Part_3.txt'] = wikipedia.page('database transaction')
wikipedia_documents['12_01_Motivation_and_Overview.txt'] = wikipedia.page('database trigger')
wikipedia_documents['12_02_Constraints_of_Several_Types_Part_1.txt'] = wikipedia.page('database')
wikipedia_documents['12_02_Constraints_of_Several_Types_Part_2.txt'] = wikipedia.page('database')
wikipedia_documents['12_04_Triggers_Introduction.txt'] = wikipedia.page('database trigger')
wikipedia_documents['12_05_Triggers_Demo_Part_1.txt'] = wikipedia.page('database trigger')
wikipedia_documents['12_05_Triggers_Demo_Part_2.txt'] = wikipedia.page('database trigger')
wikipedia_documents['13_01_Defining_and_Using_Views.txt'] = wikipedia.page('view (sql)')
wikipedia_documents['13_02_View_Modifications_Introduction_Part_1.txt'] = wikipedia.page('view (sql)')
wikipedia_documents['13_02_View_Modifications_Introduction_Part_2.txt'] = wikipedia.page('view (sql)')
wikipedia_documents['13_04_Automatic_View_Modifications.txt'] = wikipedia.page('view (sql)')
wikipedia_documents['15_03_Nonlinear_and_Mutual_Recursion_Part_1.txt'] = wikipedia.page('recursion (computer science)')
wikipedia_documents['15_03_Nonlinear_and_Mutual_Recursion_Part_2.txt'] = wikipedia.page('recursion (computer science)')
wikipedia_documents['16_01_Introduction_to_OLAP_Part_1.txt'] = wikipedia.page('online analytical processing')
wikipedia_documents['16_01_Introduction_to_OLAP_Part_2.txt'] = wikipedia.page('online analytical processing')
wikipedia_documents['17_01_NoSQLMotivation.txt'] = wikipedia.page('nosql')
wikipedia_documents['17_01_NoSQLOverview.txt'] = wikipedia.page('nosql')

In [4]:
combined_documents = OrderedDict()

for name, document in documents.items():
    combined_documents[name] = ' '.join([document, wikipedia_documents[name].content])

In [5]:
stemmed_documents = OrderedDict()

for name, document in documents.items():
    document = document.replace('-', ' ')
    document = ' '.join([stemmer.stem(token) for token in nltk.word_tokenize(document)])
    stemmed_documents[name] = document.lower()
    
stemmed_combined_documents = OrderedDict()

for name, document in combined_documents.items():
    stemmed_combined_document = document.replace('-', ' ')
    stemmed_combined_document = ' '.join([stemmer.stem(token) for token in nltk.word_tokenize(stemmed_combined_document)])
    stemmed_combined_documents[name] = stemmed_combined_document.lower()

In [7]:
stopwords = []
with open('/Users/andrewlamb/Google_Drive/Stanford/CS199/RAKE-tutorial/SmartStoplistAdditional.txt') as f:
    for line in f:
        stopwords.append(line.strip())

lecture_extractor = TfidfVectorizer(stop_words=stopwords, ngram_range=(1,4), norm=None, smooth_idf=False)
lecture_extractor.fit(stemmed_documents.values())

extractor = TfidfVectorizer(stop_words=stopwords, ngram_range=(1,4), vocabulary=lecture_extractor.vocabulary_)
X = extractor.fit_transform(stemmed_combined_documents.values())
inverse_voc = {v: k for k, v in extractor.vocabulary_.items()}

extracted = {}
for doc_idx, name in enumerate(stemmed_documents.keys()):
    extracted[name] = [inverse_voc[idx] for idx in np.argsort(X[doc_idx,:].toarray().flatten())]

In [8]:
matches = OrderedDict()
jaccard = OrderedDict()

for name, keys in sorted(keywords.items()):
    extracted_words = extracted[name][-len(keys):]
    matches[name.replace('.keys', '')] = list(set(keys).intersection(extracted_words))
    jaccard[name.replace('.keys', '')] = float(len(set(keys).intersection(extracted_words))) / len(set(keys).union(set(extracted_words)))
    
print('Average jaccard: {}'.format(sum(jaccard.values()) / len(jaccard)))

Average jaccard: 0.12340186835


In [None]:
sorted(matches.items()[6])

In [None]:
documents['03_02_DTDs_IDs_and_IDREFs.txt'].lower().count('xml schema')

In [None]:
wikipedia_documents['03_02_DTDs_IDs_and_IDREFs.txt'].content.lower().count('xml schema')

In [None]:
keywords['03_02_DTDs_IDs_and_IDREFs.txt']

In [9]:
matches = OrderedDict()
jaccard = OrderedDict()

for name, keys in sorted(keywords.items()):
    extracted_words = extracted[name][-2*len(keys):]
    matches[name.replace('.keys', '')] = list(set(keys).intersection(extracted_words))
    jaccard[name.replace('.keys', '')] = float(len(set(keys).intersection(extracted_words))) / len(set(keys).union(set(extracted_words)))
    
print('Average jaccard: {}'.format(sum(jaccard.values()) / len(jaccard)))

Average jaccard: 0.107637982346


In [10]:
with open('wiki_keys.csv', 'w') as f:
    for document, keys in sorted(extracted.items()):
        keys = list(reversed(keys))
        for i, key in enumerate(keys[:10]):
            if i < 5:
                f.write('{},{},{}\n'.format(document, key, i + 1))
            else:
                f.write('{},{},{}\n'.format(document, key, 0))