In [1]:
import os
import nltk
import wikipedia
import re

import numpy as np

from collections import OrderedDict
from sklearn.feature_extraction.text import CountVectorizer
from IPython.display import display
from ipywidgets import IntProgress

In [2]:
documents = OrderedDict()

for name in os.listdir('/Users/andrewlamb/Google_Drive/Stanford/CS199/DB_SelfPaced/Flat'):
    with open('/Users/andrewlamb/Google_Drive/Stanford/CS199/DB_SelfPaced/Flat/' + name) as f:
        lines = []
        for line in f:
            lines.append(line.strip('| \n'))
        documents[name] = (' '.join(lines))
        
keywords = {}
stemmer = nltk.stem.PorterStemmer()

for name in os.listdir('/Users/andrewlamb/Google_Drive/Stanford/CS199/DB_SelfPaced/Keys'):
    with open('/Users/andrewlamb/Google_Drive/Stanford/CS199/DB_SelfPaced/Keys/' + name) as f:
        keys = []
        for line in f:
            key = line.strip()
            key = key.replace('-', ' ')
            key = ' '.join(stemmer.stem(token) for token in nltk.word_tokenize(key))
            key = key.lower()

            keys.append(key)
            
        keywords[name] = keys

In [3]:
wikipedia_documents = {}

prog = IntProgress(min=0, max=len(documents))
display(prog)

for name in documents.keys():
    search = re.sub('\d+_\d+', '', name)
    search = re.sub('Part_\d+', '', search)
    search = search.replace('Demo', '')
    search = search.replace('.txt', '')
    search = search.replace('_', ' ')
    search = search.strip()
    search = search.lower()
    
    results = wikipedia.search(search)
    
    for result in results:
        try:
            page = wikipedia.page(result)
            break
        except wikipedia.DisambiguationError:
            pass
        
    wikipedia_documents[name] = page
    
    prog.value += 1

In [4]:
# Fix up some of the pages with bad results.
wikipedia_documents['01_01_Introduction.txt'] = wikipedia.page('database')
wikipedia_documents['06_02_Basic_SELECT_Statement.txt'] = wikipedia.page('select (sql)')
wikipedia_documents['06_03_Table_Variables_and_Set_Operators.txt'] = wikipedia.page('sql')
wikipedia_documents['06_05_Aggregation.txt'] = wikipedia.page('aggregate function')
wikipedia_documents['06_06_NULL_Values.txt'] = wikipedia.page('sql (null)')
wikipedia_documents['06_08_The_JOIN_Family_of_Operators.txt'] = wikipedia.page('join (sql)')
wikipedia_documents['09_01_UML_Data_Modeling.txt'] = wikipedia.page('unified modeling language')
wikipedia_documents['09_02_UML_to_Relations_Part_1.txt'] = wikipedia.page('unified modeling language')
wikipedia_documents['09_02_UML_to_Relations_Part_2.txt'] = wikipedia.page('unified modeling language')
wikipedia_documents['09_02_UML_to_Relations_Part_3.txt'] = wikipedia.page('unified modeling language')
wikipedia_documents['10_01_Indexes_Part_1.txt'] = wikipedia.page('database index')
wikipedia_documents['10_01_Indexes_Part_2.txt'] = wikipedia.page('database index')
wikipedia_documents['11_01_Introduction_to_Transactions.txt'] = wikipedia.page('database transaction')
wikipedia_documents['11_02_Transactions_Properties_Part_1.txt'] = wikipedia.page('database transaction')
wikipedia_documents['11_02_Transactions_Properties_Part_2.txt'] = wikipedia.page('database transaction')
wikipedia_documents['11_02_Transactions_Properties_Part_3.txt'] = wikipedia.page('database transaction')
wikipedia_documents['12_01_Motivation_and_Overview.txt'] = wikipedia.page('database trigger')
wikipedia_documents['12_02_Constraints_of_Several_Types_Part_1.txt'] = wikipedia.page('database')
wikipedia_documents['12_02_Constraints_of_Several_Types_Part_2.txt'] = wikipedia.page('database')
wikipedia_documents['12_04_Triggers_Introduction.txt'] = wikipedia.page('database trigger')
wikipedia_documents['12_05_Triggers_Demo_Part_1.txt'] = wikipedia.page('database trigger')
wikipedia_documents['12_05_Triggers_Demo_Part_2.txt'] = wikipedia.page('database trigger')
wikipedia_documents['13_01_Defining_and_Using_Views.txt'] = wikipedia.page('view (sql)')
wikipedia_documents['13_02_View_Modifications_Introduction_Part_1.txt'] = wikipedia.page('view (sql)')
wikipedia_documents['13_02_View_Modifications_Introduction_Part_2.txt'] = wikipedia.page('view (sql)')
wikipedia_documents['13_04_Automatic_View_Modifications.txt'] = wikipedia.page('view (sql)')
wikipedia_documents['15_03_Nonlinear_and_Mutual_Recursion_Part_1.txt'] = wikipedia.page('recursion (computer science)')
wikipedia_documents['15_03_Nonlinear_and_Mutual_Recursion_Part_2.txt'] = wikipedia.page('recursion (computer science)')
wikipedia_documents['16_01_Introduction_to_OLAP_Part_1.txt'] = wikipedia.page('online analytical processing')
wikipedia_documents['16_01_Introduction_to_OLAP_Part_2.txt'] = wikipedia.page('online analytical processing')
wikipedia_documents['17_01_NoSQLMotivation.txt'] = wikipedia.page('nosql')
wikipedia_documents['17_01_NoSQLOverview.txt'] = wikipedia.page('nosql')

In [9]:
stemmed_wikipedia_documents = {}

for key, document in wikipedia_documents.items():
    stemmed_document = document.content.replace('-', ' ')
    stemmed_document = ' '.join([stemmer.stem(token) for token in nltk.word_tokenize(stemmed_document)])
    stemmed_wikipedia_documents[key] = stemmed_document.lower()
    
stemmed_documents = OrderedDict()

for name, document in documents.items():
    stemmed_document = document.replace('-', ' ')
    stemmed_document = ' '.join([stemmer.stem(token) for token in nltk.word_tokenize(document)])
    stemmed_documents[name] = stemmed_document.lower()

In [5]:
tagged_documents = OrderedDict()
prog = IntProgress(min=0, max=len(documents))
display(prog)

os.environ['CLASSPATH'] = '/Users/andrewlamb/Downloads/stanford-postagger-2015-12-09'
os.environ['STANFORD_MODELS'] = '/Users/andrewlamb/Downloads/stanford-postagger-2015-12-09'
tagger = nltk.tag.StanfordPOSTagger('models/english-bidirectional-distsim.tagger')

for name, doc in documents.items():
    sents = [nltk.word_tokenize(sent) for sent in nltk.sent_tokenize(doc)]
    tagged_documents[name] = tagger.tag_sents(sents)
    prog.value += 1

In [6]:
def extract_tags(tree):
    results = []
    
    if type(tree) == nltk.tree.Tree:
        if tree.label() == 'NP':
            phrase = ' '.join([word for word, _pos in tree])
            phrase = phrase.lower()
            phrase = re.sub("[().,']", '', phrase)
            phrase = re.sub('-', ' ', phrase)
            results.append(phrase)

        for child in tree:
            results.extend(extract_tags(child))
    
    return results

grammar = '''
NP: {(<JJ>|<JJS>|<JJR>)*(<NN>|<NNS>|<NNP>|<NNPS>)+} 
'''

cp = nltk.RegexpParser(grammar)
candidates = []

chunked_docs = []

for doc in tagged_documents.values():
    chunked_doc = []
    for sent in doc:
        result = cp.parse(sent)
        chunked_doc.extend(extract_tags(result))
        candidates.extend(extract_tags(result))
    chunked_docs.append(chunked_doc)
    
candidates = list(set(candidates))

In [7]:
for i, candidate in enumerate(candidates):
    candidates[i] = ' '.join([stemmer.stem(token) for token in nltk.word_tokenize(candidate)])
    
candidates = list(set(candidates))

In [11]:
stopwords = []
with open('/Users/andrewlamb/Google_Drive/Stanford/CS199/RAKE-tutorial/SmartStoplistAdditional.txt') as f:
    for line in f:
        stopwords.append(stemmer.stem(line.strip()))

wikipedia_extractor = CountVectorizer(stop_words=stopwords, ngram_range=(1,4), vocabulary=candidates)
wikipedia_X = wikipedia_extractor.fit_transform(stemmed_wikipedia_documents.values())
inverse_wikipedia_voc = {v: k for k, v in wikipedia_extractor.vocabulary_.items()}

wikipedia_extracted = {}
for doc_idx, (name, keys) in enumerate(documents.items()):
    n_keywords = len(keys)
    wikipedia_extracted[name] = [
        inverse_wikipedia_voc[idx] 
        for idx in np.argsort(wikipedia_X[doc_idx,:].toarray().flatten())
    ]

In [12]:
extractor = CountVectorizer(stop_words=stopwords, ngram_range=(1,4), vocabulary=candidates)
X = extractor.fit_transform(stemmed_documents.values())
inverse_voc = {v: k for k, v in extractor.vocabulary_.items()}

extracted = {}
for doc_idx, (name, keys) in enumerate(keywords.items()):
    n_keywords = len(keys)
    extracted[name] = [inverse_voc[idx] for idx in np.argsort(X[doc_idx,:].toarray().flatten())]

In [13]:
combined_extracted = {}

prog = IntProgress(min=0, max=X.shape[0])
display(prog)

combined_X = X.copy()
for j in range(X.shape[0]):
    for i in range(X.shape[1]):
        word = inverse_voc[i]
        if word in wikipedia_extractor.vocabulary_:
            combined_X[j, i] += wikipedia_X[j, wikipedia_extractor.vocabulary_[word]]
        
    combined_extracted[documents.keys()[j]] = [inverse_voc[idx] for idx in np.argsort(combined_X[j,:].toarray().flatten())]
    
    prog.value += 1



In [14]:
matches = {}
jaccard = {}
partials = {}
partial_jaccard = {}


for name, keys in keywords.items():
    extracted_words = combined_extracted[name.replace('.keys', '.txt')][-len(keys):]
    matches[name.replace('.keys', '')] = list(set(keys).intersection(set(extracted_words)))
    jaccard[name.replace('.keys', '')] = float(len(set(keys).intersection(set(extracted_words)))) / len(set(keys).union(set(extracted_words)))
    
#     inner_partials = []
#     for manual_key in extracted_words:
#         manual_tokens = nltk.word_tokenize(manual_key)
        
        
#         for key in keys:
#             tokens = nltk.word_tokenize(key)
            
#             if len(set(manual_tokens).intersection(set(tokens))) > 0:
#                 inner_partials.append((manual_key, key)) 
#                 break
    
#     partials[name.replace('.keys', '')] = inner_partials
#     partial_jaccard[name.replace('.keys', '')] = float(len(inner_partials)) / (len(keys) + len(extracted_words) - len(inner_partials))
    
print('Average jaccard: {}'.format(sum(jaccard.values()) / len(jaccard)))
# print('Average partial jaccard: {}'.format(sum(partial_jaccard.values()) / len(partial_jaccard)))

Average jaccard (: 0.112764428554


In [15]:
matches = {}
jaccard = {}
partials = {}
partial_jaccard = {}


for name, keys in keywords.items():
    extracted_words = combined_extracted[name.replace('.keys', '.txt')][-2*len(keys):]
    matches[name.replace('.keys', '')] = list(set(keys).intersection(set(extracted_words)))
    jaccard[name.replace('.keys', '')] = float(len(set(keys).intersection(set(extracted_words)))) / len(set(keys).union(set(extracted_words)))
    
#     inner_partials = []
#     for manual_key in extracted_words:
#         manual_tokens = nltk.word_tokenize(manual_key)
        
        
#         for key in keys:
#             tokens = nltk.word_tokenize(key)
            
#             if len(set(manual_tokens).intersection(set(tokens))) > 0:
#                 inner_partials.append((manual_key, key)) 
#                 break
    
#     partials[name.replace('.keys', '')] = inner_partials
#     partial_jaccard[name.replace('.keys', '')] = float(len(inner_partials)) / (len(keys) + len(extracted_words) - len(inner_partials))
    
print('Average jaccard (2x): {}'.format(sum(jaccard.values()) / len(jaccard)))
# print('Average partial jaccard: {}'.format(sum(partial_jaccard.values()) / len(partial_jaccard)))

Average jaccard (2x): 0.11510620915


In [17]:
keywords.keys()

['01_01_Introduction.keys',
 '04_01_Introduction_to_JSON_Data_Part_1.keys',
 '02_02_Querying_Relational_Databases.keys',
 '04_02_JSON_Demo.keys',
 '02_01_The_Relational_Model.keys']

In [25]:
combined_extracted['10_01_Indexes_Part_1.txt'][-20:]

[u'find',
 u'gpa',
 u'order',
 u'tree',
 u'appli',
 u'referenti integr',
 u'integr',
 u'equal',
 u'case',
 u'databas',
 u'attribut',
 u'relat',
 u'key',
 u'condit',
 u'queri',
 u'column',
 u'tabl',
 u'tupl',
 u'student',
 u'index']