In [1]:
import os
import nltk

import numpy as np

from collections import OrderedDict, defaultdict
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

In [2]:
documents = OrderedDict()

for name in os.listdir('../../../data/Introduction_to_Databases_Captions'):
    with open('../../../data/Introduction_to_Databases_Captions/' + name) as f:
        lines = []
        for line in f:
            lines.append(line.strip('| \n'))
        documents[name] = ' '.join(lines)
        
keywords = defaultdict(set)
stemmer = nltk.stem.PorterStemmer()

for name in os.listdir('../../../data/keywords/'):
    with open('../../../data/keywords/' + name) as f:
        for line in f:
            lecture_name, key, _, rank = line.strip().split(',')
            rank = int(rank)
            key = key.replace('-', ' ')
            key = ' '.join(stemmer.stem(token) for token in nltk.word_tokenize(key))
            key = key.lower()
            keywords[lecture_name].add(key)

In [3]:
stopwords = []
with open('../../../data/smart.txt') as f:
    for line in f:
        line = line.strip()
        tokens = nltk.word_tokenize(line)
        stopwords.append(' '.join([stemmer.stem(token) for token in tokens]))
    
stemmed_documents = OrderedDict()

for name, document in documents.items():
    document = document.replace('-', ' ')
    document = ' '.join([stemmer.stem(token) for token in nltk.word_tokenize(document)])
    stemmed_documents[name] = document.lower()

extractor = TfidfVectorizer(stop_words=stopwords, ngram_range=(1,4))
X = extractor.fit_transform(stemmed_documents.values())
inverse_voc = {v: k for k, v in extractor.vocabulary_.items()}

extracted = OrderedDict()
for doc_idx, name in enumerate(stemmed_documents.keys()):
    extracted[name] = [inverse_voc[idx] for idx in np.argsort(X[doc_idx,:].toarray().flatten())]

In [7]:
matches = OrderedDict()
jaccard = OrderedDict()

for name, keys in sorted(keywords.items()):
    extracted_words = extracted[name][-len(keys):]
    matches[name.replace('.keys', '')] = list(set(keys).intersection(extracted_words))
    jaccard[name.replace('.keys', '')] = float(len(set(keys).intersection(extracted_words))) / len(set(keys).union(set(extracted_words)))
    
print('Average jaccard: {}'.format(sum(jaccard.values()) / len(jaccard)))

Average jaccard: 0.116370291814


In [6]:
matches = OrderedDict()
jaccard = OrderedDict()

for name, keys in sorted(keywords.items()):
    extracted_words = extracted[name][-2*len(keys):]
    matches[name.replace('.keys', '')] = list(set(keys).intersection(extracted_words))
    jaccard[name.replace('.keys', '')] = float(len(set(keys).intersection(extracted_words))) / len(set(keys).union(set(extracted_words)))
    
print('Average jaccard: {}'.format(sum(jaccard.values()) / len(jaccard)))

Average jaccard: 0.105646126594


In [None]:
with open('tfidf_keys.csv', 'w') as f:
    for document, keys in sorted(extracted.items()):
        keys = list(reversed(keys))
        for i, key in enumerate(keys[:10]):
            if i < 5:
                f.write('{},{},{}\n'.format(document, key, i + 1))
            else:
                f.write('{},{},{}\n'.format(document, key, 0))

In [5]:
matches

OrderedDict([('01_01_Introduction.txt',
              [u'applic',
               u'databas manag system',
               u'databas',
               u'data model',
               u'framework',
               u'design',
               u'user',
               u'databas system',
               u'data']),
             ('02_01_The_Relational_Model.txt',
              [u'relat model',
               u'relat',
               u'queri',
               u'tabl',
               u'key',
               u'tupl',
               u'null',
               u'attribut']),
             ('02_02_Querying_Relational_Databases.txt',
              [u'relat databas',
               u'queri',
               u'relat',
               u'queri languag',
               u'sql',
               u'relat algebra']),
             ('03_01_Well_Formed_XML_Part_1.txt',
              [u'open tag',
               u'xml',
               u'remark',
               u'close tag',
               u'relat model',
               u'structur'

In [8]:
list(reversed(extracted['17_01_NoSQLOverview.txt']))

[u'reduc',
 u'function',
 u'record',
 u'mapreduc',
 u'map',
 u'node',
 u'key',
 u'reduc function',
 u'domain',
 u'map function',
 u'framework',
 u'mapreduc framework',
 u'key store',
 u'system',
 u'store',
 u'output',
 u'data',
 u'record key',
 u'user',
 u'file',
 u'graph',
 u'hadoop',
 u'pig',
 u'document',
 u'edg',
 u'count',
 u'graph databas',
 u'produc output',
 u'mapper',
 u'hive',
 u'input',
 u'nosql',
 u'job',
 u'oper',
 u'produc',
 u'url',
 u'process',
 u'output record',
 u'novemb 2011',
 u'toler',
 u'reader',
 u'graph databas system',
 u'access domain',
 u'novemb',
 u'input record',
 u'document store',
 u'scalabl',
 u'extract',
 u'model',
 u'score',
 u'massiv',
 u'addit inform',
 u'2011',
 u'function reduc function',
 u'map reduc',
 u'fault toler',
 u'function reduc',
 u'db',
 u'hive pig',
 u'rdf',
 u'friend',
 u'access',
 u'languag',
 u'fetch',
 u'key pair',
 u'googl',
 u'writer',
 u'fault',
 u'databas',
 u'll',
 u'problem',
 u'list',
 u'combin',
 u'grade',
 u'nosql system',


In [9]:
stemmed_documents['07_04_Multivalued_Dependencies_and_4th_Normal_Form_Part_1.txt'].count('multivalu depend')

3

In [None]:
sum(1 for doc in stemmed_documents.values() if 'enumer domain' in doc)

In [None]:
matches['11_03_Isolation_Levels_Part_1.txt']

In [None]:
list(reversed(extracted['11_03_Isolation_Levels_Part_1.txt']))[:16]

In [None]:
matches['10_01_Indexes_Part_1.txt']

In [None]:
matches