In [1]:
import nltk
import os

from collections import defaultdict

import numpy as np

In [11]:
# Stem all the keywords 
stemmer = nltk.PorterStemmer()
# Collect all n-grams that raters extract.
# We count a phrase if it is a 1-gram or it is a n-gram and any rater tagged it
ngrams = defaultdict(set)
# Each rater is a dict (filename, phrase) => ranking. 
# Ranking is 0 if it is not in the top-5.
raters = []

for name in os.listdir('../../../data/keywords/'):
    with open('../../../data/keywords/' + name) as f:
        rater = {}
        for line in f:
            file, term, _, rank = line.strip().split(',')
            rank = int(rank)
            tokens = term.lower().split(' ')
            stemmed = ' '.join(stemmer.stem(token) for token in tokens)
            rater[file, stemmed] = rank
            
            if len(tokens) > 1:
                ngrams[file].add(stemmed)
                
        raters.append(rater)
    

# The algorithm's rakings. Dict (filename, phrase) => ranking
alg_rankings = {}

with open('./tfidf_combined_2grams_keys.csv') as f:
    for line in f:
        doc, term, rank = line.strip().split(',')
        rank = int(rank)
        tokens = term.lower().split(' ')
        stemmed = ' '.join(stemmer.stem(token) for token in tokens)
        alg_rankings[doc, stemmed] = rank
        
        if len(tokens) > 1:
            ngrams[file].add(stemmed)

# Dict (filename, phrase) => ranking
# Ranking is the best ranking any human rater gave.
combined_raters = {}
# Take the union over all raters' keys.
combined_keys = reduce(lambda x, y: x.union(y), [set(rater.keys()) for rater in raters])
for key in combined_keys:
    for rater in raters:
        if key in rater:
            
            if key not in combined_raters:
                combined_raters[key] = rater[key]
            else:
                combined_raters[key] = min(combined_raters[key], rater[key])

all_keys = set()

for name in os.listdir('../../../data/Introduction_to_Databases_Captions'):
    with open('../../../data/Introduction_to_Databases_Captions/' + name) as f:
        # Open up each transcript file and stem it.
        lines = []
        for line in f:
            line = line.strip('.| \n')

            stemmed = ' '.join([stemmer.stem(token) for token in line.split(' ')])
            lines.append(stemmed)
        document = ' '.join(lines)
        document = document.lower()
        
        # Remove all n-grams from the document and add them as keys.
        for ngram in ngrams[name]:
            document = document.replace(ngram, '')
            all_keys.add((name, ngram))
            
        # Add all the remaining 1-grams as keys.
        for token in document.split(' '):
            all_keys.add((name, token))

data = []
# Data is a list with tuples (rater num, (file, phrase), category)
for key in all_keys:
    for i, rater in enumerate([combined_raters, alg_rankings]):
        if key in rater:
            data.append((i, key, 'keyword'))
        else:
            data.append((i, key, 'not keyword'))
            
# Filter the dataset to files that all raters have tagged.
# In practice at least one rater has tagged every file, so this shouldn't filter
# anything.
intersection_files = reduce(
    lambda x, y: set(x).intersection(y),
    [set(zip(*rater.keys())[0]) for rater in [combined_raters, alg_rankings]]
)

data = [item for item in data if item[1][0] in intersection_files]       

In [3]:
# Print a few random data points to spot check
for i in np.random.choice(range(len(data)), size=10, replace=False):
    print(data[i])
    
print('#'*80)

# Print a few that are definitely included in the index
cat_1 = [item for item in data if item[2] == 'keyword']
for i in np.random.choice(range(len(cat_1)), size=10, replace=False):
    print(cat_1[i])
print('#'*80)
    
# Print a few in category 2
cat_2 = [item for item in data if item[2] == 'not keyword']
for i in np.random.choice(range(len(cat_2)), size=10, replace=False):
    print(cat_2[i])

(1, ('07_03_Boyce_Codd_Normal_Form_Part_1.txt', u'figur'), 'not keyword')
(0, ('12_03_Referential_Integrity_Part_3.txt', u'1,'), 'not keyword')
(0, ('07_03_Boyce_Codd_Normal_Form_Part_2.txt', u'then,in'), 'not keyword')
(0, ('17_01_NoSQLMotivation.txt', u'basi'), 'not keyword')
(1, ('03_01_Well_Formed_XML_Part_1.txt', u"what'"), 'not keyword')
(1, ('07_04_Multivalued_Dependencies_and_4th_Normal_Form_Part_4.txt', u'fourth normal form'), 'not keyword')
(0, ('02_02_Querying_Relational_Databases.txt', u'system'), 'not keyword')
(0, ('06_04_Subqueries_in_FROM_and_SELECT.txt', u'join'), 'not keyword')
(1, ('17_01_NoSQLOverview.txt', u'place'), 'not keyword')
(1, ('03_02_XML_Schema.txt', u'between'), 'not keyword')
################################################################################
(0, ('13_05_Materialized_Views_Part_1.txt', u'modif'), 'keyword')
(1, ('06_05_Aggregation.txt', u'colleg'), 'keyword')
(1, ('03_02_XML_Schema.txt', u'element'), 'keyword')
(0, ('17_01_NoSQLOverview.txt

In [12]:
t = nltk.AnnotationTask(data=data)
t.kappa()

0.17673157389130523

In [13]:
t.S()

0.9024518082814046