In [1]:
import nltk
import os
    
from collections import defaultdict

import numpy as np

In [2]:
# Stem all the keywords 
stemmer = nltk.PorterStemmer()
# Collect all n-grams that raters extract.
# We count a phrase if it is a 1-gram or it is a n-gram and any rater tagged it
ngrams = defaultdict(set)
# Each rater is a dict (filename, phrase) => ranking. 
# Ranking is 0 if it is not in the top-5.
raters = []

for name in os.listdir('../../../data/keywords/'):
    with open('../../../data/keywords/' + name) as f:
        rater = {}
        for line in f:
            file, term, _, rank = line.strip().split(',')
            rank = int(rank)
            tokens = term.lower().split(' ')
            stemmed = ' '.join(stemmer.stem(token) for token in tokens)
            rater[file, stemmed] = rank
            
            if len(tokens) > 1:
                ngrams[file].add(stemmed)
                
        raters.append(rater)
            
all_keys = set()

for name in os.listdir('../../../data/Introduction_to_Databases_Captions'):
    with open('../../../data/Introduction_to_Databases_Captions/' + name) as f:
        # Open up each transcript file and stem it.
        lines = []
        for line in f:
            line = unicode(line.strip('.| \n'), 'utf-8')

            stemmed = ' '.join([stemmer.stem(token) for token in line.split(' ')])
            lines.append(stemmed)
        document = ' '.join(lines)
        document = document.lower()
        
        # Remove all n-grams from the document and add them as keys.
        for ngram in ngrams[name]:
            document = document.replace(ngram, '')
            all_keys.add((name, ngram))
            
        # Add all the remaining 1-grams as keys.
        for token in document.split(' '):
            all_keys.add((name, token))

data = []
# Data is a list with tuples (rater num, (file, phrase), category)
for key in all_keys:
    for i, rater in enumerate(raters):
        if key in rater:
            data.append((i, key, 'keyword'))
        else:
            data.append((i, key, 'not keyword'))
        
# Filter the dataset to files that all raters have tagged.
intersection_files = reduce(
    lambda x, y: set(x).intersection(y),
    [set(zip(*rater.keys())[0]) for rater in raters]
)

data = [item for item in data if item[1][0] in intersection_files]        

In [3]:
# Print a few random data points to spot check
for i in np.random.choice(range(len(data)), size=10, replace=False):
    print(data[i])
    
print('#'*80)

# Print a few that are included in the index
cat_1 = [item for item in data if item[2] == 'keyword']
for i in np.random.choice(range(len(cat_1)), size=10, replace=False):
    print(cat_1[i])
print('#'*80)
    
# Print a few that aren't included in the index
cat_2 = [item for item in data if item[2] == 'not keyword']
for i in np.random.choice(range(len(cat_2)), size=10, replace=False):
    print(cat_2[i])

(2, ('04_02_JSON_Demo.txt', u'properties,'), 'not keyword')
(0, ('06_01_Introduction_to_SQL.txt', u'way,'), 'not keyword')
(0, ('04_01_Introduction_to_JSON_Data_Part_2.txt', u'mean'), 'not keyword')
(1, ('06_01_Introduction_to_SQL.txt', u'familiar'), 'not keyword')
(2, ('06_06_NULL_Values.txt', u'evalu'), 'not keyword')
(2, ('05_01_Select_Project_Join_Part_3.txt', u'look'), 'not keyword')
(2, ('05_01_Select_Project_Join_Part_1.txt', u'3.7--and'), 'not keyword')
(2, ('02_02_Querying_Relational_Databases.txt', u'execut'), 'not keyword')
(2, ('04_01_Introduction_to_JSON_Data_Part_2.txt', u'xslt'), 'not keyword')
(1, ('03_02_DTDs_IDs_and_IDREFs.txt', u'plural'), 'not keyword')
################################################################################
(0, ('02_01_The_Relational_Model.txt', u'relat'), 'keyword')
(1, ('05_01_Select_Project_Join_Part_2.txt', u'cross product'), 'keyword')
(0, ('03_02_DTDs_IDs_and_IDREFs.txt', u'kleen'), 'keyword')
(1, ('06_05_Aggregation.txt', u'order by'

In [4]:
# Get the kappa.
t = nltk.AnnotationTask(data=data)
t.multi_kappa()

0.327447316762549

In [5]:
t.kappa_pairwise(0, 1)

0.3714989389464814

In [6]:
t.kappa_pairwise(0, 2)

0.27181561700679585

In [7]:
t.kappa_pairwise(1, 2)

0.31090735838258493