In [29]:
import nltk
import os
    
from collections import defaultdict

import numpy as np

In [30]:
# Stem all the keywords 
stemmer = nltk.PorterStemmer()
# Collect all n-grams that raters extract.
# We count a phrase if it is a 1-gram or it is a n-gram and any rater tagged it
ngrams = defaultdict(set)
# Each rater is a dict (filename, phrase) => ranking. 
# Ranking is 0 if it is not in the top-5.
raters = []

for name in os.listdir('../../../data/keywords/'):
    with open('../../../data/keywords/' + name) as f:
        rater = {}
        for line in f:
            file, term, _, rank = line.strip().split(',')
            rank = int(rank)
            tokens = term.lower().split(' ')
            stemmed = ' '.join(stemmer.stem(token) for token in tokens)
            rater[file, stemmed] = rank
            
            if len(tokens) > 1:
                ngrams[file].add(stemmed)
                
        raters.append(rater)
            
all_keys = set()

for name in os.listdir('../../../data/Introduction_to_Databases_Captions'):
    with open('../../../data/Introduction_to_Databases_Captions/' + name) as f:
        # Open up each transcript file and stem it.
        lines = []
        for line in f:
            line = line.strip('.| \n')

            stemmed = ' '.join([stemmer.stem(token) for token in line.split(' ')])
            lines.append(stemmed)
        document = ' '.join(lines)
        document = document.lower()
        
        # Remove all n-grams from the document and add them as keys.
        for ngram in ngrams[name]:
            document = document.replace(ngram, '')
            all_keys.add((name, ngram))
            
        # Add all the remaining 1-grams as keys.
        for token in document.split(' '):
            all_keys.add((name, token))

data = []
# Data is a list with tuples (rater num, (file, phrase), category)
for key in all_keys:
    for i, rater in enumerate(raters):
        if key in rater:
            data.append((i, key, 'keyword'))
        else:
            data.append((i, key, 'not keyword'))
        
# Filter the dataset to files that all raters have tagged.
intersection_files = reduce(
    lambda x, y: set(x).intersection(y),
    [set(zip(*rater.keys())[0]) for rater in raters]
)

data = [item for item in data if item[1][0] in intersection_files]        

In [18]:
data

[(0, ('06_06_NULL_Values.txt', u'someth'), 'not keyword'),
 (1, ('06_06_NULL_Values.txt', u'someth'), 'not keyword'),
 (0, ('06_07_Data_Modification_Statements.txt', u'id'), 'not keyword'),
 (1, ('06_07_Data_Modification_Statements.txt', u'id'), 'not keyword'),
 (0, ('04_02_JSON_Demo.txt', u'parallel'), 'not keyword'),
 (1, ('04_02_JSON_Demo.txt', u'parallel'), 'not keyword'),
 (0, ('04_02_JSON_Demo.txt', u'taken'), 'not keyword'),
 (1, ('04_02_JSON_Demo.txt', u'taken'), 'not keyword'),
 (0, ('05_01_Select_Project_Join_Part_3.txt', u'automat'), 'not keyword'),
 (1, ('05_01_Select_Project_Join_Part_3.txt', u'automat'), 'not keyword'),
 (0, ('06_08_The_JOIN_Family_of_Operators.txt', u'e,'), 'not keyword'),
 (1, ('06_08_The_JOIN_Family_of_Operators.txt', u'e,'), 'not keyword'),
 (0, ('06_06_NULL_Values.txt', u'logic'), 'not keyword'),
 (1, ('06_06_NULL_Values.txt', u'logic'), 'not keyword'),
 (0,
  ('05_02_Set_Operators_Renaming_Notation_Part_3.txt', u'abbrevi'),
  'not keyword'),
 (1,
  

In [22]:
rater_0 = {}
rater_1 = {}

for item in data:
    if item[0] == 0:
        rater_0[item[1]] = item[2]
    elif item[0] == 1:
        rater_1[item[1]] = item[2]

In [42]:
len([key for key in rater_0 if rater_0[key] == 'not keyword' and rater_1[key] == 'not keyword'])

9041

In [None]:
# Print a few random data points to spot check
for i in np.random.choice(range(len(data)), size=10, replace=False):
    print(data[i])
    
print('#'*80)

# Print a few that are included in the index
cat_1 = [item for item in data if item[2] == 'keyword']
for i in np.random.choice(range(len(cat_1)), size=10, replace=False):
    print(cat_1[i])
print('#'*80)
    
# Print a few that aren't included in the index
cat_2 = [item for item in data if item[2] == 'not keyword']
for i in np.random.choice(range(len(cat_2)), size=10, replace=False):
    print(cat_2[i])

In [31]:
# Get the kappa.
t = nltk.AnnotationTask(data=data)
t.multi_kappa()

0.327447316762549

In [32]:
t.kappa_pairwise(0, 1)

0.3714989389464814

In [33]:
t.kappa_pairwise(0, 2)

0.27181561700679585

In [34]:
t.kappa_pairwise(1, 2)

0.31090735838258493

In [38]:
t.S()

0.9459306440198159