In [1]:
import nltk, os, itertools, re

from tfidf_extractor import TfidfExtractor

from ipywidgets import IntProgress
from IPython.display import display
from tabulate import tabulate
import numpy as np

Read the files.

In [2]:
def srt_to_strs(f):
    '''Convert a .srt file to a string'''
    text_lines = []
    while True:
        _seq_no = f.readline().strip()
        if _seq_no == '': break
        _time_str = f.readline()
        
        while True:
            text_line = f.readline().strip()
            if text_line == '': break
            text_line = text_line.replace('&#39;', "'")
            text_line = text_line.replace('&gt;', '')
            text_line = text_line.replace('[inaudible]', '')
            text_line = text_line.replace('gonna', 'going to')
            text_lines.append(text_line)
            
    return ' '.join(text_lines)

DATA_DIR = '../../data/'
CAPTIONS_DIR = DATA_DIR + 'compilers_captions/'
SRT_FILE_NAMES = [CAPTIONS_DIR + file_name for file_name in os.listdir(CAPTIONS_DIR) if file_name.endswith('.srt')]

documents = []

for name in SRT_FILE_NAMES:
    with open(name) as f:
        document = srt_to_strs(f)
        documents.append(unicode(document, 'utf-8'))

POS tag.

In [3]:
tags = []
prog = IntProgress(min=0, max=len(documents))
display(prog)

os.environ['CLASSPATH'] = '/Users/andrewlamb/Downloads/stanford-postagger-2015-12-09'
os.environ['STANFORD_MODELS'] = '/Users/andrewlamb/Downloads/stanford-postagger-2015-12-09'
tagger = nltk.tag.StanfordPOSTagger('models/english-bidirectional-distsim.tagger')

for doc in documents:
    sents = [nltk.word_tokenize(sent) for sent in nltk.sent_tokenize(doc)]
    tags.extend(tagger.tag_sents(sents))
    prog.value += 1

Form a set of candidate keywords.

In [4]:
def extract_tags(tree):
    results = []
    
    if type(tree) == nltk.tree.Tree:
        if tree.label() == 'NP':
            phrase = ' '.join([word for word, _pos in tree])
            phrase = phrase.lower()
            phrase = re.sub("[().,']", '', phrase)
            phrase = re.sub('-', ' ', phrase)
            results.append(phrase)

        for child in tree:
            results.extend(extract_tags(child))
    
    return results

grammar = '''
NP: {(<JJ>|<JJS>|<JJR>)*(<NN>|<NNS>|<NNP>|<NNPS>)+} 
'''

cp = nltk.RegexpParser(grammar)
candidates = set()

prog = IntProgress(min=0, max=len(tags))
display(prog)

for sent in tags:
    result = cp.parse(sent)
        
    candidates = candidates.union(set(extract_tags(result)))
    prog.value += 1
    
    assert 'weve' not in candidates
    
# Remove stopwords
stopwords = set()
with open('/Users/andrewlamb/Google_Drive/Stanford/CS199/RAKE-tutorial/SmartStoplistAdditional.txt') as f:
    for line in f:
        stopwords.add(line.strip())
        
candidates.difference_update(stopwords)

# Remove words less than 2 letters long
candidates = set([tag for tag in candidates if len(tag) > 2]) 

Stem the candidate keywords and documents.

In [17]:
stemmer = nltk.stem.WordNetLemmatizer()
candidates = set([' '.join([stemmer.lemmatize(word) for word in nltk.word_tokenize(candidate)]) for candidate in candidates])

# for i, document in enumerate(documents):
#     documents[i] = ' '.join([stemmer.lemmatize(word) for word in nltk.word_tokenize(document)])

In [None]:
index_size = 100

tfidf = TfidfExtractor(ngram_range=range(1,5), vocabulary=candidates)
words, scores = zip(*tfidf.extract_documents(documents)[:index_size])
tfs = [tfidf._tfs[word] for word in words]
dfs = [tfidf._dfs[word] for word in words]
idfs = [tfidf._idfs[word] for word in words]

print(tabulate(
        zip(range(len(words)), words, scores, tfs, dfs, idfs), 
        headers=['rank', 'word', 'tfidf', 'tf', 'df', 'idf']
))

In [None]:
split_documents = []
split_len = 15

for document in documents:
    sents = nltk.sent_tokenize(document)
    for i in range(0, len(sents), split_len):
        split_documents.append(' '.join(sents[i:i + split_len]))

index_size = 100

tfidf = TfidfExtractor(ngram_range=range(1,5), vocabulary=candidates)
words, scores = zip(*tfidf.extract_documents(split_documents)[:index_size])
tfs = [tfidf._tfs[word] for word in words]
dfs = [tfidf._dfs[word] for word in words]
idfs = [tfidf._idfs[word] for word in words]

print(tabulate(
        zip(range(len(words)), words, scores, tfs, dfs, idfs), 
        headers=['rank', 'word', 'tfidf', 'tf', 'df', 'idf']
))

In [19]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(ngram_range=(2, max(len(word.split()) for word in candidates)))
X = tfidf.fit_transform(documents)
sums = X.sum(axis=0)
sorted_inds = sums.argsort()
inverse_voc = {v: k for k, v in tfidf.vocabulary_.items()}
words = [inverse_voc[ind] for ind in sorted_inds.tolist()[0]]
words = [word for word in words if word in candidates]
words = list(reversed(words))

In [20]:
print(tabulate(
        zip(range(len(words)), words)[:100], 
        headers=['rank', 'word']
))

  rank  word
------  -----------------------
     0  self type
     1  hand side
     2  parse tree
     3  activation record
     4  right hand
     5  non terminal
     6  start symbol
     7  right hand side
     8  regular expression
     9  type check
    10  open paren
    11  type checking
    12  sub type
    13  start state
    14  final state
    15  frame pointer
    16  no type
    17  little bit
    18  dynamic type
    19  run time
    20  control flow
    21  abstract syntax
    22  first thing
    23  programming language
    24  abstract syntax tree
    25  lexical analysis
    26  code generation
    27  next input
    28  garbage collection
    29  white space
    30  control flow graph
    31  reference counting
    32  recursive descent
    33  look ahead
    34  intermediate code
    35  least upper
    36  type system
    37  reference count
    38  left hand side
    39  basic block
    40  semantic analysis
    41  id id
    42  static type
    43  type self ty

In [16]:
sorted(set([lemmatizer.lemmatize(word) for word in words[:50] if word in documents[2]]))

[u'little bit', u'programming language', u'programming languages']

In [None]:
final = set()
for doc in documents:
    final = final.union(set([word for word in words if word in doc][:20]))

In [18]:
max(len(word.split()) for word in candidates)

6