In [1]:
import os.path
import datetime
import nltk

import numpy as np
import matplotlib.pyplot as plt

from pprint import pprint
from collections import OrderedDict, Counter
from sklearn.feature_extraction.text import TfidfVectorizer

%matplotlib inline

In [2]:
FMT = '%H:%M:%S,%f'

def srt_to_strs(f, interval_length):
    '''Convert a .srt file to a dict of strings'''
    
    # [(text, start_time, end_time)]
    intervals = []
    while True:
        _seq_no = f.readline().strip()
        if _seq_no == '': break
        start_str, end_str = f.readline().strip().split(' --> ')
        start_time = datetime.datetime.strptime(start_str, FMT)
        end_time = datetime.datetime.strptime(end_str, FMT)
        
        text_lines = []
        while True:
            text_line = f.readline().strip()
            if text_line == '': break
            text_line = text_line.replace('&#39;', '')
            text_line = text_line.replace('&gt;', '')
            text_lines.append(text_line)
            
        text = ' '.join(text_lines)
        intervals.append((text, start_time, end_time))
        
    _text, interval_start_time, _end_time = intervals[0]
    
    result = OrderedDict()
    lecture_name = os.path.basename(f.name)[:-4]
    interval_lines = []
    for idx, (text, start_time, end_time) in enumerate(intervals):
        interval_lines.append(text)
        
        if idx == len(intervals) - 1 or end_time - interval_start_time > interval_length:
            result[(lecture_name, interval_start_time, end_time)] = ' '.join(interval_lines)
            interval_start_time = end_time
            interval_lines = []
            
    return result

In [3]:
SRT_FILE_NAMES = [
    "/Users/andrewlamb/Google_Drive/Stanford/CS199/CompilersSelfPacedCS1/01-01-introduction-redo-correction.srt",
    "/Users/andrewlamb/Google_Drive/Stanford/CS199/CompilersSelfPacedCS1/01-02-structure-of-a-compiler-final.srt",
    "/Users/andrewlamb/Google_Drive/Stanford/CS199/CompilersSelfPacedCS1/01-03-economy-of-Programming-Languages_19m51s_.srt",
    "/Users/andrewlamb/Google_Drive/Stanford/CS199/CompilersSelfPacedCS1/02-01-cool-overview-final.srt",
    "/Users/andrewlamb/Google_Drive/Stanford/CS199/CompilersSelfPacedCS1/02-02-cool-example-ii-final.srt",
    "/Users/andrewlamb/Google_Drive/Stanford/CS199/CompilersSelfPacedCS1/02-03-cool-example-iii-final-correction.srt",
    "/Users/andrewlamb/Google_Drive/Stanford/CS199/CompilersSelfPacedCS1/03-01-Lexical-Analysis-Part-1.srt",
    "/Users/andrewlamb/Google_Drive/Stanford/CS199/CompilersSelfPacedCS1/03-02-lexical-analysis-examples-final.srt",
    "/Users/andrewlamb/Google_Drive/Stanford/CS199/CompilersSelfPacedCS1/03-03-A+Regular+Languages.srt",
    "/Users/andrewlamb/Google_Drive/Stanford/CS199/CompilersSelfPacedCS1/03-04-formal-languages.srt",
    "/Users/andrewlamb/Google_Drive/Stanford/CS199/CompilersSelfPacedCS1/03-05-lexical-specifications-final-quizupdate.srt",
    "/Users/andrewlamb/Google_Drive/Stanford/CS199/CompilersSelfPacedCS1/04+02+finite+automata+part+1.srt",
    "/Users/andrewlamb/Google_Drive/Stanford/CS199/CompilersSelfPacedCS1/04-01-lexical-specification.srt",
    "/Users/andrewlamb/Google_Drive/Stanford/CS199/CompilersSelfPacedCS1/04-03-regular-expressions-to-nfas-final-quizupdate-correction.srt",
    "/Users/andrewlamb/Google_Drive/Stanford/CS199/CompilersSelfPacedCS1/04-04-nfa-to-dfa-quizupdate.srt",
    "/Users/andrewlamb/Google_Drive/Stanford/CS199/CompilersSelfPacedCS1/04-05-implementing-finite-automata-correction.srt",
    "/Users/andrewlamb/Google_Drive/Stanford/CS199/CompilersSelfPacedCS1/05-01-introduction-to-parsing.srt",
    "/Users/andrewlamb/Google_Drive/Stanford/CS199/CompilersSelfPacedCS1/05-02-A+Context+Free+Grammars.srt",
    "/Users/andrewlamb/Google_Drive/Stanford/CS199/CompilersSelfPacedCS1/05-03-Derivations-Part-1.srt",
    "/Users/andrewlamb/Google_Drive/Stanford/CS199/CompilersSelfPacedCS1/05-04-A+Ambiguity.srt",
    "/Users/andrewlamb/Google_Drive/Stanford/CS199/CompilersSelfPacedCS1/06-01-error-handling.srt",
    "/Users/andrewlamb/Google_Drive/Stanford/CS199/CompilersSelfPacedCS1/06-02-abstract-syntax-trees.srt",
    "/Users/andrewlamb/Google_Drive/Stanford/CS199/CompilersSelfPacedCS1/06-03-recursive-descent-parsing.srt",
    "/Users/andrewlamb/Google_Drive/Stanford/CS199/CompilersSelfPacedCS1/06-04-1-recursive-descent-limitations-04-1.srt",
    "/Users/andrewlamb/Google_Drive/Stanford/CS199/CompilersSelfPacedCS1/06-04-recursive-descent-algorithm.srt",
    "/Users/andrewlamb/Google_Drive/Stanford/CS199/CompilersSelfPacedCS1/06-05-A+Left+Recursion.srt",
    "/Users/andrewlamb/Google_Drive/Stanford/CS199/CompilersSelfPacedCS1/07-01-Predictive-Parsing-Part-1.srt",
    "/Users/andrewlamb/Google_Drive/Stanford/CS199/CompilersSelfPacedCS1/07-02-first-sets.srt",
    "/Users/andrewlamb/Google_Drive/Stanford/CS199/CompilersSelfPacedCS1/07-03-follow-sets.srt",
    "/Users/andrewlamb/Google_Drive/Stanford/CS199/CompilersSelfPacedCS1/07-04-ll1-parsing-tables.srt",
    "/Users/andrewlamb/Google_Drive/Stanford/CS199/CompilersSelfPacedCS1/07-05-Bottom-Up-Parsing-Part-1.srt",
    "/Users/andrewlamb/Google_Drive/Stanford/CS199/CompilersSelfPacedCS1/07-06-Shift-Reduce-Parsing-Part-1.srt",
    "/Users/andrewlamb/Google_Drive/Stanford/CS199/CompilersSelfPacedCS1/08-01-Handles-Part-1.srt",
    "/Users/andrewlamb/Google_Drive/Stanford/CS199/CompilersSelfPacedCS1/08-02-recognizing-handles.srt",
    "/Users/andrewlamb/Google_Drive/Stanford/CS199/CompilersSelfPacedCS1/08-03-recognizing-viable-prefixes.srt",
    "/Users/andrewlamb/Google_Drive/Stanford/CS199/CompilersSelfPacedCS1/08-04-valid-items.srt",
    "/Users/andrewlamb/Google_Drive/Stanford/CS199/CompilersSelfPacedCS1/08-05-slr-parsing.srt",
    "/Users/andrewlamb/Google_Drive/Stanford/CS199/CompilersSelfPacedCS1/08-06-slr-parsing-example.srt",
    "/Users/andrewlamb/Google_Drive/Stanford/CS199/CompilersSelfPacedCS1/08-07-slr-improvements.srt",
    "/Users/andrewlamb/Google_Drive/Stanford/CS199/CompilersSelfPacedCS1/08-08-slr-examples-correction.srt",
    "/Users/andrewlamb/Google_Drive/Stanford/CS199/CompilersSelfPacedCS1/09-01-introduction-to-semantic-analysis.srt",
    "/Users/andrewlamb/Google_Drive/Stanford/CS199/CompilersSelfPacedCS1/09-02-scope.srt",
    "/Users/andrewlamb/Google_Drive/Stanford/CS199/CompilersSelfPacedCS1/09-03-symbol-tables.srt",
    "/Users/andrewlamb/Google_Drive/Stanford/CS199/CompilersSelfPacedCS1/09-04-types.srt",
    "/Users/andrewlamb/Google_Drive/Stanford/CS199/CompilersSelfPacedCS1/09-05-A+Type+Checking.srt",
    "/Users/andrewlamb/Google_Drive/Stanford/CS199/CompilersSelfPacedCS1/09-06-A+Type+Environments.srt",
    "/Users/andrewlamb/Google_Drive/Stanford/CS199/CompilersSelfPacedCS1/09-07-A+Subtyping.srt",
    "/Users/andrewlamb/Google_Drive/Stanford/CS199/CompilersSelfPacedCS1/09-08-A+Typing+Methods.srt",
    "/Users/andrewlamb/Google_Drive/Stanford/CS199/CompilersSelfPacedCS1/09-09-implementing-type-checking.srt",
    "/Users/andrewlamb/Google_Drive/Stanford/CS199/CompilersSelfPacedCS1/10-01-A+Static+vs.+Dynamic+Typing.srt",
    "/Users/andrewlamb/Google_Drive/Stanford/CS199/CompilersSelfPacedCS1/10-02-self-type.srt",
    "/Users/andrewlamb/Google_Drive/Stanford/CS199/CompilersSelfPacedCS1/10-03-A+Self+Type+Operations.srt",
    "/Users/andrewlamb/Google_Drive/Stanford/CS199/CompilersSelfPacedCS1/10-04-self-type-usage.srt",
    "/Users/andrewlamb/Google_Drive/Stanford/CS199/CompilersSelfPacedCS1/10-05-A+Self+Type+Checking.srt",
    "/Users/andrewlamb/Google_Drive/Stanford/CS199/CompilersSelfPacedCS1/10-06-error-recovery.srt",
    "/Users/andrewlamb/Google_Drive/Stanford/CS199/CompilersSelfPacedCS1/11-01-runtime-organization.srt",
    "/Users/andrewlamb/Google_Drive/Stanford/CS199/CompilersSelfPacedCS1/11-02-A+Activations.srt",
    "/Users/andrewlamb/Google_Drive/Stanford/CS199/CompilersSelfPacedCS1/11-03-activation-records.srt",
    "/Users/andrewlamb/Google_Drive/Stanford/CS199/CompilersSelfPacedCS1/11-04-globals-and-heap.srt",
    "/Users/andrewlamb/Google_Drive/Stanford/CS199/CompilersSelfPacedCS1/11-05-alignment.srt",
    "/Users/andrewlamb/Google_Drive/Stanford/CS199/CompilersSelfPacedCS1/11-06-stack-machines.srt",
    "/Users/andrewlamb/Google_Drive/Stanford/CS199/CompilersSelfPacedCS1/12-01-introduction-to-code-generation.srt",
    "/Users/andrewlamb/Google_Drive/Stanford/CS199/CompilersSelfPacedCS1/12-02-A+Code+Generation+I.srt",
    "/Users/andrewlamb/Google_Drive/Stanford/CS199/CompilersSelfPacedCS1/12-03-A+Code+Generation+II.srt",
    "/Users/andrewlamb/Google_Drive/Stanford/CS199/CompilersSelfPacedCS1/12-04-code-generation-example.srt",
    "/Users/andrewlamb/Google_Drive/Stanford/CS199/CompilersSelfPacedCS1/12-05-A+Temporaries.srt",
    "/Users/andrewlamb/Google_Drive/Stanford/CS199/CompilersSelfPacedCS1/12-06-A+Object+Layout.srt",
    "/Users/andrewlamb/Google_Drive/Stanford/CS199/CompilersSelfPacedCS1/13-01-semantics-overview.srt",
    "/Users/andrewlamb/Google_Drive/Stanford/CS199/CompilersSelfPacedCS1/13-02-operational-semantics.srt",
    "/Users/andrewlamb/Google_Drive/Stanford/CS199/CompilersSelfPacedCS1/13-03-cool-semantics-i.srt",
    "/Users/andrewlamb/Google_Drive/Stanford/CS199/CompilersSelfPacedCS1/13-04-A+Cool+Semantics+II.srt",
    "/Users/andrewlamb/Google_Drive/Stanford/CS199/CompilersSelfPacedCS1/14-01-intermediate-code.srt",
    "/Users/andrewlamb/Google_Drive/Stanford/CS199/CompilersSelfPacedCS1/14-02-optimization-overview.srt",
    "/Users/andrewlamb/Google_Drive/Stanford/CS199/CompilersSelfPacedCS1/14-03-local-optimization.srt",
    "/Users/andrewlamb/Google_Drive/Stanford/CS199/CompilersSelfPacedCS1/14-04-peephole-optimization.srt",
    "/Users/andrewlamb/Google_Drive/Stanford/CS199/CompilersSelfPacedCS1/15-02-constant-propagation.srt",
    "/Users/andrewlamb/Google_Drive/Stanford/CS199/CompilersSelfPacedCS1/15-03-analysis-of-loops.srt",
    "/Users/andrewlamb/Google_Drive/Stanford/CS199/CompilersSelfPacedCS1/15-04-orderings.srt",
    "/Users/andrewlamb/Google_Drive/Stanford/CS199/CompilersSelfPacedCS1/15-05-A+Liveness+Analysis.srt",
    "/Users/andrewlamb/Google_Drive/Stanford/CS199/CompilersSelfPacedCS1/16-01-register-allocation.srt",
    "/Users/andrewlamb/Google_Drive/Stanford/CS199/CompilersSelfPacedCS1/16-02-A+Graph+Coloring.srt",
    "/Users/andrewlamb/Google_Drive/Stanford/CS199/CompilersSelfPacedCS1/16-03-A+Spilling.srt",
    "/Users/andrewlamb/Google_Drive/Stanford/CS199/CompilersSelfPacedCS1/16-04-managing-caches.srt",
    "/Users/andrewlamb/Google_Drive/Stanford/CS199/CompilersSelfPacedCS1/17-01-automatic-memory-management.srt",
    "/Users/andrewlamb/Google_Drive/Stanford/CS199/CompilersSelfPacedCS1/17-02-A+Mark+and+Sweep.srt",
    "/Users/andrewlamb/Google_Drive/Stanford/CS199/CompilersSelfPacedCS1/17-03-A+Stop+and+Copy.srt",
    "/Users/andrewlamb/Google_Drive/Stanford/CS199/CompilersSelfPacedCS1/17-04-conservative-collection.srt",
    "/Users/andrewlamb/Google_Drive/Stanford/CS199/CompilersSelfPacedCS1/17-05-A+Reference+Counting.srt",
    "/Users/andrewlamb/Google_Drive/Stanford/CS199/CompilersSelfPacedCS1/18-01-java.srt",
    "/Users/andrewlamb/Google_Drive/Stanford/CS199/CompilersSelfPacedCS1/18-02-java-arrays.srt",
    "/Users/andrewlamb/Google_Drive/Stanford/CS199/CompilersSelfPacedCS1/18-03-java-exceptions.srt",
    "/Users/andrewlamb/Google_Drive/Stanford/CS199/CompilersSelfPacedCS1/18-04-java-interfaces.srt",
    "/Users/andrewlamb/Google_Drive/Stanford/CS199/CompilersSelfPacedCS1/18-05-java-coercions.srt",
    "/Users/andrewlamb/Google_Drive/Stanford/CS199/CompilersSelfPacedCS1/18-06-java-threads.srt",
    "/Users/andrewlamb/Google_Drive/Stanford/CS199/CompilersSelfPacedCS1/18-07-other-topics.srt",
]

documents = OrderedDict()
for name in SRT_FILE_NAMES:
    with open(name) as f:
        documents.update(srt_to_strs(f, datetime.timedelta(minutes=1)))

In [40]:
sents = [nltk.word_tokenize(sent) for sent in nltk.sent_tokenize(' '.join(documents.values()))]
tags = []
for sent in sents:
    try:
        tags.append(nltk.pos_tag(sent))
    except UnicodeDecodeError:
        pass

pos_to_tokens = {}
for sent in tags:
    for token, pos in sent:
        if pos not in pos_to_tokens:
            pos_to_tokens[pos] = set([token.lower()])
        else:
            pos_to_tokens[pos].add(token.lower())

In [70]:
# http://www.ranks.nl/stopwords
with open('ranks_nl_stop_words_long.txt') as f:
    stop_words = [line.strip().replace("'", '') for line in f]
# A few added stop words
stop_words.extend([
        'thing',
        'things',
        'hello', 
        'going', 
        'uh', 
        'gonna', 
        'jack', 
        'will', 
        'alright',
        'cuz',
        'a0',
        'a1',
        'e0',
        'e1',
        'e2',
        'forgot',
        'graduate',
        'hope',
        'r1',
        's1',
        's2',
        't0',
        't1',
        't2',
        'x1',
        'a2i',
        'en',
        'bs',
        'idea',
        'keep',
        'sa'
    ])
# A few removed stop words
stop_words.remove('first')

lemmatizer = nltk.WordNetLemmatizer()
lemmatized_docs = OrderedDict()
for key, doc in documents.items():
    tokens = nltk.word_tokenize(doc)
    lemmatized_tokens = []
    for token in tokens:
        if token not in stop_words:
            try:
                lemmatized_tokens.append(lemmatizer.lemmatize(token))
            except:
                pass
            
    lemmatized_docs[key] = ' '.join(lemmatized_tokens) 

In [72]:
tfidf = TfidfVectorizer(
    stop_words=stop_words,
    ngram_range=(1,4),
    max_features=1000,
    max_df=30
)
X = tfidf.fit_transform(lemmatized_docs.values())

Filter down to about 300 words.

In [78]:
VALID_TAGS = [
    'NNP',
    'NNPS',
]

valid_tokens = set()
for pos in VALID_TAGS:
    valid_tokens = valid_tokens.union(pos_to_tokens[pos])

filtered_vocabulary = {
    key: value for key, value in tfidf.vocabulary_.items() 
    if len(set(key.split()).intersection(valid_tokens)) > 0 and
    Counter(key.split()).most_common()[0][1] == 1 and
    not any([key in other and key != other for other in filtered_vocabulary])
}

pprint(filtered_vocabulary)

{u'abstract syntax tree': 4,
 u'allocate object': 23,
 u'allocating object': 26,
 u'allocation pointer': 28,
 u'alpha beta': 33,
 u'argument type': 50,
 u'assembly code': 57,
 u'assembly language': 58,
 u'attribute class': 68,
 u'automata': 69,
 u'automatic memory': 71,
 u'basic block': 78,
 u'boolean': 90,
 u'bottom parsing': 91,
 u'call function': 102,
 u'call method': 103,
 u'catch block': 109,
 u'check type': 111,
 u'choose': 116,
 u'class attribute': 117,
 u'class defined': 119,
 u'class definition': 120,
 u'class implement': 121,
 u'class method': 122,
 u'class object': 123,
 u'class type': 124,
 u'code first': 130,
 u'code program': 131,
 u'compile time': 146,
 u'computer': 155,
 u'concrete': 161,
 u'consistent': 168,
 u'constant folding': 170,
 u'constant propagation': 171,
 u'context free grammar': 175,
 u'control flow graph': 178,
 u'cool program': 183,
 u'cool type': 184,
 u'copy object': 187,
 u'correct program': 188,
 u'data structure': 200,
 u'declared type': 209,
 u'depe

Lookup the minutes where "abstract syntax tree" and "recursive descent" appear.

In [61]:
token_to_docs = OrderedDict()
for key, column in filtered_vocabulary.items():
    document_indices = [idx for idx, count in enumerate(X[:, column].toarray().flatten()) if count > 0]
    token_to_docs[key] = sorted(list(set([documents.keys()[idx] for idx in document_indices])))
pprint([
        '{}: {} --> {}'.format(
            lecture_name, start_time.strftime(FMT), end_time.strftime(FMT)
        )
        for lecture_name, start_time, end_time in token_to_docs['abstract syntax tree']
    ])

['06-02-abstract-syntax-trees: 00:00:04,799000 --> 00:01:08,590000',
 '06-02-abstract-syntax-trees: 00:02:13,450000 --> 00:03:13,690000',
 '06-02-abstract-syntax-trees: 00:03:13,690000 --> 00:03:48,260000',
 '09-03-symbol-tables: 00:00:03,530000 --> 00:01:05,950000',
 '09-03-symbol-tables: 00:01:05,950000 --> 00:02:06,859000',
 '09-03-symbol-tables: 00:02:06,859000 --> 00:03:07,750000',
 '09-03-symbol-tables: 00:03:07,750000 --> 00:04:10,299000',
 '09-03-symbol-tables: 00:04:10,299000 --> 00:05:14,180000',
 '09-04-types: 00:10:29,520000 --> 00:11:20,920000',
 '09-09-implementing-type-checking: 00:00:05,150000 --> 00:01:09,830000',
 '10-06-error-recovery: 00:01:06,900000 --> 00:02:08,060000',
 '10-06-error-recovery: 00:02:08,060000 --> 00:03:13,680000',
 '10-06-error-recovery: 00:03:13,680000 --> 00:04:15,739000',
 '10-06-error-recovery: 00:04:15,739000 --> 00:05:20,139000',
 '10-06-error-recovery: 00:05:20,139000 --> 00:06:25,000000',
 '12-02-A+Code+Generation+I: 00:12:28,710000 --> 00

In [62]:
token_to_docs = OrderedDict()
for key, column in filtered_vocabulary.items():
    document_indices = [idx for idx, count in enumerate(X[:, column].toarray().flatten()) if count > 0]
    token_to_docs[key] = sorted(list(set([documents.keys()[idx] for idx in document_indices])))
pprint([
        '{}: {} --> {}'.format(
            lecture_name, start_time.strftime(FMT), end_time.strftime(FMT)
        )
        for lecture_name, start_time, end_time in token_to_docs['recursive descent']
    ])

['06-03-recursive-descent-parsing: 00:00:03,949000 --> 00:01:04,720000',
 '06-03-recursive-descent-parsing: 00:01:04,720000 --> 00:02:08,110000',
 '06-04-1-recursive-descent-limitations-04-1: 00:00:00,560000 --> 00:01:05,609000',
 '06-04-1-recursive-descent-limitations-04-1: 00:04:09,370000 --> 00:05:13,000000',
 '06-04-1-recursive-descent-limitations-04-1: 00:05:13,000000 --> 00:06:17,830000',
 '06-04-recursive-descent-algorithm: 00:00:04,150000 --> 00:01:06,760000',
 '06-04-recursive-descent-algorithm: 00:08:32,039000 --> 00:09:36,120000',
 '06-04-recursive-descent-algorithm: 00:09:36,120000 --> 00:10:36,890000',
 '06-05-A+Left+Recursion: 00:00:03,570000 --> 00:01:07,810000',
 '06-05-A+Left+Recursion: 00:02:08,530000 --> 00:03:11,370000',
 '06-05-A+Left+Recursion: 00:04:12,380000 --> 00:05:14,970000',
 '07-01-Predictive-Parsing-Part-1: 00:00:03,780000 --> 00:01:04,059000',
 '07-01-Predictive-Parsing-Part-1: 00:02:06,979000 --> 00:03:12,010000',
 '07-05-Bottom-Up-Parsing-Part-1: 00:00