In [1]:
import os.path
import datetime

import numpy as np

from pprint import pprint
from collections import OrderedDict
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans

In [2]:
FMT = '%H:%M:%S,%f'

def srt_to_strs(f, interval_length):
    '''Convert a .srt file to a dict of strings'''
    
    # [(text, start_time, end_time)]
    intervals = []
    while True:
        _seq_no = f.readline().strip()
        if _seq_no == '': break
        start_str, end_str = f.readline().strip().split(' --> ')
        start_time = datetime.datetime.strptime(start_str, FMT)
        end_time = datetime.datetime.strptime(end_str, FMT)
        
        text_lines = []
        while True:
            text_line = f.readline().strip()
            if text_line == '': break
            text_line = text_line.replace('&#39;', "'")
            text_lines.append(text_line)
            
        text = ' '.join(text_lines)
        intervals.append((text, start_time, end_time))
        
    _text, interval_start_time, _end_time = intervals[0]
    
    result = {}
    lecture_name = os.path.basename(f.name)[:-4]
    interval_lines = []
    for idx, (text, start_time, end_time) in enumerate(intervals):
        
        interval_lines.append(text)
        
        if idx == len(intervals) - 1 or end_time - interval_start_time > interval_length:
            result[(lecture_name, interval_start_time, end_time)] = ' '.join(interval_lines)
            interval_start_time = end_time
            
    return result

In [3]:
SRT_FILE_NAMES = [
    '/Users/andrewlamb/Google_Drive/Stanford/CS199/CompilersSelfPacedCS1/05-01-introduction-to-parsing.srt',
    '/Users/andrewlamb/Google_Drive/Stanford/CS199/CompilersSelfPacedCS1/05-02-A+Context+Free+Grammars.srt',
    '/Users/andrewlamb/Google_Drive/Stanford/CS199/CompilersSelfPacedCS1/05-03-Derivations-Part-1.srt',
    '/Users/andrewlamb/Google_Drive/Stanford/CS199/CompilersSelfPacedCS1/05-04-A+Ambiguity.srt',
    '/Users/andrewlamb/Google_Drive/Stanford/CS199/CompilersSelfPacedCS1/06-01-error-handling.srt',
    '/Users/andrewlamb/Google_Drive/Stanford/CS199/CompilersSelfPacedCS1/06-02-abstract-syntax-trees.srt',
    '/Users/andrewlamb/Google_Drive/Stanford/CS199/CompilersSelfPacedCS1/06-03-recursive-descent-parsing.srt',
    '/Users/andrewlamb/Google_Drive/Stanford/CS199/CompilersSelfPacedCS1/06-04-1-recursive-descent-limitations-04-1.srt',
    '/Users/andrewlamb/Google_Drive/Stanford/CS199/CompilersSelfPacedCS1/06-04-recursive-descent-algorithm.srt',
    '/Users/andrewlamb/Google_Drive/Stanford/CS199/CompilersSelfPacedCS1/06-05-A+Left+Recursion.srt',
    '/Users/andrewlamb/Google_Drive/Stanford/CS199/CompilersSelfPacedCS1/07-01-Predictive-Parsing-Part-1.srt',
    '/Users/andrewlamb/Google_Drive/Stanford/CS199/CompilersSelfPacedCS1/07-02-first-sets.srt',
    '/Users/andrewlamb/Google_Drive/Stanford/CS199/CompilersSelfPacedCS1/07-03-follow-sets.srt',
    '/Users/andrewlamb/Google_Drive/Stanford/CS199/CompilersSelfPacedCS1/07-04-ll1-parsing-tables.srt',
    '/Users/andrewlamb/Google_Drive/Stanford/CS199/CompilersSelfPacedCS1/07-05-Bottom-Up-Parsing-Part-1.srt',
    '/Users/andrewlamb/Google_Drive/Stanford/CS199/CompilersSelfPacedCS1/07-06-Shift-Reduce-Parsing-Part-1.srt',
]

documents = OrderedDict()
for name in SRT_FILE_NAMES:
    with open(name) as f:
        documents.update(srt_to_strs(f, datetime.timedelta(minutes=2)))

In [4]:
# http://www.ranks.nl/stopwords default list
with open('ranks_nl_stop_words_long.txt') as f:
    stop_words = [line.strip() for line in f]
# A few added stop words
stop_words.extend(["alright", "going"])
tfidf = TfidfVectorizer(stop_words=stop_words)
X = tfidf.fit_transform(documents.values())

In [5]:
inverse_voc = {v: k for k, v in tfidf.vocabulary_.items()}
most_common_terms = {}

for idx, key in enumerate(documents.keys()):
    row = X[idx,:].toarray().flatten()
    max_indices = row.argsort()[-5:]
    most_common_terms[key] = [inverse_voc[voc_idx] for voc_idx in max_indices]

for key, terms in sorted(
    most_common_terms.items(), 
    key=lambda ((lecture_name, start_time, _end_time), term): (lecture_name, start_time)
):
    lecture_name, start_time, end_time = key
    print '{}, {} --> {}'.format(
        lecture_name, start_time.strftime(FMT), end_time.strftime(FMT)
    )
    pprint(terms)
    print

05-01-introduction-to-parsing, 00:00:03,830000 --> 00:02:05,860000
[u'finite', u'nested', u'parens', u'languages', u'regular']

05-01-introduction-to-parsing, 00:02:05,860000 --> 00:04:05,940000
[u'count', u'languages', u'machine', u'state', u'regular']

05-01-introduction-to-parsing, 00:04:05,940000 --> 00:05:28,279000
[u'count', u'languages', u'machine', u'state', u'regular']

05-02-A+Context+Free+Grammars, 00:00:03,879000 --> 00:02:08,539000
[u'free', u'invalid', u'valid', u'strings', u'expression']

05-02-A+Context+Free+Grammars, 00:02:08,539000 --> 00:04:09,850000
[u'strings', u'parentheses', u'balanced', u'context', u'free']

05-02-A+Context+Free+Grammars, 00:04:09,850000 --> 00:06:10,580000
[u'string', u'context', u'hand', u'free', u'side']

05-02-A+Context+Free+Grammars, 00:06:10,580000 --> 00:08:14,509000
[u'strings', u'alpha', u'string', u'context', u'free']

05-02-A+Context+Free+Grammars, 00:08:14,509000 --> 00:10:17,540000
[u'hand', u'strings', u'string', u'context', u'free

In [8]:
kmeans = KMeans(n_clusters=10)

clusters = kmeans.fit_predict(X)

In [9]:
cluster_assignments = {}
for idx, cluster_assignment in enumerate(clusters):
    if cluster_assignment not in cluster_assignments:
        cluster_assignments[cluster_assignment] = [documents.keys()[idx]]
    else:
        cluster_assignments[cluster_assignment].append(documents.keys()[idx])

new_assignments = {}
        
for idx, keys in cluster_assignments.items():
    new_keys = sorted(keys, key=lambda (lecture_name, start_time, _end_time): (lecture_name, start_time))
    new_keys = [
        '{}, {} --> {}'.format(lecture_name, start_time.strftime(FMT), end_time.strftime(FMT)) 
        for lecture_name, start_time, end_time in new_keys
    ]
    new_assignments[idx] = new_keys

for cluster in new_assignments.values():
    pprint(cluster)
    print

['06-04-1-recursive-descent-limitations-04-1, 00:00:00,560000 --> 00:02:05,630000',
 '06-04-1-recursive-descent-limitations-04-1, 00:02:05,630000 --> 00:04:09,370000',
 '06-04-1-recursive-descent-limitations-04-1, 00:04:09,370000 --> 00:06:12,750000',
 '06-04-1-recursive-descent-limitations-04-1, 00:06:12,750000 --> 00:06:55,809000',
 '06-04-recursive-descent-algorithm, 00:00:04,150000 --> 00:02:06,549000',
 '06-04-recursive-descent-algorithm, 00:02:06,549000 --> 00:04:07,969000',
 '06-04-recursive-descent-algorithm, 00:04:07,969000 --> 00:06:14,180000',
 '06-04-recursive-descent-algorithm, 00:06:14,180000 --> 00:08:19,490000',
 '06-04-recursive-descent-algorithm, 00:08:19,490000 --> 00:10:20,750000',
 '06-04-recursive-descent-algorithm, 00:10:20,750000 --> 00:12:22,760000',
 '06-04-recursive-descent-algorithm, 00:12:22,760000 --> 00:13:24,930000',
 '06-05-A+Left+Recursion, 00:00:03,570000 --> 00:02:08,530000',
 '06-05-A+Left+Recursion, 00:02:08,530000 --> 00:04:12,380000']

['07-03-fo