In [1]:
import os.path

import numpy as np

from pprint import pprint
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans

In [2]:
def srt_to_str(f):
    '''Convert a .srt file to a flat string (loses time info)'''
    
    text_lines = []
    while True:
        _seq_no = f.readline().strip()
        if _seq_no == '': break
        _interval = f.readline()
        
        while True:
            text_line = f.readline().strip()
            if text_line == '': break
            text_lines.append(text_line)
            
    return ' '.join(text_lines)

In [3]:
SRT_FILE_NAMES = [
    '/Users/andrewlamb/Google_Drive/Stanford/CS199/stanford-online-ee364/Convex Sets.srt',
    '/Users/andrewlamb/Google_Drive/Stanford/CS199/stanford-online-ee364/Lecture 06   Approximation and fitting v2.srt',
    '/Users/andrewlamb/Google_Drive/Stanford/CS199/stanford-online-ee364/Lecture 11 - Equality constrained minimization.srt',
    '/Users/andrewlamb/Google_Drive/Stanford/CS199/stanford-online-ee364/EE364A LEC10 130207 v3.srt',
    '/Users/andrewlamb/Google_Drive/Stanford/CS199/stanford-online-ee364/Lecture 06   Approximation and fitting ver 1.srt',
    '/Users/andrewlamb/Google_Drive/Stanford/CS199/stanford-online-ee364/Lecture 12   Interior point methods v1.srt',
    '/Users/andrewlamb/Google_Drive/Stanford/CS199/stanford-online-ee364/EE364A lec11 130212 v3 071613.srt',
    '/Users/andrewlamb/Google_Drive/Stanford/CS199/stanford-online-ee364/Lecture 07   Statistical estimation v1.srt',
    '/Users/andrewlamb/Google_Drive/Stanford/CS199/stanford-online-ee364/Lecture 12 - Interior point methods.srt',
    '/Users/andrewlamb/Google_Drive/Stanford/CS199/stanford-online-ee364/Lecture 01 - Introduction.srt',
    '/Users/andrewlamb/Google_Drive/Stanford/CS199/stanford-online-ee364/Lecture 08   Geometric problems v1.srt',
    '/Users/andrewlamb/Google_Drive/Stanford/CS199/stanford-online-ee364/Lecture 13   Conclusions v1.srt',
    '/Users/andrewlamb/Google_Drive/Stanford/CS199/stanford-online-ee364/Lecture 03 - Convex functions.srt',
    '/Users/andrewlamb/Google_Drive/Stanford/CS199/stanford-online-ee364/Lecture 09 Numerical linear algebra background.srt',
    '/Users/andrewlamb/Google_Drive/Stanford/CS199/stanford-online-ee364/ee364a lecture 12 ver3 072613.srt',
    '/Users/andrewlamb/Google_Drive/Stanford/CS199/stanford-online-ee364/Lecture 04   Convex optimization problems v1.srt',
    '/Users/andrewlamb/Google_Drive/Stanford/CS199/stanford-online-ee364/Lecture 10   Unconstrained minimization v1.srt',
    '/Users/andrewlamb/Google_Drive/Stanford/CS199/stanford-online-ee364/Lecture 05   Duality.srt',
    '/Users/andrewlamb/Google_Drive/Stanford/CS199/stanford-online-ee364/Lecture 11   Equality constrained minimization.srt',
]

lecture_names = [os.path.basename(name)[:-4] for name in SRT_FILE_NAMES]

documents = []
for name in SRT_FILE_NAMES:
    with open(name) as f:
        documents.append(srt_to_str(f))

In [4]:
# http://www.ranks.nl/stopwords default list
stop_words = ["a","about","above","after","again","against","all","am","an","and","any","are","aren't","as","at","be","because","been","before","being","below","between","both","but","by","can't","cannot","could","couldn't","did","didn't","do","does","doesn't","doing","don't","down","during","each","few","for","from","further","had","hadn't","has","hasn't","have","haven't","having","he","he'd","he'll","he's","her","here","here's","hers","herself","him","himself","his","how","how's","i","i'd","i'll","i'm","i've","if","in","into","is","isn't","it","it's","its","itself","let's","me","more","most","mustn't","my","myself","no","nor","not","of","off","on","once","only","or","other","ought","our","ours	ourselves","out","over","own","same","shan't","she","she'd","she'll","she's","should","shouldn't","so","some","such","than","that","that's","the","their","theirs","them","themselves","then","there","there's","these","they","they'd","they'll","they're","they've","this","those","through","to","too","under","until","up","very","was","wasn't","we","we'd","we'll","we're","we've","were","weren't","what","what's","when","when's","where","where's","which","while","who","who's","whom","why","why's","with","won't","would","wouldn't","you","you'd","you'll","you're","you've","your","yours","yourself","yourselves"]
# Add a few words
stop_words.extend(["okay", "like", "know", "something", "can", "says", "right", "one", "just", "actually", "things"])
tfidf = TfidfVectorizer(stop_words=stop_words)
X = tfidf.fit_transform(documents)

In [5]:
inverse_voc = {v: k for k, v in tfidf.vocabulary_.items()}
most_common_terms = []

for idx in range(len(documents)):
    row = X[idx,:].toarray().flatten()
    max_indices = row.argsort()[-5:]
    most_common_terms.append([inverse_voc[voc_idx] for voc_idx in max_indices])

for name, terms in zip(lecture_names, most_common_terms):
    print 'Most common terms for {}: {}'.format(name, terms)

Most common terms for Convex Sets: [u'points', u'theta', u'convex', u'cone', u'set']
Most common terms for Lecture 06   Approximation and fitting v2: [u'll', u'mean', u'going', u'norm', u're']
Most common terms for Lecture 11 - Equality constrained minimization: [u'way', u'method', u'going', u're', u'newton']
Most common terms for EE364A LEC10 130207 v3: [u'going', u'mean', u'problem', u'norm', u're']
Most common terms for Lecture 06   Approximation and fitting ver 1: [u'now', u'mean', u'going', u'norm', u're']
Most common terms for Lecture 12   Interior point methods v1: [u'now', u'going', u'mu', u're', u'newton']
Most common terms for EE364A lec11 130212 v3 071613: [u'experiment', u'going', u'likelihood', u'say', u'distribution']
Most common terms for Lecture 07   Statistical estimation v1: [u'experiment', u'going', u'likelihood', u'say', u'distribution']
Most common terms for Lecture 12 - Interior point methods: [u'now', u'going', u're', u'mu', u'newton']
Most common terms for Lectu

In [6]:
kmeans = KMeans(n_clusters=4)

clusters = kmeans.fit_predict(X)

In [7]:
cluster_assignments = {}
for idx, cluster_assignment in enumerate(clusters):
    if cluster_assignment not in cluster_assignments:
        cluster_assignments[cluster_assignment] = [lecture_names[idx]]
    else:
        cluster_assignments[cluster_assignment].append(lecture_names[idx])

pprint(cluster_assignments.values())

[['Lecture 06   Approximation and fitting v2',
  'EE364A LEC10 130207 v3',
  'Lecture 06   Approximation and fitting ver 1',
  'Lecture 09 Numerical linear algebra background'],
 ['Lecture 11 - Equality constrained minimization',
  'Lecture 12   Interior point methods v1',
  'Lecture 12 - Interior point methods',
  'Lecture 10   Unconstrained minimization v1',
  'Lecture 11   Equality constrained minimization'],
 ['EE364A lec11 130212 v3 071613', 'Lecture 07   Statistical estimation v1'],
 ['Convex Sets',
  'Lecture 01 - Introduction',
  'Lecture 08   Geometric problems v1',
  'Lecture 13   Conclusions v1',
  'Lecture 03 - Convex functions',
  'ee364a lecture 12 ver3 072613',
  'Lecture 04   Convex optimization problems v1',
  'Lecture 05   Duality']]
