## TFIDF with Python and Scikit-Learn
From http://www.markhneedham.com/blog/2015/02/15/pythonscikit-learn-calculating-tfidf-on-how-i-met-your-mother-transcripts/

Imports

In [9]:
from collections import defaultdict
import csv
from sklearn.feature_extraction.text import TfidfVectorizer

Process text and collect the text from one episode into one corpus entry

In [7]:
episodes = defaultdict(list)
with open('../data/sentences.csv', 'r') as f:
    reader = csv.reader(f, delimiter=',')
    next(reader)
    for row in reader:
        episodes[row[1]].append(row[4])
        
for episode_id, text in episodes.items():
    episodes[episode_id] = ''.join(text)
    
corpus = []
for _id, episode in sorted(episodes.items(), key=lambda t: int(t[0])):
    corpus.append(episode)

In [14]:
%%time
tf = TfidfVectorizer(analyzer='word', ngram_range=(1, 3),
                     min_df=0, stop_words='english')
tfidf_matrix = tf.fit_transform(corpus)
feature_names = tf.get_feature_names()
print(feature_names[50:70])

['00 does sound', '00 don', '00 don buy', '00 dressed', '00 dressed blond', '00 drunkenly', '00 drunkenly slurred', '00 fair', '00 fair tonight', '00 fall', '00 fall foliage', '00 far', '00 far impossible', '00 fart', '00 fart sure', '00 friends', '00 friends singing', '00 getting', '00 getting guys', '00 god']
CPU times: user 4.49 s, sys: 121 ms, total: 4.61 s
Wall time: 4.62 s


In [27]:
# Convert to dense format to explore more
dense = tfidf_matrix.todense()

# One row (first episode) with tfidf scores for every phrase in corpus
episode = dense[0].tolist()[0]
print(len(episode))

# Let's filter out phrases that don't occur in this episode
phrase_scores = [pair for pair in enumerate(episode) if pair[1] > 0]
print(len(phrase_scores))

# Sort phrases by descending score
sorted_phrase_scores = sorted(phrase_scores, key=lambda t: t[1] * -1)

# Do a lookup
for phrase, score in [(feature_names[word_id], score) for (word_id, score) in
                      sorted_phrase_scores][:20]:
    print('{0: <20} {1}'.format(phrase, score))
                      

498254
4823
ted                  0.2625177493269755
olives               0.19571419072701732
marshall             0.15551468983363487
yasmine              0.15227880637176266
robin                0.1304175242341549
barney               0.12441175186690791
lily                 0.12292497785945679
signal               0.1037932464656365
goanna               0.09813798750091524
scene                0.09534236041231685
cut                  0.09173366535740156
narrator             0.08646229819848741
flashback            0.07829592155397117
flashback date       0.07028252601773662
ranjit               0.06939276915589167
flashback date robin 0.05856877168144719
ted yasmine          0.05856877168144719
carl                 0.058210117288760355
eye patch            0.05436505297972703
lebanese             0.05436505297972703
