Permalink
Find file
Fetching contributors…
Cannot retrieve contributors at this time
37 lines (30 sloc) 1.19 KB
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import defaultdict
import csv
episodes = defaultdict(list)
with open("data/import/sentences.csv", "r") as sentences_file:
reader = csv.reader(sentences_file, delimiter=',')
reader.next()
for row in reader:
episodes[row[1]].append(row[4])
for episode_id, text in episodes.iteritems():
episodes[episode_id] = "".join(text)
corpus = []
for id, episode in sorted(episodes.iteritems(), key=lambda t: int(t[0])):
corpus.append(episode)
tf = TfidfVectorizer(analyzer='word', ngram_range=(1,3), min_df = 1, max_df = 20, stop_words = 'english')
tfidf_matrix = tf.fit_transform(corpus)
feature_names = tf.get_feature_names()
with open("data/import/tfidf_scikit.csv", "w") as file:
writer = csv.writer(file, delimiter=",")
writer.writerow(["EpisodeId", "Phrase", "Score"])
doc_id = 0
for doc in tfidf_matrix.todense():
print "Document %d" %(doc_id)
word_id = 0
for score in doc.tolist()[0]:
if score > 0:
word = feature_names[word_id]
writer.writerow([doc_id+1, word.encode("utf-8"), score])
word_id +=1
doc_id +=1