In [1]:
from pprint import pprint

### 0 - Load text file

In [50]:
podcast_list = ["TED_TALK_Daily", "Snack_Daily", "Joe_Rogan"]
paths = ["./TED_Talk_Daily.txt", "./snack_daily.txt", "./Joe_Rogan.txt"]
text_corpus = []
episode = []
for path in paths:
    with open(path) as f:
        line = f.readline()
        episode.append(line)
        while line:
            line = f.readline()
            episode.append(line)
    text_corpus.append(''.join(episode))
    episode = []
# pprint (text_corpus)

In [57]:
def split_and_append(bag_of_words, line):
    word_list = line.split(" ")
    for word in word_list:
        raw_word = '%r'%word
        if "\\" in raw_word:
            idx = raw_word.find("\\")
            word = raw_word[:idx]
        if len(word) == 0:
            continue
        bag_of_words.append(word)
    return bag_of_words

### 1 - Exclude all stopwords from the text file

In [20]:
from nltk.corpus import stopwords
# nltk.download('stopwords')
print(stopwords.words('english'))
stoplist = set(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

### 2 - Top Frequent words

In [4]:
# Lowercase each document, split it by white space and filter out stopwords
texts = [[word for word in document.lower().split() if word not in stoplist]
         for document in text_corpus]

In [5]:
# Count word frequencies
from collections import defaultdict
frequency = defaultdict(int)
for text in texts:
    for token in text:
        frequency[token] += 1

In [6]:
# Only keep words that appear more than once
processed_corpus = [[token for token in text if frequency[token] > 1] for text in texts]
pprint(processed_corpus)

[['us',
  'black',
  'people',
  'work',
  'us',
  'future',
  'future',
  'work',
  'shares',
  'us',
  'shares',
  'black'],
 ['big', 'big', '2nd', 'home', '2nd'],
 ['company',
  'one',
  'call',
  'company',
  'workouts',
  'one',
  'home',
  'playstation',
  '4',
  'workouts',
  'playstation',
  '4',
  "i've",
  "i've",
  'work',
  'people',
  'call']]


In [9]:
from gensim import corpora

dictionary = corpora.Dictionary(processed_corpus)
print(dictionary)

Dictionary(16 unique tokens: ['black', 'future', 'people', 'shares', 'us']...)


In [48]:
pprint(dictionary.token2id)

{'2nd': 6,
 '4': 9,
 'big': 7,
 'black': 0,
 'call': 10,
 'company': 11,
 'future': 1,
 'home': 8,
 "i've": 12,
 'one': 13,
 'people': 2,
 'playstation': 14,
 'shares': 3,
 'us': 4,
 'work': 5,
 'workouts': 15}


### 3 - TF-IDF modeling

In [10]:
bow_corpus = [dictionary.doc2bow(text) for text in processed_corpus]
pprint(bow_corpus)

[[(0, 2), (1, 2), (2, 1), (3, 2), (4, 3), (5, 2)],
 [(6, 2), (7, 2), (8, 1)],
 [(2, 1),
  (5, 1),
  (8, 1),
  (9, 2),
  (10, 2),
  (11, 2),
  (12, 2),
  (13, 2),
  (14, 2),
  (15, 2)]]


In [13]:
from gensim import models

# train the model
tfidf = models.TfidfModel(bow_corpus)

# transform the "black lives matter" string
words = "black lives matter".lower().split()
print(tfidf[dictionary.doc2bow(words)])

[(0, 1.0)]


In [22]:
from gensim import similarities

index = similarities.SparseMatrixSimilarity(tfidf[bow_corpus], num_features=len(dictionary.token2id))

### 4 - Categorize by inverted indexing

In [38]:
import json

with open('../json/keywords.json') as f:
    mapping = json.load(f)

In [66]:
genre_score_map = {}
genre_keyword = mapping['bucket_to_keyword']
for genre in genre_keyword:    
    query_bow = dictionary.doc2bow(genre_keyword[genre])
    sims = index[tfidf[query_bow]]
    # print(genre, ": ", list(enumerate(sims)))
    genre_score_map[genre] = list(sims)

In [54]:
import pandas as pd

In [69]:
df

Unnamed: 0,disease,people,sports,comedy,cars,school,work,news
0,0.0,0.0,0.0,0.0,0.0,0.0,0.054888,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.376001,0.0


In [73]:
df = pd.DataFrame(data = genre_score_map)
print (df.idxmax(axis=1))

0       work
1    disease
2       work
dtype: object


In [46]:
for document_number, score in sorted(enumerate(sims), key=lambda x: x[1], reverse=True):
    print(document_number, score)

0 0.0
1 0.0
2 0.0
