In [None]:
!ls

Install additional libraries

In [None]:
!pip install -q tqdm gensim swifter 

# Imports

In [1]:
import pandas as pd
import swifter
import pprint
import glob
import json
from tqdm import tqdm
import re
from datetime import datetime
import gensim
import logging
import os
import shutil
import pprint
import numpy as np
import scipy
from ipywidgets import interact
from swifter import swiftapply


In [2]:
logging.basicConfig(format='%(levelname)s : %(message)s', level=logging.INFO)
logging.root.level = logging.INFO  # ipython sometimes messes up the logging setup; restore

# Load Data

In [3]:
df = pd.read_csv("talks.csv.gzip", compression="gzip")
df.head(10)

Look at one record in our data frame

In [4]:
pprint.pprint(df.iloc[0].to_dict())

{'average_rating': 4.72946882248,
 'categories': "['Nonprofits & Activism']",
 'description': 'http://www.ted.com When German psychologist Inge Missmahl '
                'went to Afghanistan, she saw universal wounds of the human '
                'heart -- despair and trauma. She tackled this widespread '
                'depression with psychosocial counseling, and in return '
                'witnessed remarkable individual and social healing and new '
                'hope for families and communities.\n'
                '\n'
                'TEDTalks is a daily video podcast of the best talks and '
                "performances from the TED Conference, where the world's "
                'leading thinkers and doers give the talk of their lives in 18 '
                'minutes. Featured speakers have included Al Gore on climate '
                'change, Philippe Starck on design, Jill Bolte Taylor on '
                'observing her own stroke, Nicholas Negroponte on One Laptop '

Look at just the description

In [5]:
print(df.iloc[0].to_dict()['description'])

http://www.ted.com When German psychologist Inge Missmahl went to Afghanistan, she saw universal wounds of the human heart -- despair and trauma. She tackled this widespread depression with psychosocial counseling, and in return witnessed remarkable individual and social healing and new hope for families and communities.

TEDTalks is a daily video podcast of the best talks and performances from the TED Conference, where the world's leading thinkers and doers give the talk of their lives in 18 minutes. Featured speakers have included Al Gore on climate change, Philippe Starck on design, Jill Bolte Taylor on observing her own stroke, Nicholas Negroponte on One Laptop per Child, Jane Goodall on chimpanzees, Bill Gates on malaria and mosquitoes, Pattie Maes on the "Sixth Sense" wearable tech, and "Lost" producer JJ Abrams on the allure of mystery. TED stands for Technology, Entertainment, Design, and TEDTalks cover these topics as well as science, business, development and the arts. Closed

Look at the transcript

In [6]:
print(df.iloc[0].to_dict()['transcript'])

So I want to tell you a story -- an encouraging story --
about addressing
desperation, depression and despair in Afghanistan,
and what we have learned from it,
and how to help people
to overcome traumatic experiences
and how to help them to regain some confidence
in the time ahead -- in the future --
and how to participate again in everyday life.
So, I am a Jungian psychoanalyst,
and I went to Afghanistan in January 2004, by chance,
on an assignment for Medica Mondiale.
Jung in Afghanistan --
you get the picture.
Afghanistan is one of the poorest countries in the world,
and 70 percent of the people are illiterate.
War and malnutrition kills people
together with hope.
You may know this from the media,
but what you may not know
is that the average age of the Afghan people is 17 years old,
which means they grow up in such an environment
and -- I repeat myself --
in 30 years of war.
So this translates
into ongoing violence,
foreign interests, bribery,
drugs, ethnic conflicts,
bad health, s

# Preprocessing in Natural Language Processing

In [7]:
# Spacy NLP

In [8]:
import spacy #popular nlp library

load a pretrained english language model

In [9]:
nlp = spacy.load("en")

** Parts of Speech **

In [10]:
sample_sentence = 'Apple is looking at buying U.K. startup for $1 billion.'

In [11]:
doc = nlp(sample_sentence)

data=[]
columns = ['original', 'lemma', 'part_of_speech', 'syntantic_dependency', 'alpha_character', 'is_stop_word']
for token in doc:
    data.append([token.text, token.lemma_, token.pos_,  token.dep_,token.is_alpha, token.is_stop])
pd.DataFrame.from_records(data=data, columns = columns)

- What tokens/ words do you keep?
- Is look and looking the same thing ?
- Do we care about puncutation, adverbs?
- Do we care about stopwords; words that occur frequently

In [12]:
def prepare(docs):
    """
    Use Spacy pipleines to 
    - ignore stopwords
    - take word lemma
    """
    #all_doc_tokens = 
    for doc in nlp.pipe(tqdm(docs)):
        doc_tokens = []
        for token in doc:
            if not token.is_stop:
                doc_tokens.append(token.lemma_)
        yield " ".join(doc_tokens)
    #return all_doc_tokens

In [13]:
next(prepare([sample_sentence]))

100%|██████████| 1/1 [00:00<00:00, 2206.37it/s]


'apple look buy u.k. startup $ 1 billion .'

** What text do u take? **    
Ted Talk definitely has a title.        
It most likely has a description.    
Does it always have a transcript.   

Create a new column with the fields you think might be important.  

In [14]:
#TEXT = df['title']  # take only title 
#TEXT = df['title'].str.cat(df['description'] , sep='\n')  # take title  and description
TEXT = df['title'] +"\n" + df['description'] + "\n" + df["transcript"] # take all text field

df["all_text"] = TEXT # save concated field as new column

In [15]:
df.columns

Index(['average_rating', 'categories', 'description', 'dislike_count',
       'duration', 'id', 'like_count', 'thumbnail', 'title', 'upload_date',
       'view_count', 'webpage_url', 'transcript', 'all_text'],
      dtype='object')

# Algorithm One: Text Frequency - Inverse Document Frequency (tf-idf)

This is a common algorithm.     
In, fact there are databases that implement this algorithm under the hood(Elastic Search/Solr/Lucene). 

## Theory

In [21]:
import sklearn # very popular machine learning library
from sklearn.feature_extraction.text import TfidfVectorizer

In [22]:
tfidf = TfidfVectorizer()
tfidf_matrix = tfidf.fit_transform(df["all_text"])

Here are the default parameters.   
Here are some frequently tweaked parameters.     
- min_df: minimum number of documents a word needs to be in, for it to be included; If a word occurs in only 1 doc, is it a mispeeling ?
- max_df: maximim percent of documents a word can be in. If a word is in all docs, does it add value

In [24]:
tfidf

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [None]:
len(tfidf.vocabulary_)

In [None]:
tfidf.vocabulary_

In [None]:
### Get Similarity Scores using cosine similarity
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
type(tfidf)

In [None]:
def get_similar_articles_tfidf(text:str, tfidf:sklearn.feature_extraction.text.TfidfVectorizer
                         , corpus:scipy.sparse.csr.csr_matrix
                         , df:pd.DataFrame, num_results=5 ):
    # apply the same transformations , used by  tf-idf ; note: converts to a matrix
    tokenized_text = tfidf.transform([text])
    
    # get similairty of passed text, with entire corpus
    sims = sklearn.metrics.pairwise.cosine_similarity(tokenized_text,corpus )
    
    # convert matrix to just vector
    sim = sims[0]
    # get the sorted indices of the highest scoring in descending order
    sorted_indices = sim.argsort()[::-1]
    # get the first n elements
    top_n_indices = sorted_indices[:num_results]
    
    # get title of top talks
    titles = df['title'].loc[top_n_indices]
    # get scores of top talks
    scores = sim[top_n_indices]
    #return "HI"
    return pd.DataFrame({'title': titles, 'scores': scores})

In [None]:
[5,6][::-1]

In [None]:
tfidf_sims = cosine_similarity (tfidf.transform(["North Korea"]), matrix)

#df['title'].loc[sims.argsort()[-5:-1]]

In [None]:
tfidf_sims

In [None]:
tfidf_sims.argsort()[-5:]

In [None]:
tfidf_sims.argsort()[0][-5:]

In [None]:
get_similar_articles_tfidf(text="brain", tfidf=tfidf, corpus=tfidf_matrix, df=df, num_results=5)

In [None]:
from ipywidgets import interact, interact_manual

In [None]:
@interact_manual(text="north korea")
def get_similar_articles_helper_tfidf(text:str):
    return get_similar_articles_tfidf(text=text, tfidf=tfidf, corpus=tfidf_matrix, df=df)

# Word2vec

**Notes**:     
[Facebook's Embedding](https://github.com/facebookresearch/fastText/blob/master/pretrained-vectors.md)

In [None]:
import  logging
import gensim.downloader as api
from gensim.models.word2vec import Word2Vec

In [None]:
import json
data_list = api.info()
print(json.dumps(data_list, indent=4))

In [None]:
word2vec_model = api.load("glove-wiki-gigaword-300")
#model = api.load("fasttext-wiki-news-subwords-300")
word2vec_model.most_similar("glass")

In [None]:
# queen = (king - man) + woman

In [None]:
word2vec_model.most_similar(positive=["king","man"] , negative=['man'])

In [None]:
word2vec_model.most_similar("democrats")

In [None]:
word2vec_model.most_similar(positive=["barack","obama", "republicans"] ,negative=["democrats"] )

In [None]:
word2vec_model["obama"].shape

In [None]:
def document_vector(word2vec_model, doc:str):
    words = doc.split(" ")
    # remove out-of-vocabulary words
    
    #for word in doc:
    #    print (word, word in word2vec_model.vocab)
    
    valid_words = [word for word in words if word in word2vec_model.vocab]
    missing_words = [word for word in words if word not in word2vec_model.vocab]
    
    if len(valid_words)==0:
        print ("No words in vocab")
        return np.zeros(word2vec_model.vector_size)
    else:
        return np.mean(word2vec_model[valid_words], axis=0)

In [None]:
print (TEXT[0])

In [None]:
#[word for word in doc if word not in model.vocab]

In [None]:
np.matrix([document_vector(model, TEXT[1]),document_vector(model, TEXT[0])]).shape

In [None]:
document_vector(model, TEXT[1])

In [None]:
len(tfidf.vocabulary_)

In [None]:
tfidf_matrix.shape

In [None]:
document_vector(model, TEXT[0])

In [None]:
swiftapply(df['all_text'].head(), lambda text: document_vector(word2vec_model=word2vec_model, doc=text))

In [None]:
swiftapply(df['all_text'].head(), lambda text: document_vector(word2vec_model=word2vec_model, doc=text)).tolist()

In [None]:
word2vec_matrix = np.matrix (
                        swiftapply(df['all_text']
                             , lambda text: document_vector(word2vec_model=word2vec_model, doc=text)).tolist()
                        
                    )
word2vec_matrix.shape

In [None]:
?document_vector

In [None]:
def get_similar_articles_word2vec(text:str,word2vec_matrix=word2vec_matrix, df=df
                                         , word2vec_model=word2vec_model
                                         , num_results=5
                                        ):
    
    vec = document_vector(doc=text,word2vec_model=word2vec_model)

    vec = vec.reshape(1, -1)
    # get similairty of passed text, with entire corpus
    sims = sklearn.metrics.pairwise.cosine_similarity(vec,word2vec_matrix )
    
    # convert matrix to just vector
    sim = sims[0]
    # get the sorted indices of the highest scoring in descending order
    sorted_indices = sim.argsort()[::-1]
    # get the first n elements
    top_n_indices = sorted_indices[:num_results]
    
    # get title of top talks
    titles = df['title'].loc[top_n_indices]
    # get scores of top talks
    scores = sim[top_n_indices]
    #return "HI"
    return pd.DataFrame({'title': titles, 'scores': scores})



@interact_manual(text="north korea")
def get_similar_articles_helper_word2vec(text:str):
    return get_similar_articles_word2vec(text=text,word2vec_matrix=word2vec_matrix, df=df
                                         , word2vec_model=word2vec_model
                                         , num_results=20)

In [None]:
document_vector(doc="brain",word2vec_model=word2vec_model)

In [None]:
get_similar_articles_word2vec("nort_korea")

In [None]:
@interact_manual(text="north korea")
def get_similar_articles_helper_tfidf(text:str):
    return get_similar_articles_tfidf(text=text, tfidf=tfidf, corpus=tfidf_matrix, df=df)

In [None]:
@interact_manual(text="north korea")
def get_similar_articles_helper_tfidf(text:str):
    return get_similar_articles_tfidf(text=text, tfidf=tfidf, corpus=tfidf_matrix, df=df)

In [None]:
df.head(10)

In [None]:
df['']

In [None]:
df.columns

In [None]:
df['categories']

In [None]:
df.where(df['categories'].isin(['Science & Technology']))

In [None]:
df[['id','title','upload_date']].sort_values(['upload_date'], ascending=False)

In [None]:
document_vector(model,doc=TEXT[0].split(" "))

# Tensorflow Hub

In [None]:
!pip install --quiet tensorflow-hub
!pip install --quiet seaborn

In [None]:
import tensorflow as tf
import tensorflow_hub as hub
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import re
import seaborn as sns

In [None]:
module_url = "https://tfhub.dev/google/universal-sentence-encoder/2" #@param ["https://tfhub.dev/google/universal-sentence-encoder/2", "https://tfhub.dev/google/universal-sentence-encoder-large/3"]        

In [None]:
# Import the Universal Sentence Encoder's TF Hub module
embed = hub.Module(module_url)

# Compute a representation for each message, showing various lengths supported.
word = "Elephant"
sentence = "I am a sentence for which I would like to get its embedding."
paragraph = (
    "Universal Sentence Encoder embeddings also support short paragraphs. "
    "There is no hard limit on how long the paragraph is. Roughly, the longer "
    "the more 'diluted' the embedding will be.")
messages = [word, sentence, paragraph]

# Reduce logging output.
tf.logging.set_verbosity(tf.logging.ERROR)

with tf.Session() as session:
  session.run([tf.global_variables_initializer(), tf.tables_initializer()])
  message_embeddings = session.run(embed(messages))

  for i, message_embedding in enumerate(np.array(message_embeddings).tolist()):
    print("Message: {}".format(messages[i]))
    print("Embedding size: {}".format(len(message_embedding)))
    message_embedding_snippet = ", ".join(
        (str(x) for x in message_embedding[:3]))
    print("Embedding: [{}, ...]\n".format(message_embedding_snippet))

In [None]:
def plot_similarity(labels, features, rotation):
  corr = np.inner(features, features)
  sns.set(font_scale=1.2)
  g = sns.heatmap(
      corr,
      xticklabels=labels,
      yticklabels=labels,
      vmin=0,
      vmax=1,
      cmap="YlOrRd")
  g.set_xticklabels(labels, rotation=rotation)
  g.set_title("Semantic Textual Similarity")


def run_and_plot(session_, input_tensor_, messages_, encoding_tensor):
  message_embeddings_ = session_.run(
      encoding_tensor, feed_dict={input_tensor_: messages_})
  plot_similarity(messages_, message_embeddings_, 90)

In [None]:
messages = [
    # Smartphones
    "I like my phone",
    "My phone is not good.",
    "Your cellphone looks great.",

    # Weather
    "Will it snow tomorrow?",
    "Recently a lot of hurricanes have hit the US",
    "Global warming is real",

    # Food and health
    "An apple a day, keeps the doctors away",
    "Eating strawberries is healthy",
    "Is paleo better than keto?",

    # Asking about age
    "How old are you?",
    "what is your age?",
]

similarity_input_placeholder = tf.placeholder(tf.string, shape=(None))
similarity_message_encodings = embed(similarity_input_placeholder)
with tf.Session() as session:
  session.run(tf.global_variables_initializer())
  session.run(tf.tables_initializer())
  run_and_plot(session, similarity_input_placeholder, messages,
               similarity_message_encodings)

In [None]:
from gensim.models.wrappers import FastText

model = FastText.load_fasttext_format('wiki.simple')

print(model.most_similar('teacher'))
# Output = [('headteacher', 0.8075869083404541), ('schoolteacher', 0.7955552339553833), ('teachers', 0.733420729637146), ('teaches', 0.6839243173599243), ('meacher', 0.6825737357139587), ('teach', 0.6285147070884705), ('taught', 0.6244685649871826), ('teaching', 0.6199781894683838), ('schoolmaster', 0.6037642955780029), ('lessons', 0.5812176465988159)]


print(model.similarity('teacher', 'teaches'))
# Output = 0.683924396754

In [None]:
matrix=tfidf.fit_transform(Text)


In [None]:
matrix.shape

In [None]:
matrix.shape

In [None]:
sklearn.feature_extraction.text.TfidfVectorizer

In [None]:
?sklearn.feature_extraction.text.TfidfVectorizer

In [None]:
df.head()

In [None]:
from nltk import word_tokenize          
from nltk.stem import WordNetLemmatizer 
class LemmaTokenizer(object):
    def __init__(self):
        self.wnl = WordNetLemmatizer()
    def __call__(self, articles):
        return [self.wnl.lemmatize(t) for t in word_tokenize(articles)]

In [None]:
?text.TfidfVectorizer

In [None]:
from sklearn.feature_extraction import text
Text=df['transcript'].tolist()
tfidf=text.TfidfVectorizer(input=Text,stop_words="english", tokenizer=LemmaTokenizer())
matrix=tfidf.fit_transform(Text)
#print(matrix.shape)

In [None]:
### Get Similarity Scores using cosine similarity
from sklearn.metrics.pairwise import cosine_similarity
sim_unigram=cosine_similarity(matrix)

In [None]:
sim_unigram

In [None]:
def get_similar_articles(x):
    return "\n".join(df['title'].loc[x.argsort()[-5:-1]])
df['similar_articles_unigram']=[get_similar_articles(x) for x in sim_unigram]

In [None]:
df['title'][1]

In [None]:
print (df['similar_articles_unigram'][1])

In [None]:
# Problems with this model

# Doc2Vec

In [None]:
df.iloc[0]

In [None]:
[row["title"]] + row["categories"]

In [None]:
talk_corpus = []
for (index,row) in df.iterrows():
    content = gensim.utils.simple_preprocess (row["transcript"])
    #tags = [row["title"], row["id"]] + row["categories"]'
    #tags = [row["title"]] + row["categories"]
    tags = [row["title"]] 
    tg = gensim.models.doc2vec.TaggedDocument(content,tags)
    talk_corpus.append(tg)

In [None]:
talk_corpus[0]

In [None]:
model = gensim.models.Doc2Vec(size = 300, 
                              min_count = 3, 
                              iter = 100)

In [None]:
model.build_vocab(talk_corpus)
print("model's vocabulary length:", len(model.wv.vocab))

In [None]:
model.train(talk_corpus,total_examples=model.corpus_count,epochs=model.epochs)

In [None]:
model.docvecs.most_similar('My escape from North Korea | Hyeonseo Lee')

In [None]:
df.head(1)

In [None]:
content = gensim.utils.simple_preprocess (df.iloc[0]["transcript"])

In [None]:
content

In [None]:
new_vector = model.infer_vector(content)
sims = model.docvecs.most_similar([new_vector],topn=20)
sims