In [None]:
! pip install -U spacy
! pip install https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.0.0/en_core_web_lg-3.0.0.tar.gz

import os
import os.path
import pprint
import string
import re
import bs4
import time
import requests
import json
import random
import warnings
import gc
import pandas as pd
import numpy as np
import plotly.express as px

from multiprocessing import Manager, Process
from bs4 import BeautifulSoup
from datetime import datetime
from google.colab import drive

import nltk
from nltk.text import Text
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from nltk.collocations import BigramCollocationFinder,BigramAssocMeasures

import spacy
from spacy import displacy

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import StackingClassifier, RandomForestClassifier, AdaBoostClassifier
from sklearn.metrics.pairwise import cosine_similarity

warnings.filterwarnings('ignore')

nltk.download('all')
pp = pprint.PrettyPrinter(indent=4, compact=True)
nlp = spacy.load("en_core_web_lg")

ted_file = 'Ted_Talk.csv'

language = 'english'

language_stops = set(stopwords.words(language))
language_stops.update(string.punctuation)

In [6]:
def remove_stop_words(text):
  """
    Remueve stop words en inglés

    Attributes
    ----------
    text: list
      lista de palabras (tokens) a filtrar

    Returns
    -------
    list
      lista de palabras sin los stop words
  """
  return [token for token in text if token.lower() not in language_stops]

In [7]:
def lematize_words(text):
  """
    Lematización de palabras - aplica lematización de palabras sobre un set de tokens

    Attributes
    ----------
    text: list
      lista de palabras (tokens) sobre los cuales se aplicará la lematización
    
    Returns
    -------
    list
      lista con todas las lematizaciones de las palabras
  """
  doc = nlp(" ".join(text))
  return [token.lemma_ for token in doc]

In [8]:
def remove_meaningless_words(text):
  """
    Remueve palabras sin significado

    Attributes
    ----------
    text: list
      lista de palabras (tokens) a filtrar
    
    Returns
    -------
    list
      lista de palabras filtrada en base a expresiones regulares
  """
  patterns = [r"(^={1,}=$)", r'\u200b']
  tokens = text
  for pattern in patterns:
    regexp = re.compile(pattern)
    tokens = [token for token in tokens if not regexp.search(token)]
  return tokens

In [9]:
def clean_short_words(text):
  """
    Limpia palabras con longitud 1

    Attributes
    ----------
    text: str
      documento a tokenizar
    
    Returns
    -------
    list
      lista de tokens
  """
  return [word for word in text if len(word) > 1]

In [10]:
def tokenize(text, mode='word'): 
  """
    Tokenización de documento - tokeniza un documento por palabra o por oración

    Attributes
    ----------
    text: str
      documento a tokenizar
    mode: str, optional
      método de tokenización (default: 'word' (por palabra))
    
    Returns
    -------
    list
      lista de tokens 
    
    Raises
    ------
      Exception
        si el mode no es 'word' o 'sentence'
  """
  if mode == 'word':
    return word_tokenize(text, language=language)
  elif mode == 'sentence':
    return sent_tokenize(text, language=language)
  else:
    raise Exception('metodo de tokenizacion no encontrado')

In [11]:
def similarity_btw_docs(matrix):
  """
    Similitud entre documentos - calcula la similitud entre documentos utilizando Similitud del Coseno

    Attributes
    ----------
    matrix: scipy matrix
      Matriz dispersa para calcular la similaridad

    Returns
    -------
    pd.DataFrame
      retorna un dataframe con el grado de similaridad entre documentos (de 0 a 1)
  """
  matrix_simil = cosine_similarity(matrix)
  return pd.DataFrame(matrix_simil)

In [12]:
def pre_procesamiento_texto(text):
  """
    Pre-procesamiento y obtención de las 20 palabras más significativas

    Attributes
    ----------
    text: str
      documento a analizar

    Returns
    -------
    pd.DataFrame
      retorna un dataframe con las 20 palabras que más se repiten y su frecuencia
  """
  tokenized = tokenize(text)
  without_stops = remove_stop_words(tokenized)
  meaningfull_tokens = remove_meaningless_words(without_stops)
  without_short_words = clean_short_words(meaningfull_tokens)
  lematized_words = lematize_words(without_short_words)
  return lematized_words

In [13]:
def remove_character(serie, char):
  """
    Pre-procesamiento y obtención de las 20 palabras más significativas

    Attributes
    ----------
    serie: pd.Serie
      columna de dataframe a modificar
    char: char
      caracter a remover

    Returns
    -------
    pd.Serie
      retorna una serie
  """
  return serie.str.replace(char, '')

In [14]:
def get_urls(page_list):
  """
    Obtencion de las urls de las charlas TED

    Attributes
    ----------
    page_list: list
      lista de paginas de charlas TED

    Returns
    -------
    list
      retorna una lista con todas las urls de las charlas TED
  """
  urls = ["https://www.ted.com" + url.select("div.media__image a.ga-link")[0].get("href") for url in page_list]
  return urls

In [15]:
def get_transcript(url, count):
  """
    Obtiene la transcripcion de una determinada charla ted

    Attributes
    ----------
    url: str
      url de la charla TED
    count: int
      indice de la pagina url

    Returns
    -------
    str
      retorna una cadena de caracteres con la transcripcion de la charla TED
  """
  transcript = ""
  transcript_res = requests.get(url, headers = {'User-agent': 'your bot 0.1'})
  soup = BeautifulSoup(transcript_res.text)
  e = soup.select('div.Grid.Grid--with-gutter.p-b:4')

  for  e_  in e:
    classes = e_.get('class')
    text = e_.select('p')[0].text
    transcript += text.strip().replace('\t', '').replace('\n', ' ')
                                
  if (transcript_res.status_code!=200) or (transcript_res.text=='') or (transcript==''):
    count_=0
    while  count_ < 3: 
      time.sleep(random.randint(0,900)/1000)
      transcript_res = requests.get(url, headers = {'User-agent': 'your bot 0.1'})
      soup = BeautifulSoup(transcript_res.text)
      e = soup.select('div.Grid.Grid--with-gutter.p-b:4')

      for  e_  in e:
        classes = e_.get('class')
        text = e_.select('p')[0].text
        transcript += text.strip().replace('\t', '').replace('\n', ' ')

      count_ += 1
      if (transcript_res.status_code==200) and (transcript_res.text!='') and (transcript!=''):
        break

  return transcript

In [16]:
def get__json_obj(url):
  """
    Obtiene el objecto JSON de una respectiva URL

    Attributes
    ----------
    url: str
      url a analizar

    Returns
    -------
    str
      retorna una cadena de caracteres que representa el objeto JSON de la URL
  """
  res = requests.get(url.strip(), headers = {'User-agent': 'your bot 0.1'})
  start_index = res.text.find('<script data-spec="q">q("talkPage.init",')
  end_index = res.text[start_index:].find(')</script>')
  script_tag = res.text[start_index: start_index + end_index]
  return script_tag[len('<script data-spec="q">q("talkPage.init",'):]

In [19]:
def get_value(l, m):
  """
    Obtiene el valor de un elemento HTML

    Attributes
    ----------
    l: list
      elementos
    m: s
      metadata

    Returns
    -------
    str
      retorna el valor del elemento HTML
  """
  for i in l:
    try:
      m = m[i]
    except: 
      return ''
  return m

In [20]:
def html_to_text(html):
  """
    Convierte un valor HTML a cadena de caracteres

    Attributes
    ----------
    html: str
      valor de elemento HTML

    Returns
    -------
    str
      retorna la representacion del valor del elemento HTML en cadena de caracdteres
  """
  if str(html)!='nan':
    soup = BeautifulSoup(html)
    return soup.get_text()
  else: 
    return html

In [22]:
def get_elements_dict_from_url(count, url, json_obj):
  """
    Generacion de diccionario de elementos de una URL

    Attributes
    ----------
    count: int
      indice de la URL dentro del listado de URLs
    url: str
      url a analizar
    json_obj: str
      objeto JSON de la URL

    Returns
    -------
    dict
      retorna un diccionario que contiene todos los elementos HTML con sus respectivos valores
  """
  metadata = json.loads(json_obj)["__INITIAL_DATA__"]
  language = get_value(["language"], metadata)
  url__transcript = url + "/transcript?language=" + language
  temp = get_value(["talks", 0, "recorded_at"], metadata)
  t = get_value(["talks", 0, "player_talks", 0, "published"], metadata)

  d = dict()
  d["language"]  =  language
  d["talk__id"]  =  get_value(["current_talk"], metadata)
  d["talk__name"]  =  get_value(["talks", 0, "title"], metadata)
  d["talk__description"]  =  get_value(["description"], metadata)
  d["view_count"]  =  get_value(["viewed_count"], metadata)
  d["duration"]  =  get_value(["talks", 0, "duration"], metadata)
  d["transcript"]  =  get_transcript(url__transcript,count)
  d["video_type_name"]  =  get_value(["talks", 0, "video_type", "name"], metadata)
  d["event"]  =  get_value(["event"], metadata)               
  d["speaker__id"]  =  get_value(["speakers", 0, "id"], metadata)                        
  d["speaker__name"]  =  get_value(["talks", 0, "speaker_name"], metadata)
  d["speaker__description"]  =  get_value(["speakers", 0, "description"], metadata)
  d["speaker__who_he_is"]  =  get_value(["speakers", 0, "whotheyare"], metadata)
  d["speaker__why_listen"]  =  html_to_text(get_value(["speakers", 0, "whylisten"], metadata))
  d["all_speakers_details"]  =  get_value(["speakers"], metadata)                       
  d["recording_date"]  =  temp  if temp==None  else temp[:10]                        
  d["published_timestamp"]  =  datetime.utcfromtimestamp(int(t)).strftime('%Y-%m-%d %H:%M:%S')                      
  d["talks__tags"]  =  get_value(["talks", 0, "tags"], metadata)
  d["number_of__tags"]  =  len(get_value(["talks", 0, "tags"], metadata) or "")                       
  d["native_language"]  =  get_value(["talks", 0, "player_talks", 0, "nativeLanguage"], metadata)                   
  d["url__webpage"]  =  get_value(["url"], metadata)                    
  d["talk__more_resources"]  =  get_value(["talks", 0, "more_resources"], metadata)
  d["number_of__talk__more_resources"]  =  len(get_value(["talks", 0, "more_resources"], metadata) or "")
  d["talk__recommendations__blurb"]  =  get_value(["talks", 0, "recommendations", "blurb"], metadata)                    
  d["talk__recommendations"]  =  get_value(["talks", 0, "recommendations", "rec_lists"], metadata)
  d["number_of__talk__recommendations"]  =  len(get_value(["talks", 0, "recommendations", "rec_lists"], metadata) or "")
  d["related_talks"]  =  get_value(["talks", 0, "related_talks"], metadata)
  d["number_of__related_talks"]  =  len(get_value(["talks", 0, "related_talks"], metadata) or "")

  return d

In [23]:
def download(urls, id_, csv_list):
  """
    Descarga toda la informacion respecto a las charlas TED

    Attributes
    ----------
    urls: list
      lista de URLs a descargar
    id_: int
      lorem ipsum dolor
    csv_list: list
      csv donde se guardara la informacion respecto a las charlas ted
  """
  for count, url in enumerate(urls):
    json_obj = get__json_obj(url)

    if not json_obj:
      count=0
      while count < 3:    
        json_obj  =  get__json_obj(url)
        count += 1
        if json_obj:
          break

    if not json_obj:
      continue
    else:
      csv_list.append(get_elements_dict_from_url(count, url, json_obj))

In [37]:
def scrape_ted_urls(urls, file_name):
  """
    Realiza un proceso de web-scraping sobre todas las charlas TED

    Attributes
    ----------
    urls: list
      URL de charlas TED a scrappear
    file_name: list
      nombre de archivo donde se guardara toda la informacion de las charlas TED
  """
  csv_list_ = []
  
  with Manager() as manager:
      csv_list = manager.list()
      Processess = []
      
      urls_  = [urls[(i*(len(urls)//100)):((i+1)*(len(urls)//100))] for i in range(100)]
      
      leftovers = urls[(100*(len(urls)//100)):len(urls)]

      for i in range(len(leftovers)):
        urls_[i] += [leftovers[i]]
      
      for (id_,urls__) in enumerate(urls_):
          p = Process(target=download, args=(urls__,id_,csv_list))
          Processess.append(p)
          p.start()
          
      for t in Processess:
        t.join()
      
      csv_list_ = list(csv_list)

  dataframe_ted = pd.DataFrame(csv_list_)
  dataframe_ted = dataframe_ted.sort_values("view_count", ascending=False)
  dataframe_ted.to_csv(file_name, index=False, encoding='utf-8')

In [25]:
def get_page_text(page_number):
  """
    Obtiene la URL de una charla TED

    Attributes
    ----------
    page_number: int
      numero de pagina

    Returns
    -------
    str
      retorna la URL de una charla TED
  """
  res = requests.get("https://www.ted.com/talks?sort=popular&page=" + str(page_number), headers = {'User-agent': 'your bot 0.1'})
  soup = bs4.BeautifulSoup(res.text)
  element = soup.select("div.container.results div.col")
  return element

In [27]:
def retrieve_pages_url():
  """
    Obtiene las URL de todas las charlas TED en un determinado idioma

    Returns
    -------
    list
      retorna un listado de URL de cada charla TED en un determinado idioma
  """
  urls = []
  page_number=1

  while 1:
    page_list_urls = get_page_text(page_number)
    
    if len(page_list_urls) == 0:    
      break
    
    page_number += 1
    urls += get_urls(page_list_urls)

  file_ted = open('TED_Talk_URLs.txt', 'w')
  file_ted.write('\n'.join(urls))
  file_ted.close()

  return urls

In [None]:
# Activando Google Drive para guardar la información
drive.mount('/content/gdrive')

# Directorio donde se guardara/buscara el archivo que contiene la informacion de las charlas TED
os.chdir('/content/gdrive/My Drive')

In [42]:
# Si no existe el archivo de TED, hace toda la falopeada de web-scraping asquerosa copy-pasted (toma 6 minutos aprox)
if not os.path.isfile(ted_file):
  ted_urls = retrieve_pages_url()
  scrape_ted_urls(ted_urls, ted_file)

In [None]:
tedx_df = pd.read_csv(ted_file)
tedx_df.head()

In [None]:
tedx_df = tedx_df.assign(talks__tags=tedx_df['talks__tags'].str.split(',')).explode('talks__tags')
tedx_df['tag'] = remove_character(remove_character(remove_character(tedx_df['talks__tags'], "["), "'"), "]")

In [45]:
tedx_df = tedx_df[tedx_df['transcript'] != np.nan]

In [47]:
X_train, X_test, y_train, y_test = train_test_split(tedx_df.transcript, tedx_df.tag, test_size=.30)
X_train = X_train.values.astype('U')
X_test = X_test.values.astype('U')
del tedx_df
gc.collect()

In [49]:
tfidf_vectorizer = TfidfVectorizer(tokenizer=pre_procesamiento_texto)

In [50]:
estimators = [
    ('rf', RandomForestClassifier()),
    ('svr', LinearSVC()),
    ('nb', GaussianNB()),
    ('dt', DecisionTreeClassifier()),
    ('ab', AdaBoostClassifier())
]

model = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression())

In [51]:
ml_pipeline = Pipeline(steps=[
                              ('preprocessor', tfidf_vectorizer),
                              ('classifier', model)
])

In [None]:
ml_pipeline.fit(X_train, y_train)

In [None]:
# TODO: Aplicar la función de similarity_btw_docs