<a href="https://colab.research.google.com/github/lromano97/text-mining/blob/main/webScraping.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Scraping de la página TED

En ted_main de kaggle hay 2550 valores, mientras que en la página de [TED](https://www.ted.com/talks?page=142&sort=popular) hay hoy (12/mayo/21) 5092.

De acuerdo a la descarga, estos son los datos con los que contamos.

*   5092 charlas (5072 en inglés)
*   4482 transcripciones (4479 en inglés)
*   En la descarga hay varias charlas que están en otros idiomas, pero las transcripciones están en inglés

Esta notebook es una adaptación de [TED-Scraper en Git Hub](https://github.com/The-Gupta/TED-Scraper)

Una posible mejora a este script es buscar descargar el título, la descripción y la trascripción en español (aparentemente hay 4283)


In [None]:
# Preparación del entorno ----

# Descarga de metadatos y transcripciones
import time, requests, bs4, json, random
from multiprocessing import Manager, Process
import pandas as pd
from bs4 import BeautifulSoup
from datetime import datetime


# Sin warnings
import warnings
warnings.filterwarnings('ignore')


In [None]:
# Activacndo Google Drive para guardar la información
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [None]:
cd "/content/gdrive/My Drive/Colab Notebooks/Text Mining/TopicsTED"

/content/gdrive/My Drive/Colab Notebooks/Text Mining/TopicsTED


## Descarga de Metadatos y Transcripciones

In [None]:
time__  =  time.time()     # Tiempo de corrida en Collab es de 6 minutos


urls = []
page_number=0

while 1:
    page_number += 1

    res  =  requests.get("https://www.ted.com/talks?sort=popular&page=" + str(page_number), 
                         headers = {'User-agent': 'your bot 0.1'})

    soup = bs4.BeautifulSoup(res.text)
    e = soup.select("div.container.results div.col")
    
    if len(e) == 0:    break    # No more videos.
    
    for  u  in e:
        urls.append("https://www.ted.com" + u.select("div.media__image a.ga-link")[0].get("href"))


# Saving.
f = open('TED_Talk_URLs.txt', 'w')
f.write('\n'.join(urls))
f.close()


def download(urls, id_, csv_list):
                for count, url in enumerate(urls):

                        def get_transcript(url):
                                transcript = ""
                                transcript_res = requests.get(url, headers = {'User-agent': 'your bot 0.1'})
                                
                                soup = BeautifulSoup(transcript_res.text)
                                e = soup.select('div.Grid.Grid--with-gutter.p-b:4')

                                for  e_  in e:
                                    classes = e_.get('class')
                                    text = e_.select('p')[0].text
                                    transcript += text.strip().replace('\t', '').replace('\n', ' ')
                                
                                if (transcript_res.status_code!=200) or (transcript_res.text=='') or (transcript==''):
                                    count_=0
                                    while  count_ < 3:    # Check 3 more times
                                        time.sleep(random.randint(0,900)/1000)     # Randomly wait for 0-0.9 seconds.
                                        transcript_res = requests.get(url, headers = {'User-agent': 'your bot 0.1'})

                                        soup = BeautifulSoup(transcript_res.text)
                                        e = soup.select('div.Grid.Grid--with-gutter.p-b:4')

                                        for  e_  in e:
                                            classes = e_.get('class')
                                            text = e_.select('p')[0].text
                                            transcript += text.strip().replace('\t', '').replace('\n', ' ')

                                        count_ += 1
                                        if (transcript_res.status_code==200) and (transcript_res.text!='') and (transcript!=''):    break

                                return transcript



                        def get__json_obj(url):
                            res = requests.get(url.strip(), headers = {'User-agent': 'your bot 0.1'})
                            html = res.text

                            start_index  =  html.find('<script data-spec="q">q("talkPage.init",')
                            end_index    =  html[start_index:].find(')</script>')
                            script_tag   =  html[start_index: start_index + end_index]
                            json_obj  =  script_tag[len('<script data-spec="q">q("talkPage.init",'):]
                            return json_obj

                        json_obj  =  get__json_obj(url)

                        if not json_obj:
                            count=0
                            while  count < 3:    # Check 3 more times
                                json_obj  =  get__json_obj(url)
                                count += 1
                                if json_obj:    break

                        if not json_obj:    print(url);continue;
                        else:               metadata = json.loads(json_obj)["__INITIAL_DATA__"]



                        def get_value(l, m=metadata):
                            for i in l:
                                try:    m = m[i]
                                except: return ''
                            return m





                        def html_to_text(html):
                            if str(html)!='nan':
                                soup = BeautifulSoup(html)
                                return soup.get_text()
                            else: return html


                        d = dict()

                        # Acerca de la Charla
                        d["talk__id"]  =  get_value(["current_talk"], metadata)
                        d["talk__name"]  =  get_value(["talks", 0, "title"], metadata)
                        d["talk__description"]  =  get_value(["description"], metadata)
                        d["view_count"]  =  get_value(["viewed_count"], metadata)
                        d["duration"]  =  get_value(["talks", 0, "duration"], metadata)    # In seconds.

                        language  =  get_value(["language"], metadata)
                        url__transcript  =  url + "/transcript?language=" + language
                        d["transcript"]  =  get_transcript(url__transcript)

                        d["video_type_name"]  =  get_value(["talks", 0, "video_type", "name"], metadata)    # One of:  TED Stage Talk, TEDx Talk, TED-Ed Original, TED Institute Talk, Best of Web, Original Content, TED Salon Talk (partner), Custom sponsored content
                        d["event"]  =  get_value(["event"], metadata)
                    

                        # Acerca del conferencista
                        d["speaker__id"]  =  get_value(["speakers", 0, "id"], metadata)                        
                        d["speaker__name"]  =  get_value(["talks", 0, "speaker_name"], metadata)
                        d["speaker__description"]  =  get_value(["speakers", 0, "description"], metadata)
                        d["speaker__who_he_is"]  =  get_value(["speakers", 0, "whotheyare"], metadata)
                        d["speaker__why_listen"]  =  html_to_text(get_value(["speakers", 0, "whylisten"], metadata))
                        d["all_speakers_details"]  =  get_value(["speakers"], metadata)
                        

                        # Recorded and Published time.
                        temp  =  get_value(["talks", 0, "recorded_at"], metadata)
                        d["recording_date"]  =  temp  if temp==None  else temp[:10]
                        
                        t  =  get_value(["talks", 0, "player_talks", 0, "published"], metadata)
                        d["published_timestamp"]  =  datetime.utcfromtimestamp(int(t)).strftime('%Y-%m-%d %H:%M:%S')
                        

                        # Tags
                        d["talks__tags"]  =  get_value(["talks", 0, "tags"], metadata)
                        d["number_of__tags"]  =  len(get_value(["talks", 0, "tags"], metadata) or "")
                        

                        d["language"]  =  language
                        d["native_language"]  =  get_value(["talks", 0, "player_talks", 0, "nativeLanguage"], metadata)

                        
                        # URLs.
                        d["url__webpage"]  =  get_value(["url"], metadata)
                        

                        # More resources.
                        d["talk__more_resources"]  =  get_value(["talks", 0, "more_resources"], metadata)
                        d["number_of__talk__more_resources"]  =  len(get_value(["talks", 0, "more_resources"], metadata) or "")


                        # Recommendations.
                        d["talk__recommendations__blurb"]  =  get_value(["talks", 0, "recommendations", "blurb"], metadata)
                        
                        d["talk__recommendations"]  =  get_value(["talks", 0, "recommendations", "rec_lists"], metadata)
                        d["number_of__talk__recommendations"]  =  len(get_value(["talks", 0, "recommendations", "rec_lists"], metadata) or "")


                        # Related Talks.
                        d["related_talks"]  =  get_value(["talks", 0, "related_talks"], metadata)
                        d["number_of__related_talks"]  =  len(get_value(["talks", 0, "related_talks"], metadata) or "")


                        csv_list.append(d)

csv_list_ = []
with  Manager()  as manager:
    csv_list = manager.list()    # SPECIAL variable - can be used only locally
    Processess = []
    
    concurreny_count  =  100
    urls_  =  [urls[(i* (len(urls)//concurreny_count)):((i+1)* (len(urls)//concurreny_count))]    for i in range(concurreny_count)]
    
    leftovers  =  urls[(concurreny_count * (len(urls)//concurreny_count))  :  len(urls)]
    for i in range(len(leftovers)):    urls_[i] += [leftovers[i]]
    
    for  (id_,urls__)  in enumerate(urls_):
        p = Process(target=download, args=(urls__,id_,csv_list))
        Processess.append(p)
        p.start()
        
    # block until all the threads finish (i.e. block until all **download** function calls finish)
    for t in Processess:    t.join()
    
    csv_list_ = list(csv_list)

# Creating DataFrame.
df  =  pd.DataFrame(csv_list_)

# Sort - most popular first.
df = df.sort_values("view_count", ascending=False)

# Saving.
df.to_excel('TED_Talk.xlsx', encoding='utf-8', index=False)
df.to_csv('TED_Talk.csv', index=False, encoding='utf-8')

print(round(time.time()  -  time__))

337
