In [None]:
# pip install youtube-transcript-api
# pip install selenium
# pip install pandas

In [1]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import os
import sys
import csv
from youtube_transcript_api import YouTubeTranscriptApi  # pip install youtube-transcript-api
import re

In [10]:
FILE_NAME_LINKS = 'guided-links.txt'
SCRIPTS_DIR = 'transcripts'
MED_TYPES = ['focused', 'body-scan', 'visualization', 'reflection', 'movement']
INSIGHT_DATA_FILE = 'insight-transcripts-data.csv'

Note about using Selenium

Do the following if driver not installed in path or the usual command does not work.
Uncomment the following two import lines in the next cell and run them

Replace `driver = webdriver.Chrome()` with the following line
`driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()))`

In [6]:
# !pip install webdriver-manager  # alternatively install using the command line
from selenium.webdriver.chrome.service import Service as ChromeService
from webdriver_manager.chrome import ChromeDriverManager

In [None]:
# driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()))

### Webscrape Youtube

In [17]:
YOUTUBE_LINKS_FILE = 'yt-links.csv'
YOUTUBE_PLAYLISTS_LINKS_FILE = 'yt-playlists.csv'
YOUTUBE_DONE_PLAYLISTS = 'yt-playlists-done.csv'
YOUTUBE_DATA_FILE = 'yt-transcripts-data.csv'

In [4]:
def scroll_to_bottom(driver, times_to_scroll=None):
    """Scroll to the bottom of page until at very bottom of page (scroll position is equal to the scroll height)
    Input a times_to_scroll argument to limit the number of times the page is scrolled"""
    last_height = driver.execute_script("return document.documentElement.scrollTop")
    
    if times_to_scroll:
        times_scrolled = 0

    while True:
        driver.execute_script("window.scrollTo(0, document.documentElement.scrollHeight);")
       
        # current_scroll_position = driver.execute_script("return document.documentElement.scrollTop")
        new_height = driver.execute_script("return document.documentElement.scrollTop")
        time.sleep(1)
        
        if times_to_scroll:
            times_scrolled += 1
            if times_scrolled >= times_to_scroll:
                print(f"Scrolled {times_scrolled} times")
                break

        # Use to check if at bottom of page
        if last_height == new_height:
            break
        else:
            last_height = new_height

In [5]:
times_to_scroll_page = 3

def search_for_yt_playlists(driver, search_med_type):
    # search only for playlists
    url = f"https://www.youtube.com/results?search_query={search_med_type.replace(' ', '+')}+meditation+playlists&sp=EgIQAw%253D%253D"
    driver.get(url)
    time.sleep(3)

    scroll_to_bottom(driver, times_to_scroll_page)

    with open(YOUTUBE_PLAYLISTS_LINKS_FILE, 'a', newline='', encoding="utf-8") as csvf:
        writer = csv.writer(csvf)

        # Add headers if file is empty
        if os.path.getsize(YOUTUBE_PLAYLISTS_LINKS_FILE) == 0:
            writer.writerow(['Meditation Type', 'Playlist Link'])

        playlist_box_list = driver.find_elements(By.TAG_NAME, 'ytd-playlist-renderer')
        for playlist_box in playlist_box_list:
            title = playlist_box.find_element(By.ID, 'video-title').text.lower()

            temp_med_type = search_med_type.replace('-', ' ').replace('ed', '').replace('ization', '').replace('ion', '')
            # Skip playlists that have music or ones that dont have meditation or the meditation type
            if 'music' in title or 'song' in title:
                print(f"'{title}' playlist skipped bc it has 'music' or 'song'")
                continue
            if temp_med_type not in title and 'meditat' not in title and 'mindful' not in title and 'relax' not in title:
                print(f"'{title}' playlist skipped bc it doesn't have '{temp_med_type}' or 'meditat' or 'mindful' or 'relax' in it")
                continue


            url = playlist_box.find_element(
                By.ID, 'view-more').find_element(
                    By.TAG_NAME, 'a').get_attribute('href')

            writer.writerow([search_med_type, url, title])
    print(f"Done writing {search_med_type} playlists to {YOUTUBE_PLAYLISTS_LINKS_FILE}")

In [6]:
def get_youtube_links_from_playlist(driver, playlist_url, med_type):

    driver.get(playlist_url)
    time.sleep(3)

    # Scroll to bottom of page to reveal all the links
    scroll_to_bottom(driver)
    wait = WebDriverWait(driver, 20)
    playlist = wait.until(
        EC.presence_of_element_located(
            (By.TAG_NAME, 'ytd-playlist-video-list-renderer')
        )
    )
    vid_boxes = playlist.find_elements(By.TAG_NAME, 'ytd-playlist-video-renderer')

    with open(YOUTUBE_LINKS_FILE, 'a', newline='') as csvf:
        writer = csv.writer(csvf)
        for box in vid_boxes:
            link = box.find_element(By.TAG_NAME, 'a')
            url = link.get_attribute('href')
            writer.writerow([med_type, url])
    print(f'Wrote {str(len(vid_boxes))} more links into {YOUTUBE_LINKS_FILE}')

    return YOUTUBE_LINKS_FILE

In [5]:
def read_youtube_transcript(youtube_url, med_type):
    sep = '&'
    # Strip the url in before the id and any extra q = pairs that come after the '&
    youtube_id = youtube_url.lstrip('https://www.youtube.com/watch?v=').split(sep, 1)[0]

    # Returns a list of dicts
    script = YouTubeTranscriptApi.get_transcript(youtube_id, languages=['en'])

    # new_file = os.path.join(YOUTUBE_SCRIPTS_DIR, f'ytscript-{youtube_id}.txt')

    script_string = ''
    for yt_dict in script:
        script_string += yt_dict['text'] + ' '

    # Remove bracketed phrases  e.g [Music]
    fixed_script = re.sub(r'\[(.*?)\]', '', script_string)
    # Remove parentheses e.g (Music)
    fixed_script = re.sub(r'\((.*?)\)', ' ', fixed_script)
    # Remove the newlines from the script
    fixed_script = re.sub(r'\n', ' ', fixed_script)

# Old code for reading into individual files
    # # Empty contents of file
    # with open(new_file, "w") as f:
    #     f.truncate(0)
    # # Write text to a .txt file
    # with open(new_file, 'a') as f:
    #     f.write(youtube_url + '\n')  # First line has url
    #     f.write(med_type + ' meditation\n')     # Second line has med_type
    #     f.write(fixed_script)  # Third line has script
    # print(f'Written .txt for {youtube_url} at {new_file}')

    # Write to data file
    with open(YOUTUBE_DATA_FILE, 'a', newline='', encoding='utf8') as csvf:
        writer = csv.writer(csvf)
        writer.writerow([med_type, youtube_url, fixed_script])
    print(f'Written line for {youtube_url}')

In [None]:
# DONE
# Get list of links to playslists for each type of meditation and store in a file
# Skip playlists if their playlist titles don't seem right

driver = webdriver.Chrome()

for med_type in MED_TYPES:
    search_med_type = med_type
    search_for_yt_playlists(driver, search_med_type)

driver.close()

In [None]:
# DONE
# Get links to each of the videos in each of the playlists
# Reads the links into YOUTUBE_LINKS_FILE
driver = webdriver.Chrome()

with open(YOUTUBE_PLAYLISTS_LINKS_FILE, 'r', encoding='utf8') as csvf:
    reader = csv.reader(csvf)
    next(reader)  # Skip the header
    for row in reader:
        meditation_type = row[0]
        link = row[1]
        print('playlist link: ', link)
        get_youtube_links_from_playlist(driver, link, meditation_type)

driver.close()


In [6]:
# DONE
# Use youtube api to read the transcripts into the data file

failed_writes = 0
# Read urls from file into a list
with open(YOUTUBE_LINKS_FILE, 'r') as csvf:
    reader = csv.reader(csvf)
    med_types = []
    yt_vid_urls = []
    for row in reader:
        med_types.append(row[0])
        yt_vid_urls.append(row[1].rstrip())

# Add header if file is empty
if os.path.getsize(YOUTUBE_PLAYLISTS_LINKS_FILE) == 0:
    with open(YOUTUBE_DATA_FILE, 'a') as csvf:
        writer = csv.writer(csvf)
        writer.writerow(['Meditation_Type', 'URL', 'Script', ])

assert(len(yt_vid_urls) == len(med_types))

# For each url, read into the data file
for i in range(len(yt_vid_urls)):
    try:
        read_youtube_transcript(yt_vid_urls[i], med_types[i])
    except:
        failed_writes += 1  # occurs if subtitles are turned off for video


print(f"Done!")
print(f'Failed writes: {failed_writes}')
print(f'Successful writes: {len(yt_vid_urls) - failed_writes} new transcripts')

Written line for https://www.youtube.com/watch?v=9B_NSBWGYCw&list=PLNXcT403FhX-B-eR5cDPPosPT6dMvqC8Q&index=44&pp=iAQB
Written line for https://www.youtube.com/watch?v=8MYf_k2mJsg&list=PLNXcT403FhX-B-eR5cDPPosPT6dMvqC8Q&index=45&pp=iAQB
Written line for https://www.youtube.com/watch?v=CcqZ47d398k&list=PLNXcT403FhX-B-eR5cDPPosPT6dMvqC8Q&index=46&pp=iAQB
Written line for https://www.youtube.com/watch?v=L5FybAebFdA&list=PLNXcT403FhX-B-eR5cDPPosPT6dMvqC8Q&index=47&pp=iAQB
Written line for https://www.youtube.com/watch?v=QJ_V0IgONo0&list=PLNXcT403FhX-B-eR5cDPPosPT6dMvqC8Q&index=48&pp=iAQB
Written line for https://www.youtube.com/watch?v=6kVVrE_sCNA&list=PLNXcT403FhX-B-eR5cDPPosPT6dMvqC8Q&index=49&pp=iAQB
Written line for https://www.youtube.com/watch?v=OK4tWkM8NkE&list=PLtLiP0-8GtS04laRvWbgwBO97fnKv6WSn&index=2&pp=iAQB
Written line for https://www.youtube.com/watch?v=iHLQOHZJync&list=PLtLiP0-8GtS04laRvWbgwBO97fnKv6WSn&index=3&pp=iAQB
Written line for https://www.youtube.com/watch?v=2rwCdGW-7

In [None]:
# %%bash
# cat yt-transcripts_data.csv |
# # cat yt-transcripts_data.csv | grep "music" | grep -v "movement"
# # cat yt-transcripts_data.csv | grep "song"
# cat yt-transcripts_data.csv | grep -v ".*,.*,.*[medit|mindful|relax]"
# cat yt-transcripts_data.csv | grep "♪" | grep -v "list=PLLNX"

# # Remove religous words like words that contain catholic, allah, buddh

### Data Cleaning

In [3]:
import pandas as pd
import re
import csv

In [19]:
PHRASES_TO_REMOVE = [
    # Remove the intro
    r'.*welcome to carries conscious living are you ready to',
    r'.*on the 7th of this month are you ready to',
    r'.*are (you|we) ready to meditate (with|the)',
    r'.*join the patreon the links are in the description',
    r'.*if you are returning welcome back here at the',
    r".*I'm Sara Raymond here at the mindful movement",
    r'.*your host Brian Scott',
    r'.*create your free Mindvalley account today at mindvalley.com'
]

def data_clean(csv_file_path):
    cleaned_file_name = csv_file_path.rstrip('.csv')
    cleaned_file_name += '-cleaned.csv'

    df = pd.read_csv(csv_file_path, encoding='utf8')
    df.dropna()

    with open(cleaned_file_name, 'w', encoding='utf8', newline='') as csvf:
        writer = csv.writer(csvf)
        writer.writerow(['Meditation_Type','URL','Script'])
        for idx, item in df.iterrows():
            s = item['Script']
            s = re.sub('\[.*?\]', ' ', s)  # Remove content inside square brackets
            s = re.sub('\(.*?\)', ' ', s)  # Remove contents inside square brackets
            s = re.sub('\s+',' ', s)  # Replace consecutive whitespace with a single space

            # Remove specific phrases  (FOR youtube data)
            s = re.sub('so( so)+', 'so', s)  # Remove consecutive 'so'
            s = re.sub('foreign( foreign)+', 'foreign', s)  # Remove consecutive 'foreign' 

            for phrase in PHRASES_TO_REMOVE:
                s = re.sub(phrase, '', s)

            script = s
            url = item['URL']
            med_type = item['Meditation_Type']
            writer.writerow([med_type, url, script])
            # print(f"Line {idx}")

    print(f'Written cleaned data to {cleaned_file_name}')

In [20]:
data_clean(YOUTUBE_DATA_FILE)

Written cleaned data to yt-transcripts-data-cleaned.csv


In [14]:
data_clean(INSIGHT_DATA_FILE)

Written cleaned data to insight-transcripts-data-cleaned.csv


### Webscrape Insighttimer Website

In [4]:
def save_guided_links(driver):
    time.sleep(3)
    grid = driver.find_element(By.CSS_SELECTOR, '.MuiGrid-root.MuiGrid-container.MuiGrid-spacing-xs-3')
    print(grid)

    print('Starting to Scroll Down')
    # Scroll to the bottom of page to load more links
    for _ in range(400):
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(0.5)
    print('Done Scrolling Down')

    links = grid.find_elements(By.TAG_NAME, 'a')
    print("Num Links", len(links))

    with open(FILE_NAME_LINKS, "w") as f:  # Empty contents of file
        f.truncate(0)

    with open(FILE_NAME_LINKS, 'a') as f:

        for link_ele in links:
            guided_or_music = link_ele.find_element(By.CSS_SELECTOR, '.chakra-text.css-gxmra2').text
            if 'GUIDED' in guided_or_music:
                # Save the link
                link = link_ele.get_attribute('href')
                f.write(link + '\n')
            # else skip the link if it lists 'MUSIC'
    
    return FILE_NAME_LINKS

In [5]:
def page_has_transcript(driver, url):

    try:
        transcript = driver.find_element(By.XPATH, "//*[text()='Transcript']")
        if transcript.text == 'Transcript':
            print('Transcript Found')
            return True
        else:
            return False
    except:
        return False

In [6]:
def read_transcript_insighttimer(driver, url):
    """Reads the transcript on the page into a .txt file"""
    div = driver.find_element(By.CLASS_NAME, 'css-14kzvyt')
    more_button = driver.find_element(By.CSS_SELECTOR, '.chakra-button.css-ryu1zs')

    # scroll button into view and click
    driver.execute_script("arguments[0].scrollIntoView({ behavior: 'smooth', block: 'nearest', inline: 'start' });", div)
    time.sleep(2)
    more_button.click()
    
    wait = WebDriverWait(driver, 20)
    body = wait.until(
        EC.presence_of_element_located(
            (By.CLASS_NAME, 'MuiCollapse-wrapperInner')
        )
    )

    line_text = driver.find_elements(By.CLASS_NAME, 'css-dbagas')
    
    id = url.lstrip('https://insighttimer.com/guided-meditations/').rstrip()
    print('id', id)
    new_file = os.path.join(SCRIPTS_DIR, f'script-{id}.txt')

    # Write text to a .txt file
    with open(new_file, 'a') as f:
        f.write(url + '\n')  # First line has url

        for line in line_text:
            f.write(line.text + '\n')
    print(f'Written .txt for {url}')

In [7]:
def get_transcripts_from_links(driver):

    with open(FILE_NAME_LINKS, 'r') as f:
        guided_urls = f.readlines()
    
    for url in guided_urls:
        driver.get(url)
        time.sleep(3)
        if page_has_transcript(driver, url):
            # Extract the Script
            read_transcript_insighttimer(driver, url)

    print('DONE WITH ALL LINKS')

    time.sleep(10) # Let the user actually see something!
    driver.quit()

In [8]:
def remove_newlines(transcripts_dir):
    """Removes the extra newlines from all the files in the given directory"""

    for file_name in os.listdir(transcripts_dir):
        # Remove the extra new lines in the file

        with open(os.path.join(transcripts_dir, file_name), 'r') as f:
            lines = f.readlines()

        with open(os.path.join(transcripts_dir, file_name), 'w') as f:
            for line in lines:
                if line != '\n' and line != ' \n':
                    f.write(line)

In [9]:
def create_csv(csv_file, scripts_dir):
    """Create csv file given scripts_dir"""
    with open(csv_file, 'w', newline='') as csvf:
        writer = csv.writer(csvf)

        # Column Names
        writer.writerow(['Meditation_Type', 'Script'])
        for file_name in os.listdir(scripts_dir):
            with open(os.path.join(scripts_dir, file_name), 'r') as f:

                first_line = f.readline()
                if not first_line.startswith('http'):
                    print(f'file {file_name} does not start with a http link')
                    sys.exit()

                key = ''
                # Add the type of meditation if listed
                for type in MED_TYPES:
                    if file_name.startswith(type):
                        key = f'{type} '
                        break

                # key += 'meditation'
                print(key)

                text = f.read()

                writer.writerow([key, text])

In [None]:
URL = 'https://insighttimer.com/guided-meditations'
driver = webdriver.Chrome()
# Use this line instead if driver not installed in path
# driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()))
driver.get(URL)
time.sleep(2)

# Scroll through and save all 10,000 links on the page
save_guided_links(driver)

# Using the list of links, go through and check for and save thei transcripts
get_transcripts_from_links(driver)

remove_newlines(SCRIPTS_DIR)

create_csv(INSIGHT_DATA_FILE, SCRIPTS_DIR)